Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ test: ## Run tests
ruff format --check .
pytest -n auto --maxfail=10 -v

.PHONY: test2
test2:
pytest -n auto --maxfail=10 -v

.PHONY: watch-tests
watch-tests: ## Watch tests and run on change
ptw --runner "pytest --testmon -n auto"
Expand Down
5 changes: 5 additions & 0 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
from app.clients.letter.dvla import DVLAClient
from app.clients.sms.firetext import FiretextClient
from app.clients.sms.mmg import MMGClient
from app.otel.metrics import otel_metrics
from app.otel.traces import otel_traces

db = SQLAlchemy()
migrate = Migrate()
Expand Down Expand Up @@ -185,6 +187,9 @@ def create_app(application):

cbc_proxy_client.init_app(application)

otel_metrics.init_app(application)
otel_traces.init_app(application)

register_blueprint(application)
register_v2_blueprints(application)

Expand Down
6 changes: 6 additions & 0 deletions app/celery/nightly_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
from app.notifications.notifications_ses_callback import (
check_and_queue_callback_task,
)
from app.otel.metrics import otel_metrics
from app.utils import get_london_midnight_in_utc


Expand Down Expand Up @@ -250,6 +251,11 @@ def timeout_notifications():

for notification in notifications:
statsd_client.incr(f"timeout-sending.{notification.sent_by}")
otel_metrics.provider_timeout_sending_counter.add(
amount=1,
attributes={"send_by": notification.sent_by},
)

check_and_queue_callback_task(notification)

current_app.logger.info(
Expand Down
16 changes: 16 additions & 0 deletions app/celery/process_ses_receipts_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
determine_notification_bounce_type,
handle_complaint,
)
from app.otel.metrics import otel_metrics


@notify_celery.task(
Expand Down Expand Up @@ -79,10 +80,25 @@ def process_ses_results(self, response):

statsd_client.incr(f"callback.ses.{notification_status}")

otel_metrics.provider_status_counter.add(
amount=1,
attributes={
"provider": "ses",
"status": notification_status,
},
)

if notification.sent_at:
statsd_client.timing_with_dates(
f"callback.ses.{notification_status}.elapsed-time", datetime.utcnow(), notification.sent_at
)
otel_metrics.provider_request_time_histogram.record(
amount=(datetime.utcnow() - notification.sent_at).total_seconds(),
attributes={
"provider": "ses",
"status": notification_status,
},
)

check_and_queue_callback_task(notification)

Expand Down
23 changes: 23 additions & 0 deletions app/celery/process_sms_client_response_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from app.notifications.notifications_ses_callback import (
check_and_queue_callback_task,
)
from app.otel.metrics import otel_metrics

sms_response_mapper = {
"MMG": get_mmg_responses,
Expand Down Expand Up @@ -74,12 +75,27 @@ def _process_for_status(notification_status, client_name, provider_reference, de

statsd_client.incr(f"callback.{client_name.lower()}.{notification_status}")

otel_metrics.provider_callback_counter.add(
amount=1,
attributes={
"provider_name": client_name.lower(),
"notification_status": notification_status,
},
)

if notification.sent_at:
statsd_client.timing_with_dates(
f"callback.{client_name.lower()}.{notification_status}.elapsed-time",
datetime.utcnow(),
notification.sent_at,
)
otel_metrics.provider_request_time_histogram.record(
amount=(datetime.utcnow() - notification.sent_at).total_seconds(),
attributes={
"provider_name": client_name.lower(),
"notification_status": notification_status,
},
)

if notification.billable_units == 0:
service = notification.service
Expand All @@ -98,3 +114,10 @@ def _process_for_status(notification_status, client_name, provider_reference, de
check_and_queue_callback_task(notification)
if notification.international:
statsd_client.incr(f"international-sms.{notification_status}.{notification.phone_prefix}")
otel_metrics.international_sms_counter.add(
ammount=1,
attributes={
"notification_status": notification_status,
"phone_prefix": notification.phone_prefix,
},
)
16 changes: 16 additions & 0 deletions app/celery/scheduled_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@
User,
)
from app.notifications.process_notifications import persist_notification, send_notification_to_queue
from app.otel.metrics import otel_metrics
from app.utils import get_london_midnight_in_utc


Expand Down Expand Up @@ -216,6 +217,14 @@ def generate_sms_delivery_stats():
f"slow-delivery.{report.provider}.delivered-within-minutes.{delivery_interval}.ratio", report.slow_ratio
)

otel_metrics.slow_delivery_ratio_gauge.set(
amount=report.slow_ratio,
attributes={
"provider": report.provider,
"delivery_interval": delivery_interval,
},
)

total_notifications = sum(report.total_notifications for report in providers_slow_delivery_reports)
slow_notifications = sum(report.slow_notifications for report in providers_slow_delivery_reports)
ratio_slow_notifications = slow_notifications / total_notifications
Expand All @@ -224,6 +233,13 @@ def generate_sms_delivery_stats():
f"slow-delivery.sms.delivered-within-minutes.{delivery_interval}.ratio", ratio_slow_notifications
)

otel_metrics.slow_sms_delivery_ratio_gauge.set(
amount=ratio_slow_notifications,
attributes={
"delivery_interval": delivery_interval,
},
)

# For the 5-minute delivery interval, let's check the percentage of all text messages sent that were slow.
# TODO: delete this when we have a way to raise these alerts from eg grafana, prometheus, something else.
if delivery_interval == 5 and current_app.should_check_slow_text_message_delivery:
Expand Down
28 changes: 28 additions & 0 deletions app/clients/email/aws_ses.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
EmailClientException,
EmailClientNonRetryableException,
)
from app.otel.metrics import otel_metrics

ses_response_map = {
"Permanent": {
Expand Down Expand Up @@ -102,6 +103,13 @@ def send_email(
)
except botocore.exceptions.ClientError as e:
self.statsd_client.incr("clients.ses.error")
otel_metrics.provider_status_counter.add(
amount=1,
attributes={
"provider": "ses",
"status": "error",
},
)

# https://docs.aws.amazon.com/ses/latest/APIReference-V2/API_SendEmail.html#API_SendEmail_Errors
if e.response["Error"]["Code"] == "InvalidParameterValue":
Expand All @@ -110,6 +118,13 @@ def send_email(
raise AwsSesClientThrottlingSendRateException(str(e)) from e
else:
self.statsd_client.incr("clients.ses.error")
otel_metrics.provider_status_counter.add(
amount=1,
attributes={
"provider": "ses",
"status": "error",
},
)
raise AwsSesClientException(str(e) + e.response["Error"]["Code"]) from e
except Exception as e:
self.statsd_client.incr("clients.ses.error")
Expand All @@ -124,7 +139,20 @@ def send_email(
},
)
self.statsd_client.timing("clients.ses.request-time", elapsed_time)
otel_metrics.provider_request_time_histogram.record(
amount=elapsed_time,
attributes={
"provider_name": "ses",
},
)
self.statsd_client.incr("clients.ses.success")
otel_metrics.provider_status_counter.add(
amount=1,
attributes={
"provider": "ses",
"status": "success",
},
)
return response["MessageId"]


Expand Down
19 changes: 7 additions & 12 deletions app/clients/sms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,24 +45,19 @@ def __init__(self, current_app, statsd_client):
**adapter.poolmanager.connection_pool_kw,
}

def record_outcome(self, success):
if success:
self.current_app.logger.info("Provider request for %s %s", self.name, "succeeded" if success else "failed")
self.statsd_client.incr(f"clients.{self.name}.success")
else:
self.statsd_client.incr(f"clients.{self.name}.error")
self.current_app.logger.warning(
"Provider request for %s %s", self.name, "succeeded" if success else "failed"
)

def send_sms(self, to, content, reference, international, sender):
start_time = monotonic()

try:
response = self.try_send_sms(to, content, reference, international, sender)
self.record_outcome(True)
self.current_app.logger.info("Provider request for %s succeeded", self.name)
self.statsd_client.incr(f"clients.{self.name}.success")
except SmsClientResponseException as e:
self.record_outcome(False)
self.statsd_client.incr(f"clients.{self.name}.error")
self.current_app.logger.warning(
"Provider request for %s failed",
self.name,
)
raise e
finally:
elapsed_time = monotonic() - start_time
Expand Down
9 changes: 9 additions & 0 deletions app/clients/sms/firetext.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import requests

from app.clients.sms import SmsClient, SmsClientResponseException
from app.otel.decorators import otel

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -57,6 +58,14 @@ def __init__(self, *args, **kwargs):
self.url = self.current_app.config.get("FIRETEXT_URL")
self.receipt_url = self.current_app.config.get("FIRETEXT_RECEIPT_URL")

@otel(
"provider_status_counter",
"provider_request_time_histogram",
attributes={"provider": "firetext"},
)
def send_sms(self, *args, **kwargs):
return super().send_sms(*args, **kwargs)

def try_send_sms(self, to, content, reference, international, sender):
data = {
"apiKey": self.international_api_key if international else self.api_key,
Expand Down
9 changes: 9 additions & 0 deletions app/clients/sms/mmg.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import requests

from app.clients.sms import SmsClient, SmsClientResponseException
from app.otel.decorators import otel

# For some extra context, see google drive: GOV.UK Notify -> SMS suppliers -> Detailed failure statuses
mmg_response_map = {
Expand Down Expand Up @@ -88,6 +89,14 @@ def __init__(self, *args, **kwargs):
self.mmg_url = self.current_app.config.get("MMG_URL")
self.receipt_url = self.current_app.config.get("MMG_RECEIPT_URL")

@otel(
"provider_status_counter",
"provider_request_time_histogram",
attributes={"provider": "mmg"},
)
def send_sms(self, *args, **kwargs):
return super().send_sms(*args, **kwargs)

def try_send_sms(self, to, content, reference, international, sender):
data = {"reqType": "BULK", "MSISDN": to, "msg": content, "sender": sender, "cid": reference, "multi": True}

Expand Down
4 changes: 4 additions & 0 deletions app/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
Template,
User,
)
from app.otel.decorators import otel


@click.group(name="command", help="Additional commands")
Expand Down Expand Up @@ -397,6 +398,7 @@ def bulk_invite_user_to_service(file_name, service_id, user_id, auth_type, permi
"-s", "--start_date", default=datetime(2017, 2, 1), help="start date inclusive", type=click_dt(format="%Y-%m-%d")
)
@statsd(namespace="tasks")
@otel()
def populate_notification_postage(start_date):
current_app.logger.info("populating historical notification postage")

Expand Down Expand Up @@ -442,6 +444,7 @@ def populate_notification_postage(start_date):
@click.option("-s", "--start_date", required=True, help="start date inclusive", type=click_dt(format="%Y-%m-%d"))
@click.option("-e", "--end_date", required=True, help="end date inclusive", type=click_dt(format="%Y-%m-%d"))
@statsd(namespace="tasks")
@otel()
def update_jobs_archived_flag(start_date, end_date):
current_app.logger.info("Archiving jobs created between %s to %s", start_date, end_date)

Expand Down Expand Up @@ -475,6 +478,7 @@ def update_jobs_archived_flag(start_date, end_date):
@notify_command(name="update-emails-to-remove-gsi")
@click.option("-s", "--service_id", required=True, help="service id. Update all user.email_address to remove .gsi")
@statsd(namespace="tasks")
@otel()
def update_emails_to_remove_gsi(service_id):
users_to_update = """SELECT u.id user_id, u.name, email_address, s.id, s.name
FROM users u
Expand Down
10 changes: 10 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ class Config:
CELERY_WORKER_LOG_LEVEL = os.getenv("CELERY_WORKER_LOG_LEVEL", "CRITICAL").upper()
CELERY_BEAT_LOG_LEVEL = os.getenv("CELERY_BEAT_LOG_LEVEL", "INFO").upper()

OTEL_EXPORT_TYPE = os.getenv("OTEL_EXPORT_TYPE", "otlp")
OTEL_COLLECTOR_ENDPOINT = os.getenv("OTEL_COLLECTOR_ENDPOINT", "localhost:4317")
OTEL_INSTRUMENTATIONS = os.getenv(
"OTEL_INSTRUMENTATIONS", "wsgi,celery,flask,redis,sqlalchemy,requests,psycopg2,boto3sqs"
)

# secrets that internal apps, such as the admin app or document download, must use to authenticate with the API
ADMIN_CLIENT_ID = "notify-admin"
FUNCTIONAL_TESTS_CLIENT_ID = "notify-functional-tests"
Expand Down Expand Up @@ -526,6 +532,8 @@ class Development(Config):

CELERY_WORKER_LOG_LEVEL = "INFO"

OTEL_EXPORT_TYPE = os.getenv("OTEL_EXPORT_TYPE", "none")

CELERY = {
**Config.CELERY,
"broker_transport_options": {
Expand Down Expand Up @@ -586,6 +594,8 @@ class Test(Development):

CELERY_WORKER_LOG_LEVEL = "INFO"

OTEL_EXPORT_TYPE = os.getenv("OTEL_EXPORT_TYPE", "none")

S3_BUCKET_CSV_UPLOAD = "test-notifications-csv-upload"
S3_BUCKET_CONTACT_LIST = "test-contact-list"
S3_BUCKET_TEST_LETTERS = "test-test-letters"
Expand Down
Loading