From ff36be4df64035fabf66fc82b309fbde27efdf9b Mon Sep 17 00:00:00 2001 From: Ben Corlett Date: Tue, 10 Feb 2026 07:31:38 +0000 Subject: [PATCH 1/5] Increase worker_connections to 256 and enable keep alive --- gunicorn_config.py | 4 ++-- tests/test_gunicorn_config.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 7c2c46da3c..bf8b434772 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -17,9 +17,9 @@ def child_exit(server, worker): workers = 4 worker_class = "eventlet" -worker_connections = 8 # limit runaway greenthread creation +worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) -keepalive = 0 # disable temporarily for diagnosing issues +keepalive = 90 timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py index 748402fb5b..e76ab49c2a 100644 --- a/tests/test_gunicorn_config.py +++ b/tests/test_gunicorn_config.py @@ -4,5 +4,5 @@ def test_gunicorn_config(): assert workers == 4 assert worker_class == "eventlet" - assert keepalive == 0 + assert keepalive == 90 assert timeout == 30 From 3992edf60ae96f0da5b70eea983b80cc33ac833a Mon Sep 17 00:00:00 2001 From: Ben Corlett Date: Tue, 10 Feb 2026 14:34:05 +0000 Subject: [PATCH 2/5] Additional logging --- app/notifications/process_notifications.py | 9 ++++ app/request_timings.py | 34 +++++++++++++ app/v2/notifications/post_notifications.py | 56 ++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 app/request_timings.py diff --git a/app/notifications/process_notifications.py b/app/notifications/process_notifications.py index 8cd0f8c39f..84096bd508 100644 --- a/app/notifications/process_notifications.py +++ b/app/notifications/process_notifications.py @@ -1,3 +1,4 @@ +import time import uuid from datetime import datetime @@ -34,6 +35,7 @@ dao_delete_notifications_by_id, ) from app.models import Notification +from app.request_timings import add_context, record_timing from app.utils import ( parse_and_format_phone_number, try_download_template_email_file_from_s3, @@ -189,9 +191,13 @@ def persist_notification( # if simulated create a Notification model to return but do not persist the Notification to the dB if not simulated: + db_insert_start = time.perf_counter() dao_create_notification(notification=notification, _autocommit=_autocommit) + record_timing("db_insert_ms", time.perf_counter() - db_insert_start) # Not sure how we can rollback + redis_limits_start = time.perf_counter() increment_daily_limit_caches(service, notification, key_type) + record_timing("redis_daily_limit_ms", time.perf_counter() - redis_limits_start) return notification @@ -236,7 +242,10 @@ def send_notification_to_queue_detached(key_type, notification_type, notificatio deliver_task = get_pdf_for_templated_letter try: + add_context(queue=queue, celery_task_name=getattr(deliver_task, "name", deliver_task.__name__)) + queue_publish_start = time.perf_counter() deliver_task.apply_async([str(notification_id)], queue=queue) + record_timing("queue_publish_ms", time.perf_counter() - queue_publish_start) except Exception: dao_delete_notifications_by_id(notification_id) raise diff --git a/app/request_timings.py b/app/request_timings.py new file mode 100644 index 0000000000..1684bc861b --- /dev/null +++ b/app/request_timings.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +from flask import g, has_request_context + + +def init_request_timings() -> None: + if not has_request_context(): + return + if not hasattr(g, "request_timings"): + g.request_timings = {} + if not hasattr(g, "request_timing_context"): + g.request_timing_context = {} + + +def record_timing(name: str, duration_seconds: float) -> None: + if not has_request_context(): + return + init_request_timings() + g.request_timings[name] = round(duration_seconds * 1000.0, 3) + + +def add_context(**kwargs: object) -> None: + if not has_request_context(): + return + init_request_timings() + g.request_timing_context.update({k: v for k, v in kwargs.items() if v is not None}) + + +def get_timings() -> tuple[dict[str, float], dict[str, object]]: + if not has_request_context(): + return {}, {} + timings = getattr(g, "request_timings", {}) + context = getattr(g, "request_timing_context", {}) + return timings, context diff --git a/app/v2/notifications/post_notifications.py b/app/v2/notifications/post_notifications.py index 21405fbb33..5457f733ac 100644 --- a/app/v2/notifications/post_notifications.py +++ b/app/v2/notifications/post_notifications.py @@ -1,5 +1,6 @@ import base64 import functools +import time import uuid from datetime import datetime @@ -52,6 +53,7 @@ validate_and_format_recipient, validate_template, ) +from app.request_timings import add_context, get_timings, init_request_timings, record_timing from app.schema_validation import validate from app.utils import try_parse_and_format_phone_number from app.v2.errors import BadRequestError @@ -76,8 +78,37 @@ ) +def _log_slow_request(request_start: float, notification_type: str, notification_id: str | None = None) -> None: + request_time = time.perf_counter() - request_start + if request_time <= 1.0: + return + + timings, context = get_timings() + service_id = getattr(authenticated_service, "id", None) + api_key_type = getattr(api_user, "key_type", None) + add_context( + notification_type=notification_type, + service_id=str(service_id) if service_id is not None else None, + api_key_type=api_key_type, + endpoint=request.endpoint, + path=request.path, + notification_id=notification_id, + request_time_seconds=round(request_time, 3), + ) + _, context = get_timings() + current_app.logger.info( + "slow request breakdown", + extra={ + "timings_ms": timings, + **context, + }, + ) + + @v2_notification_blueprint.route(f"/{LETTER_TYPE}", methods=["POST"]) def post_precompiled_letter_notification(): + init_request_timings() + request_start = time.perf_counter() check_rate_limiting(authenticated_service, api_user, notification_type=LETTER_TYPE) request_json = get_valid_json() @@ -104,13 +135,21 @@ def post_precompiled_letter_notification(): precompiled=True, ) + notification_id = None + if isinstance(notification, dict): + notification_id = str(notification.get("id")) + _log_slow_request(request_start, LETTER_TYPE, notification_id=notification_id) + return jsonify(notification), 201 @v2_notification_blueprint.route("/", methods=["POST"]) def post_notification(notification_type): + init_request_timings() + request_start = time.perf_counter() check_rate_limiting(authenticated_service, api_user, notification_type=notification_type) + json_parse_start = time.perf_counter() with POST_NOTIFICATION_JSON_PARSE_DURATION_SECONDS.time(): request_json = get_valid_json() @@ -122,9 +161,11 @@ def post_notification(notification_type): form = validate(request_json, post_letter_request) else: abort(404) + record_timing("json_parse_validate_ms", time.perf_counter() - json_parse_start) check_service_has_permission(authenticated_service, notification_type) + validate_template_start = time.perf_counter() template, template_with_content = validate_template( template_id=form["template_id"], personalisation=form.get("personalisation", {}), @@ -133,6 +174,7 @@ def post_notification(notification_type): check_char_count=False, recipient=form.get("email_address"), ) + record_timing("validate_template_ms", time.perf_counter() - validate_template_start) reply_to = get_reply_to_text(notification_type, form, template) @@ -156,6 +198,11 @@ def post_notification(notification_type): unsubscribe_link=form.get("one_click_unsubscribe_url", None), ) + notification_id = None + if isinstance(notification, dict): + notification_id = str(notification.get("id")) + _log_slow_request(request_start, notification_type, notification_id=notification_id) + return jsonify(notification), 201 @@ -172,27 +219,34 @@ def process_sms_or_email_notification( notification_id = uuid.uuid4() form_send_to = form["email_address"] if notification_type == EMAIL_TYPE else form["phone_number"] + recipient_validate_start = time.perf_counter() recipient_data = validate_and_format_recipient( send_to=form_send_to, key_type=api_user.key_type, service=service, notification_type=notification_type ) + record_timing("recipient_validate_ms", time.perf_counter() - recipient_validate_start) send_to = recipient_data["normalised_to"] if type(recipient_data) is dict else recipient_data # Do not persist or send notification to the queue if it is a simulated recipient simulated = simulated_recipient(send_to, notification_type) + document_upload_start = time.perf_counter() personalisation, document_download_count = process_document_uploads( form.get("personalisation"), service, send_to=send_to, simulated=simulated, ) + record_timing("document_upload_ms", time.perf_counter() - document_upload_start) + add_context(document_download_count=document_download_count) if document_download_count: # We changed personalisation which means we need to update the content template_with_content.values = personalisation # validate content length after url is replaced in personalisation. + content_validation_start = time.perf_counter() check_is_message_too_long(template_with_content) + record_timing("content_validation_ms", time.perf_counter() - content_validation_start) response = create_response_for_post_notification( notification_id=notification_id, @@ -206,6 +260,7 @@ def process_sms_or_email_notification( template_with_content=template_with_content, ) + persist_start = time.perf_counter() persist_notification( notification_id=notification_id, template_id=template.id, @@ -222,6 +277,7 @@ def process_sms_or_email_notification( unsubscribe_link=unsubscribe_link, document_download_count=document_download_count, ) + record_timing("persist_notification_ms", time.perf_counter() - persist_start) if not simulated: send_notification_to_queue_detached( From 9d51fcdb72fa7b92c2db6069ea8d59831b880aa2 Mon Sep 17 00:00:00 2001 From: Ben Corlett Date: Thu, 19 Feb 2026 09:43:22 +0000 Subject: [PATCH 3/5] Make the number if workers and worker connections an optional environment variable --- gunicorn_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index bf8b434772..71954a29a6 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -15,9 +15,9 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = 4 +workers = os.getenv("GUNICORN_WORKERS", 4) worker_class = "eventlet" -worker_connections = 256 +worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256) statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 90 timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class From 63fe9e477e4e058a3e021eb57c7b185c9d8975b6 Mon Sep 17 00:00:00 2001 From: Ben Corlett Date: Thu, 19 Feb 2026 12:31:18 +0000 Subject: [PATCH 4/5] More gunicorn options --- gunicorn_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 71954a29a6..45a2dba164 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -16,10 +16,10 @@ def child_exit(server, worker): workers = os.getenv("GUNICORN_WORKERS", 4) -worker_class = "eventlet" +worker_class = os.getenv("GUNICORN_WORKER_CLASS", "eventlet") worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256) statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) -keepalive = 90 +keepalive = os.getenv("GUNICORN_KEEPALIVE", 90) timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) From 72074a581abe6df4932e8b38e2807c159ae0aa9f Mon Sep 17 00:00:00 2001 From: Ben Corlett Date: Thu, 19 Feb 2026 14:18:52 +0000 Subject: [PATCH 5/5] Adding more gunicorn options for load testing --- gunicorn_config.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 45a2dba164..5c253e3c38 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -15,12 +15,29 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = os.getenv("GUNICORN_WORKERS", 4) +workers = int(os.getenv("GUNICORN_WORKERS", "4")) worker_class = os.getenv("GUNICORN_WORKER_CLASS", "eventlet") -worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256) + +# For async workers (eventlet/gevent) this controls concurrent clients per worker. +# For sync/gthread this setting is ignored. +worker_connections = int(os.getenv("GUNICORN_WORKER_CONNECTIONS", "256")) + +# Used by gthread worker_class (ignored by eventlet/gevent). +threads = int(os.getenv("GUNICORN_THREADS", "4")) + statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) -keepalive = os.getenv("GUNICORN_KEEPALIVE", 90) -timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class +keepalive = int(os.getenv("GUNICORN_KEEPALIVE", "90")) + +# If using eventlet, this has little effect for long-held requests. +timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", "30")) + +# When shutting down/restarting, wait this long for workers to finish requests. +graceful_timeout = int(os.getenv("GUNICORN_GRACEFUL_TIMEOUT", "30")) + +# Optional worker recycling to reduce impact of slow leaks/fragmentation. +# Gunicorn uses 0 to disable. +max_requests = int(os.getenv("GUNICORN_MAX_REQUESTS", "0")) +max_requests_jitter = int(os.getenv("GUNICORN_MAX_REQUESTS_JITTER", "0")) debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) if debug_post_threshold: