From ff36be4df64035fabf66fc82b309fbde27efdf9b Mon Sep 17 00:00:00 2001
From: Ben Corlett <benjamin.corlett@digital.cabinet-office.gov.uk>
Date: Tue, 10 Feb 2026 07:31:38 +0000
Subject: [PATCH 1/5] Increase worker_connections to 256 and enable keep alive

---
 gunicorn_config.py            | 4 ++--
 tests/test_gunicorn_config.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/gunicorn_config.py b/gunicorn_config.py
index 7c2c46da3c..bf8b434772 100644
--- a/gunicorn_config.py
+++ b/gunicorn_config.py
@@ -17,9 +17,9 @@ def child_exit(server, worker):
 
 workers = 4
 worker_class = "eventlet"
-worker_connections = 8  # limit runaway greenthread creation
+worker_connections = 256
 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST"))
-keepalive = 0  # disable temporarily for diagnosing issues
+keepalive = 90
 timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30))  # though has little effect with eventlet worker_class
 
 debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None)
diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py
index 748402fb5b..e76ab49c2a 100644
--- a/tests/test_gunicorn_config.py
+++ b/tests/test_gunicorn_config.py
@@ -4,5 +4,5 @@
 def test_gunicorn_config():
     assert workers == 4
     assert worker_class == "eventlet"
-    assert keepalive == 0
+    assert keepalive == 90
     assert timeout == 30

From 3992edf60ae96f0da5b70eea983b80cc33ac833a Mon Sep 17 00:00:00 2001
From: Ben Corlett <benjamin.corlett@digital.cabinet-office.gov.uk>
Date: Tue, 10 Feb 2026 14:34:05 +0000
Subject: [PATCH 2/5] Additional logging

---
 app/notifications/process_notifications.py |  9 ++++
 app/request_timings.py                     | 34 +++++++++++++
 app/v2/notifications/post_notifications.py | 56 ++++++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 app/request_timings.py

diff --git a/app/notifications/process_notifications.py b/app/notifications/process_notifications.py
index 8cd0f8c39f..84096bd508 100644
--- a/app/notifications/process_notifications.py
+++ b/app/notifications/process_notifications.py
@@ -1,3 +1,4 @@
+import time
 import uuid
 from datetime import datetime
 
@@ -34,6 +35,7 @@
     dao_delete_notifications_by_id,
 )
 from app.models import Notification
+from app.request_timings import add_context, record_timing
 from app.utils import (
     parse_and_format_phone_number,
     try_download_template_email_file_from_s3,
@@ -189,9 +191,13 @@ def persist_notification(
 
     # if simulated create a Notification model to return but do not persist the Notification to the dB
     if not simulated:
+        db_insert_start = time.perf_counter()
         dao_create_notification(notification=notification, _autocommit=_autocommit)
+        record_timing("db_insert_ms", time.perf_counter() - db_insert_start)
         # Not sure how we can rollback
+        redis_limits_start = time.perf_counter()
         increment_daily_limit_caches(service, notification, key_type)
+        record_timing("redis_daily_limit_ms", time.perf_counter() - redis_limits_start)
 
     return notification
 
@@ -236,7 +242,10 @@ def send_notification_to_queue_detached(key_type, notification_type, notificatio
         deliver_task = get_pdf_for_templated_letter
 
     try:
+        add_context(queue=queue, celery_task_name=getattr(deliver_task, "name", deliver_task.__name__))
+        queue_publish_start = time.perf_counter()
         deliver_task.apply_async([str(notification_id)], queue=queue)
+        record_timing("queue_publish_ms", time.perf_counter() - queue_publish_start)
     except Exception:
         dao_delete_notifications_by_id(notification_id)
         raise
diff --git a/app/request_timings.py b/app/request_timings.py
new file mode 100644
index 0000000000..1684bc861b
--- /dev/null
+++ b/app/request_timings.py
@@ -0,0 +1,34 @@
+from __future__ import annotations
+
+from flask import g, has_request_context
+
+
+def init_request_timings() -> None:
+    if not has_request_context():
+        return
+    if not hasattr(g, "request_timings"):
+        g.request_timings = {}
+    if not hasattr(g, "request_timing_context"):
+        g.request_timing_context = {}
+
+
+def record_timing(name: str, duration_seconds: float) -> None:
+    if not has_request_context():
+        return
+    init_request_timings()
+    g.request_timings[name] = round(duration_seconds * 1000.0, 3)
+
+
+def add_context(**kwargs: object) -> None:
+    if not has_request_context():
+        return
+    init_request_timings()
+    g.request_timing_context.update({k: v for k, v in kwargs.items() if v is not None})
+
+
+def get_timings() -> tuple[dict[str, float], dict[str, object]]:
+    if not has_request_context():
+        return {}, {}
+    timings = getattr(g, "request_timings", {})
+    context = getattr(g, "request_timing_context", {})
+    return timings, context
diff --git a/app/v2/notifications/post_notifications.py b/app/v2/notifications/post_notifications.py
index 21405fbb33..5457f733ac 100644
--- a/app/v2/notifications/post_notifications.py
+++ b/app/v2/notifications/post_notifications.py
@@ -1,5 +1,6 @@
 import base64
 import functools
+import time
 import uuid
 from datetime import datetime
 
@@ -52,6 +53,7 @@
     validate_and_format_recipient,
     validate_template,
 )
+from app.request_timings import add_context, get_timings, init_request_timings, record_timing
 from app.schema_validation import validate
 from app.utils import try_parse_and_format_phone_number
 from app.v2.errors import BadRequestError
@@ -76,8 +78,37 @@
 )
 
 
+def _log_slow_request(request_start: float, notification_type: str, notification_id: str | None = None) -> None:
+    request_time = time.perf_counter() - request_start
+    if request_time <= 1.0:
+        return
+
+    timings, context = get_timings()
+    service_id = getattr(authenticated_service, "id", None)
+    api_key_type = getattr(api_user, "key_type", None)
+    add_context(
+        notification_type=notification_type,
+        service_id=str(service_id) if service_id is not None else None,
+        api_key_type=api_key_type,
+        endpoint=request.endpoint,
+        path=request.path,
+        notification_id=notification_id,
+        request_time_seconds=round(request_time, 3),
+    )
+    _, context = get_timings()
+    current_app.logger.info(
+        "slow request breakdown",
+        extra={
+            "timings_ms": timings,
+            **context,
+        },
+    )
+
+
 @v2_notification_blueprint.route(f"/{LETTER_TYPE}", methods=["POST"])
 def post_precompiled_letter_notification():
+    init_request_timings()
+    request_start = time.perf_counter()
     check_rate_limiting(authenticated_service, api_user, notification_type=LETTER_TYPE)
 
     request_json = get_valid_json()
@@ -104,13 +135,21 @@ def post_precompiled_letter_notification():
         precompiled=True,
     )
 
+    notification_id = None
+    if isinstance(notification, dict):
+        notification_id = str(notification.get("id"))
+    _log_slow_request(request_start, LETTER_TYPE, notification_id=notification_id)
+
     return jsonify(notification), 201
 
 
 @v2_notification_blueprint.route("/<notification_type>", methods=["POST"])
 def post_notification(notification_type):
+    init_request_timings()
+    request_start = time.perf_counter()
     check_rate_limiting(authenticated_service, api_user, notification_type=notification_type)
 
+    json_parse_start = time.perf_counter()
     with POST_NOTIFICATION_JSON_PARSE_DURATION_SECONDS.time():
         request_json = get_valid_json()
 
@@ -122,9 +161,11 @@ def post_notification(notification_type):
             form = validate(request_json, post_letter_request)
         else:
             abort(404)
+    record_timing("json_parse_validate_ms", time.perf_counter() - json_parse_start)
 
     check_service_has_permission(authenticated_service, notification_type)
 
+    validate_template_start = time.perf_counter()
     template, template_with_content = validate_template(
         template_id=form["template_id"],
         personalisation=form.get("personalisation", {}),
@@ -133,6 +174,7 @@ def post_notification(notification_type):
         check_char_count=False,
         recipient=form.get("email_address"),
     )
+    record_timing("validate_template_ms", time.perf_counter() - validate_template_start)
 
     reply_to = get_reply_to_text(notification_type, form, template)
 
@@ -156,6 +198,11 @@ def post_notification(notification_type):
             unsubscribe_link=form.get("one_click_unsubscribe_url", None),
         )
 
+    notification_id = None
+    if isinstance(notification, dict):
+        notification_id = str(notification.get("id"))
+    _log_slow_request(request_start, notification_type, notification_id=notification_id)
+
     return jsonify(notification), 201
 
 
@@ -172,27 +219,34 @@ def process_sms_or_email_notification(
     notification_id = uuid.uuid4()
     form_send_to = form["email_address"] if notification_type == EMAIL_TYPE else form["phone_number"]
 
+    recipient_validate_start = time.perf_counter()
     recipient_data = validate_and_format_recipient(
         send_to=form_send_to, key_type=api_user.key_type, service=service, notification_type=notification_type
     )
+    record_timing("recipient_validate_ms", time.perf_counter() - recipient_validate_start)
 
     send_to = recipient_data["normalised_to"] if type(recipient_data) is dict else recipient_data
 
     # Do not persist or send notification to the queue if it is a simulated recipient
     simulated = simulated_recipient(send_to, notification_type)
 
+    document_upload_start = time.perf_counter()
     personalisation, document_download_count = process_document_uploads(
         form.get("personalisation"),
         service,
         send_to=send_to,
         simulated=simulated,
     )
+    record_timing("document_upload_ms", time.perf_counter() - document_upload_start)
+    add_context(document_download_count=document_download_count)
     if document_download_count:
         # We changed personalisation which means we need to update the content
         template_with_content.values = personalisation
 
     # validate content length after url is replaced in personalisation.
+    content_validation_start = time.perf_counter()
     check_is_message_too_long(template_with_content)
+    record_timing("content_validation_ms", time.perf_counter() - content_validation_start)
 
     response = create_response_for_post_notification(
         notification_id=notification_id,
@@ -206,6 +260,7 @@ def process_sms_or_email_notification(
         template_with_content=template_with_content,
     )
 
+    persist_start = time.perf_counter()
     persist_notification(
         notification_id=notification_id,
         template_id=template.id,
@@ -222,6 +277,7 @@ def process_sms_or_email_notification(
         unsubscribe_link=unsubscribe_link,
         document_download_count=document_download_count,
     )
+    record_timing("persist_notification_ms", time.perf_counter() - persist_start)
 
     if not simulated:
         send_notification_to_queue_detached(

From 9d51fcdb72fa7b92c2db6069ea8d59831b880aa2 Mon Sep 17 00:00:00 2001
From: Ben Corlett <benjamin.corlett@digital.cabinet-office.gov.uk>
Date: Thu, 19 Feb 2026 09:43:22 +0000
Subject: [PATCH 3/5] Make the number if workers and worker connections an
 optional environment variable

---
 gunicorn_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gunicorn_config.py b/gunicorn_config.py
index bf8b434772..71954a29a6 100644
--- a/gunicorn_config.py
+++ b/gunicorn_config.py
@@ -15,9 +15,9 @@ def child_exit(server, worker):
     multiprocess.mark_process_dead(worker.pid)
 
 
-workers = 4
+workers = os.getenv("GUNICORN_WORKERS", 4)
 worker_class = "eventlet"
-worker_connections = 256
+worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256)
 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST"))
 keepalive = 90
 timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30))  # though has little effect with eventlet worker_class

From 63fe9e477e4e058a3e021eb57c7b185c9d8975b6 Mon Sep 17 00:00:00 2001
From: Ben Corlett <benjamin.corlett@digital.cabinet-office.gov.uk>
Date: Thu, 19 Feb 2026 12:31:18 +0000
Subject: [PATCH 4/5] More gunicorn options

---
 gunicorn_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gunicorn_config.py b/gunicorn_config.py
index 71954a29a6..45a2dba164 100644
--- a/gunicorn_config.py
+++ b/gunicorn_config.py
@@ -16,10 +16,10 @@ def child_exit(server, worker):
 
 
 workers = os.getenv("GUNICORN_WORKERS", 4)
-worker_class = "eventlet"
+worker_class = os.getenv("GUNICORN_WORKER_CLASS", "eventlet")
 worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256)
 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST"))
-keepalive = 90
+keepalive = os.getenv("GUNICORN_KEEPALIVE", 90)
 timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30))  # though has little effect with eventlet worker_class
 
 debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None)

From 72074a581abe6df4932e8b38e2807c159ae0aa9f Mon Sep 17 00:00:00 2001
From: Ben Corlett <benjamin.corlett@digital.cabinet-office.gov.uk>
Date: Thu, 19 Feb 2026 14:18:52 +0000
Subject: [PATCH 5/5] Adding more gunicorn options for load testing

---
 gunicorn_config.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/gunicorn_config.py b/gunicorn_config.py
index 45a2dba164..5c253e3c38 100644
--- a/gunicorn_config.py
+++ b/gunicorn_config.py
@@ -15,12 +15,29 @@ def child_exit(server, worker):
     multiprocess.mark_process_dead(worker.pid)
 
 
-workers = os.getenv("GUNICORN_WORKERS", 4)
+workers = int(os.getenv("GUNICORN_WORKERS", "4"))
 worker_class = os.getenv("GUNICORN_WORKER_CLASS", "eventlet")
-worker_connections = os.getenv("GUNICORN_WORKER_CONNECTIONS", 256)
+
+# For async workers (eventlet/gevent) this controls concurrent clients per worker.
+# For sync/gthread this setting is ignored.
+worker_connections = int(os.getenv("GUNICORN_WORKER_CONNECTIONS", "256"))
+
+# Used by gthread worker_class (ignored by eventlet/gevent).
+threads = int(os.getenv("GUNICORN_THREADS", "4"))
+
 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST"))
-keepalive = os.getenv("GUNICORN_KEEPALIVE", 90)
-timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30))  # though has little effect with eventlet worker_class
+keepalive = int(os.getenv("GUNICORN_KEEPALIVE", "90"))
+
+# If using eventlet, this has little effect for long-held requests.
+timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", "30"))
+
+# When shutting down/restarting, wait this long for workers to finish requests.
+graceful_timeout = int(os.getenv("GUNICORN_GRACEFUL_TIMEOUT", "30"))
+
+# Optional worker recycling to reduce impact of slow leaks/fragmentation.
+# Gunicorn uses 0 to disable.
+max_requests = int(os.getenv("GUNICORN_MAX_REQUESTS", "0"))
+max_requests_jitter = int(os.getenv("GUNICORN_MAX_REQUESTS_JITTER", "0"))
 
 debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None)
 if debug_post_threshold: