From 00e279e6994d2673d523129daefe8dacbae9f183 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Wed, 2 Jul 2025 15:53:50 +0100 Subject: [PATCH 01/21] TMP gevent --- app/__init__.py | 6 +-- app/config.py | 2 +- app/errors.py | 6 +-- app/v2/errors.py | 6 +-- application.py | 10 ++-- gunicorn_config.py | 97 ++------------------------------------- requirements.in | 2 +- requirements.txt | 20 ++++++-- requirements_for_test.txt | 24 +++++++++- 9 files changed, 58 insertions(+), 115 deletions(-) diff --git a/app/__init__.py b/app/__init__.py index 9175c2b737..5e3156d0aa 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -27,7 +27,7 @@ from notifications_utils.clients.signing.signing_client import Signing from notifications_utils.clients.statsd.statsd_client import StatsdClient from notifications_utils.clients.zendesk.zendesk_client import ZendeskClient -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.local_vars import LazyLocalGetter from notifications_utils.logging import flask as utils_logging from sqlalchemy import event @@ -425,8 +425,8 @@ def exception(error): code = getattr(error, "code", 500) return jsonify(result="error", message=msg), code - @app.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @app.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): app.logger.exception(error) return jsonify(result="error", message="Timeout serving request"), 504 diff --git a/app/config.py b/app/config.py index 6d40a413f4..ea2a6480ec 100644 --- a/app/config.py +++ b/app/config.py @@ -155,7 +155,7 @@ class Config: # Antivirus ANTIVIRUS_ENABLED = True - NOTIFY_EVENTLET_STATS = os.getenv("NOTIFY_EVENTLET_STATS", "0") == "1" + NOTIFY_GREENLET_STATS = os.getenv("NOTIFY_GREENLET_STATS", "0") == "1" ########################### # Default config values ### diff --git a/app/errors.py b/app/errors.py index ffeb4aff39..d99fc6e9aa 100644 --- a/app/errors.py +++ b/app/errors.py @@ -1,7 +1,7 @@ from flask import current_app, json, jsonify from jsonschema import ValidationError as JsonSchemaValidationError from marshmallow import ValidationError -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.recipient_validation.errors import InvalidRecipientError from sqlalchemy.exc import DataError from sqlalchemy.orm.exc import NoResultFound @@ -98,8 +98,8 @@ def no_result_found(e): current_app.logger.info(e, exc_info=True) return jsonify(result="error", message="No result found"), 404 - @blueprint.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @blueprint.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): current_app.logger.exception(error) return jsonify(result="error", message="Timeout serving request"), 504 diff --git a/app/v2/errors.py b/app/v2/errors.py index 091948dbaa..291c1315c3 100644 --- a/app/v2/errors.py +++ b/app/v2/errors.py @@ -2,7 +2,7 @@ from flask import current_app, jsonify, request from jsonschema import ValidationError as JsonSchemaValidationError -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.recipient_validation.errors import InvalidPhoneError, InvalidRecipientError from sqlalchemy.exc import DataError from sqlalchemy.orm.exc import NoResultFound @@ -139,8 +139,8 @@ def auth_error(error): current_app.logger.info("API AuthError, client: %s error: %s", request.headers.get("User-Agent"), error) return jsonify(error.to_dict_v2()), error.code - @blueprint.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @blueprint.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): current_app.logger.exception(error) return ( jsonify( diff --git a/application.py b/application.py index 0eac7c774e..f079fef67a 100644 --- a/application.py +++ b/application.py @@ -8,20 +8,20 @@ from app import create_app # noqa from app.notify_api_flask_app import NotifyApiFlaskApp # noqa -import notifications_utils.eventlet as utils_eventlet # noqa +import notifications_utils.greenlet as utils_greenlet # noqa application = NotifyApiFlaskApp("app") create_app(application) -if utils_eventlet.using_eventlet: - application.wsgi_app = utils_eventlet.EventletTimeoutMiddleware( +if utils_greenlet.using_eventlet or utils_greenlet.using_gevent: + application.wsgi_app = utils_greenlet.RequestHandlingTimeoutMiddleware( application.wsgi_app, timeout_seconds=int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)), ) - if application.config["NOTIFY_EVENTLET_STATS"]: + if application.config["NOTIFY_GREENLET_STATS"]: import greenlet - greenlet.settrace(utils_eventlet.account_greenlet_times) + greenlet.settrace(utils_greenlet.account_greenlet_times) application._server_greenlet = greenlet.getcurrent() diff --git a/gunicorn_config.py b/gunicorn_config.py index 7c2c46da3c..e06a2cc34b 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -6,7 +6,7 @@ # importing child_exit from gds_metrics.gunicorn has the side effect of eagerly importing -# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet +# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet/gevent # has done its monkeypatching. use a nested import for the rare cases child_exit is actually # called instead. def child_exit(server, worker): @@ -16,99 +16,8 @@ def child_exit(server, worker): workers = 4 -worker_class = "eventlet" +worker_class = "gevent" worker_connections = 8 # limit runaway greenthread creation statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 0 # disable temporarily for diagnosing issues -timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class - -debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) -if debug_post_threshold: - debug_post_threshold_float = float(debug_post_threshold) - profiler = None - - def pre_request(worker, req): - # using os.times() to avoid additional imports before eventlet monkeypatching - req._pre_request_elapsed = os.times().elapsed - - def _process_item(process, attr, process_times_before): - value = process.info[attr] - if attr == "cpu_times": - if process.pid in process_times_before: - value = [ - after - before - for before, after in zip(process_times_before[process.pid], process.info["cpu_times"], strict=False) - ] - else: - # we have no way of giving a sensible value - value = None - - if isinstance(value, tuple): - # convert to list for more compact (and json-able) representation - return list(value) - - return value - - def post_request(worker, req, environ, resp): - elapsed = os.times().elapsed - req._pre_request_elapsed - if elapsed > debug_post_threshold_float: - import json - import time - from io import StringIO - from os import getpid - from pstats import Stats - - import psutil - from eventlet.green import profile - - # include cpu_percent to give later call a "start time" to work with - process_times_before = { - p.pid: p.info["cpu_times"] for p in psutil.process_iter(["cpu_percent", "cpu_times"]) - } - - global profiler - if profiler is None: - profiler = profile.Profile() - - should_profile = not getattr(profiler, "running", False) - if should_profile: - profiler.start() - - perf_counter_before = time.perf_counter() - - time.sleep(0.1) # period over which to profile and calculate cpu_percent - - perf_counter_after = time.perf_counter() - - if should_profile: - profiler.stop() - prof_out_sio = StringIO() - s = Stats(profiler, stream=prof_out_sio) - s.sort_stats("cumulative") - s.print_stats(0.1) - prof_out_str = prof_out_sio.getvalue() - else: - prof_out_str = "profiler already running - no profile collected" - - attrs = ["pid", "name", "cpu_percent", "cpu_times", "status", "memory_info"] - - context = { - "actual_profile_period": perf_counter_after - perf_counter_before, - "request_time": elapsed, - "processes": json.dumps( - [ - attrs, - [ - [_process_item(p, a, process_times_before) for a in attrs] - for p in psutil.process_iter(attrs) - ], - ] - ), - "profile": prof_out_str, - "process_": getpid(), - } - worker.log.info( - "post-request diagnostics for request of %(request_time)ss", - context, - extra=context, - ) +timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with gevent worker_class diff --git a/requirements.in b/requirements.in index 933c4760b4..68bade7595 100644 --- a/requirements.in +++ b/requirements.in @@ -25,7 +25,7 @@ psutil>=6.0.0,<7.0.0 notifications-python-client==10.0.1 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.8.0 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@ris-gevent # gds-metrics requires prometheseus 0.2.0, override that requirement as 0.7.1 brings significant performance gains prometheus-client==0.14.1 diff --git a/requirements.txt b/requirements.txt index a53d89ec9b..5c4383ec70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -94,10 +94,15 @@ fqdn==1.5.1 # via jsonschema gds-metrics @ git+https://github.com/alphagov/gds_metrics_python.git@6f1840a57b6fb1ee40b7e84f2f18ec229de8aa72 # via -r requirements.in +gevent==25.5.1 + # via gunicorn govuk-bank-holidays==0.15 # via notifications-utils greenlet==3.2.2 - # via eventlet + # via + # eventlet + # gevent + # sqlalchemy gunicorn==23.0.0 # via # -r requirements.in @@ -150,7 +155,7 @@ mistune==0.8.4 # via notifications-utils notifications-python-client==10.0.1 # via -r requirements.in -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@a97b36f6a32e7bb917152c8cd716fe65fa15ac9f +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@2cf33bd9287d4c0114679ba7d62c19a8f01f1978 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils @@ -181,11 +186,12 @@ pyjwt==2.10.1 # notifications-python-client pypdf==3.17.0 # via notifications-utils -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # arrow # botocore # celery + # notifications-utils python-json-logger==3.3.0 # via notifications-utils pytz==2024.2 @@ -218,6 +224,10 @@ segno==1.6.1 # via notifications-utils sentry-sdk==1.45.1 # via -r requirements.in +setuptools==80.9.0 + # via + # zope-event + # zope-interface six==1.16.0 # via # click-repl @@ -256,3 +266,7 @@ webcolors==1.12 # via jsonschema werkzeug==3.1.3 # via flask +zope-event==5.1 + # via gevent +zope-interface==7.2 + # via gevent diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 1f021e5010..9f1b10e1ce 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -143,6 +143,10 @@ freezegun==1.5.1 # via -r requirements_for_test_common.in gds-metrics @ git+https://github.com/alphagov/gds_metrics_python.git@6f1840a57b6fb1ee40b7e84f2f18ec229de8aa72 # via -r requirements.txt +gevent==25.5.1 + # via + # -r requirements.txt + # gunicorn govuk-bank-holidays==0.15 # via # -r requirements.txt @@ -151,6 +155,8 @@ greenlet==3.2.2 # via # -r requirements.txt # eventlet + # gevent + # sqlalchemy gunicorn==23.0.0 # via # -r requirements.txt @@ -220,7 +226,7 @@ moto==5.0.11 # via -r requirements_for_test.in notifications-python-client==10.0.1 # via -r requirements.txt -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@a97b36f6a32e7bb917152c8cd716fe65fa15ac9f +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@2cf33bd9287d4c0114679ba7d62c19a8f01f1978 # via -r requirements.txt ordered-set==4.1.0 # via @@ -280,7 +286,7 @@ pytest-testmon==2.1.1 # via -r requirements_for_test_common.in pytest-xdist==3.6.1 # via -r requirements_for_test_common.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # arrow @@ -288,6 +294,7 @@ python-dateutil==2.8.2 # celery # freezegun # moto + # notifications-utils python-json-logger==3.3.0 # via # -r requirements.txt @@ -344,6 +351,11 @@ segno==1.6.1 # notifications-utils sentry-sdk==1.45.1 # via -r requirements.txt +setuptools==80.9.0 + # via + # -r requirements.txt + # zope-event + # zope-interface six==1.16.0 # via # -r requirements.txt @@ -403,3 +415,11 @@ werkzeug==3.1.3 # pytest-httpserver xmltodict==0.14.2 # via moto +zope-event==5.1 + # via + # -r requirements.txt + # gevent +zope-interface==7.2 + # via + # -r requirements.txt + # gevent From f761627f344759249b245d18fed6cf86e50804a9 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Wed, 2 Jul 2025 16:24:04 +0100 Subject: [PATCH 02/21] TMP --- tests/test_gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py index 748402fb5b..9bc5c61afb 100644 --- a/tests/test_gunicorn_config.py +++ b/tests/test_gunicorn_config.py @@ -3,6 +3,6 @@ def test_gunicorn_config(): assert workers == 4 - assert worker_class == "eventlet" + assert worker_class == "gevent" assert keepalive == 0 assert timeout == 30 From 82c3c7b5c110aa27f7850ff634cab6fa325912ee Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Thu, 3 Jul 2025 16:45:57 +0100 Subject: [PATCH 03/21] TMP back to eventlet --- gunicorn_config.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index e06a2cc34b..87ba56184a 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -15,9 +15,9 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = 4 -worker_class = "gevent" -worker_connections = 8 # limit runaway greenthread creation +workers = 2 +worker_class = "eventlet" +worker_connections = 128 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 0 # disable temporarily for diagnosing issues timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with gevent worker_class From 2e2e98c46e445d5ba8eace3f8abeadce6634dc0d Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 09:31:13 +0100 Subject: [PATCH 04/21] TMP gevent --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 87ba56184a..d5371669d3 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -16,7 +16,7 @@ def child_exit(server, worker): workers = 2 -worker_class = "eventlet" +worker_class = "gevent" worker_connections = 128 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 0 # disable temporarily for diagnosing issues From 318119922ef27acdc18640fe8df694e221f05d13 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 11:33:38 +0100 Subject: [PATCH 05/21] TMP 1 worker --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index d5371669d3..5161062fa6 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -15,7 +15,7 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = 2 +workers = 1 worker_class = "gevent" worker_connections = 128 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) From 3059eb4fafcba33ea9aca8470e20f6e83b5e4e20 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 13:16:21 +0100 Subject: [PATCH 06/21] TMP eventlet again --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 5161062fa6..52b3542597 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -16,7 +16,7 @@ def child_exit(server, worker): workers = 1 -worker_class = "gevent" +worker_class = "eventlet" worker_connections = 128 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 0 # disable temporarily for diagnosing issues From 8bec748d4d8e9d81a740c38905b5dcd105e6edb5 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 14:23:59 +0100 Subject: [PATCH 07/21] TMP keepalive 32 --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 52b3542597..8faddb88ed 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -19,5 +19,5 @@ def child_exit(server, worker): worker_class = "eventlet" worker_connections = 128 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) -keepalive = 0 # disable temporarily for diagnosing issues +keepalive = 32 # disable temporarily for diagnosing issues timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with gevent worker_class From 2c12c13c443f60f71dedcfc008c7f80895c421a0 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 14:33:51 +0100 Subject: [PATCH 08/21] TMP add be slow --- app/status/healthcheck.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/app/status/healthcheck.py b/app/status/healthcheck.py index 8fc87ca3fe..ffac8942f9 100644 --- a/app/status/healthcheck.py +++ b/app/status/healthcheck.py @@ -24,6 +24,12 @@ def show_status(): ) +@status.route("/_be_slow", methods=["GET", "POST"]) +def be_slow(): + while True: + be_slow_db() + + @status.route("/_status/live-service-and-organisation-counts") def live_service_and_organisation_counts(): return ( @@ -35,6 +41,11 @@ def live_service_and_organisation_counts(): ) +def be_slow_db(): + query = "SELECT pg_sleep(1.23)" + return db.session.execute(query).fetchone() + + def get_db_version(): query = "SELECT version_num FROM alembic_version" full_name = db.session.execute(query).fetchone()[0] From 7c462c039f3e6b01464cdca9efd6805b04be4e58 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 16:26:09 +0100 Subject: [PATCH 09/21] TMP conns 256 --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 8faddb88ed..998b6e1bfd 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -17,7 +17,7 @@ def child_exit(server, worker): workers = 1 worker_class = "eventlet" -worker_connections = 128 +worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 32 # disable temporarily for diagnosing issues timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with gevent worker_class From 25a7f1df6aaf0d59e23201a2cde1f859ae4e0d26 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 16:26:25 +0100 Subject: [PATCH 10/21] Revert "TMP add be slow" This reverts commit bdfdea8d633264937cac43694136b2adbe371d61. --- app/status/healthcheck.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/app/status/healthcheck.py b/app/status/healthcheck.py index ffac8942f9..8fc87ca3fe 100644 --- a/app/status/healthcheck.py +++ b/app/status/healthcheck.py @@ -24,12 +24,6 @@ def show_status(): ) -@status.route("/_be_slow", methods=["GET", "POST"]) -def be_slow(): - while True: - be_slow_db() - - @status.route("/_status/live-service-and-organisation-counts") def live_service_and_organisation_counts(): return ( @@ -41,11 +35,6 @@ def live_service_and_organisation_counts(): ) -def be_slow_db(): - query = "SELECT pg_sleep(1.23)" - return db.session.execute(query).fetchone() - - def get_db_version(): query = "SELECT version_num FROM alembic_version" full_name = db.session.execute(query).fetchone()[0] From 9297b878f1b9252f578aff25bf1170e9bddb33ca Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Fri, 4 Jul 2025 16:26:51 +0100 Subject: [PATCH 11/21] TMP v2 be slow --- app/v2/notifications/post_notifications.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/app/v2/notifications/post_notifications.py b/app/v2/notifications/post_notifications.py index 2257309caa..878d1816a3 100644 --- a/app/v2/notifications/post_notifications.py +++ b/app/v2/notifications/post_notifications.py @@ -9,6 +9,7 @@ from app import ( api_user, authenticated_service, + db, document_download_client, notify_celery, ) @@ -76,6 +77,12 @@ ) +@v2_notification_blueprint.route("/_be_slow", methods=["GET", "POST"]) +def be_slow(): + while True: + db.session.execute("SELECT pg_sleep(1.23)").fetchone() + + @v2_notification_blueprint.route(f"/{LETTER_TYPE}", methods=["POST"]) def post_precompiled_letter_notification(): check_rate_limiting(authenticated_service, api_user, notification_type=LETTER_TYPE) From 7db1d99eb38a9255d208a316b0b0c295735e91db Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sat, 5 Jul 2025 18:02:42 +0100 Subject: [PATCH 12/21] TMP -> gevent --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 998b6e1bfd..af1114d0b2 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -16,7 +16,7 @@ def child_exit(server, worker): workers = 1 -worker_class = "eventlet" +worker_class = "gevent" worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 32 # disable temporarily for diagnosing issues From 9dd1326fcecc7157846cdbc40cb33154f4dfffbb Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 12:34:13 +0100 Subject: [PATCH 13/21] TMP post-request dumps --- gunicorn_config.py | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index af1114d0b2..c33d510eb9 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -20,4 +20,33 @@ def child_exit(server, worker): worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 32 # disable temporarily for diagnosing issues -timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with gevent worker_class +timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet/gevent worker_class + +debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) +debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_CONCURRENCY", "0") + +if debug_post_threshold_seconds: + debug_post_threshold_seconds_float = float(debug_post_threshold_seconds) + debug_post_threshold_concurrency_int = int(debug_post_threshold_concurrency) + + concurrent_requests = 0 + + def pre_request(worker, req): + nonlocal concurrent_requests + concurrent_requests += 1 + # using os.times() to avoid additional imports before eventlet monkeypatching + req._pre_request_elapsed = os.times().elapsed + + def post_request(worker, req, environ, resp): + nonlocal concurrent_requests + concurrent_requests -= 1 + elapsed = os.times().elapsed - req._pre_request_elapsed + if elapsed > debug_post_threshold_seconds_float and concurrent_requests > debug_post_threshold_concurrency_int: + if worker_class == "gevent": + from datetime import datetime + from gevent.util import print_run_info + from tempdir import gettempdir + + timestamp = datetime.utcnow().isoformat(timespec="microseconds") + with open(f"{gettempdir()}/dump.{timestamp}", "w") as f: + print_run_info(file=f) From d7d38dd62fed9f9d15d556e2d46ec5f76715b524 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 12:40:51 +0100 Subject: [PATCH 14/21] TMP post-request log -> dump --- gunicorn_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index c33d510eb9..cc3967cf57 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -22,8 +22,8 @@ def child_exit(server, worker): keepalive = 32 # disable temporarily for diagnosing issues timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet/gevent worker_class -debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) -debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_CONCURRENCY", "0") +debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_SECONDS", None) +debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_CONCURRENCY", "0") if debug_post_threshold_seconds: debug_post_threshold_seconds_float = float(debug_post_threshold_seconds) From 1aa8f9ff1239918bbf9b2a0271170e069d4e8634 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 12:51:58 +0100 Subject: [PATCH 15/21] TMP nonlocal -> global --- gunicorn_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index cc3967cf57..f6856bbb90 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -32,13 +32,13 @@ def child_exit(server, worker): concurrent_requests = 0 def pre_request(worker, req): - nonlocal concurrent_requests + global concurrent_requests concurrent_requests += 1 # using os.times() to avoid additional imports before eventlet monkeypatching req._pre_request_elapsed = os.times().elapsed def post_request(worker, req, environ, resp): - nonlocal concurrent_requests + global concurrent_requests concurrent_requests -= 1 elapsed = os.times().elapsed - req._pre_request_elapsed if elapsed > debug_post_threshold_seconds_float and concurrent_requests > debug_post_threshold_concurrency_int: From 8e73f39bd2b176dd7ca31f0e4dd9a13b89347368 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 13:57:41 +0100 Subject: [PATCH 16/21] TMP tempfile --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index f6856bbb90..3272fcd260 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -45,7 +45,7 @@ def post_request(worker, req, environ, resp): if worker_class == "gevent": from datetime import datetime from gevent.util import print_run_info - from tempdir import gettempdir + from tempfile import gettempdir timestamp = datetime.utcnow().isoformat(timespec="microseconds") with open(f"{gettempdir()}/dump.{timestamp}", "w") as f: From 08e30e25fccfa18d774888df80b8bd288f40c3f9 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 14:50:47 +0100 Subject: [PATCH 17/21] TMP more predictable dump --- gunicorn_config.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 3272fcd260..9dfe11522a 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -40,13 +40,17 @@ def pre_request(worker, req): def post_request(worker, req, environ, resp): global concurrent_requests concurrent_requests -= 1 + + if worker_class == "gevent": + from datetime import datetime + from gevent.util import print_run_info + elapsed = os.times().elapsed - req._pre_request_elapsed if elapsed > debug_post_threshold_seconds_float and concurrent_requests > debug_post_threshold_concurrency_int: if worker_class == "gevent": - from datetime import datetime - from gevent.util import print_run_info - from tempfile import gettempdir - timestamp = datetime.utcnow().isoformat(timespec="microseconds") - with open(f"{gettempdir()}/dump.{timestamp}", "w") as f: + p = f"/tmp/dump.{timestamp}" + with open(p, "w") as f: print_run_info(file=f) + + worker.log.info("Wrote dump %s", p) From 2126a8e1a4e7c0f801946a502616b664093d2f10 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 15:49:15 +0100 Subject: [PATCH 18/21] TMP post request dump backoff --- gunicorn_config.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 9dfe11522a..75a2b8bf12 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -24,11 +24,14 @@ def child_exit(server, worker): debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_SECONDS", None) debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_CONCURRENCY", "0") +debug_post_backoff_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_BACKOFF_SECONDS", "1") if debug_post_threshold_seconds: debug_post_threshold_seconds_float = float(debug_post_threshold_seconds) debug_post_threshold_concurrency_int = int(debug_post_threshold_concurrency) + debug_post_backoff_seconds_float = float(debug_post_backoff_seconds) + last_dumped = 0 concurrent_requests = 0 def pre_request(worker, req): @@ -45,8 +48,15 @@ def post_request(worker, req, environ, resp): from datetime import datetime from gevent.util import print_run_info - elapsed = os.times().elapsed - req._pre_request_elapsed - if elapsed > debug_post_threshold_seconds_float and concurrent_requests > debug_post_threshold_concurrency_int: + now_elapsed = os.times().elapsed + elapsed = now_elapsed - req._pre_request_elapsed + if ( + elapsed > debug_post_threshold_seconds_float + and concurrent_requests > debug_post_threshold_concurrency_int + and last_dumped + debug_post_backoff_seconds_float < now_elapsed + ): + global last_dumped + last_dumped = now_elapsed if worker_class == "gevent": timestamp = datetime.utcnow().isoformat(timespec="microseconds") p = f"/tmp/dump.{timestamp}" From a28ce8ae6da8d186c0f5889af45466f80e39b98c Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sun, 6 Jul 2025 15:53:11 +0100 Subject: [PATCH 19/21] TMP fix global last_dumped --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 75a2b8bf12..96dc225745 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -42,6 +42,7 @@ def pre_request(worker, req): def post_request(worker, req, environ, resp): global concurrent_requests + global last_dumped concurrent_requests -= 1 if worker_class == "gevent": @@ -55,7 +56,6 @@ def post_request(worker, req, environ, resp): and concurrent_requests > debug_post_threshold_concurrency_int and last_dumped + debug_post_backoff_seconds_float < now_elapsed ): - global last_dumped last_dumped = now_elapsed if worker_class == "gevent": timestamp = datetime.utcnow().isoformat(timespec="microseconds") From f82ba1a2bed896d113204d058272dff24630271f Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sat, 12 Jul 2025 13:38:56 +0100 Subject: [PATCH 20/21] TMP eventlet, 2 workers --- gunicorn_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 96dc225745..35cd87db02 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -15,8 +15,8 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = 1 -worker_class = "gevent" +workers = 2 +worker_class = "eventlet" worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 32 # disable temporarily for diagnosing issues From cc53bcf2bc6db1796090138318f70d144bdf43e9 Mon Sep 17 00:00:00 2001 From: Robert Scott Date: Sat, 12 Jul 2025 18:03:37 +0100 Subject: [PATCH 21/21] TMP -> gevent --- gunicorn_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gunicorn_config.py b/gunicorn_config.py index 35cd87db02..796a1e8265 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -16,7 +16,7 @@ def child_exit(server, worker): workers = 2 -worker_class = "eventlet" +worker_class = "gevent" worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) keepalive = 32 # disable temporarily for diagnosing issues