diff --git a/app/__init__.py b/app/__init__.py index 9175c2b737..5e3156d0aa 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -27,7 +27,7 @@ from notifications_utils.clients.signing.signing_client import Signing from notifications_utils.clients.statsd.statsd_client import StatsdClient from notifications_utils.clients.zendesk.zendesk_client import ZendeskClient -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.local_vars import LazyLocalGetter from notifications_utils.logging import flask as utils_logging from sqlalchemy import event @@ -425,8 +425,8 @@ def exception(error): code = getattr(error, "code", 500) return jsonify(result="error", message=msg), code - @app.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @app.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): app.logger.exception(error) return jsonify(result="error", message="Timeout serving request"), 504 diff --git a/app/config.py b/app/config.py index 6d40a413f4..ea2a6480ec 100644 --- a/app/config.py +++ b/app/config.py @@ -155,7 +155,7 @@ class Config: # Antivirus ANTIVIRUS_ENABLED = True - NOTIFY_EVENTLET_STATS = os.getenv("NOTIFY_EVENTLET_STATS", "0") == "1" + NOTIFY_GREENLET_STATS = os.getenv("NOTIFY_GREENLET_STATS", "0") == "1" ########################### # Default config values ### diff --git a/app/errors.py b/app/errors.py index ffeb4aff39..d99fc6e9aa 100644 --- a/app/errors.py +++ b/app/errors.py @@ -1,7 +1,7 @@ from flask import current_app, json, jsonify from jsonschema import ValidationError as JsonSchemaValidationError from marshmallow import ValidationError -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.recipient_validation.errors import InvalidRecipientError from sqlalchemy.exc import DataError from sqlalchemy.orm.exc import NoResultFound @@ -98,8 +98,8 @@ def no_result_found(e): current_app.logger.info(e, exc_info=True) return jsonify(result="error", message="No result found"), 404 - @blueprint.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @blueprint.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): current_app.logger.exception(error) return jsonify(result="error", message="Timeout serving request"), 504 diff --git a/app/v2/errors.py b/app/v2/errors.py index 091948dbaa..291c1315c3 100644 --- a/app/v2/errors.py +++ b/app/v2/errors.py @@ -2,7 +2,7 @@ from flask import current_app, jsonify, request from jsonschema import ValidationError as JsonSchemaValidationError -from notifications_utils.eventlet import EventletTimeout +from notifications_utils.greenlet import RequestHandlingTimeout from notifications_utils.recipient_validation.errors import InvalidPhoneError, InvalidRecipientError from sqlalchemy.exc import DataError from sqlalchemy.orm.exc import NoResultFound @@ -139,8 +139,8 @@ def auth_error(error): current_app.logger.info("API AuthError, client: %s error: %s", request.headers.get("User-Agent"), error) return jsonify(error.to_dict_v2()), error.code - @blueprint.errorhandler(EventletTimeout) - def eventlet_timeout(error): + @blueprint.errorhandler(RequestHandlingTimeout) + def request_handling_timeout(error): current_app.logger.exception(error) return ( jsonify( diff --git a/app/v2/notifications/post_notifications.py b/app/v2/notifications/post_notifications.py index 2257309caa..878d1816a3 100644 --- a/app/v2/notifications/post_notifications.py +++ b/app/v2/notifications/post_notifications.py @@ -9,6 +9,7 @@ from app import ( api_user, authenticated_service, + db, document_download_client, notify_celery, ) @@ -76,6 +77,12 @@ ) +@v2_notification_blueprint.route("/_be_slow", methods=["GET", "POST"]) +def be_slow(): + while True: + db.session.execute("SELECT pg_sleep(1.23)").fetchone() + + @v2_notification_blueprint.route(f"/{LETTER_TYPE}", methods=["POST"]) def post_precompiled_letter_notification(): check_rate_limiting(authenticated_service, api_user, notification_type=LETTER_TYPE) diff --git a/application.py b/application.py index 0eac7c774e..f079fef67a 100644 --- a/application.py +++ b/application.py @@ -8,20 +8,20 @@ from app import create_app # noqa from app.notify_api_flask_app import NotifyApiFlaskApp # noqa -import notifications_utils.eventlet as utils_eventlet # noqa +import notifications_utils.greenlet as utils_greenlet # noqa application = NotifyApiFlaskApp("app") create_app(application) -if utils_eventlet.using_eventlet: - application.wsgi_app = utils_eventlet.EventletTimeoutMiddleware( +if utils_greenlet.using_eventlet or utils_greenlet.using_gevent: + application.wsgi_app = utils_greenlet.RequestHandlingTimeoutMiddleware( application.wsgi_app, timeout_seconds=int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)), ) - if application.config["NOTIFY_EVENTLET_STATS"]: + if application.config["NOTIFY_GREENLET_STATS"]: import greenlet - greenlet.settrace(utils_eventlet.account_greenlet_times) + greenlet.settrace(utils_greenlet.account_greenlet_times) application._server_greenlet = greenlet.getcurrent() diff --git a/gunicorn_config.py b/gunicorn_config.py index 7c2c46da3c..796a1e8265 100644 --- a/gunicorn_config.py +++ b/gunicorn_config.py @@ -6,7 +6,7 @@ # importing child_exit from gds_metrics.gunicorn has the side effect of eagerly importing -# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet +# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet/gevent # has done its monkeypatching. use a nested import for the rare cases child_exit is actually # called instead. def child_exit(server, worker): @@ -15,100 +15,52 @@ def child_exit(server, worker): multiprocess.mark_process_dead(worker.pid) -workers = 4 -worker_class = "eventlet" -worker_connections = 8 # limit runaway greenthread creation +workers = 2 +worker_class = "gevent" +worker_connections = 256 statsd_host = "{}:8125".format(os.getenv("STATSD_HOST")) -keepalive = 0 # disable temporarily for diagnosing issues -timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class +keepalive = 32 # disable temporarily for diagnosing issues +timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet/gevent worker_class -debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None) -if debug_post_threshold: - debug_post_threshold_float = float(debug_post_threshold) - profiler = None +debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_SECONDS", None) +debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_CONCURRENCY", "0") +debug_post_backoff_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_BACKOFF_SECONDS", "1") + +if debug_post_threshold_seconds: + debug_post_threshold_seconds_float = float(debug_post_threshold_seconds) + debug_post_threshold_concurrency_int = int(debug_post_threshold_concurrency) + debug_post_backoff_seconds_float = float(debug_post_backoff_seconds) + + last_dumped = 0 + concurrent_requests = 0 def pre_request(worker, req): + global concurrent_requests + concurrent_requests += 1 # using os.times() to avoid additional imports before eventlet monkeypatching req._pre_request_elapsed = os.times().elapsed - def _process_item(process, attr, process_times_before): - value = process.info[attr] - if attr == "cpu_times": - if process.pid in process_times_before: - value = [ - after - before - for before, after in zip(process_times_before[process.pid], process.info["cpu_times"], strict=False) - ] - else: - # we have no way of giving a sensible value - value = None - - if isinstance(value, tuple): - # convert to list for more compact (and json-able) representation - return list(value) - - return value - def post_request(worker, req, environ, resp): - elapsed = os.times().elapsed - req._pre_request_elapsed - if elapsed > debug_post_threshold_float: - import json - import time - from io import StringIO - from os import getpid - from pstats import Stats - - import psutil - from eventlet.green import profile - - # include cpu_percent to give later call a "start time" to work with - process_times_before = { - p.pid: p.info["cpu_times"] for p in psutil.process_iter(["cpu_percent", "cpu_times"]) - } - - global profiler - if profiler is None: - profiler = profile.Profile() - - should_profile = not getattr(profiler, "running", False) - if should_profile: - profiler.start() - - perf_counter_before = time.perf_counter() - - time.sleep(0.1) # period over which to profile and calculate cpu_percent - - perf_counter_after = time.perf_counter() - - if should_profile: - profiler.stop() - prof_out_sio = StringIO() - s = Stats(profiler, stream=prof_out_sio) - s.sort_stats("cumulative") - s.print_stats(0.1) - prof_out_str = prof_out_sio.getvalue() - else: - prof_out_str = "profiler already running - no profile collected" - - attrs = ["pid", "name", "cpu_percent", "cpu_times", "status", "memory_info"] - - context = { - "actual_profile_period": perf_counter_after - perf_counter_before, - "request_time": elapsed, - "processes": json.dumps( - [ - attrs, - [ - [_process_item(p, a, process_times_before) for a in attrs] - for p in psutil.process_iter(attrs) - ], - ] - ), - "profile": prof_out_str, - "process_": getpid(), - } - worker.log.info( - "post-request diagnostics for request of %(request_time)ss", - context, - extra=context, - ) + global concurrent_requests + global last_dumped + concurrent_requests -= 1 + + if worker_class == "gevent": + from datetime import datetime + from gevent.util import print_run_info + + now_elapsed = os.times().elapsed + elapsed = now_elapsed - req._pre_request_elapsed + if ( + elapsed > debug_post_threshold_seconds_float + and concurrent_requests > debug_post_threshold_concurrency_int + and last_dumped + debug_post_backoff_seconds_float < now_elapsed + ): + last_dumped = now_elapsed + if worker_class == "gevent": + timestamp = datetime.utcnow().isoformat(timespec="microseconds") + p = f"/tmp/dump.{timestamp}" + with open(p, "w") as f: + print_run_info(file=f) + + worker.log.info("Wrote dump %s", p) diff --git a/requirements.in b/requirements.in index 933c4760b4..68bade7595 100644 --- a/requirements.in +++ b/requirements.in @@ -25,7 +25,7 @@ psutil>=6.0.0,<7.0.0 notifications-python-client==10.0.1 # Run `make bump-utils` to update to the latest version -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.8.0 +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@ris-gevent # gds-metrics requires prometheseus 0.2.0, override that requirement as 0.7.1 brings significant performance gains prometheus-client==0.14.1 diff --git a/requirements.txt b/requirements.txt index a53d89ec9b..5c4383ec70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -94,10 +94,15 @@ fqdn==1.5.1 # via jsonschema gds-metrics @ git+https://github.com/alphagov/gds_metrics_python.git@6f1840a57b6fb1ee40b7e84f2f18ec229de8aa72 # via -r requirements.in +gevent==25.5.1 + # via gunicorn govuk-bank-holidays==0.15 # via notifications-utils greenlet==3.2.2 - # via eventlet + # via + # eventlet + # gevent + # sqlalchemy gunicorn==23.0.0 # via # -r requirements.in @@ -150,7 +155,7 @@ mistune==0.8.4 # via notifications-utils notifications-python-client==10.0.1 # via -r requirements.in -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@a97b36f6a32e7bb917152c8cd716fe65fa15ac9f +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@2cf33bd9287d4c0114679ba7d62c19a8f01f1978 # via -r requirements.in ordered-set==4.1.0 # via notifications-utils @@ -181,11 +186,12 @@ pyjwt==2.10.1 # notifications-python-client pypdf==3.17.0 # via notifications-utils -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # arrow # botocore # celery + # notifications-utils python-json-logger==3.3.0 # via notifications-utils pytz==2024.2 @@ -218,6 +224,10 @@ segno==1.6.1 # via notifications-utils sentry-sdk==1.45.1 # via -r requirements.in +setuptools==80.9.0 + # via + # zope-event + # zope-interface six==1.16.0 # via # click-repl @@ -256,3 +266,7 @@ webcolors==1.12 # via jsonschema werkzeug==3.1.3 # via flask +zope-event==5.1 + # via gevent +zope-interface==7.2 + # via gevent diff --git a/requirements_for_test.txt b/requirements_for_test.txt index 1f021e5010..9f1b10e1ce 100644 --- a/requirements_for_test.txt +++ b/requirements_for_test.txt @@ -143,6 +143,10 @@ freezegun==1.5.1 # via -r requirements_for_test_common.in gds-metrics @ git+https://github.com/alphagov/gds_metrics_python.git@6f1840a57b6fb1ee40b7e84f2f18ec229de8aa72 # via -r requirements.txt +gevent==25.5.1 + # via + # -r requirements.txt + # gunicorn govuk-bank-holidays==0.15 # via # -r requirements.txt @@ -151,6 +155,8 @@ greenlet==3.2.2 # via # -r requirements.txt # eventlet + # gevent + # sqlalchemy gunicorn==23.0.0 # via # -r requirements.txt @@ -220,7 +226,7 @@ moto==5.0.11 # via -r requirements_for_test.in notifications-python-client==10.0.1 # via -r requirements.txt -notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@a97b36f6a32e7bb917152c8cd716fe65fa15ac9f +notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@2cf33bd9287d4c0114679ba7d62c19a8f01f1978 # via -r requirements.txt ordered-set==4.1.0 # via @@ -280,7 +286,7 @@ pytest-testmon==2.1.1 # via -r requirements_for_test_common.in pytest-xdist==3.6.1 # via -r requirements_for_test_common.in -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # -r requirements.txt # arrow @@ -288,6 +294,7 @@ python-dateutil==2.8.2 # celery # freezegun # moto + # notifications-utils python-json-logger==3.3.0 # via # -r requirements.txt @@ -344,6 +351,11 @@ segno==1.6.1 # notifications-utils sentry-sdk==1.45.1 # via -r requirements.txt +setuptools==80.9.0 + # via + # -r requirements.txt + # zope-event + # zope-interface six==1.16.0 # via # -r requirements.txt @@ -403,3 +415,11 @@ werkzeug==3.1.3 # pytest-httpserver xmltodict==0.14.2 # via moto +zope-event==5.1 + # via + # -r requirements.txt + # gevent +zope-interface==7.2 + # via + # -r requirements.txt + # gevent diff --git a/tests/test_gunicorn_config.py b/tests/test_gunicorn_config.py index 748402fb5b..9bc5c61afb 100644 --- a/tests/test_gunicorn_config.py +++ b/tests/test_gunicorn_config.py @@ -3,6 +3,6 @@ def test_gunicorn_config(): assert workers == 4 - assert worker_class == "eventlet" + assert worker_class == "gevent" assert keepalive == 0 assert timeout == 30