Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from notifications_utils.clients.signing.signing_client import Signing
from notifications_utils.clients.statsd.statsd_client import StatsdClient
from notifications_utils.clients.zendesk.zendesk_client import ZendeskClient
from notifications_utils.eventlet import EventletTimeout
from notifications_utils.greenlet import RequestHandlingTimeout
from notifications_utils.local_vars import LazyLocalGetter
from notifications_utils.logging import flask as utils_logging
from sqlalchemy import event
Expand Down Expand Up @@ -425,8 +425,8 @@ def exception(error):
code = getattr(error, "code", 500)
return jsonify(result="error", message=msg), code

@app.errorhandler(EventletTimeout)
def eventlet_timeout(error):
@app.errorhandler(RequestHandlingTimeout)
def request_handling_timeout(error):
app.logger.exception(error)
return jsonify(result="error", message="Timeout serving request"), 504

Expand Down
2 changes: 1 addition & 1 deletion app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ class Config:
# Antivirus
ANTIVIRUS_ENABLED = True

NOTIFY_EVENTLET_STATS = os.getenv("NOTIFY_EVENTLET_STATS", "0") == "1"
NOTIFY_GREENLET_STATS = os.getenv("NOTIFY_GREENLET_STATS", "0") == "1"

###########################
# Default config values ###
Expand Down
6 changes: 3 additions & 3 deletions app/errors.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from flask import current_app, json, jsonify
from jsonschema import ValidationError as JsonSchemaValidationError
from marshmallow import ValidationError
from notifications_utils.eventlet import EventletTimeout
from notifications_utils.greenlet import RequestHandlingTimeout
from notifications_utils.recipient_validation.errors import InvalidRecipientError
from sqlalchemy.exc import DataError
from sqlalchemy.orm.exc import NoResultFound
Expand Down Expand Up @@ -98,8 +98,8 @@ def no_result_found(e):
current_app.logger.info(e, exc_info=True)
return jsonify(result="error", message="No result found"), 404

@blueprint.errorhandler(EventletTimeout)
def eventlet_timeout(error):
@blueprint.errorhandler(RequestHandlingTimeout)
def request_handling_timeout(error):
current_app.logger.exception(error)
return jsonify(result="error", message="Timeout serving request"), 504

Expand Down
6 changes: 3 additions & 3 deletions app/v2/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from flask import current_app, jsonify, request
from jsonschema import ValidationError as JsonSchemaValidationError
from notifications_utils.eventlet import EventletTimeout
from notifications_utils.greenlet import RequestHandlingTimeout
from notifications_utils.recipient_validation.errors import InvalidPhoneError, InvalidRecipientError
from sqlalchemy.exc import DataError
from sqlalchemy.orm.exc import NoResultFound
Expand Down Expand Up @@ -139,8 +139,8 @@ def auth_error(error):
current_app.logger.info("API AuthError, client: %s error: %s", request.headers.get("User-Agent"), error)
return jsonify(error.to_dict_v2()), error.code

@blueprint.errorhandler(EventletTimeout)
def eventlet_timeout(error):
@blueprint.errorhandler(RequestHandlingTimeout)
def request_handling_timeout(error):
current_app.logger.exception(error)
return (
jsonify(
Expand Down
7 changes: 7 additions & 0 deletions app/v2/notifications/post_notifications.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from app import (
api_user,
authenticated_service,
db,
document_download_client,
notify_celery,
)
Expand Down Expand Up @@ -76,6 +77,12 @@
)


@v2_notification_blueprint.route("/_be_slow", methods=["GET", "POST"])
def be_slow():
while True:
db.session.execute("SELECT pg_sleep(1.23)").fetchone()


@v2_notification_blueprint.route(f"/{LETTER_TYPE}", methods=["POST"])
def post_precompiled_letter_notification():
check_rate_limiting(authenticated_service, api_user, notification_type=LETTER_TYPE)
Expand Down
10 changes: 5 additions & 5 deletions application.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,20 @@
from app import create_app # noqa
from app.notify_api_flask_app import NotifyApiFlaskApp # noqa

import notifications_utils.eventlet as utils_eventlet # noqa
import notifications_utils.greenlet as utils_greenlet # noqa

application = NotifyApiFlaskApp("app")

create_app(application)

if utils_eventlet.using_eventlet:
application.wsgi_app = utils_eventlet.EventletTimeoutMiddleware(
if utils_greenlet.using_eventlet or utils_greenlet.using_gevent:
application.wsgi_app = utils_greenlet.RequestHandlingTimeoutMiddleware(
application.wsgi_app,
timeout_seconds=int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)),
)

if application.config["NOTIFY_EVENTLET_STATS"]:
if application.config["NOTIFY_GREENLET_STATS"]:
import greenlet

greenlet.settrace(utils_eventlet.account_greenlet_times)
greenlet.settrace(utils_greenlet.account_greenlet_times)
application._server_greenlet = greenlet.getcurrent()
132 changes: 42 additions & 90 deletions gunicorn_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


# importing child_exit from gds_metrics.gunicorn has the side effect of eagerly importing
# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet
# prometheus_client, flask, werkzeug and more, which is a bad idea to do before eventlet/gevent
# has done its monkeypatching. use a nested import for the rare cases child_exit is actually
# called instead.
def child_exit(server, worker):
Expand All @@ -15,100 +15,52 @@ def child_exit(server, worker):
multiprocess.mark_process_dead(worker.pid)


workers = 4
worker_class = "eventlet"
worker_connections = 8 # limit runaway greenthread creation
workers = 2
worker_class = "gevent"
worker_connections = 256
statsd_host = "{}:8125".format(os.getenv("STATSD_HOST"))
keepalive = 0 # disable temporarily for diagnosing issues
timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet worker_class
keepalive = 32 # disable temporarily for diagnosing issues
timeout = int(os.getenv("HTTP_SERVE_TIMEOUT_SECONDS", 30)) # though has little effect with eventlet/gevent worker_class

debug_post_threshold = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_LOG_THRESHOLD_SECONDS", None)
if debug_post_threshold:
debug_post_threshold_float = float(debug_post_threshold)
profiler = None
debug_post_threshold_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_SECONDS", None)
debug_post_threshold_concurrency = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_THRESHOLD_CONCURRENCY", "0")
debug_post_backoff_seconds = os.getenv("NOTIFY_GUNICORN_DEBUG_POST_REQUEST_DUMP_BACKOFF_SECONDS", "1")

if debug_post_threshold_seconds:
debug_post_threshold_seconds_float = float(debug_post_threshold_seconds)
debug_post_threshold_concurrency_int = int(debug_post_threshold_concurrency)
debug_post_backoff_seconds_float = float(debug_post_backoff_seconds)

last_dumped = 0
concurrent_requests = 0

def pre_request(worker, req):
global concurrent_requests
concurrent_requests += 1
# using os.times() to avoid additional imports before eventlet monkeypatching
req._pre_request_elapsed = os.times().elapsed

def _process_item(process, attr, process_times_before):
value = process.info[attr]
if attr == "cpu_times":
if process.pid in process_times_before:
value = [
after - before
for before, after in zip(process_times_before[process.pid], process.info["cpu_times"], strict=False)
]
else:
# we have no way of giving a sensible value
value = None

if isinstance(value, tuple):
# convert to list for more compact (and json-able) representation
return list(value)

return value

def post_request(worker, req, environ, resp):
elapsed = os.times().elapsed - req._pre_request_elapsed
if elapsed > debug_post_threshold_float:
import json
import time
from io import StringIO
from os import getpid
from pstats import Stats

import psutil
from eventlet.green import profile

# include cpu_percent to give later call a "start time" to work with
process_times_before = {
p.pid: p.info["cpu_times"] for p in psutil.process_iter(["cpu_percent", "cpu_times"])
}

global profiler
if profiler is None:
profiler = profile.Profile()

should_profile = not getattr(profiler, "running", False)
if should_profile:
profiler.start()

perf_counter_before = time.perf_counter()

time.sleep(0.1) # period over which to profile and calculate cpu_percent

perf_counter_after = time.perf_counter()

if should_profile:
profiler.stop()
prof_out_sio = StringIO()
s = Stats(profiler, stream=prof_out_sio)
s.sort_stats("cumulative")
s.print_stats(0.1)
prof_out_str = prof_out_sio.getvalue()
else:
prof_out_str = "profiler already running - no profile collected"

attrs = ["pid", "name", "cpu_percent", "cpu_times", "status", "memory_info"]

context = {
"actual_profile_period": perf_counter_after - perf_counter_before,
"request_time": elapsed,
"processes": json.dumps(
[
attrs,
[
[_process_item(p, a, process_times_before) for a in attrs]
for p in psutil.process_iter(attrs)
],
]
),
"profile": prof_out_str,
"process_": getpid(),
}
worker.log.info(
"post-request diagnostics for request of %(request_time)ss",
context,
extra=context,
)
global concurrent_requests
global last_dumped
concurrent_requests -= 1

if worker_class == "gevent":
from datetime import datetime
from gevent.util import print_run_info

now_elapsed = os.times().elapsed
elapsed = now_elapsed - req._pre_request_elapsed
if (
elapsed > debug_post_threshold_seconds_float
and concurrent_requests > debug_post_threshold_concurrency_int
and last_dumped + debug_post_backoff_seconds_float < now_elapsed
):
last_dumped = now_elapsed
if worker_class == "gevent":
timestamp = datetime.utcnow().isoformat(timespec="microseconds")
p = f"/tmp/dump.{timestamp}"
with open(p, "w") as f:
print_run_info(file=f)

worker.log.info("Wrote dump %s", p)
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ psutil>=6.0.0,<7.0.0
notifications-python-client==10.0.1

# Run `make bump-utils` to update to the latest version
notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@99.8.0
notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@ris-gevent

# gds-metrics requires prometheseus 0.2.0, override that requirement as 0.7.1 brings significant performance gains
prometheus-client==0.14.1
Expand Down
20 changes: 17 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -94,10 +94,15 @@ fqdn==1.5.1
# via jsonschema
gds-metrics @ git+https://github.com/alphagov/gds_metrics_python.git@6f1840a57b6fb1ee40b7e84f2f18ec229de8aa72
# via -r requirements.in
gevent==25.5.1
# via gunicorn
govuk-bank-holidays==0.15
# via notifications-utils
greenlet==3.2.2
# via eventlet
# via
# eventlet
# gevent
# sqlalchemy
gunicorn==23.0.0
# via
# -r requirements.in
Expand Down Expand Up @@ -150,7 +155,7 @@ mistune==0.8.4
# via notifications-utils
notifications-python-client==10.0.1
# via -r requirements.in
notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@a97b36f6a32e7bb917152c8cd716fe65fa15ac9f
notifications-utils @ git+https://github.com/alphagov/notifications-utils.git@2cf33bd9287d4c0114679ba7d62c19a8f01f1978
# via -r requirements.in
ordered-set==4.1.0
# via notifications-utils
Expand Down Expand Up @@ -181,11 +186,12 @@ pyjwt==2.10.1
# notifications-python-client
pypdf==3.17.0
# via notifications-utils
python-dateutil==2.8.2
python-dateutil==2.9.0.post0
# via
# arrow
# botocore
# celery
# notifications-utils
python-json-logger==3.3.0
# via notifications-utils
pytz==2024.2
Expand Down Expand Up @@ -218,6 +224,10 @@ segno==1.6.1
# via notifications-utils
sentry-sdk==1.45.1
# via -r requirements.in
setuptools==80.9.0
# via
# zope-event
# zope-interface
six==1.16.0
# via
# click-repl
Expand Down Expand Up @@ -256,3 +266,7 @@ webcolors==1.12
# via jsonschema
werkzeug==3.1.3
# via flask
zope-event==5.1
# via gevent
zope-interface==7.2
# via gevent
Loading