diff --git a/cloud_pipelines_backend/instrumentation/metrics.py b/cloud_pipelines_backend/instrumentation/metrics.py new file mode 100644 index 0000000..5ddb326 --- /dev/null +++ b/cloud_pipelines_backend/instrumentation/metrics.py @@ -0,0 +1,36 @@ +""" +Application-level meters and instruments. + +Meters should be named after the software component they represent. +They should not change over time (avoid using __name__). + +Instruments should be named after the metric they represent. +First and foremost, they should follow the semantic conventions +(https://opentelemetry.io/docs/specs/semconv/general/metrics/) +of OTel if the metric is common (e.g. http.server.duration). + +For custom, application-specific measurements, choose a name after +what is being measured, and not after the software component that +measures it. + +Good example: +- Meter: tangle.orchestrator +- Instrument: execution.system_errors + +Bad example: +- Meter: tangle.orchestrator +- Instrument: orchestrator_execution_system_errors +""" + +from opentelemetry import metrics as otel_metrics + +# --------------------------------------------------------------------------- +# tangle.orchestrator +# --------------------------------------------------------------------------- +orchestrator_meter = otel_metrics.get_meter("tangle.orchestrator") + +execution_system_errors = orchestrator_meter.create_counter( + name="execution.system_errors", + description="Number of execution nodes that ended in SYSTEM_ERROR status", + unit="{error}", +) diff --git a/cloud_pipelines_backend/orchestrator_sql.py b/cloud_pipelines_backend/orchestrator_sql.py index 0241c38..eb1f1ab 100644 --- a/cloud_pipelines_backend/orchestrator_sql.py +++ b/cloud_pipelines_backend/orchestrator_sql.py @@ -21,6 +21,7 @@ from .launchers import common_annotations from .launchers import interfaces as launcher_interfaces from .instrumentation import contextual_logging +from .instrumentation import metrics as app_metrics _logger = logging.getLogger(__name__) @@ -1037,6 +1038,8 @@ def _retry( def record_system_error_exception(execution: bts.ExecutionNode, exception: Exception): + app_metrics.execution_system_errors.add(1) + if execution.extra_data is None: execution.extra_data = {} execution.extra_data[