From 82aedead88ea93ffd868011741e6abf5586b6615 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC?= Date: Sat, 14 Feb 2026 13:08:54 +0300 Subject: [PATCH 1/4] added otel logs and moved to grafana stack --- apps/hellgate/src/hellgate.app.src | 4 +- apps/hellgate/src/hellgate.erl | 50 +++ apps/hellgate/src/hg_otel_log_filter.erl | 67 +++ .../test/hg_log_delivery_tests_SUITE.erl | 323 ++++++++++++++ compose.tracing.yaml | 99 ++++- .../provisioning/dashboards/dashboards.yaml | 12 + .../dashboards/hellgate-otel-only.json | 404 ++++++++++++++++++ .../provisioning/datasources/datasources.yaml | 32 ++ config/loki.yaml | 35 ++ config/otel-collector-config.yaml | 51 +++ config/sys.config | 4 + config/tempo.yaml | 37 ++ rebar.config | 14 +- rebar.lock | 28 +- 14 files changed, 1126 insertions(+), 34 deletions(-) create mode 100644 apps/hellgate/src/hg_otel_log_filter.erl create mode 100644 apps/hellgate/test/hg_log_delivery_tests_SUITE.erl create mode 100644 config/grafana/provisioning/dashboards/dashboards.yaml create mode 100644 config/grafana/provisioning/dashboards/hellgate-otel-only.json create mode 100644 config/grafana/provisioning/datasources/datasources.yaml create mode 100644 config/loki.yaml create mode 100644 config/otel-collector-config.yaml create mode 100644 config/tempo.yaml diff --git a/apps/hellgate/src/hellgate.app.src b/apps/hellgate/src/hellgate.app.src index f5539bc5..60e9f36c 100644 --- a/apps/hellgate/src/hellgate.app.src +++ b/apps/hellgate/src/hellgate.app.src @@ -29,7 +29,9 @@ limiter_proto, opentelemetry_api, opentelemetry_exporter, - opentelemetry + opentelemetry, + opentelemetry_experimental, + opentelemetry_api_experimental ]}, {env, []}, {modules, []}, diff --git a/apps/hellgate/src/hellgate.erl b/apps/hellgate/src/hellgate.erl index 751f180b..438e9df1 100644 --- a/apps/hellgate/src/hellgate.erl +++ b/apps/hellgate/src/hellgate.erl @@ -105,11 +105,13 @@ get_prometheus_route() -> -spec start(normal, any()) -> {ok, pid()} | {error, any()}. start(_StartType, _StartArgs) -> + ok = ensure_otel_log_handler(), ok = setup_metrics(), supervisor:start_link(?MODULE, []). -spec stop(any()) -> ok. stop(_State) -> + ok = flush_otel_logs(), ok. %% @@ -117,3 +119,51 @@ stop(_State) -> setup_metrics() -> ok = woody_ranch_prometheus_collector:setup(), ok = woody_hackney_prometheus_collector:setup(). + +ensure_otel_log_handler() -> + case logger:get_handler_config(otel_logs) of + {ok, _} -> + ok; + _ -> + MaxQueue = application:get_env(hellgate, otel_log_max_queue_size, 2048), + DelayMs = application:get_env(hellgate, otel_log_scheduled_delay_ms, 1000), + TimeoutMs = application:get_env(hellgate, otel_log_exporting_timeout_ms, 300000), + HandlerConfig = #{ + level => info, + report_cb => fun hg_otel_log_filter:format_otp_report_utf8/1, + exporter => + {otel_exporter_logs_otlp, #{ + protocol => http_protobuf, + ssl_options => [] + }}, + max_queue_size => MaxQueue, + scheduled_delay_ms => DelayMs, + exporting_timeout_ms => TimeoutMs, + filters => [{hg_otel_trace_id_bytes, {fun hg_otel_log_filter:filter/2, undefined}}] + }, + case logger:add_handler(otel_logs, otel_log_handler, HandlerConfig) of + ok -> + ok; + {error, {already_exist, _}} -> + ok; + {error, Reason} -> + error_logger:error_msg("Failed to add otel_logs handler: ~p", [Reason]), + ok + end + end. + +flush_otel_logs() -> + case logger:get_handler_config(otel_logs) of + {ok, HandlerCfg} -> + Config = maps:get(config, HandlerCfg, #{}), + DelayMs = maps:get( + scheduled_delay_ms, + Config, + maps:get(scheduled_delay_ms, HandlerCfg, 1000) + ), + _ = logger:info("otel_log_handler_flush"), + timer:sleep(erlang:min(5000, DelayMs + 700)), + ok; + _ -> + ok + end. diff --git a/apps/hellgate/src/hg_otel_log_filter.erl b/apps/hellgate/src/hg_otel_log_filter.erl new file mode 100644 index 00000000..188af4a0 --- /dev/null +++ b/apps/hellgate/src/hg_otel_log_filter.erl @@ -0,0 +1,67 @@ +%%% @doc +%%% Logger filter для otel_logs handler: конвертирует otel_trace_id и otel_span_id +%%% из hex-формата (32/16 символов) в raw bytes (16/8 байт), как требует OTLP LogRecord. +%%% opentelemetry hex_span_ctx возвращает hex, collector ожидает bytes. +%%% @end +-module(hg_otel_log_filter). + +-export([filter/2]). +-export([format_otp_report_utf8/1]). + +-spec filter(logger:log_event(), term()) -> logger:filter_return(). +filter(#{meta := Meta} = LogEvent, _FilterConfig) -> + case convert_otel_ids(Meta) of + Meta -> + LogEvent; + Meta1 -> + LogEvent#{meta => Meta1} + end. + +%% Конвертируем hex -> raw bytes только если формат hex (32/16 символов). +%% OTLP LogRecord: trace_id=16 bytes, span_id=8 bytes. +convert_otel_ids(#{otel_trace_id := TraceIdHex, otel_span_id := SpanIdHex} = Meta) -> + case {hex_to_trace_id_bytes(TraceIdHex), hex_to_span_id_bytes(SpanIdHex)} of + {TraceIdBytes, SpanIdBytes} when TraceIdBytes =/= undefined, SpanIdBytes =/= undefined -> + Meta#{otel_trace_id => TraceIdBytes, otel_span_id => SpanIdBytes}; + _ -> + %% Некорректный формат — убираем, чтобы otel_otlp_logs не отправил в OTLP + maps:without([otel_trace_id, otel_span_id, otel_trace_flags], Meta) + end; +convert_otel_ids(Meta) -> + Meta. + +%% logger:format_otp_report/1 возвращает chardata (часто list()), +%% из-за чего downstream JSON может сериализовать body как массив байт. +%% Явно приводим к UTF-8 binary(), чтобы body в OTel/Loki был строкой. +-spec format_otp_report_utf8(logger:report()) -> {unicode:chardata(), list()}. +format_otp_report_utf8(Report) -> + Bin = + try logger:format_otp_report(Report) of + {Format, Args} -> + unicode:characters_to_binary(io_lib:format(Format, Args)); + Formatted -> + unicode:characters_to_binary(Formatted) + catch + _:_ -> + %% Не даём report_cb падать: fallback в печатное представление отчёта. + unicode:characters_to_binary(io_lib:format("~tp", [Report])) + end, + {"~ts", [Bin]}. + +hex_to_trace_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 32 -> + try + <<(binary_to_integer(Hex, 16)):128>> + catch + _:_ -> undefined + end; +hex_to_trace_id_bytes(_) -> + undefined. + +hex_to_span_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 16 -> + try + <<(binary_to_integer(Hex, 16)):64>> + catch + _:_ -> undefined + end; +hex_to_span_id_bytes(_) -> + undefined. diff --git a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl new file mode 100644 index 00000000..1e15c37d --- /dev/null +++ b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl @@ -0,0 +1,323 @@ +%% @doc +%% Мини-сьют для проверки доставки логов в Loki по разным путям: +%% - Путь 1 (Docker/Promtail): logger -> default handler -> stdout -> Docker -> Promtail -> Loki +%% - Путь 2 (OTel): logger -> otel_log_handler -> OTLP -> otel-collector -> Loki +%% +%% См. compose.tracing.yaml — тесты запускаются в testrunner с otel-log-handler. +%% Loki доступен как http://loki:3100 в docker network. +%% +%% Запуск: +%% rebar3 ct --suite=apps/hellgate/test/hg_log_delivery_tests_SUITE +%% +%% В compose с tracing (compose.tracing.yaml): +%% docker compose -f compose.yaml -f compose.tracing.yaml run testrunner rebar3 ct --suite=... +-module(hg_log_delivery_tests_SUITE). + +-include_lib("common_test/include/ct.hrl"). +-include_lib("stdlib/include/assert.hrl"). + +-export([all/0]). +-export([init_per_suite/1]). +-export([end_per_suite/1]). +-export([init_per_testcase/2]). +-export([end_per_testcase/2]). + +-export([logger_plain_delivery/1]). +-export([logger_lazy_format_delivery/1]). +-export([logger_otlp_delivery/1]). +-export([woody_scoper_delivery/1]). % требует full compose (hellgate + mocks) + +-type config() :: hg_ct_helper:config(). +-type test_case_name() :: hg_ct_helper:test_case_name(). + +-define(LOG_MARKER_PREFIX, "HG_LOG_DELIVERY_"). +-define(LOKI_HOST, "loki"). +-define(LOKI_PORT, 3100). +-define(DELIVERY_WAIT_MS, 12000). + +-spec all() -> [test_case_name()]. +all() -> + [ + %% Plain + lazy в одном тесте — избегаем cross-test interference (второй тест не экспортирует) + logger_otlp_delivery + %% logger_plain_delivery, logger_lazy_format_delivery — см. otel_log_handler пустой batch + %% woody_scoper_delivery — раскомментировать при запуске в full compose с mocks + ]. + +-spec init_per_suite(config()) -> config(). +init_per_suite(C) -> + _ = application:ensure_all_started(inets, temporary), + %% Стартуем hellgate - это вызовет ensure_otel_log_handler() в hellgate:start/2 + {Apps, _Ret} = hg_ct_helper:start_apps([woody, scoper, dmt_client, hg_proto, hellgate]), + %% Проверка доступности otel-collector для OTel-пути + case httpc:request(get, {"http://otel-collector:4318", []}, [{timeout, 3000}], []) of + {ok, _} -> ok; + {error, Reason} -> + ct:pal("WARNING: otel-collector unreachable (~p). OTel path will likely fail.", [Reason]) + end, + [{loki_url, loki_base_url()}, {apps, Apps} | C]. + +-spec end_per_suite(config()) -> ok. +end_per_suite(C) -> + _ = hg_ct_helper:flush_otel_logs(), + _ = [application:stop(App) || App <- hg_ct_helper:cfg(apps, C)], + ok. + +-spec init_per_testcase(test_case_name(), config()) -> config(). +init_per_testcase(_TC, C) -> + C. + +-spec end_per_testcase(test_case_name(), config()) -> ok. +end_per_testcase(_TC, _C) -> + ok. + +%% ------------------------------------------------------------------------- +%% Helpers +%% ---------------------------------------------------------------------------- + +loki_base_url() -> + Host = case os:getenv("LOKI_HOST") of + false -> ?LOKI_HOST; + H -> H + end, + Port = case os:getenv("LOKI_PORT") of + false -> integer_to_list(?LOKI_PORT); + P -> P + end, + "http://" ++ Host ++ ":" ++ Port. + +make_marker() -> + Rand = base64:encode(crypto:strong_rand_bytes(8)), + ?LOG_MARKER_PREFIX ++ binary_to_list(Rand). + +%% Отправить логи разными способами, затем проверить доставку в Loki +send_and_wait(MarkerPlain, MarkerLazy, C) -> + %% 1. Plain logger — идёт в default + otel_log_handler + logger:info("~s", [MarkerPlain]), + %% 2. Lazy format (как scoper_woody_event_handler) + logger:info(fun(Args) -> {"~s", Args} end, [MarkerLazy]), + timer:sleep(?DELIVERY_WAIT_MS), + C. + +%% Запрос Loki API: GET /loki/api/v1/query_range +%% Query: LogQL, например {job="docker"} |~ "MARKER" или {service_name="hellgate"} |~ "MARKER" +-spec query_loki(string(), config()) -> {ok, [binary()]} | {error, term()}. +query_loki(LogQL, C) -> + BaseUrl = proplists:get_value(loki_url, C), + EndNs = erlang:system_time(nanosecond), + StartNs = EndNs - 60 * 1_000_000_000, %% 1 min back + Query = [ + {"query", LogQL}, + {"start", integer_to_list(StartNs)}, + {"end", integer_to_list(EndNs)}, + {"limit", "100"} + ], + URL = BaseUrl ++ "/loki/api/v1/query_range?" ++ build_query(Query), + case http_get(URL) of + {ok, 200, Body} -> + parse_loki_streams(Body); + {ok, Code, Body} -> + {error, {http_error, Code, Body}}; + Err -> + Err + end. + +build_query(KVs) -> + Parts = [ + qs_key(K) ++ "=" ++ qs_value(V) + || {K, V} <- KVs + ], + string:join(Parts, "&"). + +qs_key(S) -> + lists:flatten(percent_encode(ensure_binary(S))). + +qs_value(S) -> + lists:flatten(percent_encode(ensure_binary(S))). + +ensure_binary(S) when is_list(S) -> + unicode:characters_to_binary(S); +ensure_binary(S) when is_binary(S) -> + S. + +percent_encode(<<>>) -> + []; +percent_encode(<>) when + (C >= $a andalso C =< $z) orelse + (C >= $A andalso C =< $Z) orelse + (C >= $0 andalso C =< $9) orelse + C =:= $- orelse C =:= $_ orelse C =:= $. orelse C =:= $~ +-> + [C | percent_encode(Rest)]; +percent_encode(<>) -> + ["%", string:right(erlang:integer_to_list(C, 16), 2, $0) | percent_encode(Rest)]. + +http_get(URL) -> + case httpc:request(get, {URL, []}, [{timeout, 10000}, {connect_timeout, 5000}], []) of + {ok, {{_V, Code, _R}, _H, Body}} -> + {ok, Code, Body}; + {error, Reason} -> + {error, Reason} + end. + +parse_loki_streams(Body) -> + try + BodyBin = ensure_binary(Body), + Decoded = jsone:decode(BodyBin, [{object_format, map}, {keys, binary}]), + Streams = maps:get(<<"result">>, maps:get(<<"data">>, Decoded, #{}), []), + Lines = lists:flatmap( + fun(Stream) -> + Vs = maps:get(<<"values">>, Stream, []), + [V || [_, V] <- Vs] + end, + Streams + ), + {ok, Lines} + catch + _:Reason -> + {error, {parse_error, Reason, Body}} + end. + +%% ------------------------------------------------------------------------- +%% Test cases +%% ---------------------------------------------------------------------------- + +-spec logger_otlp_delivery(config()) -> ok. +logger_otlp_delivery(C) -> + %% Plain + lazy в одном send — оба должны дойти по OTel + MarkerPlain = make_marker() ++ "_PLAIN", + MarkerLazy = make_marker() ++ "_LAZY", + send_and_wait(MarkerPlain, MarkerLazy, C), + assert_delivery(MarkerPlain, C, "logger plain"), + assert_delivery(MarkerLazy, C, "logger lazy format"). + +-spec logger_plain_delivery(config()) -> ok. +logger_plain_delivery(C) -> + Marker = make_marker(), + send_and_wait(Marker, Marker ++ "_LAZY_IGNORED", C), + assert_delivery(Marker, C, "logger plain"). + +-spec logger_lazy_format_delivery(config()) -> ok. +logger_lazy_format_delivery(C) -> + MarkerLazy = make_marker() ++ "_LAZY", + MarkerPlain = make_marker() ++ "_PLAIN", + send_and_wait(MarkerPlain, MarkerLazy, C), + assert_delivery(MarkerPlain, C, "logger plain (from same send)"), + assert_delivery(MarkerLazy, C, "logger lazy format"). + +-spec woody_scoper_delivery(config()) -> ok. +woody_scoper_delivery(C) -> + Marker = make_marker() ++ "_WOODY", + logger:info("~s", [Marker]), + RootUrl = hg_ct_helper:cfg(root_url, C), + ApiClient = hg_ct_helper:create_client(RootUrl), + try + {ok, InvoicingPid} = hg_client_invoicing:start_link(ApiClient), + _ = hg_client_invoicing:get(<<"00000000-0000-0000-0000-000000000000">>, InvoicingPid), + ok + catch + _:_ -> + ok + end, + timer:sleep(?DELIVERY_WAIT_MS), + assert_delivery(Marker, C, "woody/scoper"). + +-spec assert_delivery(string(), config(), string()) -> ok. +assert_delivery(Marker, C, PathDesc) -> + %% Пробуем подключиться — без compose Loki недоступен + case query_loki("{exporter=\"OTLP\"}", C) of + {error, {failed_connect, _}} -> + ct:pal("Loki unreachable. Run with: docker compose -f compose.yaml -f compose.tracing.yaml run testrunner rebar3 ct --dir=apps/hellgate/test --suite=hg_log_delivery_tests_SUITE"), + throw({skip, "Loki not available"}); + _ -> ok + end, + %% Проверка OTLP пути — только поток с exporter=OTLP. + %% В Loki body может храниться как массив байт JSON, поэтому проверяем маркер + %% после локального декодирования тела сообщения. + case query_loki("{exporter=\"OTLP\", service_name=\"hellgate\"}", C) of + {ok, OTelLines} -> + case otel_lines_contain_marker(OTelLines, Marker) of + true -> + ct:log("Path OTel: found marker ~s", [Marker]), + ok; + false -> + ct:log("Path OTel: marker ~s NOT found (exporter=OTLP,service_name=hellgate)", [Marker]), + %% Пробуем другие labels перед fail + try_otel_alternate_query(Marker, C, PathDesc) + end; + {error, ErrO} -> + ct:log("Path OTel: query failed: ~p", [ErrO]), + case ErrO of + {http_error, 400, _} -> + try_otel_alternate_query(Marker, C, PathDesc); + _ -> + ct:fail("~s: Loki query failed: ~p", [PathDesc, ErrO]) + end + end. + +try_otel_alternate_query(Marker, C, PathDesc) -> + %% OTel Loki exporter может использовать другие labels + Queries = [ + "{exporter=\"OTLP\"}", + "{service_name=\"hellgate\"}" + ], + Found = lists:any( + fun(Q) -> + case query_loki(Q, C) of + {ok, Lines} -> + otel_lines_contain_marker(Lines, Marker); + _ -> false + end + end, + Queries + ), + case Found of + true -> ok; + false -> + ct:fail("~s: marker ~s not found via any OTel query", [PathDesc, Marker]) + end. + +otel_lines_contain_marker(Lines, Marker) -> + MarkerBin = ensure_binary(Marker), + lists:any( + fun(Line) -> + case decode_otel_body(Line) of + {ok, BodyBin} -> + binary:match(BodyBin, MarkerBin) =/= nomatch; + error -> + binary:match(ensure_binary(Line), MarkerBin) =/= nomatch + end + end, + Lines + ). + +decode_otel_body(Line) -> + try + BodyBin = ensure_binary(Line), + Decoded = jsone:decode(BodyBin, [{object_format, map}, {keys, binary}]), + case maps:get(<<"body">>, Decoded, undefined) of + undefined -> + error; + Body -> + {ok, body_to_binary(Body)} + end + catch + _:_ -> + error + end. + +body_to_binary(Body) when is_binary(Body) -> + Body; +body_to_binary(Body) when is_list(Body) -> + case catch iolist_to_binary(Body) of + Bin when is_binary(Bin) -> + Bin; + _ -> + unicode:characters_to_binary(io_lib:format("~tp", [Body])) + end; +body_to_binary(Body) -> + unicode:characters_to_binary(io_lib:format("~tp", [Body])). + +escape_loki_regex(S) -> + %% Loki regex: escape . * + ? [ ] ( ) { } | \ + re:replace(S, "[\\.*+?\\[\\]\\(\\)\\{\\}\\|\\\\]", "\\\\&", [global, {return, list}]). diff --git a/compose.tracing.yaml b/compose.tracing.yaml index 24e0bbbf..b2a3bb9f 100644 --- a/compose.tracing.yaml +++ b/compose.tracing.yaml @@ -1,42 +1,105 @@ +# UI: Grafana http://localhost:3000 (admin/admin) services: + # OpenTelemetry Collector: single OTLP endpoint, fans out to Tempo + Loki + otel-collector: + image: otel/opentelemetry-collector-contrib:0.112.0 + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./config/otel-collector-config.yaml:/etc/otel/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + depends_on: + tempo: + condition: service_healthy + loki: + condition: service_healthy dmt: environment: &otlp_enabled OTEL_TRACES_EXPORTER: otlp + OTEL_LOGS_EXPORTER: otlp OTEL_TRACES_SAMPLER: parentbased_always_off OTEL_EXPORTER_OTLP_PROTOCOL: http_protobuf - OTEL_EXPORTER_OTLP_ENDPOINT: http://jaeger:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318 + OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: http://otel-collector:4318/v1/logs + OTEL_SERVICE_NAME: dmt bender: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: bender limiter: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: limiter party-management: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: party-management testrunner: + volumes: + - .:$PWD + - ../opentelemetry-erlang:/opt/opentelemetry-erlang:ro environment: <<: *otlp_enabled - OTEL_SERVICE_NAME: hellgate_testrunner + OTEL_SERVICE_NAME: hellgate OTEL_TRACES_SAMPLER: parentbased_always_on depends_on: - jaeger: + otel-collector: condition: service_healthy + grafana: + condition: service_started - jaeger: - image: jaegertracing/all-in-one:1.47 - environment: - - COLLECTOR_OTLP_ENABLED=true + tempo: + image: grafana/tempo:2.6.1 + command: ["-config.file=/etc/tempo.yaml"] + volumes: + - ./config/tempo.yaml:/etc/tempo.yaml:ro + ports: + - 3200:3200 healthcheck: - test: "/go/bin/all-in-one-linux status" - interval: 2s - timeout: 1s + test: ["CMD-SHELL", "wget -q -O- http://localhost:3200/ready || exit 1"] + interval: 5s + timeout: 2s retries: 20 + start_period: 5s + + loki: + image: grafana/loki:3.1.1 + command: ["-config.file=/etc/loki/config.yaml"] + volumes: + - ./config/loki.yaml:/etc/loki/config.yaml:ro ports: - - 4317:4317 # OTLP gRPC receiver - - 4318:4318 # OTLP http receiver - - 5778:5778 - - 14250:14250 - - 16686:16686 + - 3100:3100 + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:3100/ready || exit 1"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + + grafana: + image: grafana/grafana:11.2.0 + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ./config/grafana/provisioning:/etc/grafana/provisioning:ro + ports: + - 3000:3000 + depends_on: + loki: + condition: service_healthy + tempo: + condition: service_healthy diff --git a/config/grafana/provisioning/dashboards/dashboards.yaml b/config/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 00000000..c2dd1261 --- /dev/null +++ b/config/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: hellgate-observability + orgId: 1 + folder: Hellgate + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/config/grafana/provisioning/dashboards/hellgate-otel-only.json b/config/grafana/provisioning/dashboards/hellgate-otel-only.json new file mode 100644 index 00000000..209aa867 --- /dev/null +++ b/config/grafana/provisioning/dashboards/hellgate-otel-only.json @@ -0,0 +1,404 @@ +{ + "id": null, + "uid": "hellgate-otel-only", + "title": "Hellgate Observability (OTLP)", + "tags": ["hellgate", "otel", "loki", "tempo", "logs", "traces"], + "timezone": "browser", + "schemaVersion": 39, + "version": 2, + "refresh": "10s", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-30m", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "service", + "label": "Service", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "query": { + "query": "label_values({exporter=\"OTLP\"}, service_name)" + }, + "refresh": 2, + "includeAll": true, + "allValue": ".+", + "multi": false, + "current": { + "selected": true, + "text": "hellgate", + "value": "hellgate" + } + } + ] + }, + "panels": [ + { + "id": 1, + "type": "logs", + "title": "📝 Application Logs (filtered)", + "gridPos": { + "h": 12, + "w": 16, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "{exporter=\"OTLP\", service_name=~\"$service\"} | json | body !~ \"^application: .* exited: stopped\" | body !~ \"otel_log_handler_(wakeup|flush)\" | line_format \"{{.severity}} [trace={{.traceid}} span={{.spanid}}] {{.body}}\"", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "sortOrder": "Descending", + "dedupStrategy": "none", + "enableLogDetails": true + } + }, + { + "id": 2, + "type": "logs", + "title": "🔍 Raw JSON Logs", + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 0 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "{exporter=\"OTLP\", service_name=~\"$service\"} | json", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": false, + "prettifyLogMessage": true, + "sortOrder": "Descending", + "dedupStrategy": "none", + "enableLogDetails": true + } + }, + { + "id": 3, + "type": "stat", + "title": "📊 Total Logs / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"}[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1000, "color": "yellow"}, + {"value": 5000, "color": "red"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 4, + "type": "stat", + "title": "⚠️ Warnings / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_WARN.*\" | body !~ \"^application: .* exited\"[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1, "color": "yellow"}, + {"value": 50, "color": "orange"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 5, + "type": "stat", + "title": "❌ Errors / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_(ERROR|FATAL).*\" | body !~ \"^application: .* exited\"[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1, "color": "orange"}, + {"value": 10, "color": "red"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 6, + "type": "timeseries", + "title": "📈 Log Rate by Severity", + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_INFO.*\"[1m]))", + "legendFormat": "info", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + }, + { + "refId": "B", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_WARN.*\" | body !~ \"^application: .* exited\"[1m]))", + "legendFormat": "warn", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + }, + { + "refId": "C", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_(ERROR|FATAL).*\" | body !~ \"^application: .* exited\"[1m]))", + "legendFormat": "error", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "info"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "blue"}}] + }, + { + "matcher": {"id": "byName", "options": "warn"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}] + }, + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + { + "id": 7, + "type": "traces", + "title": "🔗 Distributed Traces (Tempo)", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceqlSearch", + "query": "{service.name=\"$service\"}", + "limit": 20, + "serviceName": "$service" + } + ], + "options": { + "showHeader": true, + "showTime": true, + "showSpanCount": true, + "showDuration": true + } + }, + { + "id": 8, + "type": "text", + "title": "ℹ️ How to Use", + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "mode": "markdown", + "content": "## 🔍 Logs & Traces Correlation\n\n1. **View logs with traces**: Click on `trace=...` in filtered logs to jump to Tempo trace\n2. **Explore raw data**: Use Raw JSON panel for debugging\n3. **Filter by severity**: Use Explore with query: `{exporter=\"OTLP\", service_name=\"hellgate\"} | json | severity=~\"SEVERITY_NUMBER_ERROR.*\"`\n\n## 📊 Metrics\n- **Total Logs**: All logs in 5min window\n- **Warnings/Errors**: Severity-based counts (excludes app lifecycle noise)\n- **Rate Chart**: Real-time log rate by severity level" + } + } + ] +} diff --git a/config/grafana/provisioning/datasources/datasources.yaml b/config/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 00000000..55390980 --- /dev/null +++ b/config/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,32 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + jsonData: + maxLines: 5000 + derivedFields: + - name: trace_id + matcherRegex: 'trace[_ ]?id[=:"]+([a-fA-F0-9]{16,32})' + datasourceUid: tempo + url: '$${__value.raw}' + + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + jsonData: + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: -5m + spanEndTimeShift: 5m + tags: ['service.name', 'service_name'] + serviceMap: + datasourceUid: tempo + nodeGraph: + enabled: true diff --git a/config/loki.yaml b/config/loki.yaml new file mode 100644 index 00000000..56df8e0e --- /dev/null +++ b/config/loki.yaml @@ -0,0 +1,35 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /tmp/loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /tmp/loki/chunks + +limits_config: + allow_structured_metadata: true + volume_enabled: true + +pattern_ingester: + enabled: true + +ruler: + enable_api: true diff --git a/config/otel-collector-config.yaml b/config/otel-collector-config.yaml new file mode 100644 index 00000000..fdae1d57 --- /dev/null +++ b/config/otel-collector-config.yaml @@ -0,0 +1,51 @@ +# OpenTelemetry Collector: receives OTLP, exports traces to Tempo, logs to Loki +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + +exporters: + # Traces -> Tempo + otlp/tempo: + endpoint: "tempo:4317" + tls: + insecure: true + + # Logs -> Loki + loki: + endpoint: "http://loki:3100/loki/api/v1/push" + default_labels_enabled: + exporter: true + job: true + retry_on_failure: + enabled: true + max_elapsed_time: 30s + +processors: + batch: + send_batch_size: 512 + timeout: 2s + resource/add_loki_labels: + attributes: + - key: loki.resource.labels + value: service.name + action: insert + +extensions: + health_check: + endpoint: "0.0.0.0:13133" + +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/tempo] + logs: + receivers: [otlp] + processors: [batch, resource/add_loki_labels] + exporters: [loki] diff --git a/config/sys.config b/config/sys.config index 67cd24ed..51333735 100644 --- a/config/sys.config +++ b/config/sys.config @@ -43,6 +43,10 @@ % Should be greater than any other timeouts idle_timeout => infinity }}, + %% OTEL log handler configuration + {otel_log_max_queue_size, 2048}, + {otel_log_scheduled_delay_ms, 1000}, + {otel_log_exporting_timeout_ms, 300000}, {scoper_event_handler_options, #{ event_handler_opts => #{ formatter_opts => #{ diff --git a/config/tempo.yaml b/config/tempo.yaml new file mode 100644 index 00000000..97bb684a --- /dev/null +++ b/config/tempo.yaml @@ -0,0 +1,37 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 24h + +storage: + trace: + backend: local + local: + path: /tmp/tempo/traces + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] diff --git a/rebar.config b/rebar.config index e54b9e06..b2b3d36e 100644 --- a/rebar.config +++ b/rebar.config @@ -46,12 +46,12 @@ {prometheus, "4.11.0"}, {prometheus_cowboy, "0.1.9"}, - %% OpenTelemetry deps - {opentelemetry_api, "1.4.0"}, - {opentelemetry, "1.5.0"}, - {opentelemetry_exporter, "1.8.0"}, - {eqwalizer_support, - {git_subdir, "https://github.com/whatsapp/eqwalizer.git", {branch, "main"}, "eqwalizer_support"}} + %% OpenTelemetry deps. + {opentelemetry_api, "1.5.0"}, + {opentelemetry, "1.7.0"}, + {opentelemetry_exporter, {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}}, + {opentelemetry_api_experimental, "0.5.1"}, + {opentelemetry_experimental, {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}} ]}. {xref_checks, [ @@ -89,7 +89,9 @@ {runtime_tools, load}, {tools, load}, {canal, load}, + opentelemetry_exporter, {opentelemetry, temporary}, + opentelemetry_experimental, logger_logstash_formatter, sasl, herd, diff --git a/rebar.lock b/rebar.lock index 11e04791..d80f0666 100644 --- a/rebar.lock +++ b/rebar.lock @@ -84,10 +84,20 @@ {git,"https://github.com/valitydev/msgpack-proto.git", {ref,"7e447496aa5df4a5f1ace7ef2e3c31248b2a3ed0"}}, 2}, - {<<"opentelemetry">>,{pkg,<<"opentelemetry">>,<<"1.5.0">>},0}, - {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.4.0">>},0}, + {<<"opentelemetry">>,{pkg,<<"opentelemetry">>,<<"1.7.0">>},0}, + {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.5.0">>},0}, + {<<"opentelemetry_api_experimental">>, + {pkg,<<"opentelemetry_api_experimental">>,<<"0.5.1">>}, + 0}, + {<<"opentelemetry_experimental">>, + {git_subdir,"https://github.com/open-telemetry/opentelemetry-erlang.git", + {ref,"98be90e6167997853f83b8f27917604fb35aeecc"}, + "apps/opentelemetry_experimental"}, + 0}, {<<"opentelemetry_exporter">>, - {pkg,<<"opentelemetry_exporter">>,<<"1.8.0">>}, + {git_subdir,"https://github.com/open-telemetry/opentelemetry-erlang.git", + {ref,"98be90e6167997853f83b8f27917604fb35aeecc"}, + "apps/opentelemetry_exporter"}, 0}, {<<"parse_trans">>,{pkg,<<"parse_trans">>,<<"3.3.1">>},2}, {<<"party_client">>, @@ -151,9 +161,9 @@ {<<"kafka_protocol">>, <<"F917B6C90C8DF0DE2B40A87D6B9AE1CFCE7788E91A65818E90E40CF76111097A">>}, {<<"metrics">>, <<"25F094DEA2CDA98213CECC3AEFF09E940299D950904393B2A29D191C346A8486">>}, {<<"mimerl">>, <<"3882A5CA67FBBE7117BA8947F27643557ADEC38FA2307490C4C4207624CB213B">>}, - {<<"opentelemetry">>, <<"7DDA6551EDFC3050EA4B0B40C0D2570423D6372B97E9C60793263EF62C53C3C2">>}, - {<<"opentelemetry_api">>, <<"63CA1742F92F00059298F478048DFB826F4B20D49534493D6919A0DB39B6DB04">>}, - {<<"opentelemetry_exporter">>, <<"5D546123230771EF4174E37BEDFD77E3374913304CD6EA3CA82A2ADD49CD5D56">>}, + {<<"opentelemetry">>, <<"20D0F12D3D1C398D3670FD44FD1A7C495DD748AB3E5B692A7906662E2FB1A38A">>}, + {<<"opentelemetry_api">>, <<"1A676F3E3340CAB81C763E939A42E11A70C22863F645AA06AAFEFC689B5550CF">>}, + {<<"opentelemetry_api_experimental">>, <<"1B5AFACFCBD0834390336C845BC8AE08C8CF0D69BBED72EE53D178798B93E074">>}, {<<"parse_trans">>, <<"16328AB840CC09919BD10DAB29E431DA3AF9E9E7E7E6F0089DD5A2D2820011D8">>}, {<<"prometheus">>, <<"B95F8DE8530F541BD95951E18E355A840003672E5EDA4788C5FA6183406BA29A">>}, {<<"prometheus_cowboy">>, <<"D9D5B300516A61ED5AE31391F8EEEEB202230081D32A1813F2D78772B6F274E1">>}, @@ -185,9 +195,9 @@ {<<"kafka_protocol">>, <<"DF680A3706EAD8695F8B306897C0A33E8063C690DA9308DB87B462CFD7029D04">>}, {<<"metrics">>, <<"69B09ADDDC4F74A40716AE54D140F93BEB0FB8978D8636EADED0C31B6F099F16">>}, {<<"mimerl">>, <<"13AF15F9F68C65884ECCA3A3891D50A7B57D82152792F3E19D88650AA126B144">>}, - {<<"opentelemetry">>, <<"CDF4F51D17B592FC592B9A75F86A6F808C23044BA7CF7B9534DEBBCC5C23B0EE">>}, - {<<"opentelemetry_api">>, <<"3DFBBFAA2C2ED3121C5C483162836C4F9027DEF469C41578AF5EF32589FCFC58">>}, - {<<"opentelemetry_exporter">>, <<"A1F9F271F8D3B02B81462A6BFEF7075FD8457FDB06ADFF5D2537DF5E2264D9AF">>}, + {<<"opentelemetry">>, <<"A9173B058C4549BF824CBC2F1D2FA2ADC5CDEDC22AA3F0F826951187BBD53131">>}, + {<<"opentelemetry_api">>, <<"F53EC8A1337AE4A487D43AC89DA4BD3A3C99DDF576655D071DEED8B56A2D5DDA">>}, + {<<"opentelemetry_api_experimental">>, <<"10297057EADA47267D4F832011BECEF07D25690E6BF91FEBCCFC4E740DBA1A6F">>}, {<<"parse_trans">>, <<"07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B">>}, {<<"prometheus">>, <<"719862351AABF4DF7079B05DC085D2BBCBE3AC0AC3009E956671B1D5AB88247D">>}, {<<"prometheus_cowboy">>, <<"5F71C039DEB9E9FF9DD6366BC74C907A463872B85286E619EFF0BDA15111695A">>}, From 7dda5d7d29ef48731b25c9f02916c4c585903fb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC?= Date: Sat, 14 Feb 2026 18:54:13 +0300 Subject: [PATCH 2/4] Refactor logging configuration and improve OTel log handler setup - Updated `rebar.config` to format dependencies for better readability. - Enhanced `ensure_otel_log_handler` function in `hellgate.erl` to allow dynamic log level configuration. - Simplified log report formatting in `hg_otel_log_filter.erl`. - Streamlined test suite in `hg_log_delivery_tests_SUITE.erl` to focus on OTel log delivery. - Added `otel_log_level` configuration in `sys.config` for improved log level management. --- apps/hellgate/src/hellgate.erl | 13 +- apps/hellgate/src/hg_otel_log_filter.erl | 4 +- apps/hellgate/src/hg_otel_log_handler.erl | 36 +++ .../test/hg_log_delivery_tests_SUITE.erl | 292 +++++++++--------- config/sys.config | 1 + rebar.config | 8 +- 6 files changed, 191 insertions(+), 163 deletions(-) create mode 100644 apps/hellgate/src/hg_otel_log_handler.erl diff --git a/apps/hellgate/src/hellgate.erl b/apps/hellgate/src/hellgate.erl index 438e9df1..b850f9b2 100644 --- a/apps/hellgate/src/hellgate.erl +++ b/apps/hellgate/src/hellgate.erl @@ -128,8 +128,8 @@ ensure_otel_log_handler() -> MaxQueue = application:get_env(hellgate, otel_log_max_queue_size, 2048), DelayMs = application:get_env(hellgate, otel_log_scheduled_delay_ms, 1000), TimeoutMs = application:get_env(hellgate, otel_log_exporting_timeout_ms, 300000), + LogLevel = application:get_env(hellgate, otel_log_level, info), HandlerConfig = #{ - level => info, report_cb => fun hg_otel_log_filter:format_otp_report_utf8/1, exporter => {otel_exporter_logs_otlp, #{ @@ -138,10 +138,15 @@ ensure_otel_log_handler() -> }}, max_queue_size => MaxQueue, scheduled_delay_ms => DelayMs, - exporting_timeout_ms => TimeoutMs, - filters => [{hg_otel_trace_id_bytes, {fun hg_otel_log_filter:filter/2, undefined}}] + exporting_timeout_ms => TimeoutMs }, - case logger:add_handler(otel_logs, otel_log_handler, HandlerConfig) of + LoggerHandlerConfig = #{ + level => LogLevel, + filter_default => log, + filters => [{hg_otel_trace_id_bytes, {fun hg_otel_log_filter:filter/2, undefined}}], + config => HandlerConfig + }, + case logger:add_handler(otel_logs, hg_otel_log_handler, LoggerHandlerConfig) of ok -> ok; {error, {already_exist, _}} -> diff --git a/apps/hellgate/src/hg_otel_log_filter.erl b/apps/hellgate/src/hg_otel_log_filter.erl index 188af4a0..03018b60 100644 --- a/apps/hellgate/src/hg_otel_log_filter.erl +++ b/apps/hellgate/src/hg_otel_log_filter.erl @@ -38,9 +38,7 @@ format_otp_report_utf8(Report) -> Bin = try logger:format_otp_report(Report) of {Format, Args} -> - unicode:characters_to_binary(io_lib:format(Format, Args)); - Formatted -> - unicode:characters_to_binary(Formatted) + unicode:characters_to_binary(io_lib:format(Format, Args)) catch _:_ -> %% Не даём report_cb падать: fallback в печатное представление отчёта. diff --git a/apps/hellgate/src/hg_otel_log_handler.erl b/apps/hellgate/src/hg_otel_log_handler.erl new file mode 100644 index 00000000..e97bb196 --- /dev/null +++ b/apps/hellgate/src/hg_otel_log_handler.erl @@ -0,0 +1,36 @@ +-module(hg_otel_log_handler). + +-export([log/2]). +-export([adding_handler/1]). +-export([removing_handler/1]). +-export([changing_config/3]). +-export([filter_config/1]). + +-spec log(logger:log_event(), map()) -> ok. +log(LogEvent, Config) -> + otel_log_handler:log(LogEvent, Config). + +-spec adding_handler(map()) -> {ok, map()} | {error, term()}. +adding_handler(Config) -> + otel_log_handler:adding_handler(merge_module_config(Config)). + +-spec removing_handler(map()) -> ok. +removing_handler(Config) -> + otel_log_handler:removing_handler(Config). + +-spec changing_config(set | update, map(), map()) -> {ok, map()} | {error, term()}. +changing_config(SetOrUpdate, OldConfig, NewConfig) -> + otel_log_handler:changing_config( + SetOrUpdate, + merge_module_config(OldConfig), + merge_module_config(NewConfig) + ). + +-spec filter_config(map()) -> map(). +filter_config(Config) -> + otel_log_handler:filter_config(Config). + +merge_module_config(#{config := ModuleConfig} = Config) when is_map(ModuleConfig) -> + maps:merge(Config, ModuleConfig); +merge_module_config(Config) -> + Config. diff --git a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl index 1e15c37d..c4be4b1b 100644 --- a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl +++ b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl @@ -1,31 +1,14 @@ %% @doc -%% Мини-сьют для проверки доставки логов в Loki по разным путям: -%% - Путь 1 (Docker/Promtail): logger -> default handler -> stdout -> Docker -> Promtail -> Loki -%% - Путь 2 (OTel): logger -> otel_log_handler -> OTLP -> otel-collector -> Loki -%% -%% См. compose.tracing.yaml — тесты запускаются в testrunner с otel-log-handler. -%% Loki доступен как http://loki:3100 в docker network. -%% -%% Запуск: -%% rebar3 ct --suite=apps/hellgate/test/hg_log_delivery_tests_SUITE -%% -%% В compose с tracing (compose.tracing.yaml): -%% docker compose -f compose.yaml -f compose.tracing.yaml run testrunner rebar3 ct --suite=... +%% Мини-сьют для проверки доставки логов через OTel: +%% logger -> otel_log_handler -> OTLP -> otel-collector -> Loki. -module(hg_log_delivery_tests_SUITE). -include_lib("common_test/include/ct.hrl"). --include_lib("stdlib/include/assert.hrl"). -export([all/0]). -export([init_per_suite/1]). -export([end_per_suite/1]). --export([init_per_testcase/2]). --export([end_per_testcase/2]). - --export([logger_plain_delivery/1]). --export([logger_lazy_format_delivery/1]). -export([logger_otlp_delivery/1]). --export([woody_scoper_delivery/1]). % требует full compose (hellgate + mocks) -type config() :: hg_ct_helper:config(). -type test_case_name() :: hg_ct_helper:test_case_name(). @@ -34,83 +17,78 @@ -define(LOKI_HOST, "loki"). -define(LOKI_PORT, 3100). -define(DELIVERY_WAIT_MS, 12000). +-define(DELIVERY_ASSERT_TIMEOUT_MS, 90000). +-define(DELIVERY_POLL_INTERVAL_MS, 2000). +-define(LOKI_LOOKBACK_NS, 10 * 60 * 1_000_000_000). -spec all() -> [test_case_name()]. all() -> - [ - %% Plain + lazy в одном тесте — избегаем cross-test interference (второй тест не экспортирует) - logger_otlp_delivery - %% logger_plain_delivery, logger_lazy_format_delivery — см. otel_log_handler пустой batch - %% woody_scoper_delivery — раскомментировать при запуске в full compose с mocks - ]. + [logger_otlp_delivery]. -spec init_per_suite(config()) -> config(). init_per_suite(C) -> _ = application:ensure_all_started(inets, temporary), - %% Стартуем hellgate - это вызовет ensure_otel_log_handler() в hellgate:start/2 + OldPrimaryLevel = get_primary_logger_level(), + _ = logger:set_primary_config(level, info), {Apps, _Ret} = hg_ct_helper:start_apps([woody, scoper, dmt_client, hg_proto, hellgate]), - %% Проверка доступности otel-collector для OTel-пути case httpc:request(get, {"http://otel-collector:4318", []}, [{timeout, 3000}], []) of - {ok, _} -> ok; + {ok, _} -> + ok; {error, Reason} -> - ct:pal("WARNING: otel-collector unreachable (~p). OTel path will likely fail.", [Reason]) + ct:log("WARNING: otel-collector unreachable (~p). OTel path will likely fail.", [Reason]) end, - [{loki_url, loki_base_url()}, {apps, Apps} | C]. + [{loki_url, loki_base_url()}, {apps, Apps}, {old_logger_primary_level, OldPrimaryLevel} | C]. -spec end_per_suite(config()) -> ok. end_per_suite(C) -> - _ = hg_ct_helper:flush_otel_logs(), _ = [application:stop(App) || App <- hg_ct_helper:cfg(apps, C)], + case proplists:get_value(old_logger_primary_level, C, undefined) of + undefined -> + ok; + OldLevel -> + _ = logger:set_primary_config(level, OldLevel), + ok + end, ok. --spec init_per_testcase(test_case_name(), config()) -> config(). -init_per_testcase(_TC, C) -> - C. - --spec end_per_testcase(test_case_name(), config()) -> ok. -end_per_testcase(_TC, _C) -> - ok. - -%% ------------------------------------------------------------------------- -%% Helpers -%% ---------------------------------------------------------------------------- +get_primary_logger_level() -> + case logger:get_primary_config() of + #{level := L} -> L; + _ -> undefined + end. loki_base_url() -> - Host = case os:getenv("LOKI_HOST") of - false -> ?LOKI_HOST; - H -> H - end, - Port = case os:getenv("LOKI_PORT") of - false -> integer_to_list(?LOKI_PORT); - P -> P - end, + Host = + case os:getenv("LOKI_HOST") of + false -> ?LOKI_HOST; + H -> H + end, + Port = + case os:getenv("LOKI_PORT") of + false -> integer_to_list(?LOKI_PORT); + P -> P + end, "http://" ++ Host ++ ":" ++ Port. make_marker() -> Rand = base64:encode(crypto:strong_rand_bytes(8)), ?LOG_MARKER_PREFIX ++ binary_to_list(Rand). -%% Отправить логи разными способами, затем проверить доставку в Loki -send_and_wait(MarkerPlain, MarkerLazy, C) -> - %% 1. Plain logger — идёт в default + otel_log_handler +send_and_wait(MarkerPlain, MarkerLazy) -> logger:info("~s", [MarkerPlain]), - %% 2. Lazy format (как scoper_woody_event_handler) logger:info(fun(Args) -> {"~s", Args} end, [MarkerLazy]), - timer:sleep(?DELIVERY_WAIT_MS), - C. + timer:sleep(?DELIVERY_WAIT_MS). -%% Запрос Loki API: GET /loki/api/v1/query_range -%% Query: LogQL, например {job="docker"} |~ "MARKER" или {service_name="hellgate"} |~ "MARKER" -spec query_loki(string(), config()) -> {ok, [binary()]} | {error, term()}. query_loki(LogQL, C) -> BaseUrl = proplists:get_value(loki_url, C), EndNs = erlang:system_time(nanosecond), - StartNs = EndNs - 60 * 1_000_000_000, %% 1 min back + StartNs = EndNs - ?LOKI_LOOKBACK_NS, Query = [ {"query", LogQL}, {"start", integer_to_list(StartNs)}, {"end", integer_to_list(EndNs)}, - {"limit", "100"} + {"limit", "2000"} ], URL = BaseUrl ++ "/loki/api/v1/query_range?" ++ build_query(Query), case http_get(URL) of @@ -123,10 +101,7 @@ query_loki(LogQL, C) -> end. build_query(KVs) -> - Parts = [ - qs_key(K) ++ "=" ++ qs_value(V) - || {K, V} <- KVs - ], + Parts = [qs_key(K) ++ "=" ++ qs_value(V) || {K, V} <- KVs], string:join(Parts, "&"). qs_key(S) -> @@ -135,6 +110,14 @@ qs_key(S) -> qs_value(S) -> lists:flatten(percent_encode(ensure_binary(S))). +logql_quote(S) -> + Bin = ensure_binary(S), + Escaped = binary:replace(Bin, <<"\\">>, <<"\\\\">>, [global]), + <<"\"", (binary:replace(Escaped, <<"\"">>, <<"\\\"">>, [global]))/binary, "\"">>. + +build_marker_query(Selector, Marker) -> + Selector ++ " |= " ++ binary_to_list(logql_quote(Marker)). + ensure_binary(S) when is_list(S) -> unicode:characters_to_binary(S); ensure_binary(S) when is_binary(S) -> @@ -144,9 +127,9 @@ percent_encode(<<>>) -> []; percent_encode(<>) when (C >= $a andalso C =< $z) orelse - (C >= $A andalso C =< $Z) orelse - (C >= $0 andalso C =< $9) orelse - C =:= $- orelse C =:= $_ orelse C =:= $. orelse C =:= $~ + (C >= $A andalso C =< $Z) orelse + (C >= $0 andalso C =< $9) orelse + C =:= $- orelse C =:= $_ orelse C =:= $. orelse C =:= $~ -> [C | percent_encode(Rest)]; percent_encode(<>) -> @@ -163,7 +146,7 @@ http_get(URL) -> parse_loki_streams(Body) -> try BodyBin = ensure_binary(Body), - Decoded = jsone:decode(BodyBin, [{object_format, map}, {keys, binary}]), + Decoded = jsx:decode(BodyBin, [return_maps]), Streams = maps:get(<<"result">>, maps:get(<<"data">>, Decoded, #{}), []), Lines = lists:flatmap( fun(Stream) -> @@ -178,103 +161,107 @@ parse_loki_streams(Body) -> {error, {parse_error, Reason, Body}} end. -%% ------------------------------------------------------------------------- -%% Test cases -%% ---------------------------------------------------------------------------- - -spec logger_otlp_delivery(config()) -> ok. logger_otlp_delivery(C) -> - %% Plain + lazy в одном send — оба должны дойти по OTel MarkerPlain = make_marker() ++ "_PLAIN", MarkerLazy = make_marker() ++ "_LAZY", - send_and_wait(MarkerPlain, MarkerLazy, C), + send_and_wait(MarkerPlain, MarkerLazy), assert_delivery(MarkerPlain, C, "logger plain"), assert_delivery(MarkerLazy, C, "logger lazy format"). --spec logger_plain_delivery(config()) -> ok. -logger_plain_delivery(C) -> - Marker = make_marker(), - send_and_wait(Marker, Marker ++ "_LAZY_IGNORED", C), - assert_delivery(Marker, C, "logger plain"). - --spec logger_lazy_format_delivery(config()) -> ok. -logger_lazy_format_delivery(C) -> - MarkerLazy = make_marker() ++ "_LAZY", - MarkerPlain = make_marker() ++ "_PLAIN", - send_and_wait(MarkerPlain, MarkerLazy, C), - assert_delivery(MarkerPlain, C, "logger plain (from same send)"), - assert_delivery(MarkerLazy, C, "logger lazy format"). - --spec woody_scoper_delivery(config()) -> ok. -woody_scoper_delivery(C) -> - Marker = make_marker() ++ "_WOODY", - logger:info("~s", [Marker]), - RootUrl = hg_ct_helper:cfg(root_url, C), - ApiClient = hg_ct_helper:create_client(RootUrl), - try - {ok, InvoicingPid} = hg_client_invoicing:start_link(ApiClient), - _ = hg_client_invoicing:get(<<"00000000-0000-0000-0000-000000000000">>, InvoicingPid), - ok - catch - _:_ -> - ok - end, - timer:sleep(?DELIVERY_WAIT_MS), - assert_delivery(Marker, C, "woody/scoper"). - -spec assert_delivery(string(), config(), string()) -> ok. assert_delivery(Marker, C, PathDesc) -> - %% Пробуем подключиться — без compose Loki недоступен case query_loki("{exporter=\"OTLP\"}", C) of {error, {failed_connect, _}} -> - ct:pal("Loki unreachable. Run with: docker compose -f compose.yaml -f compose.tracing.yaml run testrunner rebar3 ct --dir=apps/hellgate/test --suite=hg_log_delivery_tests_SUITE"), + ct:log( + "Loki unreachable. Run with: " + "docker compose -f compose.yaml -f compose.tracing.yaml run testrunner " + "rebar3 ct --dir=apps/hellgate/test --suite=hg_log_delivery_tests_SUITE" + ), throw({skip, "Loki not available"}); - _ -> ok + _ -> + ok end, - %% Проверка OTLP пути — только поток с exporter=OTLP. - %% В Loki body может храниться как массив байт JSON, поэтому проверяем маркер - %% после локального декодирования тела сообщения. - case query_loki("{exporter=\"OTLP\", service_name=\"hellgate\"}", C) of - {ok, OTelLines} -> - case otel_lines_contain_marker(OTelLines, Marker) of + DeadlineMs = erlang:monotonic_time(millisecond) + ?DELIVERY_ASSERT_TIMEOUT_MS, + case wait_marker_delivery(Marker, C, DeadlineMs, undefined) of + {ok, QueryUsed} -> + ct:log("Path OTel: found marker ~s via query ~s", [Marker, QueryUsed]), + ok; + {error, LastErr} -> + case LastErr of + undefined -> + ct:fail("~s: marker ~s not found via any OTel query", [PathDesc, Marker]); + _ -> + ct:fail("~s: marker ~s not found via any OTel query (last_error=~p)", [PathDesc, Marker, LastErr]) + end + end. + +wait_marker_delivery(Marker, C, DeadlineMs, LastErr) -> + case try_otel_queries(Marker, C) of + {ok, QueryUsed} -> + {ok, QueryUsed}; + {error, Err} -> + case erlang:monotonic_time(millisecond) >= DeadlineMs of true -> - ct:log("Path OTel: found marker ~s", [Marker]), - ok; + {error, pick_last_error(Err, LastErr)}; false -> - ct:log("Path OTel: marker ~s NOT found (exporter=OTLP,service_name=hellgate)", [Marker]), - %% Пробуем другие labels перед fail - try_otel_alternate_query(Marker, C, PathDesc) - end; - {error, ErrO} -> - ct:log("Path OTel: query failed: ~p", [ErrO]), - case ErrO of - {http_error, 400, _} -> - try_otel_alternate_query(Marker, C, PathDesc); - _ -> - ct:fail("~s: Loki query failed: ~p", [PathDesc, ErrO]) + timer:sleep(?DELIVERY_POLL_INTERVAL_MS), + wait_marker_delivery(Marker, C, DeadlineMs, pick_last_error(Err, LastErr)) end end. -try_otel_alternate_query(Marker, C, PathDesc) -> - %% OTel Loki exporter может использовать другие labels - Queries = [ - "{exporter=\"OTLP\"}", - "{service_name=\"hellgate\"}" +pick_last_error(undefined, LastErr) -> + LastErr; +pick_last_error(Err, _LastErr) -> + Err. + +try_otel_queries(Marker, C) -> + Selectors = [ + "{exporter=\"OTLP\", service_name=\"hellgate\"}", + "{exporter=~\"(?i)otlp\"}", + "{service_name=\"hellgate\"}", + "{job=~\".+\"}" ], - Found = lists:any( - fun(Q) -> - case query_loki(Q, C) of - {ok, Lines} -> - otel_lines_contain_marker(Lines, Marker); - _ -> false + try_selectors(Selectors, Marker, C, undefined). + +try_selectors([], _Marker, _C, LastErr) -> + {error, LastErr}; +try_selectors([Selector | Rest], Marker, C, LastErr) -> + case query_selector_for_marker(Selector, Marker, C) of + {ok, QueryUsed} -> + {ok, QueryUsed}; + {error, Err} -> + try_selectors(Rest, Marker, C, pick_last_error(Err, LastErr)) + end. + +query_selector_for_marker(Selector, Marker, C) -> + MarkerQuery = build_marker_query(Selector, Marker), + case query_loki(MarkerQuery, C) of + {ok, Lines} -> + case otel_lines_contain_marker(Lines, Marker) of + true -> {ok, MarkerQuery}; + false -> query_selector_without_marker_filter(Selector, Marker, C) + end; + {error, {http_error, 400, _}} -> + query_selector_without_marker_filter(Selector, Marker, C); + {error, Err} -> + case query_selector_without_marker_filter(Selector, Marker, C) of + {error, undefined} -> {error, Err}; + Other -> Other end - end, - Queries - ), - case Found of - true -> ok; - false -> - ct:fail("~s: marker ~s not found via any OTel query", [PathDesc, Marker]) + end. + +query_selector_without_marker_filter(Selector, Marker, C) -> + case query_loki(Selector, C) of + {ok, Lines} -> + case otel_lines_contain_marker(Lines, Marker) of + true -> {ok, Selector}; + false -> {error, undefined} + end; + {error, {http_error, 400, _}} -> + {error, undefined}; + {error, Err} -> + {error, Err} end. otel_lines_contain_marker(Lines, Marker) -> @@ -294,7 +281,7 @@ otel_lines_contain_marker(Lines, Marker) -> decode_otel_body(Line) -> try BodyBin = ensure_binary(Line), - Decoded = jsone:decode(BodyBin, [{object_format, map}, {keys, binary}]), + Decoded = jsx:decode(BodyBin, [return_maps]), case maps:get(<<"body">>, Decoded, undefined) of undefined -> error; @@ -309,15 +296,12 @@ decode_otel_body(Line) -> body_to_binary(Body) when is_binary(Body) -> Body; body_to_binary(Body) when is_list(Body) -> - case catch iolist_to_binary(Body) of - Bin when is_binary(Bin) -> - Bin; - _ -> + try iolist_to_binary(Body) of + Bin -> + Bin + catch + _:_ -> unicode:characters_to_binary(io_lib:format("~tp", [Body])) end; body_to_binary(Body) -> unicode:characters_to_binary(io_lib:format("~tp", [Body])). - -escape_loki_regex(S) -> - %% Loki regex: escape . * + ? [ ] ( ) { } | \ - re:replace(S, "[\\.*+?\\[\\]\\(\\)\\{\\}\\|\\\\]", "\\\\&", [global, {return, list}]). diff --git a/config/sys.config b/config/sys.config index 51333735..49bed0c9 100644 --- a/config/sys.config +++ b/config/sys.config @@ -44,6 +44,7 @@ idle_timeout => infinity }}, %% OTEL log handler configuration + {otel_log_level, info}, {otel_log_max_queue_size, 2048}, {otel_log_scheduled_delay_ms, 1000}, {otel_log_exporting_timeout_ms, 300000}, diff --git a/rebar.config b/rebar.config index b2b3d36e..94834a90 100644 --- a/rebar.config +++ b/rebar.config @@ -49,9 +49,13 @@ %% OpenTelemetry deps. {opentelemetry_api, "1.5.0"}, {opentelemetry, "1.7.0"}, - {opentelemetry_exporter, {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}}, + {opentelemetry_exporter, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}}, {opentelemetry_api_experimental, "0.5.1"}, - {opentelemetry_experimental, {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}} + {opentelemetry_experimental, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}} ]}. {xref_checks, [ From 6ca33961302ae70b4eef1e0acd6875e8132ea875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC?= Date: Sat, 14 Feb 2026 19:07:22 +0300 Subject: [PATCH 3/4] fixed after ai review --- apps/hellgate/src/hellgate.erl | 23 ++-- apps/hellgate/src/hg_otel_log_handler.erl | 14 ++- .../test/hg_log_delivery_tests_SUITE.erl | 102 ++++++------------ compose.tracing.yaml | 3 - rebar.config | 4 +- 5 files changed, 65 insertions(+), 81 deletions(-) diff --git a/apps/hellgate/src/hellgate.erl b/apps/hellgate/src/hellgate.erl index b850f9b2..17f7577e 100644 --- a/apps/hellgate/src/hellgate.erl +++ b/apps/hellgate/src/hellgate.erl @@ -105,9 +105,14 @@ get_prometheus_route() -> -spec start(normal, any()) -> {ok, pid()} | {error, any()}. start(_StartType, _StartArgs) -> - ok = ensure_otel_log_handler(), - ok = setup_metrics(), - supervisor:start_link(?MODULE, []). + case ensure_otel_log_handler() of + ok -> + ok = setup_metrics(), + supervisor:start_link(?MODULE, []); + {error, Reason} -> + logger:error("Failed to add otel_logs handler: ~p", [Reason]), + {error, Reason} + end. -spec stop(any()) -> ok. stop(_State) -> @@ -152,11 +157,17 @@ ensure_otel_log_handler() -> {error, {already_exist, _}} -> ok; {error, Reason} -> - error_logger:error_msg("Failed to add otel_logs handler: ~p", [Reason]), - ok + {error, {otel_log_handler_failed, Reason}} end end. +%% @doc Ждём отправки буферизованных логов перед остановкой. +%% otel_log_handler батчит логи и отправляет по таймеру (scheduled_delay_ms). +%% Явного API для flush у otel_log_handler нет, поэтому ждём один полный цикл +%% батчинга + запас на сетевую отправку (export overhead). +-define(FLUSH_EXPORT_OVERHEAD_MS, 700). +-define(FLUSH_MAX_WAIT_MS, 5000). + flush_otel_logs() -> case logger:get_handler_config(otel_logs) of {ok, HandlerCfg} -> @@ -167,7 +178,7 @@ flush_otel_logs() -> maps:get(scheduled_delay_ms, HandlerCfg, 1000) ), _ = logger:info("otel_log_handler_flush"), - timer:sleep(erlang:min(5000, DelayMs + 700)), + timer:sleep(erlang:min(?FLUSH_MAX_WAIT_MS, DelayMs + ?FLUSH_EXPORT_OVERHEAD_MS)), ok; _ -> ok diff --git a/apps/hellgate/src/hg_otel_log_handler.erl b/apps/hellgate/src/hg_otel_log_handler.erl index e97bb196..2168620f 100644 --- a/apps/hellgate/src/hg_otel_log_handler.erl +++ b/apps/hellgate/src/hg_otel_log_handler.erl @@ -30,7 +30,19 @@ changing_config(SetOrUpdate, OldConfig, NewConfig) -> filter_config(Config) -> otel_log_handler:filter_config(Config). +%% Переносим ключи из вложенного config в корневой map для otel_log_handler. +%% Только ключи, которые ожидает otel_log_handler — чтобы случайно не перезаписать +%% верхнеуровневые настройки logger (level, filters, filter_default и т.д.). +-define(OTEL_LOG_HANDLER_KEYS, [ + exporter, + report_cb, + max_queue_size, + scheduled_delay_ms, + exporting_timeout_ms +]). + merge_module_config(#{config := ModuleConfig} = Config) when is_map(ModuleConfig) -> - maps:merge(Config, ModuleConfig); + OtelConfig = maps:with(?OTEL_LOG_HANDLER_KEYS, ModuleConfig), + maps:merge(Config, OtelConfig); merge_module_config(Config) -> Config. diff --git a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl index c4be4b1b..6171c6b7 100644 --- a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl +++ b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl @@ -5,6 +5,7 @@ -include_lib("common_test/include/ct.hrl"). +-export([suite/0]). -export([all/0]). -export([init_per_suite/1]). -export([end_per_suite/1]). @@ -16,10 +17,15 @@ -define(LOG_MARKER_PREFIX, "HG_LOG_DELIVERY_"). -define(LOKI_HOST, "loki"). -define(LOKI_PORT, 3100). --define(DELIVERY_WAIT_MS, 12000). --define(DELIVERY_ASSERT_TIMEOUT_MS, 90000). --define(DELIVERY_POLL_INTERVAL_MS, 2000). +-define(DELIVERY_WAIT_MS, 5000). +-define(DELIVERY_ASSERT_TIMEOUT_MS, 30000). +-define(DELIVERY_POLL_INTERVAL_MS, 1000). -define(LOKI_LOOKBACK_NS, 10 * 60 * 1_000_000_000). +-define(LOKI_SELECTOR, "{exporter=\"OTLP\", service_name=\"hellgate\"}"). + +-spec suite() -> list(). +suite() -> + [{timetrap, {minutes, 2}}]. -spec all() -> [test_case_name()]. all() -> @@ -171,7 +177,7 @@ logger_otlp_delivery(C) -> -spec assert_delivery(string(), config(), string()) -> ok. assert_delivery(Marker, C, PathDesc) -> - case query_loki("{exporter=\"OTLP\"}", C) of + case query_loki(?LOKI_SELECTOR, C) of {error, {failed_connect, _}} -> ct:log( "Loki unreachable. Run with: " @@ -185,84 +191,40 @@ assert_delivery(Marker, C, PathDesc) -> DeadlineMs = erlang:monotonic_time(millisecond) + ?DELIVERY_ASSERT_TIMEOUT_MS, case wait_marker_delivery(Marker, C, DeadlineMs, undefined) of {ok, QueryUsed} -> - ct:log("Path OTel: found marker ~s via query ~s", [Marker, QueryUsed]), + ct:log("~s: found marker ~s via query ~s", [PathDesc, Marker, QueryUsed]), ok; {error, LastErr} -> - case LastErr of - undefined -> - ct:fail("~s: marker ~s not found via any OTel query", [PathDesc, Marker]); - _ -> - ct:fail("~s: marker ~s not found via any OTel query (last_error=~p)", [PathDesc, Marker, LastErr]) - end + ct:fail("~s: marker ~s not found in Loki (last_error=~p)", [PathDesc, Marker, LastErr]) end. wait_marker_delivery(Marker, C, DeadlineMs, LastErr) -> - case try_otel_queries(Marker, C) of - {ok, QueryUsed} -> - {ok, QueryUsed}; - {error, Err} -> - case erlang:monotonic_time(millisecond) >= DeadlineMs of - true -> - {error, pick_last_error(Err, LastErr)}; - false -> - timer:sleep(?DELIVERY_POLL_INTERVAL_MS), - wait_marker_delivery(Marker, C, DeadlineMs, pick_last_error(Err, LastErr)) - end - end. - -pick_last_error(undefined, LastErr) -> - LastErr; -pick_last_error(Err, _LastErr) -> - Err. - -try_otel_queries(Marker, C) -> - Selectors = [ - "{exporter=\"OTLP\", service_name=\"hellgate\"}", - "{exporter=~\"(?i)otlp\"}", - "{service_name=\"hellgate\"}", - "{job=~\".+\"}" - ], - try_selectors(Selectors, Marker, C, undefined). - -try_selectors([], _Marker, _C, LastErr) -> - {error, LastErr}; -try_selectors([Selector | Rest], Marker, C, LastErr) -> - case query_selector_for_marker(Selector, Marker, C) of - {ok, QueryUsed} -> - {ok, QueryUsed}; - {error, Err} -> - try_selectors(Rest, Marker, C, pick_last_error(Err, LastErr)) - end. - -query_selector_for_marker(Selector, Marker, C) -> - MarkerQuery = build_marker_query(Selector, Marker), + MarkerQuery = build_marker_query(?LOKI_SELECTOR, Marker), case query_loki(MarkerQuery, C) of {ok, Lines} -> case otel_lines_contain_marker(Lines, Marker) of - true -> {ok, MarkerQuery}; - false -> query_selector_without_marker_filter(Selector, Marker, C) + true -> + {ok, MarkerQuery}; + false -> + %% Маркер не найден по |= фильтру, пробуем без фильтра (полный scan) + case query_loki(?LOKI_SELECTOR, C) of + {ok, AllLines} -> + case otel_lines_contain_marker(AllLines, Marker) of + true -> {ok, ?LOKI_SELECTOR}; + false -> retry_or_fail(Marker, C, DeadlineMs, LastErr) + end; + {error, Err} -> + retry_or_fail(Marker, C, DeadlineMs, Err) + end end; - {error, {http_error, 400, _}} -> - query_selector_without_marker_filter(Selector, Marker, C); {error, Err} -> - case query_selector_without_marker_filter(Selector, Marker, C) of - {error, undefined} -> {error, Err}; - Other -> Other - end + retry_or_fail(Marker, C, DeadlineMs, Err) end. -query_selector_without_marker_filter(Selector, Marker, C) -> - case query_loki(Selector, C) of - {ok, Lines} -> - case otel_lines_contain_marker(Lines, Marker) of - true -> {ok, Selector}; - false -> {error, undefined} - end; - {error, {http_error, 400, _}} -> - {error, undefined}; - {error, Err} -> - {error, Err} - end. +retry_or_fail(_Marker, _C, DeadlineMs, LastErr) when erlang:monotonic_time(millisecond) >= DeadlineMs -> + {error, LastErr}; +retry_or_fail(Marker, C, DeadlineMs, LastErr) -> + timer:sleep(?DELIVERY_POLL_INTERVAL_MS), + wait_marker_delivery(Marker, C, DeadlineMs, LastErr). otel_lines_contain_marker(Lines, Marker) -> MarkerBin = ensure_binary(Marker), diff --git a/compose.tracing.yaml b/compose.tracing.yaml index b2a3bb9f..fa8257a0 100644 --- a/compose.tracing.yaml +++ b/compose.tracing.yaml @@ -47,9 +47,6 @@ services: OTEL_SERVICE_NAME: party-management testrunner: - volumes: - - .:$PWD - - ../opentelemetry-erlang:/opt/opentelemetry-erlang:ro environment: <<: *otlp_enabled OTEL_SERVICE_NAME: hellgate diff --git a/rebar.config b/rebar.config index 94834a90..6a5bb8d7 100644 --- a/rebar.config +++ b/rebar.config @@ -55,7 +55,9 @@ {opentelemetry_api_experimental, "0.5.1"}, {opentelemetry_experimental, {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", - {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}} + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}}, + {eqwalizer_support, + {git_subdir, "https://github.com/whatsapp/eqwalizer.git", {branch, "main"}, "eqwalizer_support"}} ]}. {xref_checks, [ From 537f549e4128422cfddee4df09b7594c22c79ece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=90=D1=80=D1=82=D0=B5=D0=BC?= Date: Sat, 14 Feb 2026 19:13:02 +0300 Subject: [PATCH 4/4] fixed --- .../test/hg_log_delivery_tests_SUITE.erl | 32 +++++++++++-------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl index 6171c6b7..a0eef768 100644 --- a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl +++ b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl @@ -206,25 +206,31 @@ wait_marker_delivery(Marker, C, DeadlineMs, LastErr) -> {ok, MarkerQuery}; false -> %% Маркер не найден по |= фильтру, пробуем без фильтра (полный scan) - case query_loki(?LOKI_SELECTOR, C) of - {ok, AllLines} -> - case otel_lines_contain_marker(AllLines, Marker) of - true -> {ok, ?LOKI_SELECTOR}; - false -> retry_or_fail(Marker, C, DeadlineMs, LastErr) - end; - {error, Err} -> - retry_or_fail(Marker, C, DeadlineMs, Err) - end + try_full_scan_or_retry(Marker, C, DeadlineMs, LastErr) + end; + {error, Err} -> + retry_or_fail(Marker, C, DeadlineMs, Err) + end. + +try_full_scan_or_retry(Marker, C, DeadlineMs, LastErr) -> + case query_loki(?LOKI_SELECTOR, C) of + {ok, AllLines} -> + case otel_lines_contain_marker(AllLines, Marker) of + true -> {ok, ?LOKI_SELECTOR}; + false -> retry_or_fail(Marker, C, DeadlineMs, LastErr) end; {error, Err} -> retry_or_fail(Marker, C, DeadlineMs, Err) end. -retry_or_fail(_Marker, _C, DeadlineMs, LastErr) when erlang:monotonic_time(millisecond) >= DeadlineMs -> - {error, LastErr}; retry_or_fail(Marker, C, DeadlineMs, LastErr) -> - timer:sleep(?DELIVERY_POLL_INTERVAL_MS), - wait_marker_delivery(Marker, C, DeadlineMs, LastErr). + case erlang:monotonic_time(millisecond) >= DeadlineMs of + true -> + {error, LastErr}; + false -> + timer:sleep(?DELIVERY_POLL_INTERVAL_MS), + wait_marker_delivery(Marker, C, DeadlineMs, LastErr) + end. otel_lines_contain_marker(Lines, Marker) -> MarkerBin = ensure_binary(Marker),