diff --git a/apps/hellgate/src/hellgate.app.src b/apps/hellgate/src/hellgate.app.src index f5539bc5..60e9f36c 100644 --- a/apps/hellgate/src/hellgate.app.src +++ b/apps/hellgate/src/hellgate.app.src @@ -29,7 +29,9 @@ limiter_proto, opentelemetry_api, opentelemetry_exporter, - opentelemetry + opentelemetry, + opentelemetry_experimental, + opentelemetry_api_experimental ]}, {env, []}, {modules, []}, diff --git a/apps/hellgate/src/hellgate.erl b/apps/hellgate/src/hellgate.erl index 751f180b..17f7577e 100644 --- a/apps/hellgate/src/hellgate.erl +++ b/apps/hellgate/src/hellgate.erl @@ -105,11 +105,18 @@ get_prometheus_route() -> -spec start(normal, any()) -> {ok, pid()} | {error, any()}. start(_StartType, _StartArgs) -> - ok = setup_metrics(), - supervisor:start_link(?MODULE, []). + case ensure_otel_log_handler() of + ok -> + ok = setup_metrics(), + supervisor:start_link(?MODULE, []); + {error, Reason} -> + logger:error("Failed to add otel_logs handler: ~p", [Reason]), + {error, Reason} + end. -spec stop(any()) -> ok. stop(_State) -> + ok = flush_otel_logs(), ok. %% @@ -117,3 +124,62 @@ stop(_State) -> setup_metrics() -> ok = woody_ranch_prometheus_collector:setup(), ok = woody_hackney_prometheus_collector:setup(). + +ensure_otel_log_handler() -> + case logger:get_handler_config(otel_logs) of + {ok, _} -> + ok; + _ -> + MaxQueue = application:get_env(hellgate, otel_log_max_queue_size, 2048), + DelayMs = application:get_env(hellgate, otel_log_scheduled_delay_ms, 1000), + TimeoutMs = application:get_env(hellgate, otel_log_exporting_timeout_ms, 300000), + LogLevel = application:get_env(hellgate, otel_log_level, info), + HandlerConfig = #{ + report_cb => fun hg_otel_log_filter:format_otp_report_utf8/1, + exporter => + {otel_exporter_logs_otlp, #{ + protocol => http_protobuf, + ssl_options => [] + }}, + max_queue_size => MaxQueue, + scheduled_delay_ms => DelayMs, + exporting_timeout_ms => TimeoutMs + }, + LoggerHandlerConfig = #{ + level => LogLevel, + filter_default => log, + filters => [{hg_otel_trace_id_bytes, {fun hg_otel_log_filter:filter/2, undefined}}], + config => HandlerConfig + }, + case logger:add_handler(otel_logs, hg_otel_log_handler, LoggerHandlerConfig) of + ok -> + ok; + {error, {already_exist, _}} -> + ok; + {error, Reason} -> + {error, {otel_log_handler_failed, Reason}} + end + end. + +%% @doc Ждём отправки буферизованных логов перед остановкой. +%% otel_log_handler батчит логи и отправляет по таймеру (scheduled_delay_ms). +%% Явного API для flush у otel_log_handler нет, поэтому ждём один полный цикл +%% батчинга + запас на сетевую отправку (export overhead). +-define(FLUSH_EXPORT_OVERHEAD_MS, 700). +-define(FLUSH_MAX_WAIT_MS, 5000). + +flush_otel_logs() -> + case logger:get_handler_config(otel_logs) of + {ok, HandlerCfg} -> + Config = maps:get(config, HandlerCfg, #{}), + DelayMs = maps:get( + scheduled_delay_ms, + Config, + maps:get(scheduled_delay_ms, HandlerCfg, 1000) + ), + _ = logger:info("otel_log_handler_flush"), + timer:sleep(erlang:min(?FLUSH_MAX_WAIT_MS, DelayMs + ?FLUSH_EXPORT_OVERHEAD_MS)), + ok; + _ -> + ok + end. diff --git a/apps/hellgate/src/hg_otel_log_filter.erl b/apps/hellgate/src/hg_otel_log_filter.erl new file mode 100644 index 00000000..03018b60 --- /dev/null +++ b/apps/hellgate/src/hg_otel_log_filter.erl @@ -0,0 +1,65 @@ +%%% @doc +%%% Logger filter для otel_logs handler: конвертирует otel_trace_id и otel_span_id +%%% из hex-формата (32/16 символов) в raw bytes (16/8 байт), как требует OTLP LogRecord. +%%% opentelemetry hex_span_ctx возвращает hex, collector ожидает bytes. +%%% @end +-module(hg_otel_log_filter). + +-export([filter/2]). +-export([format_otp_report_utf8/1]). + +-spec filter(logger:log_event(), term()) -> logger:filter_return(). +filter(#{meta := Meta} = LogEvent, _FilterConfig) -> + case convert_otel_ids(Meta) of + Meta -> + LogEvent; + Meta1 -> + LogEvent#{meta => Meta1} + end. + +%% Конвертируем hex -> raw bytes только если формат hex (32/16 символов). +%% OTLP LogRecord: trace_id=16 bytes, span_id=8 bytes. +convert_otel_ids(#{otel_trace_id := TraceIdHex, otel_span_id := SpanIdHex} = Meta) -> + case {hex_to_trace_id_bytes(TraceIdHex), hex_to_span_id_bytes(SpanIdHex)} of + {TraceIdBytes, SpanIdBytes} when TraceIdBytes =/= undefined, SpanIdBytes =/= undefined -> + Meta#{otel_trace_id => TraceIdBytes, otel_span_id => SpanIdBytes}; + _ -> + %% Некорректный формат — убираем, чтобы otel_otlp_logs не отправил в OTLP + maps:without([otel_trace_id, otel_span_id, otel_trace_flags], Meta) + end; +convert_otel_ids(Meta) -> + Meta. + +%% logger:format_otp_report/1 возвращает chardata (часто list()), +%% из-за чего downstream JSON может сериализовать body как массив байт. +%% Явно приводим к UTF-8 binary(), чтобы body в OTel/Loki был строкой. +-spec format_otp_report_utf8(logger:report()) -> {unicode:chardata(), list()}. +format_otp_report_utf8(Report) -> + Bin = + try logger:format_otp_report(Report) of + {Format, Args} -> + unicode:characters_to_binary(io_lib:format(Format, Args)) + catch + _:_ -> + %% Не даём report_cb падать: fallback в печатное представление отчёта. + unicode:characters_to_binary(io_lib:format("~tp", [Report])) + end, + {"~ts", [Bin]}. + +hex_to_trace_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 32 -> + try + <<(binary_to_integer(Hex, 16)):128>> + catch + _:_ -> undefined + end; +hex_to_trace_id_bytes(_) -> + undefined. + +hex_to_span_id_bytes(Hex) when is_binary(Hex), byte_size(Hex) =:= 16 -> + try + <<(binary_to_integer(Hex, 16)):64>> + catch + _:_ -> undefined + end; +hex_to_span_id_bytes(_) -> + undefined. diff --git a/apps/hellgate/src/hg_otel_log_handler.erl b/apps/hellgate/src/hg_otel_log_handler.erl new file mode 100644 index 00000000..2168620f --- /dev/null +++ b/apps/hellgate/src/hg_otel_log_handler.erl @@ -0,0 +1,48 @@ +-module(hg_otel_log_handler). + +-export([log/2]). +-export([adding_handler/1]). +-export([removing_handler/1]). +-export([changing_config/3]). +-export([filter_config/1]). + +-spec log(logger:log_event(), map()) -> ok. +log(LogEvent, Config) -> + otel_log_handler:log(LogEvent, Config). + +-spec adding_handler(map()) -> {ok, map()} | {error, term()}. +adding_handler(Config) -> + otel_log_handler:adding_handler(merge_module_config(Config)). + +-spec removing_handler(map()) -> ok. +removing_handler(Config) -> + otel_log_handler:removing_handler(Config). + +-spec changing_config(set | update, map(), map()) -> {ok, map()} | {error, term()}. +changing_config(SetOrUpdate, OldConfig, NewConfig) -> + otel_log_handler:changing_config( + SetOrUpdate, + merge_module_config(OldConfig), + merge_module_config(NewConfig) + ). + +-spec filter_config(map()) -> map(). +filter_config(Config) -> + otel_log_handler:filter_config(Config). + +%% Переносим ключи из вложенного config в корневой map для otel_log_handler. +%% Только ключи, которые ожидает otel_log_handler — чтобы случайно не перезаписать +%% верхнеуровневые настройки logger (level, filters, filter_default и т.д.). +-define(OTEL_LOG_HANDLER_KEYS, [ + exporter, + report_cb, + max_queue_size, + scheduled_delay_ms, + exporting_timeout_ms +]). + +merge_module_config(#{config := ModuleConfig} = Config) when is_map(ModuleConfig) -> + OtelConfig = maps:with(?OTEL_LOG_HANDLER_KEYS, ModuleConfig), + maps:merge(Config, OtelConfig); +merge_module_config(Config) -> + Config. diff --git a/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl new file mode 100644 index 00000000..a0eef768 --- /dev/null +++ b/apps/hellgate/test/hg_log_delivery_tests_SUITE.erl @@ -0,0 +1,275 @@ +%% @doc +%% Мини-сьют для проверки доставки логов через OTel: +%% logger -> otel_log_handler -> OTLP -> otel-collector -> Loki. +-module(hg_log_delivery_tests_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([suite/0]). +-export([all/0]). +-export([init_per_suite/1]). +-export([end_per_suite/1]). +-export([logger_otlp_delivery/1]). + +-type config() :: hg_ct_helper:config(). +-type test_case_name() :: hg_ct_helper:test_case_name(). + +-define(LOG_MARKER_PREFIX, "HG_LOG_DELIVERY_"). +-define(LOKI_HOST, "loki"). +-define(LOKI_PORT, 3100). +-define(DELIVERY_WAIT_MS, 5000). +-define(DELIVERY_ASSERT_TIMEOUT_MS, 30000). +-define(DELIVERY_POLL_INTERVAL_MS, 1000). +-define(LOKI_LOOKBACK_NS, 10 * 60 * 1_000_000_000). +-define(LOKI_SELECTOR, "{exporter=\"OTLP\", service_name=\"hellgate\"}"). + +-spec suite() -> list(). +suite() -> + [{timetrap, {minutes, 2}}]. + +-spec all() -> [test_case_name()]. +all() -> + [logger_otlp_delivery]. + +-spec init_per_suite(config()) -> config(). +init_per_suite(C) -> + _ = application:ensure_all_started(inets, temporary), + OldPrimaryLevel = get_primary_logger_level(), + _ = logger:set_primary_config(level, info), + {Apps, _Ret} = hg_ct_helper:start_apps([woody, scoper, dmt_client, hg_proto, hellgate]), + case httpc:request(get, {"http://otel-collector:4318", []}, [{timeout, 3000}], []) of + {ok, _} -> + ok; + {error, Reason} -> + ct:log("WARNING: otel-collector unreachable (~p). OTel path will likely fail.", [Reason]) + end, + [{loki_url, loki_base_url()}, {apps, Apps}, {old_logger_primary_level, OldPrimaryLevel} | C]. + +-spec end_per_suite(config()) -> ok. +end_per_suite(C) -> + _ = [application:stop(App) || App <- hg_ct_helper:cfg(apps, C)], + case proplists:get_value(old_logger_primary_level, C, undefined) of + undefined -> + ok; + OldLevel -> + _ = logger:set_primary_config(level, OldLevel), + ok + end, + ok. + +get_primary_logger_level() -> + case logger:get_primary_config() of + #{level := L} -> L; + _ -> undefined + end. + +loki_base_url() -> + Host = + case os:getenv("LOKI_HOST") of + false -> ?LOKI_HOST; + H -> H + end, + Port = + case os:getenv("LOKI_PORT") of + false -> integer_to_list(?LOKI_PORT); + P -> P + end, + "http://" ++ Host ++ ":" ++ Port. + +make_marker() -> + Rand = base64:encode(crypto:strong_rand_bytes(8)), + ?LOG_MARKER_PREFIX ++ binary_to_list(Rand). + +send_and_wait(MarkerPlain, MarkerLazy) -> + logger:info("~s", [MarkerPlain]), + logger:info(fun(Args) -> {"~s", Args} end, [MarkerLazy]), + timer:sleep(?DELIVERY_WAIT_MS). + +-spec query_loki(string(), config()) -> {ok, [binary()]} | {error, term()}. +query_loki(LogQL, C) -> + BaseUrl = proplists:get_value(loki_url, C), + EndNs = erlang:system_time(nanosecond), + StartNs = EndNs - ?LOKI_LOOKBACK_NS, + Query = [ + {"query", LogQL}, + {"start", integer_to_list(StartNs)}, + {"end", integer_to_list(EndNs)}, + {"limit", "2000"} + ], + URL = BaseUrl ++ "/loki/api/v1/query_range?" ++ build_query(Query), + case http_get(URL) of + {ok, 200, Body} -> + parse_loki_streams(Body); + {ok, Code, Body} -> + {error, {http_error, Code, Body}}; + Err -> + Err + end. + +build_query(KVs) -> + Parts = [qs_key(K) ++ "=" ++ qs_value(V) || {K, V} <- KVs], + string:join(Parts, "&"). + +qs_key(S) -> + lists:flatten(percent_encode(ensure_binary(S))). + +qs_value(S) -> + lists:flatten(percent_encode(ensure_binary(S))). + +logql_quote(S) -> + Bin = ensure_binary(S), + Escaped = binary:replace(Bin, <<"\\">>, <<"\\\\">>, [global]), + <<"\"", (binary:replace(Escaped, <<"\"">>, <<"\\\"">>, [global]))/binary, "\"">>. + +build_marker_query(Selector, Marker) -> + Selector ++ " |= " ++ binary_to_list(logql_quote(Marker)). + +ensure_binary(S) when is_list(S) -> + unicode:characters_to_binary(S); +ensure_binary(S) when is_binary(S) -> + S. + +percent_encode(<<>>) -> + []; +percent_encode(<>) when + (C >= $a andalso C =< $z) orelse + (C >= $A andalso C =< $Z) orelse + (C >= $0 andalso C =< $9) orelse + C =:= $- orelse C =:= $_ orelse C =:= $. orelse C =:= $~ +-> + [C | percent_encode(Rest)]; +percent_encode(<>) -> + ["%", string:right(erlang:integer_to_list(C, 16), 2, $0) | percent_encode(Rest)]. + +http_get(URL) -> + case httpc:request(get, {URL, []}, [{timeout, 10000}, {connect_timeout, 5000}], []) of + {ok, {{_V, Code, _R}, _H, Body}} -> + {ok, Code, Body}; + {error, Reason} -> + {error, Reason} + end. + +parse_loki_streams(Body) -> + try + BodyBin = ensure_binary(Body), + Decoded = jsx:decode(BodyBin, [return_maps]), + Streams = maps:get(<<"result">>, maps:get(<<"data">>, Decoded, #{}), []), + Lines = lists:flatmap( + fun(Stream) -> + Vs = maps:get(<<"values">>, Stream, []), + [V || [_, V] <- Vs] + end, + Streams + ), + {ok, Lines} + catch + _:Reason -> + {error, {parse_error, Reason, Body}} + end. + +-spec logger_otlp_delivery(config()) -> ok. +logger_otlp_delivery(C) -> + MarkerPlain = make_marker() ++ "_PLAIN", + MarkerLazy = make_marker() ++ "_LAZY", + send_and_wait(MarkerPlain, MarkerLazy), + assert_delivery(MarkerPlain, C, "logger plain"), + assert_delivery(MarkerLazy, C, "logger lazy format"). + +-spec assert_delivery(string(), config(), string()) -> ok. +assert_delivery(Marker, C, PathDesc) -> + case query_loki(?LOKI_SELECTOR, C) of + {error, {failed_connect, _}} -> + ct:log( + "Loki unreachable. Run with: " + "docker compose -f compose.yaml -f compose.tracing.yaml run testrunner " + "rebar3 ct --dir=apps/hellgate/test --suite=hg_log_delivery_tests_SUITE" + ), + throw({skip, "Loki not available"}); + _ -> + ok + end, + DeadlineMs = erlang:monotonic_time(millisecond) + ?DELIVERY_ASSERT_TIMEOUT_MS, + case wait_marker_delivery(Marker, C, DeadlineMs, undefined) of + {ok, QueryUsed} -> + ct:log("~s: found marker ~s via query ~s", [PathDesc, Marker, QueryUsed]), + ok; + {error, LastErr} -> + ct:fail("~s: marker ~s not found in Loki (last_error=~p)", [PathDesc, Marker, LastErr]) + end. + +wait_marker_delivery(Marker, C, DeadlineMs, LastErr) -> + MarkerQuery = build_marker_query(?LOKI_SELECTOR, Marker), + case query_loki(MarkerQuery, C) of + {ok, Lines} -> + case otel_lines_contain_marker(Lines, Marker) of + true -> + {ok, MarkerQuery}; + false -> + %% Маркер не найден по |= фильтру, пробуем без фильтра (полный scan) + try_full_scan_or_retry(Marker, C, DeadlineMs, LastErr) + end; + {error, Err} -> + retry_or_fail(Marker, C, DeadlineMs, Err) + end. + +try_full_scan_or_retry(Marker, C, DeadlineMs, LastErr) -> + case query_loki(?LOKI_SELECTOR, C) of + {ok, AllLines} -> + case otel_lines_contain_marker(AllLines, Marker) of + true -> {ok, ?LOKI_SELECTOR}; + false -> retry_or_fail(Marker, C, DeadlineMs, LastErr) + end; + {error, Err} -> + retry_or_fail(Marker, C, DeadlineMs, Err) + end. + +retry_or_fail(Marker, C, DeadlineMs, LastErr) -> + case erlang:monotonic_time(millisecond) >= DeadlineMs of + true -> + {error, LastErr}; + false -> + timer:sleep(?DELIVERY_POLL_INTERVAL_MS), + wait_marker_delivery(Marker, C, DeadlineMs, LastErr) + end. + +otel_lines_contain_marker(Lines, Marker) -> + MarkerBin = ensure_binary(Marker), + lists:any( + fun(Line) -> + case decode_otel_body(Line) of + {ok, BodyBin} -> + binary:match(BodyBin, MarkerBin) =/= nomatch; + error -> + binary:match(ensure_binary(Line), MarkerBin) =/= nomatch + end + end, + Lines + ). + +decode_otel_body(Line) -> + try + BodyBin = ensure_binary(Line), + Decoded = jsx:decode(BodyBin, [return_maps]), + case maps:get(<<"body">>, Decoded, undefined) of + undefined -> + error; + Body -> + {ok, body_to_binary(Body)} + end + catch + _:_ -> + error + end. + +body_to_binary(Body) when is_binary(Body) -> + Body; +body_to_binary(Body) when is_list(Body) -> + try iolist_to_binary(Body) of + Bin -> + Bin + catch + _:_ -> + unicode:characters_to_binary(io_lib:format("~tp", [Body])) + end; +body_to_binary(Body) -> + unicode:characters_to_binary(io_lib:format("~tp", [Body])). diff --git a/compose.tracing.yaml b/compose.tracing.yaml index 24e0bbbf..fa8257a0 100644 --- a/compose.tracing.yaml +++ b/compose.tracing.yaml @@ -1,42 +1,102 @@ +# UI: Grafana http://localhost:3000 (admin/admin) services: + # OpenTelemetry Collector: single OTLP endpoint, fans out to Tempo + Loki + otel-collector: + image: otel/opentelemetry-collector-contrib:0.112.0 + command: ["--config=/etc/otel/config.yaml"] + volumes: + - ./config/otel-collector-config.yaml:/etc/otel/config.yaml:ro + ports: + - "4317:4317" # OTLP gRPC + - "4318:4318" # OTLP HTTP + healthcheck: + test: ["CMD", "/otelcol-contrib", "--version"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + depends_on: + tempo: + condition: service_healthy + loki: + condition: service_healthy dmt: environment: &otlp_enabled OTEL_TRACES_EXPORTER: otlp + OTEL_LOGS_EXPORTER: otlp OTEL_TRACES_SAMPLER: parentbased_always_off OTEL_EXPORTER_OTLP_PROTOCOL: http_protobuf - OTEL_EXPORTER_OTLP_ENDPOINT: http://jaeger:4318 + OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4318 + OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: http://otel-collector:4318/v1/logs + OTEL_SERVICE_NAME: dmt bender: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: bender limiter: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: limiter party-management: - environment: *otlp_enabled + environment: + <<: *otlp_enabled + OTEL_SERVICE_NAME: party-management testrunner: environment: <<: *otlp_enabled - OTEL_SERVICE_NAME: hellgate_testrunner + OTEL_SERVICE_NAME: hellgate OTEL_TRACES_SAMPLER: parentbased_always_on depends_on: - jaeger: + otel-collector: condition: service_healthy + grafana: + condition: service_started - jaeger: - image: jaegertracing/all-in-one:1.47 - environment: - - COLLECTOR_OTLP_ENABLED=true + tempo: + image: grafana/tempo:2.6.1 + command: ["-config.file=/etc/tempo.yaml"] + volumes: + - ./config/tempo.yaml:/etc/tempo.yaml:ro + ports: + - 3200:3200 healthcheck: - test: "/go/bin/all-in-one-linux status" - interval: 2s - timeout: 1s + test: ["CMD-SHELL", "wget -q -O- http://localhost:3200/ready || exit 1"] + interval: 5s + timeout: 2s retries: 20 + start_period: 5s + + loki: + image: grafana/loki:3.1.1 + command: ["-config.file=/etc/loki/config.yaml"] + volumes: + - ./config/loki.yaml:/etc/loki/config.yaml:ro ports: - - 4317:4317 # OTLP gRPC receiver - - 4318:4318 # OTLP http receiver - - 5778:5778 - - 14250:14250 - - 16686:16686 + - 3100:3100 + healthcheck: + test: ["CMD-SHELL", "wget -q -O- http://localhost:3100/ready || exit 1"] + interval: 5s + timeout: 2s + retries: 20 + start_period: 5s + + grafana: + image: grafana/grafana:11.2.0 + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + volumes: + - ./config/grafana/provisioning:/etc/grafana/provisioning:ro + ports: + - 3000:3000 + depends_on: + loki: + condition: service_healthy + tempo: + condition: service_healthy diff --git a/config/grafana/provisioning/dashboards/dashboards.yaml b/config/grafana/provisioning/dashboards/dashboards.yaml new file mode 100644 index 00000000..c2dd1261 --- /dev/null +++ b/config/grafana/provisioning/dashboards/dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +providers: + - name: hellgate-observability + orgId: 1 + folder: Hellgate + type: file + disableDeletion: false + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/provisioning/dashboards diff --git a/config/grafana/provisioning/dashboards/hellgate-otel-only.json b/config/grafana/provisioning/dashboards/hellgate-otel-only.json new file mode 100644 index 00000000..209aa867 --- /dev/null +++ b/config/grafana/provisioning/dashboards/hellgate-otel-only.json @@ -0,0 +1,404 @@ +{ + "id": null, + "uid": "hellgate-otel-only", + "title": "Hellgate Observability (OTLP)", + "tags": ["hellgate", "otel", "loki", "tempo", "logs", "traces"], + "timezone": "browser", + "schemaVersion": 39, + "version": 2, + "refresh": "10s", + "editable": true, + "graphTooltip": 1, + "time": { + "from": "now-30m", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "service", + "label": "Service", + "type": "query", + "datasource": { + "type": "loki", + "uid": "loki" + }, + "query": { + "query": "label_values({exporter=\"OTLP\"}, service_name)" + }, + "refresh": 2, + "includeAll": true, + "allValue": ".+", + "multi": false, + "current": { + "selected": true, + "text": "hellgate", + "value": "hellgate" + } + } + ] + }, + "panels": [ + { + "id": 1, + "type": "logs", + "title": "📝 Application Logs (filtered)", + "gridPos": { + "h": 12, + "w": 16, + "x": 0, + "y": 0 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "{exporter=\"OTLP\", service_name=~\"$service\"} | json | body !~ \"^application: .* exited: stopped\" | body !~ \"otel_log_handler_(wakeup|flush)\" | line_format \"{{.severity}} [trace={{.traceid}} span={{.spanid}}] {{.body}}\"", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": false, + "showCommonLabels": false, + "wrapLogMessage": true, + "prettifyLogMessage": false, + "sortOrder": "Descending", + "dedupStrategy": "none", + "enableLogDetails": true + } + }, + { + "id": 2, + "type": "logs", + "title": "🔍 Raw JSON Logs", + "gridPos": { + "h": 12, + "w": 8, + "x": 16, + "y": 0 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "{exporter=\"OTLP\", service_name=~\"$service\"} | json", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "options": { + "showTime": true, + "showLabels": true, + "showCommonLabels": false, + "wrapLogMessage": false, + "prettifyLogMessage": true, + "sortOrder": "Descending", + "dedupStrategy": "none", + "enableLogDetails": true + } + }, + { + "id": 3, + "type": "stat", + "title": "📊 Total Logs / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 0, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"}[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1000, "color": "yellow"}, + {"value": 5000, "color": "red"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 4, + "type": "stat", + "title": "⚠️ Warnings / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 4, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_WARN.*\" | body !~ \"^application: .* exited\"[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1, "color": "yellow"}, + {"value": 50, "color": "orange"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 5, + "type": "stat", + "title": "❌ Errors / 5m", + "gridPos": { + "h": 4, + "w": 4, + "x": 8, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(count_over_time({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_(ERROR|FATAL).*\" | body !~ \"^application: .* exited\"[5m]))", + "queryType": "instant", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + {"value": null, "color": "green"}, + {"value": 1, "color": "orange"}, + {"value": 10, "color": "red"} + ] + } + } + }, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": ["lastNotNull"], + "fields": "", + "values": false + }, + "textMode": "value_and_name" + } + }, + { + "id": 6, + "type": "timeseries", + "title": "📈 Log Rate by Severity", + "gridPos": { + "h": 4, + "w": 12, + "x": 12, + "y": 12 + }, + "datasource": { + "type": "loki", + "uid": "loki" + }, + "targets": [ + { + "refId": "A", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_INFO.*\"[1m]))", + "legendFormat": "info", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + }, + { + "refId": "B", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_WARN.*\" | body !~ \"^application: .* exited\"[1m]))", + "legendFormat": "warn", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + }, + { + "refId": "C", + "expr": "sum(rate({exporter=\"OTLP\", service_name=~\"$service\"} | json | severity=~\"SEVERITY_NUMBER_(ERROR|FATAL).*\" | body !~ \"^application: .* exited\"[1m]))", + "legendFormat": "error", + "queryType": "range", + "datasource": { + "type": "loki", + "uid": "loki" + } + } + ], + "fieldConfig": { + "defaults": { + "custom": { + "lineWidth": 1, + "fillOpacity": 10, + "showPoints": "never" + } + }, + "overrides": [ + { + "matcher": {"id": "byName", "options": "info"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "blue"}}] + }, + { + "matcher": {"id": "byName", "options": "warn"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "yellow"}}] + }, + { + "matcher": {"id": "byName", "options": "error"}, + "properties": [{"id": "color", "value": {"mode": "fixed", "fixedColor": "red"}}] + } + ] + }, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + } + }, + { + "id": 7, + "type": "traces", + "title": "🔗 Distributed Traces (Tempo)", + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 16 + }, + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "targets": [ + { + "refId": "A", + "datasource": { + "type": "tempo", + "uid": "tempo" + }, + "queryType": "traceqlSearch", + "query": "{service.name=\"$service\"}", + "limit": 20, + "serviceName": "$service" + } + ], + "options": { + "showHeader": true, + "showTime": true, + "showSpanCount": true, + "showDuration": true + } + }, + { + "id": 8, + "type": "text", + "title": "ℹ️ How to Use", + "gridPos": { + "h": 4, + "w": 24, + "x": 0, + "y": 24 + }, + "options": { + "mode": "markdown", + "content": "## 🔍 Logs & Traces Correlation\n\n1. **View logs with traces**: Click on `trace=...` in filtered logs to jump to Tempo trace\n2. **Explore raw data**: Use Raw JSON panel for debugging\n3. **Filter by severity**: Use Explore with query: `{exporter=\"OTLP\", service_name=\"hellgate\"} | json | severity=~\"SEVERITY_NUMBER_ERROR.*\"`\n\n## 📊 Metrics\n- **Total Logs**: All logs in 5min window\n- **Warnings/Errors**: Severity-based counts (excludes app lifecycle noise)\n- **Rate Chart**: Real-time log rate by severity level" + } + } + ] +} diff --git a/config/grafana/provisioning/datasources/datasources.yaml b/config/grafana/provisioning/datasources/datasources.yaml new file mode 100644 index 00000000..55390980 --- /dev/null +++ b/config/grafana/provisioning/datasources/datasources.yaml @@ -0,0 +1,32 @@ +apiVersion: 1 + +datasources: + - name: Loki + uid: loki + type: loki + access: proxy + url: http://loki:3100 + isDefault: true + jsonData: + maxLines: 5000 + derivedFields: + - name: trace_id + matcherRegex: 'trace[_ ]?id[=:"]+([a-fA-F0-9]{16,32})' + datasourceUid: tempo + url: '$${__value.raw}' + + - name: Tempo + uid: tempo + type: tempo + access: proxy + url: http://tempo:3200 + jsonData: + tracesToLogsV2: + datasourceUid: loki + spanStartTimeShift: -5m + spanEndTimeShift: 5m + tags: ['service.name', 'service_name'] + serviceMap: + datasourceUid: tempo + nodeGraph: + enabled: true diff --git a/config/loki.yaml b/config/loki.yaml new file mode 100644 index 00000000..56df8e0e --- /dev/null +++ b/config/loki.yaml @@ -0,0 +1,35 @@ +auth_enabled: false + +server: + http_listen_port: 3100 + +common: + path_prefix: /tmp/loki + replication_factor: 1 + ring: + kvstore: + store: inmemory + +schema_config: + configs: + - from: 2024-01-01 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h + +storage_config: + filesystem: + directory: /tmp/loki/chunks + +limits_config: + allow_structured_metadata: true + volume_enabled: true + +pattern_ingester: + enabled: true + +ruler: + enable_api: true diff --git a/config/otel-collector-config.yaml b/config/otel-collector-config.yaml new file mode 100644 index 00000000..fdae1d57 --- /dev/null +++ b/config/otel-collector-config.yaml @@ -0,0 +1,51 @@ +# OpenTelemetry Collector: receives OTLP, exports traces to Tempo, logs to Loki +receivers: + otlp: + protocols: + grpc: + endpoint: "0.0.0.0:4317" + http: + endpoint: "0.0.0.0:4318" + +exporters: + # Traces -> Tempo + otlp/tempo: + endpoint: "tempo:4317" + tls: + insecure: true + + # Logs -> Loki + loki: + endpoint: "http://loki:3100/loki/api/v1/push" + default_labels_enabled: + exporter: true + job: true + retry_on_failure: + enabled: true + max_elapsed_time: 30s + +processors: + batch: + send_batch_size: 512 + timeout: 2s + resource/add_loki_labels: + attributes: + - key: loki.resource.labels + value: service.name + action: insert + +extensions: + health_check: + endpoint: "0.0.0.0:13133" + +service: + extensions: [health_check] + pipelines: + traces: + receivers: [otlp] + processors: [batch] + exporters: [otlp/tempo] + logs: + receivers: [otlp] + processors: [batch, resource/add_loki_labels] + exporters: [loki] diff --git a/config/sys.config b/config/sys.config index 67cd24ed..49bed0c9 100644 --- a/config/sys.config +++ b/config/sys.config @@ -43,6 +43,11 @@ % Should be greater than any other timeouts idle_timeout => infinity }}, + %% OTEL log handler configuration + {otel_log_level, info}, + {otel_log_max_queue_size, 2048}, + {otel_log_scheduled_delay_ms, 1000}, + {otel_log_exporting_timeout_ms, 300000}, {scoper_event_handler_options, #{ event_handler_opts => #{ formatter_opts => #{ diff --git a/config/tempo.yaml b/config/tempo.yaml new file mode 100644 index 00000000..97bb684a --- /dev/null +++ b/config/tempo.yaml @@ -0,0 +1,37 @@ +server: + http_listen_port: 3200 + +distributor: + receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +ingester: + max_block_duration: 5m + +compactor: + compaction: + block_retention: 24h + +storage: + trace: + backend: local + local: + path: /tmp/tempo/traces + +metrics_generator: + registry: + external_labels: + source: tempo + cluster: docker-compose + storage: + path: /tmp/tempo/generator/wal + +overrides: + defaults: + metrics_generator: + processors: [service-graphs, span-metrics] diff --git a/rebar.config b/rebar.config index e54b9e06..6a5bb8d7 100644 --- a/rebar.config +++ b/rebar.config @@ -46,10 +46,16 @@ {prometheus, "4.11.0"}, {prometheus_cowboy, "0.1.9"}, - %% OpenTelemetry deps - {opentelemetry_api, "1.4.0"}, - {opentelemetry, "1.5.0"}, - {opentelemetry_exporter, "1.8.0"}, + %% OpenTelemetry deps. + {opentelemetry_api, "1.5.0"}, + {opentelemetry, "1.7.0"}, + {opentelemetry_exporter, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_exporter"}}, + {opentelemetry_api_experimental, "0.5.1"}, + {opentelemetry_experimental, + {git_subdir, "https://github.com/valitydev/opentelemetry-erlang.git", + {branch, "fix/otlp-common-charlist-string"}, "apps/opentelemetry_experimental"}}, {eqwalizer_support, {git_subdir, "https://github.com/whatsapp/eqwalizer.git", {branch, "main"}, "eqwalizer_support"}} ]}. @@ -89,7 +95,9 @@ {runtime_tools, load}, {tools, load}, {canal, load}, + opentelemetry_exporter, {opentelemetry, temporary}, + opentelemetry_experimental, logger_logstash_formatter, sasl, herd, diff --git a/rebar.lock b/rebar.lock index 11e04791..d80f0666 100644 --- a/rebar.lock +++ b/rebar.lock @@ -84,10 +84,20 @@ {git,"https://github.com/valitydev/msgpack-proto.git", {ref,"7e447496aa5df4a5f1ace7ef2e3c31248b2a3ed0"}}, 2}, - {<<"opentelemetry">>,{pkg,<<"opentelemetry">>,<<"1.5.0">>},0}, - {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.4.0">>},0}, + {<<"opentelemetry">>,{pkg,<<"opentelemetry">>,<<"1.7.0">>},0}, + {<<"opentelemetry_api">>,{pkg,<<"opentelemetry_api">>,<<"1.5.0">>},0}, + {<<"opentelemetry_api_experimental">>, + {pkg,<<"opentelemetry_api_experimental">>,<<"0.5.1">>}, + 0}, + {<<"opentelemetry_experimental">>, + {git_subdir,"https://github.com/open-telemetry/opentelemetry-erlang.git", + {ref,"98be90e6167997853f83b8f27917604fb35aeecc"}, + "apps/opentelemetry_experimental"}, + 0}, {<<"opentelemetry_exporter">>, - {pkg,<<"opentelemetry_exporter">>,<<"1.8.0">>}, + {git_subdir,"https://github.com/open-telemetry/opentelemetry-erlang.git", + {ref,"98be90e6167997853f83b8f27917604fb35aeecc"}, + "apps/opentelemetry_exporter"}, 0}, {<<"parse_trans">>,{pkg,<<"parse_trans">>,<<"3.3.1">>},2}, {<<"party_client">>, @@ -151,9 +161,9 @@ {<<"kafka_protocol">>, <<"F917B6C90C8DF0DE2B40A87D6B9AE1CFCE7788E91A65818E90E40CF76111097A">>}, {<<"metrics">>, <<"25F094DEA2CDA98213CECC3AEFF09E940299D950904393B2A29D191C346A8486">>}, {<<"mimerl">>, <<"3882A5CA67FBBE7117BA8947F27643557ADEC38FA2307490C4C4207624CB213B">>}, - {<<"opentelemetry">>, <<"7DDA6551EDFC3050EA4B0B40C0D2570423D6372B97E9C60793263EF62C53C3C2">>}, - {<<"opentelemetry_api">>, <<"63CA1742F92F00059298F478048DFB826F4B20D49534493D6919A0DB39B6DB04">>}, - {<<"opentelemetry_exporter">>, <<"5D546123230771EF4174E37BEDFD77E3374913304CD6EA3CA82A2ADD49CD5D56">>}, + {<<"opentelemetry">>, <<"20D0F12D3D1C398D3670FD44FD1A7C495DD748AB3E5B692A7906662E2FB1A38A">>}, + {<<"opentelemetry_api">>, <<"1A676F3E3340CAB81C763E939A42E11A70C22863F645AA06AAFEFC689B5550CF">>}, + {<<"opentelemetry_api_experimental">>, <<"1B5AFACFCBD0834390336C845BC8AE08C8CF0D69BBED72EE53D178798B93E074">>}, {<<"parse_trans">>, <<"16328AB840CC09919BD10DAB29E431DA3AF9E9E7E7E6F0089DD5A2D2820011D8">>}, {<<"prometheus">>, <<"B95F8DE8530F541BD95951E18E355A840003672E5EDA4788C5FA6183406BA29A">>}, {<<"prometheus_cowboy">>, <<"D9D5B300516A61ED5AE31391F8EEEEB202230081D32A1813F2D78772B6F274E1">>}, @@ -185,9 +195,9 @@ {<<"kafka_protocol">>, <<"DF680A3706EAD8695F8B306897C0A33E8063C690DA9308DB87B462CFD7029D04">>}, {<<"metrics">>, <<"69B09ADDDC4F74A40716AE54D140F93BEB0FB8978D8636EADED0C31B6F099F16">>}, {<<"mimerl">>, <<"13AF15F9F68C65884ECCA3A3891D50A7B57D82152792F3E19D88650AA126B144">>}, - {<<"opentelemetry">>, <<"CDF4F51D17B592FC592B9A75F86A6F808C23044BA7CF7B9534DEBBCC5C23B0EE">>}, - {<<"opentelemetry_api">>, <<"3DFBBFAA2C2ED3121C5C483162836C4F9027DEF469C41578AF5EF32589FCFC58">>}, - {<<"opentelemetry_exporter">>, <<"A1F9F271F8D3B02B81462A6BFEF7075FD8457FDB06ADFF5D2537DF5E2264D9AF">>}, + {<<"opentelemetry">>, <<"A9173B058C4549BF824CBC2F1D2FA2ADC5CDEDC22AA3F0F826951187BBD53131">>}, + {<<"opentelemetry_api">>, <<"F53EC8A1337AE4A487D43AC89DA4BD3A3C99DDF576655D071DEED8B56A2D5DDA">>}, + {<<"opentelemetry_api_experimental">>, <<"10297057EADA47267D4F832011BECEF07D25690E6BF91FEBCCFC4E740DBA1A6F">>}, {<<"parse_trans">>, <<"07CD9577885F56362D414E8C4C4E6BDF10D43A8767ABB92D24CBE8B24C54888B">>}, {<<"prometheus">>, <<"719862351AABF4DF7079B05DC085D2BBCBE3AC0AC3009E956671B1D5AB88247D">>}, {<<"prometheus_cowboy">>, <<"5F71C039DEB9E9FF9DD6366BC74C907A463872B85286E619EFF0BDA15111695A">>},