From 45db43cc248a97459b78edc3cb2ff44a828e7045 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Xavier=20THIRY?= Date: Thu, 16 Apr 2026 11:20:21 +0200 Subject: [PATCH 1/6] chore(tests): add VictoriaLogs event fixtures corpus (D-25-2) Fifteen anonymised single-line JSON events under tests/fixtures/vl_events/ cover the shapes encountered in practice: nginx flat-dotted, cisco regex- extractable, k8s with label keys, Windows PascalCase, journald _SYSTEMD_*, OpenTelemetry (randomised IDs, not the W3C canonical examples), raw _msg only, JSON-embedded _msg, edge cases (unicode CJK + emoji, empty fields, literal dotted keys), audit, docker, syslog daemon. A YAML manifest documents each fixture (description, source_system, tags, prevents_regression_for). The helper module tests/common/vl_events.rs exposes load_fixture, load_fixtures_by_tag, load_fixtures_by_source_system, and all_fixtures, with the manifest cached lazily via OnceLock and indexed as a BTreeMap for determinism across parallel test runs. Two integration test files ship the first consumers. The consistency test checks index <-> filesystem coherence, one-line JSON shape, required VL fields, and tag/source enumeration. The parameterised template test exercises a byte-exact render of _msg/_time against the whole corpus, the #25 dotted-key regression on the dotted_keys tag, the empty-field contract on edge_empty_fields, the lenient-undefined contract on raw sources, and unicode round-tripping on the unicode fixture. This delivers deferred item D-25-2 and lays groundwork for v2.0.0 multi- source testing. --- tests/common/mod.rs | 12 ++ tests/common/vl_events.rs | 182 ++++++++++++++++++ tests/fixtures/vl_events/audit_event.json | 1 + .../fixtures/vl_events/cisco_sw_critical.json | 1 + .../vl_events/docker_container_exit.json | 1 + .../vl_events/edge_dotted_keys_literal.json | 1 + .../fixtures/vl_events/edge_empty_fields.json | 1 + .../fixtures/vl_events/edge_unicode_msg.json | 1 + tests/fixtures/vl_events/index.yaml | 107 ++++++++++ tests/fixtures/vl_events/journald_cron.json | 1 + .../fixtures/vl_events/json_embedded_msg.json | 1 + tests/fixtures/vl_events/k8s_pod_oom.json | 1 + tests/fixtures/vl_events/nginx_http_400.json | 1 + tests/fixtures/vl_events/nginx_http_500.json | 1 + tests/fixtures/vl_events/otel_span.json | 1 + .../fixtures/vl_events/raw_message_only.json | 1 + .../fixtures/vl_events/syslog_raw_daemon.json | 1 + .../vl_events/windows_security_logoff.json | 1 + tests/template_against_vl_fixtures.rs | 160 +++++++++++++++ tests/vl_fixtures_consistency.rs | 164 ++++++++++++++++ 20 files changed, 640 insertions(+) create mode 100644 tests/common/mod.rs create mode 100644 tests/common/vl_events.rs create mode 100644 tests/fixtures/vl_events/audit_event.json create mode 100644 tests/fixtures/vl_events/cisco_sw_critical.json create mode 100644 tests/fixtures/vl_events/docker_container_exit.json create mode 100644 tests/fixtures/vl_events/edge_dotted_keys_literal.json create mode 100644 tests/fixtures/vl_events/edge_empty_fields.json create mode 100644 tests/fixtures/vl_events/edge_unicode_msg.json create mode 100644 tests/fixtures/vl_events/index.yaml create mode 100644 tests/fixtures/vl_events/journald_cron.json create mode 100644 tests/fixtures/vl_events/json_embedded_msg.json create mode 100644 tests/fixtures/vl_events/k8s_pod_oom.json create mode 100644 tests/fixtures/vl_events/nginx_http_400.json create mode 100644 tests/fixtures/vl_events/nginx_http_500.json create mode 100644 tests/fixtures/vl_events/otel_span.json create mode 100644 tests/fixtures/vl_events/raw_message_only.json create mode 100644 tests/fixtures/vl_events/syslog_raw_daemon.json create mode 100644 tests/fixtures/vl_events/windows_security_logoff.json create mode 100644 tests/template_against_vl_fixtures.rs create mode 100644 tests/vl_fixtures_consistency.rs diff --git a/tests/common/mod.rs b/tests/common/mod.rs new file mode 100644 index 0000000..38be906 --- /dev/null +++ b/tests/common/mod.rs @@ -0,0 +1,12 @@ +//! Shared helpers for integration tests. +//! +//! Each file under `tests/` compiles as its own crate, so shared helpers live +//! here and are pulled in via `mod common;` from the consuming test file. +//! +//! The `#[allow(dead_code)]` below is intentional: not every consumer uses +//! every sub-module, and Cargo's per-test-crate compilation would otherwise +//! flag unused items. + +#![allow(dead_code)] + +pub mod vl_events; diff --git a/tests/common/vl_events.rs b/tests/common/vl_events.rs new file mode 100644 index 0000000..1186837 --- /dev/null +++ b/tests/common/vl_events.rs @@ -0,0 +1,182 @@ +//! Loader for the versioned VictoriaLogs event fixture corpus. +//! +//! Fixtures live under `tests/fixtures/vl_events/*.json`, one anonymised +//! one-line JSON event per file. `index.yaml` in the same directory is the +//! authoritative manifest: it documents each fixture (description, +//! `source_system`, `tags`, `prevents_regression_for`). +//! +//! The manifest is parsed once per test crate via `OnceLock`. +//! +//! # Panics +//! +//! Helpers panic aggressively on misuse (missing fixture file, fixture not +//! listed in the index, malformed manifest). The assumption is that these +//! are programmer errors in tests, not runtime failures to recover from. +//! The consistency test `vl_fixtures_consistency.rs` catches them in CI. +//! +//! # Examples +//! +//! ```ignore +//! mod common; +//! use common::vl_events::{load_fixture, load_fixtures_by_tag}; +//! +//! let event = load_fixture("nginx_http_400.json"); +//! assert_eq!(event["_stream"].as_str().is_some(), true); +//! +//! let dotted = load_fixtures_by_tag("dotted_keys"); +//! assert!(!dotted.is_empty()); +//! ``` + +use serde::Deserialize; +use serde_json::Value; +use std::collections::BTreeMap; +use std::path::{Path, PathBuf}; +use std::sync::OnceLock; + +/// Relative path (from crate root) to the fixture directory. +const FIXTURES_DIR: &str = "tests/fixtures/vl_events"; + +/// Filename of the manifest inside `FIXTURES_DIR`. +const INDEX_FILENAME: &str = "index.yaml"; + +/// One manifest entry as declared in `index.yaml`. +#[derive(Debug, Clone, Deserialize)] +pub struct FixtureEntry { + pub description: String, + pub source_system: String, + #[serde(default)] + pub tags: Vec, + #[serde(default)] + pub prevents_regression_for: Vec, +} + +/// Root of the manifest document. +#[derive(Debug, Clone, Deserialize)] +struct Manifest { + fixtures: BTreeMap, +} + +/// Cache: manifest parsed once per test crate. +static MANIFEST: OnceLock> = OnceLock::new(); + +/// Absolute path to the fixture directory. +fn fixtures_dir() -> PathBuf { + let manifest_dir = env!("CARGO_MANIFEST_DIR"); + Path::new(manifest_dir).join(FIXTURES_DIR) +} + +/// Load and cache the manifest. +fn manifest() -> &'static BTreeMap { + MANIFEST.get_or_init(|| { + let path = fixtures_dir().join(INDEX_FILENAME); + let raw = std::fs::read_to_string(&path).unwrap_or_else(|e| { + panic!("failed to read fixture manifest {}: {}", path.display(), e) + }); + let parsed: Manifest = serde_yaml::from_str(&raw).unwrap_or_else(|e| { + panic!("failed to parse fixture manifest {}: {}", path.display(), e) + }); + parsed.fixtures + }) +} + +/// Return the full manifest (for consistency tests). +pub fn manifest_entries() -> &'static BTreeMap { + manifest() +} + +/// Read and parse a fixture by stem (e.g. `"nginx_http_400"`) or by full +/// filename (e.g. `"nginx_http_400.json"`). The `.json` suffix is optional. +/// +/// # Panics +/// +/// Panics if the fixture is not listed in `index.yaml`, if the file is +/// missing, or if it is not valid JSON. +pub fn load_fixture(name: &str) -> Value { + let filename = if name.ends_with(".json") { + name.to_string() + } else { + format!("{}.json", name) + }; + if !manifest().contains_key(&filename) { + panic!( + "fixture '{}' is not declared in {}/{}. Add it to the manifest \ + or pick an existing name.", + name, FIXTURES_DIR, INDEX_FILENAME + ); + } + load_fixture_from_disk(&filename) +} + +/// Read and parse a fixture file from disk without checking the manifest. +/// +/// Used internally to catch the "listed in index but missing on disk" case +/// with a clearer message than a plain IO error. +fn load_fixture_from_disk(name: &str) -> Value { + let path = fixtures_dir().join(name); + let raw = std::fs::read_to_string(&path).unwrap_or_else(|e| { + panic!( + "fixture '{}' declared in index but missing on disk ({}): {}", + name, + path.display(), + e + ) + }); + serde_json::from_str(&raw) + .unwrap_or_else(|e| panic!("fixture '{}' is not valid JSON: {}", name, e)) +} + +/// Return every fixture listed in the manifest as `(name, value)`. +/// +/// Ordering is stable (manifest keys are a `BTreeMap`). +pub fn all_fixtures() -> Vec<(String, Value)> { + manifest() + .keys() + .map(|name| (name.clone(), load_fixture_from_disk(name))) + .collect() +} + +/// Return all fixtures whose manifest entry has `tag` in its `tags` list. +/// +/// Returns an empty `Vec` for an unknown tag; callers that expect a +/// non-empty result should assert it themselves. +pub fn load_fixtures_by_tag(tag: &str) -> Vec<(String, Value)> { + manifest() + .iter() + .filter(|(_, entry)| entry.tags.iter().any(|t| t == tag)) + .map(|(name, _)| (name.clone(), load_fixture_from_disk(name))) + .collect() +} + +/// Return all fixtures whose manifest entry has `source_system == system`. +/// +/// Empty `Vec` for unknown systems; callers assert the expected size. +pub fn load_fixtures_by_source_system(system: &str) -> Vec<(String, Value)> { + manifest() + .iter() + .filter(|(_, entry)| entry.source_system == system) + .map(|(name, _)| (name.clone(), load_fixture_from_disk(name))) + .collect() +} + +/// Return the set of filenames physically present in `FIXTURES_DIR`, +/// excluding the manifest itself. Used by the consistency test. +pub fn filesystem_fixtures() -> Vec { + let dir = fixtures_dir(); + let entries = std::fs::read_dir(&dir) + .unwrap_or_else(|e| panic!("failed to read fixture directory {}: {}", dir.display(), e)); + let mut names: Vec = entries + .filter_map(|e| e.ok()) + .filter_map(|entry| { + let file_name = entry.file_name().to_string_lossy().to_string(); + if file_name == INDEX_FILENAME { + None + } else if entry.path().extension().and_then(|s| s.to_str()) == Some("json") { + Some(file_name) + } else { + None + } + }) + .collect(); + names.sort(); + names +} diff --git a/tests/fixtures/vl_events/audit_event.json b/tests/fixtures/vl_events/audit_event.json new file mode 100644 index 0000000..f472c70 --- /dev/null +++ b/tests/fixtures/vl_events/audit_event.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T11:10:08.770Z","_stream":"{group=\"audit\",host=\"app-host-07\"}","_stream_id":"00000000000000005566778899aabbcc","_msg":"USER_LOGIN pid=4422 uid=0 auid=1001 ses=3 msg='op=login id=1001 exe=\"/usr/sbin/sshd\" hostname=? addr=198.51.100.77 terminal=ssh res=success'","type":"USER_LOGIN","auid":"1001","uid":"0","res":"success","host":"app-host-07"} diff --git a/tests/fixtures/vl_events/cisco_sw_critical.json b/tests/fixtures/vl_events/cisco_sw_critical.json new file mode 100644 index 0000000..e6f1ccf --- /dev/null +++ b/tests/fixtures/vl_events/cisco_sw_critical.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:20:17.000Z","_stream":"{group=\"network\",source=\"syslog\"}","_stream_id":"0000000000000000b2c3d4e5f6071122","_msg":"<187>Apr 15 10:20:17 switch-sw-01 %LINK-3-UPDOWN: Interface GigabitEthernet0/24, changed state to down","host":"switch-sw-01","facility":"local7","severity":"critical","appname":"cisco-ios"} diff --git a/tests/fixtures/vl_events/docker_container_exit.json b/tests/fixtures/vl_events/docker_container_exit.json new file mode 100644 index 0000000..0f24592 --- /dev/null +++ b/tests/fixtures/vl_events/docker_container_exit.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T11:15:44.229Z","_stream":"{group=\"docker\",host=\"app-host-08\"}","_stream_id":"000000000000000066778899aabbccdd","_msg":"container sidecar-logger exited with code 1","container_id":"9f8e7d6c5b4a3e2f1d0c9b8a7654321fedcba","container_name":"sidecar-logger","image":"ghcr.io/example/sidecar-logger:1.2.3","exit_code":"1","host":"app-host-08"} diff --git a/tests/fixtures/vl_events/edge_dotted_keys_literal.json b/tests/fixtures/vl_events/edge_dotted_keys_literal.json new file mode 100644 index 0000000..8a5bc76 --- /dev/null +++ b/tests/fixtures/vl_events/edge_dotted_keys_literal.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T11:05:12.333Z","_stream":"{group=\"app\",host=\"app-host-06\"}","_stream_id":"0000000000000000445566778899aabb","_msg":"GET /healthz -> 200","nginx.http.method":"GET","nginx.http.path":"/healthz","nginx.http.status":"200","nginx.http.request_id":"req-0f1e2d3c4b5a","nginx.http.remote_addr":"203.0.113.9","host":"app-host-06"} diff --git a/tests/fixtures/vl_events/edge_empty_fields.json b/tests/fixtures/vl_events/edge_empty_fields.json new file mode 100644 index 0000000..84ee27e --- /dev/null +++ b/tests/fixtures/vl_events/edge_empty_fields.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T11:00:00.000Z","_stream":"{group=\"app\",host=\"app-host-05\"}","_stream_id":"00000000000000003344556677889900","_msg":"request completed","host":"app-host-05","request_id":"","user_id":"","session":"","error":""} diff --git a/tests/fixtures/vl_events/edge_unicode_msg.json b/tests/fixtures/vl_events/edge_unicode_msg.json new file mode 100644 index 0000000..34d1e0d --- /dev/null +++ b/tests/fixtures/vl_events/edge_unicode_msg.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:55:41.671Z","_stream":"{group=\"app\",host=\"app-host-04\"}","_stream_id":"000000000000000022334455667788aa","_msg":"\u2705 \u652f\u4ed8\u5931\u8d25 payment failed for user \u5c71\u7530\u592a\u90ce","host":"app-host-04","app":"checkout","locale":"ja_JP.UTF-8"} diff --git a/tests/fixtures/vl_events/index.yaml b/tests/fixtures/vl_events/index.yaml new file mode 100644 index 0000000..c9c7a2d --- /dev/null +++ b/tests/fixtures/vl_events/index.yaml @@ -0,0 +1,107 @@ +# Manifest for VictoriaLogs event fixtures. +# +# Each entry documents one file under this directory. The entry key MUST match +# the filename on disk exactly; the consistency test (`tests/vl_fixtures_consistency.rs`) +# enforces the invariant that index.yaml and the filesystem are in sync. +# +# Schema per entry: +# description: free-form sentence describing what this event represents. +# source_system: one of nginx | cisco | k8s | windows | journald | otel | raw | docker | audit | syslog | edge_case +# tags: list of free-form strings. Consistent vocabulary: +# dotted_keys, flat_keys, regex_extract, pascal_case, multiline_msg, +# unicode, http, syslog, empty_fields, embedded_json, minimal, labels. +# prevents_regression_for: list of issue references (e.g. "#25"). Empty list is valid. +# +# All data is anonymised. Hostnames are fake (app-host-NN, switch-sw-01, k8s-node-01). +# IPs use RFC 5737 ranges (192.0.2.0/24, 198.51.100.0/24, 203.0.113.0/24). +# No real tokens, trace ids, or tenants. +fixtures: + nginx_http_400.json: + description: "nginx reverse proxy 400 response with vector-forwarded flat-dotted keys" + source_system: nginx + tags: [dotted_keys, flat_keys, http] + prevents_regression_for: ["#25"] + + nginx_http_500.json: + description: "nginx 500 upstream failure, same flat-dotted shape as 400 case" + source_system: nginx + tags: [dotted_keys, flat_keys, http] + prevents_regression_for: ["#25"] + + cisco_sw_critical.json: + description: "Cisco IOS syslog link-down event with regex-extractable switch name in _msg" + source_system: cisco + tags: [regex_extract, syslog] + prevents_regression_for: [] + + k8s_pod_oom.json: + description: "Kubernetes container OOMKilled event with kubernetes.* labels as flat-dotted keys" + source_system: k8s + tags: [dotted_keys, flat_keys, labels] + prevents_regression_for: [] + + windows_security_logoff.json: + description: "Windows Security Event 4634 (account logoff) with PascalCase keys and escaped multiline _msg" + source_system: windows + tags: [pascal_case, multiline_msg] + prevents_regression_for: [] + + journald_cron.json: + description: "journald CRON execution with _SYSTEMD_* / _TRANSPORT fields" + source_system: journald + tags: [flat_keys, syslog] + prevents_regression_for: [] + + otel_span.json: + description: "OpenTelemetry server span with trace_id, span_id and dotted service.* / http.* attributes" + source_system: otel + tags: [dotted_keys, flat_keys] + prevents_regression_for: [] + + raw_message_only.json: + description: "Minimal event: only _time, _stream, _stream_id and _msg, no structured fields" + source_system: raw + tags: [minimal] + prevents_regression_for: [] + + json_embedded_msg.json: + description: "_msg contains an embedded JSON payload, verbatim (not pre-parsed by the ingester)" + source_system: raw + tags: [embedded_json] + prevents_regression_for: [] + + edge_unicode_msg.json: + description: "_msg containing CJK characters and an emoji to exercise UTF-8 handling" + source_system: edge_case + tags: [unicode] + prevents_regression_for: [] + + edge_empty_fields.json: + description: "Several structured fields present but empty string, to exercise empty-guard paths" + source_system: edge_case + tags: [empty_fields] + prevents_regression_for: ["#26"] + + edge_dotted_keys_literal.json: + description: "Exact shape from #25 with literal dotted keys nginx.http.* preserved top-level" + source_system: edge_case + tags: [dotted_keys, flat_keys, http] + prevents_regression_for: ["#25"] + + audit_event.json: + description: "Linux auditd USER_LOGIN record with embedded key=value payload inside _msg" + source_system: audit + tags: [regex_extract, flat_keys] + prevents_regression_for: [] + + docker_container_exit.json: + description: "Docker engine event for a sidecar container exiting with non-zero status" + source_system: docker + tags: [flat_keys] + prevents_regression_for: [] + + syslog_raw_daemon.json: + description: "Raw RFC3164 syslog line from chronyd forwarded via a syslog collector" + source_system: syslog + tags: [syslog, regex_extract] + prevents_regression_for: [] diff --git a/tests/fixtures/vl_events/journald_cron.json b/tests/fixtures/vl_events/journald_cron.json new file mode 100644 index 0000000..c1380eb --- /dev/null +++ b/tests/fixtures/vl_events/journald_cron.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:35:00.112Z","_stream":"{group=\"systemd\",host=\"app-host-01\"}","_stream_id":"0000000000000000e5f6071122334455","_msg":"(root) CMD (test -x /usr/sbin/anacron || ( cd / && run-parts --report /etc/cron.hourly ))","_SYSTEMD_UNIT":"cron.service","_SYSTEMD_SLICE":"system.slice","_TRANSPORT":"journald","_HOSTNAME":"app-host-01","_PID":"2481","_UID":"0","SYSLOG_IDENTIFIER":"CRON","PRIORITY":"6"} diff --git a/tests/fixtures/vl_events/json_embedded_msg.json b/tests/fixtures/vl_events/json_embedded_msg.json new file mode 100644 index 0000000..43ec67b --- /dev/null +++ b/tests/fixtures/vl_events/json_embedded_msg.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:50:55.818Z","_stream":"{group=\"app\",host=\"app-host-03\"}","_stream_id":"0000000000000000112233445566778a","_msg":"{\"level\":\"error\",\"event\":\"payment_declined\",\"user_id\":\"u-00012\",\"amount\":42.50,\"reason\":\"insufficient_funds\"}","host":"app-host-03","app":"billing"} diff --git a/tests/fixtures/vl_events/k8s_pod_oom.json b/tests/fixtures/vl_events/k8s_pod_oom.json new file mode 100644 index 0000000..023c1b9 --- /dev/null +++ b/tests/fixtures/vl_events/k8s_pod_oom.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:25:44.877Z","_stream":"{group=\"k8s\",namespace=\"payments\",pod=\"worker-7c9b\"}","_stream_id":"0000000000000000c3d4e5f607112233","_msg":"Container worker terminated (OOMKilled), exit code 137","kubernetes.namespace_name":"payments","kubernetes.pod_name":"worker-7c9b4d5f-abc12","kubernetes.container_name":"worker","kubernetes.node_name":"k8s-node-01","kubernetes.labels.app":"worker","kubernetes.labels.version":"1.8.3","reason":"OOMKilled","exit_code":"137"} diff --git a/tests/fixtures/vl_events/nginx_http_400.json b/tests/fixtures/vl_events/nginx_http_400.json new file mode 100644 index 0000000..809c763 --- /dev/null +++ b/tests/fixtures/vl_events/nginx_http_400.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:12:33.421Z","_stream":"{group=\"web\",host=\"app-host-01\"}","_stream_id":"0000000000000000a1b2c3d4e5f60011","_msg":"400 Bad Request from 192.0.2.17","nginx.http.method":"POST","nginx.http.status":"400","nginx.http.request_id":"req-7f9c2a1b8d4e","nginx.http.path":"/api/v1/events","nginx.http.remote_addr":"192.0.2.17","nginx.http.user_agent":"curl/8.6.0","host":"app-host-01"} diff --git a/tests/fixtures/vl_events/nginx_http_500.json b/tests/fixtures/vl_events/nginx_http_500.json new file mode 100644 index 0000000..b7c8de2 --- /dev/null +++ b/tests/fixtures/vl_events/nginx_http_500.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:14:02.108Z","_stream":"{group=\"web\",host=\"app-host-02\"}","_stream_id":"0000000000000000a1b2c3d4e5f60022","_msg":"500 Internal Server Error while proxying to upstream","nginx.http.method":"GET","nginx.http.status":"500","nginx.http.request_id":"req-1122aabbccdd","nginx.http.path":"/api/v1/render","nginx.http.remote_addr":"198.51.100.42","nginx.http.upstream":"backend-api:8080","host":"app-host-02"} diff --git a/tests/fixtures/vl_events/otel_span.json b/tests/fixtures/vl_events/otel_span.json new file mode 100644 index 0000000..63cb75a --- /dev/null +++ b/tests/fixtures/vl_events/otel_span.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:40:22.340Z","_stream":"{group=\"otel\",service=\"checkout\"}","_stream_id":"0000000000000000f607112233445566","_msg":"span completed: POST /api/checkout","trace_id":"5e8f3c1d92a94fb8b7d6a1e2c3f40517","span_id":"9e3a7b4d15f82c60","parent_span_id":"7a3f8d2e1c4b5a6f","service.name":"checkout","service.version":"2.4.1","http.method":"POST","http.status_code":"200","span.kind":"server","duration_ns":"48210000"} diff --git a/tests/fixtures/vl_events/raw_message_only.json b/tests/fixtures/vl_events/raw_message_only.json new file mode 100644 index 0000000..be09415 --- /dev/null +++ b/tests/fixtures/vl_events/raw_message_only.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:45:11.002Z","_stream":"{group=\"misc\"}","_stream_id":"00000000000000000711223344556677","_msg":"disk space usage at 87% on /var"} diff --git a/tests/fixtures/vl_events/syslog_raw_daemon.json b/tests/fixtures/vl_events/syslog_raw_daemon.json new file mode 100644 index 0000000..1a5970a --- /dev/null +++ b/tests/fixtures/vl_events/syslog_raw_daemon.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T11:20:30.444Z","_stream":"{group=\"syslog\",host=\"app-host-09\"}","_stream_id":"00000000000000007788899aabbccdde","_msg":"<30>Apr 15 11:20:30 app-host-09 chronyd[812]: Selected source 203.0.113.123 (pool.example.test)","facility":"daemon","severity":"info","appname":"chronyd","host":"app-host-09"} diff --git a/tests/fixtures/vl_events/windows_security_logoff.json b/tests/fixtures/vl_events/windows_security_logoff.json new file mode 100644 index 0000000..9350067 --- /dev/null +++ b/tests/fixtures/vl_events/windows_security_logoff.json @@ -0,0 +1 @@ +{"_time":"2026-04-15T10:31:09.550Z","_stream":"{group=\"windows\",host=\"WIN-APP-01\"}","_stream_id":"0000000000000000d4e5f60711223344","_msg":"An account was logged off.\r\n\r\nSubject:\r\n\tSecurity ID:\t\tS-1-5-21-0-0-0-1001\r\n\tAccount Name:\t\ttestuser\r\n\tAccount Domain:\t\tEXAMPLE\r\n\tLogon ID:\t\t0x3E7","EventID":"4634","Channel":"Security","Computer":"WIN-APP-01.example.test","Provider":"Microsoft-Windows-Security-Auditing","LogonType":"3","TargetUserName":"testuser","TargetDomainName":"EXAMPLE"} diff --git a/tests/template_against_vl_fixtures.rs b/tests/template_against_vl_fixtures.rs new file mode 100644 index 0000000..20c0a8f --- /dev/null +++ b/tests/template_against_vl_fixtures.rs @@ -0,0 +1,160 @@ +//! Parameterised template × fixture integration tests. +//! +//! Demonstrates how the corpus plugs into the `TemplateEngine` and catches +//! regressions that motivated the chore: +//! - #25: `{{ nginx.http.request_id }}` against flat-dotted keys. +//! - empty-field rendering against `edge_empty_fields.json`. +//! +//! The existing inline-event tests (e.g. `integration_notify.rs`) are left +//! untouched; this file is purely additive. + +use std::collections::HashMap; + +use valerter::config::CompiledTemplate; +use valerter::template::TemplateEngine; + +mod common; + +use common::vl_events::{ + all_fixtures, load_fixture, load_fixtures_by_source_system, load_fixtures_by_tag, +}; + +/// Build a single-template engine wired to `{{ title_tpl }}` / `{{ body_tpl }}`. +fn engine_with(title_tpl: &str, body_tpl: &str) -> TemplateEngine { + let mut templates = HashMap::new(); + templates.insert( + "t".to_string(), + CompiledTemplate { + title: title_tpl.to_string(), + body: body_tpl.to_string(), + email_body_html: None, + accent_color: None, + }, + ); + TemplateEngine::new(templates) +} + +#[test] +fn smoke_every_fixture_renders_msg_and_time_matching_event() { + // For every fixture, render `{{ _msg }} @ {{ _time }}` and assert the + // output matches what the event literally carries. Stronger than a bare + // `!is_empty()` assertion: catches silent drift if templating or unflat- + // tening starts mangling top-level fields. + let engine = engine_with("{{ _msg }}", "{{ _msg }} @ {{ _time }}"); + for (name, value) in all_fixtures() { + let msg = value + .as_object() + .and_then(|o| o.get("_msg")) + .and_then(|v| v.as_str()) + .unwrap_or_else(|| panic!("fixture {} missing string _msg", name)); + let time = value + .as_object() + .and_then(|o| o.get("_time")) + .and_then(|v| v.as_str()) + .unwrap_or_else(|| panic!("fixture {} missing string _time", name)); + let expected = format!("{} @ {}", msg, time); + let rendered = engine + .render("t", &value, "smoke") + .unwrap_or_else(|e| panic!("fixture {} failed to render: {}", name, e)); + assert_eq!( + rendered.body, expected, + "fixture {} rendered body did not match the event's _msg/_time", + name + ); + assert_eq!( + rendered.title, msg, + "fixture {} rendered title drifted", + name + ); + } +} + +#[test] +fn regression_gh25_dotted_keys_render_their_value() { + // Regression guard for #25: `{{ nginx.http.request_id }}` must render + // the flat-dotted value from the event. Before the fix, minijinja + // treated `nginx.http.request_id` as nested attribute lookup and the + // expression resolved to empty under Lenient undefined behaviour. + let engine = engine_with("req", "{{ nginx.http.request_id }}"); + let hits = load_fixtures_by_tag("dotted_keys"); + assert!( + !hits.is_empty(), + "expected at least one `dotted_keys` fixture for #25 regression" + ); + let mut checked_any = false; + for (name, value) in hits { + // Skip fixtures that don't carry the specific dotted key we test. + let expected = value + .as_object() + .and_then(|o| o.get("nginx.http.request_id")) + .and_then(|v| v.as_str()); + let Some(expected) = expected else { + continue; + }; + let rendered = engine + .render("t", &value, "gh25") + .unwrap_or_else(|e| panic!("fixture {} failed to render for #25: {}", name, e)); + assert_eq!( + rendered.body, expected, + "fixture {} did not surface nginx.http.request_id via template", + name + ); + checked_any = true; + } + assert!( + checked_any, + "no dotted_keys fixture carried `nginx.http.request_id`; add one or \ + retag the existing nginx fixtures" + ); +} + +#[test] +fn regression_empty_fields_render_as_empty_string() { + // `edge_empty_fields.json` carries `request_id: ""`. Templates that + // reference these must render to empty string (not fail, not produce + // `"None"`, etc.). This is the contract the empty-guard in #26 relies on. + let engine = engine_with("t", "[{{ request_id }}][{{ user_id }}][{{ error }}]"); + let event = load_fixture("edge_empty_fields.json"); + let rendered = engine + .render("t", &event, "empty_fields") + .expect("render should succeed with lenient undefined"); + assert_eq!(rendered.body, "[][][]"); +} + +#[test] +fn raw_source_fixtures_render_missing_as_empty_under_lenient() { + // Fixtures whose manifest source_system is `raw` carry no structured + // fields beyond the VL envelope. Rendering `{{ _msg }}` must still + // work, and rendering `{{ missing_field }}` must yield empty under + // Lenient (not error). This protects against accidental Strict flips. + let engine = engine_with("{{ _msg }}", "{{ unknown_field_that_does_not_exist }}"); + let fixtures = load_fixtures_by_source_system("raw"); + assert!( + !fixtures.is_empty(), + "expected at least one fixture with source_system: raw" + ); + for (name, value) in fixtures { + let rendered = engine + .render("t", &value, "raw") + .unwrap_or_else(|e| panic!("fixture {} failed lenient render: {}", name, e)); + assert!(!rendered.title.is_empty(), "title empty for {}", name); + assert!( + rendered.body.is_empty(), + "body should be empty string for missing field (fixture {})", + name + ); + } +} + +#[test] +fn unicode_fixture_preserves_cjk_and_emoji() { + // Render the unicode fixture via `{{ _msg }}` and check the raw bytes + // survive. This catches encoding bugs in the template pipeline. + let engine = engine_with("{{ _msg }}", "{{ _msg }}"); + let event = load_fixture("edge_unicode_msg.json"); + let rendered = engine + .render("t", &event, "unicode") + .expect("render should succeed on unicode event"); + assert!(rendered.body.contains("支付失败"), "lost CJK codepoints"); + assert!(rendered.body.contains('\u{2705}'), "lost emoji codepoint"); +} diff --git a/tests/vl_fixtures_consistency.rs b/tests/vl_fixtures_consistency.rs new file mode 100644 index 0000000..02f9674 --- /dev/null +++ b/tests/vl_fixtures_consistency.rs @@ -0,0 +1,164 @@ +//! Consistency checks between `tests/fixtures/vl_events/index.yaml` and the +//! filesystem. This is the gate that keeps the corpus honest: no orphaned +//! files, no phantom entries, every fixture parses, every fixture carries the +//! VL-mandatory fields. + +use std::collections::BTreeSet; + +mod common; + +use common::vl_events::{ + all_fixtures, filesystem_fixtures, load_fixtures_by_tag, manifest_entries, +}; + +/// Required top-level fields on every `/select/logsql/tail` event. +const REQUIRED_FIELDS: &[&str] = &["_msg", "_time", "_stream"]; + +#[test] +fn corpus_contains_at_least_fifteen_fixtures() { + let count = manifest_entries().len(); + assert!( + count >= 15, + "spec requires at least 15 fixtures, found {}", + count + ); +} + +#[test] +fn index_and_filesystem_agree() { + let index_names: BTreeSet = manifest_entries().keys().cloned().collect(); + let disk_names: BTreeSet = filesystem_fixtures().into_iter().collect(); + + let orphans: Vec<&String> = disk_names.difference(&index_names).collect(); + let phantoms: Vec<&String> = index_names.difference(&disk_names).collect(); + + assert!( + orphans.is_empty(), + "orphan fixtures on disk without an index.yaml entry: {:?}. \ + Either add them to tests/fixtures/vl_events/index.yaml or delete them.", + orphans + ); + assert!( + phantoms.is_empty(), + "phantom fixtures declared in index.yaml but missing on disk: {:?}. \ + Either create the file or remove the entry from index.yaml.", + phantoms + ); +} + +#[test] +fn every_fixture_parses_as_json() { + // `all_fixtures()` already panics on parse error; this test just exercises + // every file so CI surfaces the panic with a clear path. + let loaded = all_fixtures(); + assert!(!loaded.is_empty(), "no fixtures discovered"); + for (name, value) in &loaded { + assert!( + value.is_object(), + "fixture {} parsed but is not a JSON object (got {})", + name, + match value { + serde_json::Value::Null => "null", + serde_json::Value::Bool(_) => "bool", + serde_json::Value::Number(_) => "number", + serde_json::Value::String(_) => "string", + serde_json::Value::Array(_) => "array", + serde_json::Value::Object(_) => "object", + } + ); + } +} + +#[test] +fn every_fixture_is_single_line_json() { + // VL's /select/logsql/tail emits one event per line; the corpus must + // mirror that shape so consumers can copy-paste fixtures into mocks. + let dir = std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/vl_events"); + for name in filesystem_fixtures() { + let path = dir.join(&name); + let raw = std::fs::read_to_string(&path) + .unwrap_or_else(|e| panic!("failed to read {}: {}", path.display(), e)); + // Allow a single trailing newline (POSIX convention) but no embedded + // newlines: the JSON payload itself must be on one line. + let trimmed = raw.strip_suffix('\n').unwrap_or(&raw); + assert!( + !trimmed.contains('\n'), + "fixture {} spans multiple lines. VL tail output is one JSON per \ + line; fixtures must match.", + name + ); + } +} + +#[test] +fn every_fixture_has_required_vl_fields() { + // Self-contained: do not assume parses_as_json ran first. `cargo test` + // parallelises tests by default, so cross-test dependencies via panic + // messages are unreliable. + for (name, value) in all_fixtures() { + let obj = value.as_object().unwrap_or_else(|| { + panic!( + "fixture {} is not a top-level JSON object (see also parses_as_json test)", + name + ) + }); + for field in REQUIRED_FIELDS { + assert!( + obj.contains_key(*field), + "fixture {} is missing required VL field `{}`", + name, + field + ); + } + // _msg must be a non-empty string so the smoke test can render it. + let msg = obj + .get("_msg") + .and_then(|v| v.as_str()) + .unwrap_or_else(|| panic!("fixture {} has non-string _msg", name)); + assert!( + !msg.is_empty(), + "fixture {} has empty _msg (not useful for template smoke tests)", + name + ); + } +} + +#[test] +fn every_entry_has_description_source_and_tags() { + for (name, entry) in manifest_entries() { + assert!( + !entry.description.trim().is_empty(), + "fixture {} has an empty description", + name + ); + assert!( + !entry.source_system.trim().is_empty(), + "fixture {} has an empty source_system", + name + ); + assert!( + !entry.tags.is_empty(), + "fixture {} has no tags (require at least one for discoverability)", + name + ); + } +} + +#[test] +fn tag_lookup_returns_empty_for_unknown() { + // Contract from the spec: unknown tags yield an empty Vec, caller asserts. + let hits = load_fixtures_by_tag("this_tag_does_not_exist_anywhere_xyz"); + assert!(hits.is_empty(), "expected empty result for unknown tag"); +} + +#[test] +fn dotted_keys_tag_is_populated() { + // The corpus exists primarily to catch regressions like #25. If this + // tag ever ends up empty, the smoke test below becomes a no-op. + let hits = load_fixtures_by_tag("dotted_keys"); + assert!( + !hits.is_empty(), + "no fixtures tagged `dotted_keys` — the regression harness for #25 \ + would silently skip" + ); +} From 02bc03db067dfdcd60e9c420fd1bde3a52c5d9aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Xavier=20THIRY?= Date: Thu, 16 Apr 2026 13:12:07 +0200 Subject: [PATCH 2/6] feat!: multi-source VictoriaLogs (v2.0.0 part 1, #34) `victorialogs:` becomes a `BTreeMap` indexed by source name. Rules gain an optional `vl_sources: [name, ...]` field; absent means fan out to every configured source. The engine spawns one task per (rule, source) pair with per-source cancellation and reconnect isolation, so a single unhealthy source does not stop alerts on the others. Synthetic `vl_source` is injected into every render context where `rule_name` already flowed: layer 1 templates (title, body, email_body_html), throttle.key, and notifier-level layer 2 contexts. The default throttle key becomes `{rule}-{source}:global` so multi-source deployments get isolated buckets per source out of the box. Source names are restricted to `^[a-zA-Z0-9_]+$` to keep the default throttle key separator (`-`) unambiguous. Rules reject duplicate `vl_sources` entries at load. The legacy single-URL config shape is rejected with an actionable migration error. `AlertPayload.vl_source` propagates end-to-end. The Mattermost footer gains a source segment and the default webhook payload exposes `vl_source` as a top-level JSON field. See CHANGELOG v2.0.0 Breaking section for the full migration. Tests consume the fixtures corpus (PR #35). The new `tests/multi_source_integration.rs` exercises the full routing matrix across two wiremock VL backends, including a negative-evidence assertion that a pinned rule does not leak to the unpinned source. Observability and guardrails (metrics `vl_source` label, `valerter_vl_source_up` gauge, `max_streams` cap, reconnect jitter) ship as a separate spec part 2 before the v2.0.0 tag. Cargo.toml not bumped here; version bump lives in the release PR. --- CHANGELOG.md | 41 ++ config/config.example.yaml | 48 +- docs/architecture.md | 22 +- docs/configuration.md | 90 +++- docs/notifiers.md | 6 + src/config/mod.rs | 2 +- src/config/runtime.rs | 15 +- src/config/tests.rs | 499 +++++++++++++++--- src/config/types.rs | 137 ++++- src/engine.rs | 317 +++++++---- src/main.rs | 11 +- src/notify/email.rs | 3 + src/notify/mattermost.rs | 17 +- src/notify/payload.rs | 4 + src/notify/telegram.rs | 2 + src/notify/tests.rs | 4 + src/notify/webhook.rs | 7 + src/tail.rs | 19 +- src/template.rs | 203 +++++-- src/throttle.rs | 164 ++++-- tests/fixtures/config_disabled_invalid.yaml | 3 +- .../config_email_missing_email_body_html.yaml | 3 +- tests/fixtures/config_invalid_basic_auth.yaml | 9 +- .../config_invalid_notifier_type.yaml | 3 +- tests/fixtures/config_invalid_regex.yaml | 3 +- tests/fixtures/config_invalid_template.yaml | 3 +- tests/fixtures/config_minimal.yaml | 3 +- tests/fixtures/config_no_notifier.yaml | 3 +- tests/fixtures/config_no_template.yaml | 3 +- tests/fixtures/config_valid.yaml | 3 +- tests/fixtures/config_with_auth.yaml | 19 +- tests/fixtures/config_with_notifiers.yaml | 3 +- .../fixtures/multi-file-collision/config.yaml | 3 +- .../fixtures/multi-file-cross-ref/config.yaml | 3 +- tests/fixtures/multi-file-empty/config.yaml | 3 +- .../multi-file-intra-collision/config.yaml | 3 +- tests/fixtures/multi-file-invalid/config.yaml | 3 +- .../multi-file-notifier-collision/config.yaml | 3 +- tests/fixtures/multi-file-only-d/config.yaml | 3 +- .../multi-file-template-collision/config.yaml | 3 +- tests/fixtures/multi-file/config.yaml | 3 +- tests/integration_notify.rs | 3 +- tests/integration_validate.rs | 4 +- tests/multi_source_integration.rs | 392 ++++++++++++++ tests/smtp_integration.rs | 2 + tests/template_against_vl_fixtures.rs | 10 +- 46 files changed, 1753 insertions(+), 354 deletions(-) create mode 100644 tests/multi_source_integration.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 97b59e2..98aff3c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,47 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.0.0] - unreleased + +### Breaking changes + +- **`victorialogs` is now a map of named sources.** A single valerter instance can tail multiple VL backends and route alerts per source. The v1.x single-URL shape (`victorialogs.url: ...` at the top level) is rejected at load with an actionable migration error. + + Migrate from: + + ```yaml + victorialogs: + url: "http://victorialogs:9428" + basic_auth: + username: "u" + password: "p" + ``` + + To: + + ```yaml + victorialogs: + default: + url: "http://victorialogs:9428" + basic_auth: + username: "u" + password: "p" + ``` + + Then optionally target sources per rule via `vl_sources: [name, ...]`, or omit the field to fan out across every configured source. Credentials, TLS, and headers are per-source, self-contained in each `VlSourceConfig`. + +- **Default throttle key is now `{rule}-{source}:global`** (was `{rule}:global` in v1.x). Multi-source deployments get isolated throttle buckets per source with no extra config. Users who want cross-source dedup must override `throttle.key` explicitly (e.g. `key: "{{ rule_name }}"`). + +- **Source names are restricted to `^[a-zA-Z0-9_]+$`.** No dashes, colons, dots, or spaces allowed. Validated at load. The constraint avoids ambiguity in the default throttle key format above. + +- **Notifier output formats extended with `vl_source`.** The Mattermost footer now reads `valerter | | | ` instead of `valerter | | `. The default webhook payload exposes `vl_source` as a top-level JSON field. Downstream parsers / dashboards that match exact strings in either output need to update. + +### Added + +- **Multi-source VictoriaLogs support** (issue #34). The engine spawns one task per `(rule, source)` pair with per-source cancellation and reconnect isolation, so a single unhealthy source does not stop alerts on the others. +- **`{{ vl_source }}` template variable** available everywhere `{{ rule_name }}` is: layer 1 templates (`title`, `body`, `email_body_html`), `throttle.key`, and notifier-level layer 2 contexts (`subject_template`, `body_template`). Always non-empty, owned `String`, equal to the source name currently processing the event. Synthetic value wins over any event field literally named `vl_source` (matches the `rule_name` collision policy). +- **`AlertPayload.vl_source`** propagated end-to-end so notifiers can render the source name. See Breaking changes above for the related output format updates on Mattermost and webhook destinations. + ## [1.2.1] - unreleased ### Fixed diff --git a/config/config.example.yaml b/config/config.example.yaml index b1474e9..7cb21fd 100644 --- a/config/config.example.yaml +++ b/config/config.example.yaml @@ -58,25 +58,41 @@ # ┌────────────────────────────────────────────────────────────────────────────┐ -# │ VICTORIALOGS CONNECTION [REQUIRED]│ +# │ VICTORIALOGS SOURCES [REQUIRED]│ # └────────────────────────────────────────────────────────────────────────────┘ +# `victorialogs` is a map of named sources (breaking change in v2.0.0). +# Each rule can target a subset via `vl_sources: [name, ...]`, or omit the +# field to fan out across every configured source. +# +# See docs/configuration.md for the v1.x → v2.0 migration snippet. victorialogs: - url: "http://127.0.0.1:9428" - - # ── Authentication (optional) ────────────────────────────────────────────── - # - # basic_auth: - # username: "${VL_USER}" - # password: "${VL_PASS}" - # - # headers: # For Bearer tokens or API keys - # Authorization: "Bearer ${VL_TOKEN}" - - # ── TLS (optional) ───────────────────────────────────────────────────────── + default: + url: "http://127.0.0.1:9428" + + # ── Authentication (optional, per source) ────────────────────────────── + # + # basic_auth: + # username: "${VL_USER}" + # password: "${VL_PASS}" + # + # headers: # For Bearer tokens or API keys + # Authorization: "Bearer ${VL_TOKEN}" + + # ── TLS (optional, per source) ───────────────────────────────────────── + # + # tls: + # verify: true # Set false for self-signed certs + + # ── Example: multi-source (production + dev) ─────────────────────────────── # - # tls: - # verify: true # Set false for self-signed certs + # vlprod: + # url: "https://victorialogs.prod.example.com:9428" + # basic_auth: + # username: "${VL_PROD_USER}" + # password: "${VL_PROD_PASS}" + # vldev: + # url: "http://victorialogs.dev.internal:9428" # ┌────────────────────────────────────────────────────────────────────────────┐ @@ -285,6 +301,7 @@ notifiers: # notify - Notification settings (required) # enabled - Enable/disable rule (optional, default: true) # throttle - Override default throttle (optional) +# vl_sources - List of VL source names to target (optional; default: all) # # Parser (at least one of regex or json should be configured): # json.fields - List of JSON fields to extract @@ -306,6 +323,7 @@ rules: key: "{{ host }}" # Throttle independently per host count: 3 window: 5m + # vl_sources: [default] # Optional: restrict to specific sources notify: template: default_alert destinations: diff --git a/docs/architecture.md b/docs/architecture.md index 8c5039e..5ae3916 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -51,10 +51,9 @@ src/ Valerter connects to VictoriaLogs' `/select/logsql/tail` endpoint via HTTP streaming. Unlike polling-based approaches: - **Real-time:** Logs arrive within seconds of being ingested -- **Efficient:** Single long-lived connection per rule +- **Efficient:** Single long-lived connection per `(rule, source)` pair - **Resilient:** Automatic reconnection with exponential backoff - -Each rule maintains its own streaming connection, providing isolation. +- **Multi-source:** Each rule fans out to every configured `victorialogs.` source (or to a named subset via `vl_sources:`), with one task per pair so an unhealthy source never blocks others. ### 2. Parse @@ -103,18 +102,19 @@ main.rs └── RuleEngine::run() │ └── JoinSet<()> - ├── rule_task("rule-1") ──► tail → parse → throttle → template → queue - ├── rule_task("rule-2") ──► tail → parse → throttle → template → queue - └── rule_task("rule-N") ──► ... + ├── rule_task("rule-1", "vlprod") ──► tail → parse → throttle → template → queue + ├── rule_task("rule-1", "vldev") ──► tail → parse → throttle → template → queue + ├── rule_task("rule-2", "vlprod") ──► tail → parse → throttle → template → queue + └── rule_task("rule-N", "") ──► ... ``` **Key properties:** -- **1 task per rule:** Rules are fully isolated via `JoinSet` -- **Error isolation:** One rule's failure doesn't affect others -- **Panic recovery:** Panicked rules are respawned after 5s delay (`PANIC_RESTART_DELAY`) -- **Graceful shutdown:** All tasks respect the `CancellationToken` -- **Metric:** `valerter_rule_panics_total{rule_name}` tracks panics per rule +- **1 task per `(rule, source)` pair:** rules and sources are both fully isolated via `JoinSet`. A rule with `vl_sources: [a, b]` against a config defining sources `{a, b, c}` spawns 2 tasks; a rule with no `vl_sources` spawns N (one per configured source). +- **Error isolation:** one task's failure doesn't affect others — neither sibling sources of the same rule, nor sibling rules of the same source. +- **Panic recovery:** panicked tasks are respawned after 5s delay (`PANIC_RESTART_DELAY`), keyed by `(rule, source)` so the right cancellation token and source config are restored. +- **Graceful shutdown:** all tasks respect the `CancellationToken` +- **Metric:** `valerter_rule_panics_total{rule_name}` tracks panics per rule. (Per-source label split is deferred to the v2.0.0 part 2 observability spec.) ## Reconnection Strategy diff --git a/docs/configuration.md b/docs/configuration.md index 077e931..512b056 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -102,26 +102,77 @@ templates: # Message templates (REQUIRED) rules: # Alert rules (REQUIRED, at least one) ``` -## VictoriaLogs Connection +## VictoriaLogs Sources (multi-source) + +`victorialogs` is a map of named sources. A single valerter instance can tail +multiple VL backends concurrently and route alerts per source. At least one +source is required. ```yaml victorialogs: - url: "http://victorialogs:9428" # REQUIRED + default: # Source name (used as `vl_source`) + url: "http://victorialogs:9428" # REQUIRED - # Optional: Basic Authentication - basic_auth: - username: "${VL_USER}" - password: "${VL_PASS}" + # Optional: Basic Authentication (per-source) + basic_auth: + username: "${VL_USER}" + password: "${VL_PASS}" + + # Optional: Custom headers (for tokens, API keys) + headers: + Authorization: "Bearer ${VL_TOKEN}" + + # Optional: TLS configuration + tls: + verify: true # Set to false for self-signed certs +``` - # Optional: Custom headers (for tokens, API keys) - headers: - Authorization: "Bearer ${VL_TOKEN}" +### Multi-source example - # Optional: TLS configuration - tls: - verify: true # Set to false for self-signed certs +```yaml +victorialogs: + vlprod: + url: "https://victorialogs.prod.example.com:9428" + basic_auth: + username: "${VL_PROD_USER}" + password: "${VL_PROD_PASS}" + vldev: + url: "http://victorialogs.dev.internal:9428" ``` +Rules can target a subset of sources via `vl_sources: [name, ...]`, or omit +the field to fan out across every configured source. The current source name +is exposed in templates as `{{ vl_source }}` (layer 1 templates, +`throttle.key`, and notifier-level layer 2 contexts). + +### Migration from v1.x (breaking change) + +The v1.x single-URL shape (`victorialogs.url: ...` at the top level) is +rejected at load with a clear error. Wrap your existing settings under a +named key (we recommend `default` for single-source deployments): + +```yaml +# Before (v1.x): +victorialogs: + url: "http://victorialogs:9428" + basic_auth: + username: "u" + password: "p" + +# After (v2.0+): +victorialogs: + default: + url: "http://victorialogs:9428" + basic_auth: + username: "u" + password: "p" +``` + +The default throttle key also changed from `{rule}:global` to +`{rule}-{source}:global` so multi-source buckets are isolated by default. To +preserve v1.x cross-source dedup, set `throttle.key: "{{ rule_name }}"` +explicitly on the rules that need it. + ### Reverse Proxy Configuration If VictoriaLogs is behind a reverse proxy (nginx, Traefik, etc.), you **must** disable buffering and caching for the `/select/logsql/tail` endpoint. Valerter uses HTTP streaming to receive logs in real-time, and proxy buffering will cause delays or connection issues. @@ -199,6 +250,7 @@ Variables come from the parser output plus built-in fields: | Variable | Description | |----------|-------------| | `rule_name` | Name of the rule that triggered | +| `vl_source` | Name of the VictoriaLogs source the event came from | | `_msg` | Original log message (from VictoriaLogs) | | `_time` | Log timestamp (raw from VictoriaLogs) | | `_stream` | Stream labels | @@ -206,10 +258,11 @@ Variables come from the parser output plus built-in fields: | `log_timestamp_formatted` | Human-readable timestamp (respects `timestamp_timezone` setting) | | Custom fields | Extracted by regex/JSON parser | -**Note:** `rule_name` is available in all template contexts: the top-level -template fields (`title`, `body`, `email_body_html`), the `throttle.key`, and -the notifier-level templates (`subject_template`, `body_template`). If an -event field happens to be named `rule_name`, the synthetic rule name wins. +**Note:** `rule_name` and `vl_source` are available in all template contexts: +the top-level template fields (`title`, `body`, `email_body_html`), the +`throttle.key`, and the notifier-level templates (`subject_template`, +`body_template`). If an event field happens to be named `rule_name` or +`vl_source`, the synthetic value wins. **Note:** `log_timestamp` and `log_timestamp_formatted` are available in: - Email subject and body templates @@ -243,6 +296,11 @@ rules: count: 3 window: 5m + vl_sources: [vlprod] # Optional: target specific sources + # Empty/omitted = fan out across all + # sources defined in `victorialogs:`. + # Unknown names rejected at load. + notify: # REQUIRED template: "custom_template" # REQUIRED: template name destinations: # REQUIRED: at least one notifier diff --git a/docs/notifiers.md b/docs/notifiers.md index b8dcf1b..d15e7bc 100644 --- a/docs/notifiers.md +++ b/docs/notifiers.md @@ -66,6 +66,7 @@ If `body_template` is omitted, sends: { "alert_name": "", "rule_name": "...", + "vl_source": "", "title": "...", "body": "...", "timestamp": "", @@ -89,9 +90,14 @@ When using `body_template`, these variables are available: | `title` | Alert title | | `body` | Alert body | | `rule_name` | Name of the rule | +| `vl_source` | Name of the VictoriaLogs source the event came from | | `log_timestamp` | Original log timestamp (ISO 8601) | | `log_timestamp_formatted` | Human-readable timestamp | +`{{ vl_source }}` is available wherever `{{ rule_name }}` is, and follows the +same collision policy: an event field literally named `vl_source` is masked +by the synthetic source name. + ### Examples **Slack:** diff --git a/src/config/mod.rs b/src/config/mod.rs index b6cb481..b637122 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -23,7 +23,7 @@ pub use secret::SecretString; pub use types::{ BasicAuthConfig, Config, DEFAULT_CONFIG_PATH, DefaultsConfig, JsonParserConfig, MetricsConfig, NotifyConfig, ParserConfig, RuleConfig, TemplateConfig, ThrottleConfig, TlsConfig, - VictoriaLogsConfig, + VlSourceConfig, }; pub use validation::validate_template_render; diff --git a/src/config/runtime.rs b/src/config/runtime.rs index 08bfc4e..ef7ea20 100644 --- a/src/config/runtime.rs +++ b/src/config/runtime.rs @@ -2,18 +2,22 @@ use super::notifiers::NotifiersConfig; use super::types::{ - Config, DefaultsConfig, JsonParserConfig, MetricsConfig, NotifyConfig, VictoriaLogsConfig, + Config, DefaultsConfig, JsonParserConfig, MetricsConfig, NotifyConfig, VlSourceConfig, }; use crate::error::ConfigError; use regex::Regex; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::path::Path; use std::time::Duration; /// Runtime configuration with pre-compiled regex (FR15). +/// +/// `victorialogs` is a BTreeMap of named sources. BTreeMap (not HashMap) is +/// intentional: deterministic iteration order is required for spawn order, +/// diagnostic logs, and test assertions. #[derive(Debug)] pub struct RuntimeConfig { - pub victorialogs: VictoriaLogsConfig, + pub victorialogs: BTreeMap, pub defaults: DefaultsConfig, pub templates: HashMap, pub rules: Vec, @@ -31,6 +35,10 @@ pub struct CompiledRule { pub parser: CompiledParser, pub throttle: Option, pub notify: NotifyConfig, + /// VL sources this rule is bound to. Empty means "fan out across all + /// configured sources"; non-empty restricts to the named subset. All + /// names are validated against the top-level map at config load time. + pub vl_sources: Vec, } /// Parser with pre-compiled regex pattern. @@ -133,6 +141,7 @@ impl Config { window: t.window, }), notify: rule.notify, + vl_sources: rule.vl_sources, } }) .collect(); diff --git a/src/config/tests.rs b/src/config/tests.rs index 59bc574..de15c1b 100644 --- a/src/config/tests.rs +++ b/src/config/tests.rs @@ -19,8 +19,12 @@ fn fixture_path(name: &str) -> PathBuf { fn load_valid_config() { let config = Config::load(&fixture_path("config_valid.yaml")).unwrap(); - // VictoriaLogs settings - assert_eq!(config.victorialogs.url, "http://victorialogs:9428"); + // VictoriaLogs settings (multi-source map — single `default` source) + assert_eq!(config.victorialogs.len(), 1); + assert_eq!( + config.victorialogs.get("default").unwrap().url, + "http://victorialogs:9428" + ); // Defaults assert_eq!(config.defaults.throttle.count, 5); @@ -96,7 +100,10 @@ fn config_example_yaml_is_valid() { .join("config.example.yaml"); let config = Config::load(&example_path).expect("config.example.yaml should be valid"); - assert!(!config.victorialogs.url.is_empty()); + assert!(!config.victorialogs.is_empty()); + for source in config.victorialogs.values() { + assert!(!source.url.is_empty()); + } assert!(!config.templates.is_empty()); assert!(!config.rules.is_empty()); } @@ -148,7 +155,8 @@ fn validate_invalid_template_returns_error() { fn validate_no_rules_fails() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -244,7 +252,10 @@ fn compile_preserves_config_values() { let runtime = config.compile(&path).unwrap(); - assert_eq!(runtime.victorialogs.url, "http://victorialogs:9428"); + assert_eq!( + runtime.victorialogs.get("default").unwrap().url, + "http://victorialogs:9428" + ); assert_eq!(runtime.defaults.throttle.count, 5); assert!(runtime.templates.contains_key("default_alert")); } @@ -257,8 +268,9 @@ fn compile_preserves_config_values() { fn load_config_with_basic_auth() { let config = Config::load(&fixture_path("config_with_auth.yaml")).unwrap(); - assert!(config.victorialogs.basic_auth.is_some()); - let basic_auth = config.victorialogs.basic_auth.as_ref().unwrap(); + let source = config.victorialogs.get("default").unwrap(); + assert!(source.basic_auth.is_some()); + let basic_auth = source.basic_auth.as_ref().unwrap(); assert_eq!(basic_auth.username, "testuser"); assert_eq!(basic_auth.password.expose(), "testpassword"); } @@ -267,8 +279,9 @@ fn load_config_with_basic_auth() { fn load_config_with_headers() { let config = Config::load(&fixture_path("config_with_auth.yaml")).unwrap(); - assert!(config.victorialogs.headers.is_some()); - let headers = config.victorialogs.headers.as_ref().unwrap(); + let source = config.victorialogs.get("default").unwrap(); + assert!(source.headers.is_some()); + let headers = source.headers.as_ref().unwrap(); assert_eq!(headers.len(), 2); assert_eq!( headers.get("X-API-Key").unwrap().expose(), @@ -280,8 +293,9 @@ fn load_config_with_headers() { fn load_config_with_tls_verify_false() { let config = Config::load(&fixture_path("config_with_auth.yaml")).unwrap(); - assert!(config.victorialogs.tls.is_some()); - assert!(!config.victorialogs.tls.as_ref().unwrap().verify); + let source = config.victorialogs.get("default").unwrap(); + assert!(source.tls.is_some()); + assert!(!source.tls.as_ref().unwrap().verify); } #[test] @@ -324,11 +338,18 @@ fn load_config_without_notifiers_section_loads_none() { fn make_runtime_config_with_destinations(destinations: Vec) -> RuntimeConfig { RuntimeConfig { - victorialogs: VictoriaLogsConfig { - url: "http://localhost:9428".to_string(), - basic_auth: None, - headers: None, - tls: None, + victorialogs: { + let mut m = std::collections::BTreeMap::new(); + m.insert( + "default".to_string(), + VlSourceConfig { + url: "http://localhost:9428".to_string(), + basic_auth: None, + headers: None, + tls: None, + }, + ); + m }, defaults: DefaultsConfig { throttle: ThrottleConfig { @@ -365,6 +386,7 @@ fn make_runtime_config_with_destinations(destinations: Vec) -> RuntimeCo mattermost_channel: None, destinations, }, + vl_sources: Vec::new(), }], metrics: MetricsConfig::default(), notifiers: Some(std::collections::HashMap::new()), @@ -430,11 +452,18 @@ fn rules_use_defaults_when_not_specified() { #[test] fn validate_collects_all_errors() { let config = Config { - victorialogs: VictoriaLogsConfig { - url: "http://localhost:9428".to_string(), - basic_auth: None, - headers: None, - tls: None, + victorialogs: { + let mut m = std::collections::BTreeMap::new(); + m.insert( + "default".to_string(), + VlSourceConfig { + url: "http://localhost:9428".to_string(), + basic_auth: None, + headers: None, + tls: None, + }, + ); + m }, defaults: DefaultsConfig { throttle: ThrottleConfig { @@ -472,6 +501,7 @@ fn validate_collects_all_errors() { mattermost_channel: None, destinations: vec!["test".to_string()], }, + vl_sources: Vec::new(), }, RuleConfig { name: "rule2_invalid".to_string(), @@ -487,6 +517,7 @@ fn validate_collects_all_errors() { mattermost_channel: None, destinations: vec!["test".to_string()], }, + vl_sources: Vec::new(), }, ], metrics: MetricsConfig::default(), @@ -507,11 +538,18 @@ fn validate_collects_all_errors() { #[test] fn validate_throttle_key_template() { let config = Config { - victorialogs: VictoriaLogsConfig { - url: "http://localhost:9428".to_string(), - basic_auth: None, - headers: None, - tls: None, + victorialogs: { + let mut m = std::collections::BTreeMap::new(); + m.insert( + "default".to_string(), + VlSourceConfig { + url: "http://localhost:9428".to_string(), + basic_auth: None, + headers: None, + tls: None, + }, + ); + m }, defaults: DefaultsConfig { throttle: ThrottleConfig { @@ -552,6 +590,7 @@ fn validate_throttle_key_template() { mattermost_channel: None, destinations: vec!["test".to_string()], }, + vl_sources: Vec::new(), }], metrics: MetricsConfig::default(), notifiers: None, @@ -570,11 +609,18 @@ fn validate_throttle_key_template() { #[test] fn validate_nonexistent_notify_template_fails() { let config = Config { - victorialogs: VictoriaLogsConfig { - url: "http://localhost:9428".to_string(), - basic_auth: None, - headers: None, - tls: None, + victorialogs: { + let mut m = std::collections::BTreeMap::new(); + m.insert( + "default".to_string(), + VlSourceConfig { + url: "http://localhost:9428".to_string(), + basic_auth: None, + headers: None, + tls: None, + }, + ); + m }, defaults: DefaultsConfig { throttle: ThrottleConfig { @@ -611,6 +657,7 @@ fn validate_nonexistent_notify_template_fails() { mattermost_channel: None, destinations: vec!["test".to_string()], }, + vl_sources: Vec::new(), }], metrics: MetricsConfig::default(), notifiers: None, @@ -630,9 +677,23 @@ fn validate_nonexistent_notify_template_fails() { #[test] fn load_config_without_auth_options() { let config = Config::load(&fixture_path("config_valid.yaml")).unwrap(); - assert!(config.victorialogs.basic_auth.is_none()); - assert!(config.victorialogs.headers.is_none()); - assert!(config.victorialogs.tls.is_none()); + assert!( + config + .victorialogs + .get("default") + .unwrap() + .basic_auth + .is_none() + ); + assert!( + config + .victorialogs + .get("default") + .unwrap() + .headers + .is_none() + ); + assert!(config.victorialogs.get("default").unwrap().tls.is_none()); } #[test] @@ -713,7 +774,8 @@ fn load_config_with_unknown_notifier_type_fails() { fn validate_email_body_html_syntax_error_detected() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -750,7 +812,8 @@ rules: [] fn parse_rejects_old_body_html_field_name() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -788,7 +851,8 @@ rules: [] fn validate_config_with_invalid_accent_color_fails() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -818,7 +882,8 @@ rules: [] fn validate_config_with_short_hex_fails() { let yaml = r##" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -848,7 +913,8 @@ rules: [] fn validate_config_with_valid_accent_color_passes() { let yaml = r##" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -881,7 +947,8 @@ rules: fn validate_template_render_in_body_detects_unknown_filter() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -914,7 +981,8 @@ rules: [] fn validate_valid_templates_pass() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -945,11 +1013,18 @@ rules: #[test] fn validate_rule_destinations_collects_all_errors() { let config = RuntimeConfig { - victorialogs: VictoriaLogsConfig { - url: "http://localhost:9428".to_string(), - basic_auth: None, - headers: None, - tls: None, + victorialogs: { + let mut m = std::collections::BTreeMap::new(); + m.insert( + "default".to_string(), + VlSourceConfig { + url: "http://localhost:9428".to_string(), + basic_auth: None, + headers: None, + tls: None, + }, + ); + m }, defaults: DefaultsConfig { throttle: ThrottleConfig { @@ -987,6 +1062,7 @@ fn validate_rule_destinations_collects_all_errors() { mattermost_channel: None, destinations: vec!["unknown-1".to_string()], }, + vl_sources: Vec::new(), }, CompiledRule { name: "rule_2".to_string(), @@ -1002,6 +1078,7 @@ fn validate_rule_destinations_collects_all_errors() { mattermost_channel: None, destinations: vec!["unknown-2".to_string()], }, + vl_sources: Vec::new(), }, ], metrics: MetricsConfig::default(), @@ -1277,7 +1354,8 @@ fn load_with_notifier_collision_fails() { fn defaults_notify_is_rejected() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1317,7 +1395,8 @@ rules: fn unknown_field_rejected_in_config_root() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1358,8 +1437,9 @@ unknown_root_field: "should fail" fn unknown_field_rejected_in_victorialogs_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 - unknown_vl_field: "should fail" + default: + url: http://localhost:9428 + unknown_vl_field: "should fail" notifiers: test: type: mattermost @@ -1399,7 +1479,8 @@ rules: fn unknown_field_rejected_in_metrics_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 metrics: enabled: true port: 9090 @@ -1443,7 +1524,8 @@ rules: fn unknown_field_rejected_in_throttle_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1484,7 +1566,8 @@ rules: fn unknown_field_rejected_in_template_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1525,7 +1608,8 @@ rules: fn unknown_field_rejected_in_rule_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1563,7 +1647,8 @@ rules: fn unknown_field_rejected_in_parser_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1602,7 +1687,8 @@ rules: fn unknown_field_rejected_in_json_parser_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1643,7 +1729,8 @@ rules: fn unknown_field_rejected_in_mattermost_notifier() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: mattermost @@ -1684,7 +1771,8 @@ rules: fn unknown_field_rejected_in_webhook_notifier() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: webhook @@ -1726,7 +1814,8 @@ rules: fn unknown_field_rejected_in_email_notifier() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: email @@ -1772,7 +1861,8 @@ rules: fn unknown_field_rejected_in_smtp_config() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: email @@ -1822,7 +1912,8 @@ rules: fn invalid_url_rejected_in_victorialogs() { let yaml = r#" victorialogs: - url: "not-a-valid-url" + default: + url: "not-a-valid-url" notifiers: test: type: mattermost @@ -1857,7 +1948,8 @@ rules: fn invalid_url_rejected_in_webhook_notifier() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: test: type: webhook @@ -1893,7 +1985,8 @@ rules: fn telegram_notifier_with_empty_chat_ids_fails_validation() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: telegram-broken: type: telegram @@ -1934,7 +2027,8 @@ rules: fn telegram_notifier_with_populated_chat_ids_passes_validation() { let yaml = r#" victorialogs: - url: http://localhost:9428 + default: + url: http://localhost:9428 notifiers: telegram-ok: type: telegram @@ -1965,3 +2059,280 @@ rules: "Telegram notifier with non-empty chat_ids should validate" ); } + +// ============================================================ +// v2.0.0: multi-source config schema tests +// ============================================================ + +/// Shared YAML tail reused by the multi-source validation suite. Isolates +/// source-shape changes from notifier / template / rule noise. +const MULTI_SOURCE_TAIL: &str = r#" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: r + query: "*" + parser: { json: { fields: [x] } } + notify: { template: default, destinations: [mm] } +"#; + +#[test] +fn schema_parses_map_with_single_source() { + let yaml = format!( + r#" +victorialogs: + default: + url: "http://localhost:9428" +{}"#, + MULTI_SOURCE_TAIL + ); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + assert_eq!(config.victorialogs.len(), 1); + assert!(config.victorialogs.contains_key("default")); + assert!(config.validate().is_ok()); +} + +#[test] +fn schema_parses_map_with_multiple_sources() { + let yaml = format!( + r#" +victorialogs: + vlprod: + url: "https://vl.prod.example.com:9428" + vldev: + url: "http://vl.dev.internal:9428" +{}"#, + MULTI_SOURCE_TAIL + ); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + assert_eq!(config.victorialogs.len(), 2); + assert!(config.victorialogs.contains_key("vlprod")); + assert!(config.victorialogs.contains_key("vldev")); +} + +#[test] +fn schema_parses_rule_vl_sources_field() { + let yaml = r#" +victorialogs: + vlprod: + url: "http://vlprod:9428" + vldev: + url: "http://vldev:9428" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: prod_only + query: "*" + parser: { json: { fields: [x] } } + vl_sources: [vlprod] + notify: { template: default, destinations: [mm] } + - name: fan_out + query: "*" + parser: { json: { fields: [x] } } + notify: { template: default, destinations: [mm] } +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + assert_eq!(config.rules.len(), 2); + assert_eq!(config.rules[0].vl_sources, vec!["vlprod".to_string()]); + assert!(config.rules[1].vl_sources.is_empty()); // default = fan-out + assert!(config.validate().is_ok()); +} + +#[test] +fn validate_rejects_zero_sources() { + let yaml = format!( + r#" +victorialogs: {{}} +{}"#, + MULTI_SOURCE_TAIL + ); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + let errors = config.validate().expect_err("zero sources must reject"); + assert!( + errors + .iter() + .any(|e| e.to_string().contains("at least one source required")), + "expected 'at least one source required' error, got: {:?}", + errors + ); +} + +#[test] +fn validate_rejects_unknown_vl_sources_ref() { + let yaml = r#" +victorialogs: + vlprod: + url: "http://vlprod:9428" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: bad_ref + query: "*" + parser: { json: { fields: [x] } } + vl_sources: [vlprod, missing_source] + notify: { template: default, destinations: [mm] } +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let errors = config + .validate() + .expect_err("unknown source ref must reject"); + let msg = errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + assert!( + msg.contains("unknown source 'missing_source'"), + "msg: {}", + msg + ); + assert!( + msg.contains("vlprod"), + "error must list known sources; msg: {}", + msg + ); +} + +#[test] +fn validate_rejects_invalid_source_name() { + // Source name with `-` would create ambiguity in the default throttle key + // `{rule}-{source}:global`. Restricted to `^[a-zA-Z0-9_]+$`. + let yaml = r#" +victorialogs: + prod-eu: + url: "http://vl:9428" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: r + query: "*" + parser: { json: { fields: [x] } } + notify: { template: default, destinations: [mm] } +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let errors = config + .validate() + .expect_err("source name with `-` must reject"); + let msg = errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + assert!( + msg.contains("source name 'prod-eu' is invalid"), + "msg: {}", + msg + ); + assert!( + msg.contains("[a-zA-Z0-9_]"), + "msg must hint at allowed chars: {}", + msg + ); +} + +#[test] +fn validate_rejects_duplicate_vl_sources_entry() { + let yaml = r#" +victorialogs: + vlprod: + url: "http://vl:9428" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: dupe + query: "*" + parser: { json: { fields: [x] } } + vl_sources: [vlprod, vlprod] + notify: { template: default, destinations: [mm] } +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + let errors = config + .validate() + .expect_err("duplicate vl_sources must reject"); + let msg = errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + assert!( + msg.contains("duplicate entry 'vlprod'"), + "expected duplicate entry mention, got: {}", + msg + ); +} + +#[test] +fn load_rejects_legacy_single_url_shape_with_migration_message() { + let legacy_yaml = r#" +victorialogs: + url: "http://victorialogs:9428" +notifiers: + mm: + type: mattermost + webhook_url: "https://mattermost.example.com/hooks/test" +defaults: + throttle: { count: 5, window: 60s } +templates: + default: + title: "t" + body: "b" +rules: + - name: r + query: "*" + parser: { json: { fields: [x] } } + notify: { template: default, destinations: [mm] } +"#; + let err = serde_yaml::from_str::(legacy_yaml) + .expect_err("legacy single-URL shape must fail to parse"); + let err_str = err.to_string(); + assert!( + err_str.contains("map of named sources"), + "expected migration hint mentioning 'map of named sources', got: {}", + err_str + ); + assert!( + err_str.contains("victorialogs:"), + "expected migration YAML snippet in error, got: {}", + err_str + ); +} diff --git a/src/config/types.rs b/src/config/types.rs index 952d1f6..29eefe7 100644 --- a/src/config/types.rs +++ b/src/config/types.rs @@ -6,7 +6,7 @@ use super::validation::{validate_hex_color, validate_jinja_template, validate_ur use crate::error::ConfigError; use regex::Regex; use serde::Deserialize; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::path::{Path, PathBuf}; use std::time::Duration; @@ -14,11 +14,16 @@ use std::time::Duration; pub const DEFAULT_CONFIG_PATH: &str = "/etc/valerter/config.yaml"; /// Main configuration structure for valerter. +/// +/// `victorialogs` is a map of named source configurations. A BTreeMap is used +/// explicitly (never HashMap) so iteration over sources is deterministic — +/// crucial for test assertions, diagnostic logs, and spawn order. #[derive(Debug, Deserialize)] #[serde(deny_unknown_fields)] pub struct Config { - /// VictoriaLogs connection settings. - pub victorialogs: VictoriaLogsConfig, + /// Named VictoriaLogs sources. At least one is required (enforced at validate()). + #[serde(deserialize_with = "deserialize_vl_sources")] + pub victorialogs: BTreeMap, /// Default values for throttle and notify. pub defaults: DefaultsConfig, /// Reusable message templates. @@ -35,10 +40,13 @@ pub struct Config { pub notifiers: Option, } -/// VictoriaLogs connection configuration. +/// Configuration for a single named VictoriaLogs source. +/// +/// Renamed from `VictoriaLogsConfig` in v2.0.0 (multi-source support). The +/// field set is unchanged; only the containing map structure changed. #[derive(Debug, Clone, Deserialize)] #[serde(deny_unknown_fields)] -pub struct VictoriaLogsConfig { +pub struct VlSourceConfig { /// URL of the VictoriaLogs instance. pub url: String, /// Optional Basic Auth credentials. @@ -52,6 +60,62 @@ pub struct VictoriaLogsConfig { pub tls: Option, } +/// Legacy single-URL `victorialogs` shape. Kept for detection only so we can +/// emit a precise migration error when users upgrade from v1.x. +/// +/// Intentionally does NOT set `deny_unknown_fields`: v1 users with a forked +/// build or an extra field still land on the migration error rather than a +/// cryptic v2 parse failure. +#[derive(Debug, Deserialize)] +struct LegacyVictoriaLogsConfig { + #[allow(dead_code)] + url: String, + #[serde(default)] + #[allow(dead_code)] + basic_auth: Option, + #[serde(default)] + #[allow(dead_code)] + headers: Option>, + #[serde(default)] + #[allow(dead_code)] + tls: Option, +} + +/// Migration error text pointing users from the v1 single-URL shape to the +/// v2 map shape. Exposed so tests can assert wording. +pub(crate) const LEGACY_VL_MIGRATION_MESSAGE: &str = "`victorialogs` is now a map of named sources (breaking change in v2.0.0).\n\nMigrate from:\n victorialogs:\n url: \"http://...\"\n basic_auth:\n username: \"u\"\n password: \"p\"\nTo:\n victorialogs:\n default:\n url: \"http://...\"\n basic_auth:\n username: \"u\"\n password: \"p\"\n\nThen optionally target sources per rule via `vl_sources: [default]` (or omit to fan out across all sources). See CHANGELOG v2.0.0 for the full migration note."; + +/// Deserialize `victorialogs` as `BTreeMap`, but +/// emit a migration-oriented error when the legacy single-object shape +/// (`url: ...` at the `victorialogs` level) is detected. +fn deserialize_vl_sources<'de, D>( + deserializer: D, +) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let raw = serde_yaml::Value::deserialize(deserializer)?; + + // Legacy-shape detection: a YAML mapping that parses cleanly as the v1 + // struct is the old shape. The new shape's outer map has named keys whose + // values are objects — these fail the v1 struct parse due to + // `deny_unknown_fields`. + if serde_yaml::from_value::(raw.clone()).is_ok() { + return Err(serde::de::Error::custom(LEGACY_VL_MIGRATION_MESSAGE)); + } + + serde_yaml::from_value::>(raw) + .map_err(serde::de::Error::custom) +} + +/// A VictoriaLogs source name is valid if it is non-empty and contains only +/// alphanumeric ASCII characters or underscores. This restriction guarantees +/// the default throttle key `{rule}-{source}:global` parses unambiguously and +/// avoids collisions when rule and source names share the `-` separator. +fn is_valid_source_name(name: &str) -> bool { + !name.is_empty() && name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_') +} + /// Basic Auth configuration for VictoriaLogs connection. /// /// Both `username` and `password` are required when Basic Auth is configured. @@ -174,6 +238,12 @@ pub struct RuleConfig { #[serde(default)] pub throttle: Option, pub notify: NotifyConfig, + /// Optional list of VictoriaLogs source names to target. An empty list (the + /// default) means "fan out across every configured source". All listed + /// names must exist in the top-level `victorialogs` map; unknown refs are + /// rejected at `Config::validate()` time. + #[serde(default)] + pub vl_sources: Vec, } /// Rule configuration without the `name` field, for deserializing `.d/` files. @@ -188,6 +258,8 @@ struct RuleConfigWithoutName { #[serde(default)] pub throttle: Option, pub notify: NotifyConfig, + #[serde(default)] + pub vl_sources: Vec, } impl RuleConfigWithoutName { @@ -200,6 +272,7 @@ impl RuleConfigWithoutName { parser: self.parser, throttle: self.throttle, notify: self.notify, + vl_sources: self.vl_sources, } } } @@ -468,14 +541,54 @@ impl Config { pub fn validate(&self) -> Result<(), Vec> { let mut errors = Vec::new(); - // ===== URL validations ===== + // ===== VictoriaLogs source validations ===== - // Validate victorialogs.url - if let Err(e) = validate_url(&self.victorialogs.url) { - errors.push(ConfigError::ValidationError(format!( - "victorialogs.url: {}", - e - ))); + // At least one source is required (zero-source rejection). + if self.victorialogs.is_empty() { + errors.push(ConfigError::ValidationError( + "victorialogs: at least one source required (define e.g. `victorialogs: { default: { url: \"http://...\" } }`)" + .to_string(), + )); + } + + // Validate each source's URL and name format. + for (source_name, source) in &self.victorialogs { + if let Err(e) = validate_url(&source.url) { + errors.push(ConfigError::ValidationError(format!( + "victorialogs.{}.url: {}", + source_name, e + ))); + } + if !is_valid_source_name(source_name) { + errors.push(ConfigError::ValidationError(format!( + "victorialogs source name '{}' is invalid: must match `^[a-zA-Z0-9_]+$` (alphanumeric or underscore). \ + This avoids ambiguity in the default throttle key `{{rule}}-{{source}}:global`.", + source_name + ))); + } + } + + // Validate that every rule.vl_sources entry references a declared source + // and that the rule's vl_sources list contains no duplicates. + let known_sources: Vec<&str> = self.victorialogs.keys().map(String::as_str).collect(); + for rule in &self.rules { + let mut seen: std::collections::HashSet<&str> = std::collections::HashSet::new(); + for referenced in &rule.vl_sources { + if !self.victorialogs.contains_key(referenced) { + errors.push(ConfigError::ValidationError(format!( + "rule '{}': vl_sources references unknown source '{}' (known sources: [{}])", + rule.name, + referenced, + known_sources.join(", ") + ))); + } + if !seen.insert(referenced.as_str()) { + errors.push(ConfigError::ValidationError(format!( + "rule '{}': vl_sources contains duplicate entry '{}' (each source may appear at most once)", + rule.name, referenced + ))); + } + } } // Validate notifier URLs diff --git a/src/engine.rs b/src/engine.rs index 0d527f3..cb03c39 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -32,7 +32,7 @@ //! engine.run(cancel).await?; //! ``` -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use std::time::Duration; @@ -40,9 +40,7 @@ use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::{Instrument, debug, error, info, trace, warn}; -use crate::config::{ - BasicAuthConfig, CompiledRule, CompiledThrottle, RuntimeConfig, SecretString, TlsConfig, -}; +use crate::config::{CompiledRule, CompiledThrottle, RuntimeConfig, VlSourceConfig}; use crate::error::RuleError; use crate::notify::{AlertPayload, NotificationQueue}; use crate::parser::{RuleParser, record_log_matched, record_parse_error}; @@ -53,15 +51,39 @@ use crate::throttle::{ThrottleResult, Throttler}; /// Delay before restarting a rule after panic (AD-07 inspired). const PANIC_RESTART_DELAY: Duration = Duration::from_secs(5); -/// Context needed to spawn a rule task. +/// Resolve the set of `(source_name, source_config)` pairs to spawn for a rule. +/// +/// - Empty `rule.vl_sources`: fan out across every configured source. +/// - Non-empty `rule.vl_sources`: restrict to the named subset, preserving +/// BTreeMap iteration order so the spawn order is deterministic regardless +/// of the order names appear in the rule's list. +/// +/// Unknown source names must have been rejected at `Config::validate()` time; +/// any mismatch here is silently ignored (treated as defensive filtering). +pub(crate) fn resolve_sources( + rule: &CompiledRule, + sources: &BTreeMap, +) -> Vec<(String, VlSourceConfig)> { + if rule.vl_sources.is_empty() { + return sources + .iter() + .map(|(name, cfg)| (name.clone(), cfg.clone())) + .collect(); + } + sources + .iter() + .filter(|(name, _)| rule.vl_sources.iter().any(|r| r == *name)) + .map(|(name, cfg)| (name.clone(), cfg.clone())) + .collect() +} + +/// Context needed to spawn a single `(rule, source)` task. /// Stored to allow respawning after panic. #[derive(Clone)] struct RuleSpawnContext { rule: CompiledRule, - vl_url: String, - vl_basic_auth: Option, - vl_headers: Option>, - vl_tls: Option, + vl_source_name: String, + vl_source_config: VlSourceConfig, queue: NotificationQueue, template_engine: Arc, default_throttle: CompiledThrottle, @@ -120,23 +142,24 @@ impl RuleEngine { /// /// Returns `Ok(())` when cancelled, or propagates fatal errors. pub async fn run(&self, cancel: CancellationToken) -> Result<(), RuleError> { - let mut tasks: JoinSet<(String, Result<(), RuleError>)> = JoinSet::new(); - // Map AbortHandle ID to (rule_name, spawn_context) for respawn after panic - let mut handle_to_context: HashMap = + let mut tasks: JoinSet<(String, String, Result<(), RuleError>)> = JoinSet::new(); + // Map AbortHandle ID to (rule_name, vl_source_name, spawn_context) for + // respawn after panic. Each `(rule, source)` pair is a distinct task. + let mut handle_to_context: HashMap = HashMap::new(); - // Spawn a task per enabled rule - let enabled_count = + // Spawn a task per (enabled rule, resolved source) pair + let spawned_count = self.spawn_rule_tasks(&mut tasks, &mut handle_to_context, cancel.clone()); - if enabled_count == 0 { + if spawned_count == 0 { warn!("No enabled rules found, engine will exit"); return Ok(()); } info!( - rule_count = enabled_count, - "Rule engine started, supervising rules" + task_count = spawned_count, + "Rule engine started, supervising rule-source tasks" ); // Supervision loop @@ -144,13 +167,13 @@ impl RuleEngine { .await } - /// Spawn tasks for all enabled rules. + /// Spawn tasks for all enabled `(rule, source)` pairs. /// - /// Returns the number of enabled rules spawned. + /// Returns the number of tasks spawned. fn spawn_rule_tasks( &self, - tasks: &mut JoinSet<(String, Result<(), RuleError>)>, - handle_to_context: &mut HashMap, + tasks: &mut JoinSet<(String, String, Result<(), RuleError>)>, + handle_to_context: &mut HashMap, cancel: CancellationToken, ) -> usize { let mut count = 0; @@ -171,72 +194,91 @@ impl RuleEngine { continue; } - debug!(rule_name = %rule.name, "Spawning rule task"); - - let ctx = RuleSpawnContext { - rule: rule.clone(), - vl_url: self.runtime_config.victorialogs.url.clone(), - vl_basic_auth: self.runtime_config.victorialogs.basic_auth.clone(), - vl_headers: self.runtime_config.victorialogs.headers.clone(), - vl_tls: self.runtime_config.victorialogs.tls.clone(), - queue: self.queue.clone(), - template_engine: Arc::clone(&template_engine), - default_throttle: default_throttle.clone(), - timestamp_timezone: self.runtime_config.defaults.timestamp_timezone.clone(), - }; - - let rule_name = rule.name.clone(); - Self::spawn_single_rule(tasks, handle_to_context, &ctx, &rule_name, cancel.clone()); - count += 1; + let resolved = resolve_sources(rule, &self.runtime_config.victorialogs); + if resolved.is_empty() { + // Should not happen: validation rejects zero sources at load. + warn!( + rule_name = %rule.name, + "Rule resolved to zero sources, skipping" + ); + continue; + } + + for (source_name, source_cfg) in resolved { + trace!( + rule_name = %rule.name, + vl_source = %source_name, + "Spawning (rule, source) task" + ); + + let ctx = RuleSpawnContext { + rule: rule.clone(), + vl_source_name: source_name.clone(), + vl_source_config: source_cfg, + queue: self.queue.clone(), + template_engine: Arc::clone(&template_engine), + default_throttle: default_throttle.clone(), + timestamp_timezone: self.runtime_config.defaults.timestamp_timezone.clone(), + }; + + Self::spawn_single_rule(tasks, handle_to_context, &ctx, cancel.clone()); + count += 1; + } } count } - /// Spawn a single rule task and track its handle. + /// Spawn a single `(rule, source)` task and track its handle. fn spawn_single_rule( - tasks: &mut JoinSet<(String, Result<(), RuleError>)>, - handle_to_context: &mut HashMap, + tasks: &mut JoinSet<(String, String, Result<(), RuleError>)>, + handle_to_context: &mut HashMap, ctx: &RuleSpawnContext, - rule_name: &str, cancel: CancellationToken, ) { - let rule_name_owned = rule_name.to_string(); + let rule_name_owned = ctx.rule.name.clone(); + let vl_source_owned = ctx.vl_source_name.clone(); let ctx_clone = ctx.clone(); let abort_handle = tasks.spawn(async move { + let rule_name_for_return = ctx_clone.rule.name.clone(); + let vl_source_for_return = ctx_clone.vl_source_name.clone(); let result = run_rule(ctx_clone, cancel).await; - (rule_name_owned, result) + (rule_name_for_return, vl_source_for_return, result) }); // Track context by task ID for respawn - handle_to_context.insert(abort_handle.id(), (rule_name.to_string(), ctx.clone())); + handle_to_context.insert( + abort_handle.id(), + (rule_name_owned, vl_source_owned, ctx.clone()), + ); } /// Supervise running tasks and handle completion/errors/panics. async fn supervise_tasks( &self, - tasks: &mut JoinSet<(String, Result<(), RuleError>)>, - handle_to_context: &mut HashMap, + tasks: &mut JoinSet<(String, String, Result<(), RuleError>)>, + handle_to_context: &mut HashMap, cancel: CancellationToken, ) -> Result<(), RuleError> { loop { tokio::select! { Some(result) = tasks.join_next_with_id() => { match result { - Ok((task_id, (rule_name, Ok(_)))) => { + Ok((task_id, (rule_name, vl_source, Ok(_)))) => { // Task completed normally (shouldn't happen - rules run forever) - info!(rule_name = %rule_name, "Rule task completed normally"); + info!(rule_name = %rule_name, vl_source = %vl_source, "Rule task completed normally"); handle_to_context.remove(&task_id); } - Ok((task_id, (rule_name, Err(e)))) => { - // Fatal error from rule + Ok((task_id, (rule_name, vl_source, Err(e)))) => { + // Fatal error from rule — scoped to this (rule, source) pair only. + // Other pairs keep running (per-source isolation). error!( rule_name = %rule_name, + vl_source = %vl_source, error = %e, "Rule task failed fatally" ); - // Don't restart - fatal errors are not recoverable metrics::counter!( "valerter_rule_errors_total", "rule_name" => rule_name @@ -244,46 +286,40 @@ impl RuleEngine { handle_to_context.remove(&task_id); } Err(join_error) if join_error.is_panic() => { - // Get task ID from the JoinError let task_id = join_error.id(); - // PANIC - extract rule info from our tracking map - if let Some((rule_name, ctx)) = handle_to_context.remove(&task_id) { - // Log CRITICAL with rule_name (Fix H3) + if let Some((rule_name, vl_source, ctx)) = handle_to_context.remove(&task_id) { error!( rule_name = %rule_name, + vl_source = %vl_source, error = %join_error, "Rule task panicked - CRITICAL" ); - // Increment metric with rule_name label (Fix H2) metrics::counter!( "valerter_rule_panics_total", "rule_name" => rule_name.clone() ).increment(1); - // Respawn after delay if not cancelled (Fix H1) if !cancel.is_cancelled() { info!( rule_name = %rule_name, + vl_source = %vl_source, delay_secs = PANIC_RESTART_DELAY.as_secs(), - "Respawning rule after panic delay" + "Respawning rule-source task after panic delay" ); tokio::time::sleep(PANIC_RESTART_DELAY).await; - // Respawn the task if !cancel.is_cancelled() { Self::spawn_single_rule( tasks, handle_to_context, &ctx, - &rule_name, cancel.clone(), ); - info!(rule_name = %rule_name, "Rule respawned after panic"); + info!(rule_name = %rule_name, vl_source = %vl_source, "Rule-source task respawned after panic"); } } } else { - // Should never happen, but log if it does error!( error = %join_error, "Rule task panicked but context not found - CRITICAL" @@ -292,14 +328,12 @@ impl RuleEngine { } } Err(join_error) => { - // Task was cancelled - get ID and clean up let task_id = join_error.id(); tracing::debug!(error = %join_error, "Rule task cancelled"); handle_to_context.remove(&task_id); } } - // If no tasks remain and not cancelled, exit if tasks.is_empty() && !cancel.is_cancelled() { warn!("All rule tasks completed unexpectedly"); return Ok(()); @@ -309,7 +343,6 @@ impl RuleEngine { info!("Shutdown signal received, aborting all rules"); tasks.abort_all(); - // Drain remaining tasks while tasks.join_next().await.is_some() {} info!("All rule tasks stopped"); @@ -366,10 +399,14 @@ impl ReconnectCallback for ThrottleResetCallback { /// The function runs until cancelled or a fatal error occurs. /// All recoverable errors are logged and the loop continues (Log+Continue pattern). async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<(), RuleError> { - let span = tracing::info_span!("run_rule", rule_name = %ctx.rule.name); + let span = tracing::info_span!( + "run_rule", + rule_name = %ctx.rule.name, + vl_source = %ctx.vl_source_name + ); async move { - debug!("Rule task started"); + debug!("Rule-source task started"); // Create parser for this rule let parser = RuleParser::from_compiled(&ctx.rule.parser); @@ -379,9 +416,16 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() "Parser initialized" ); - // Create throttler for this rule (uses rule's config or defaults) + // Create throttler for this (rule, source) pair. Each task owns its + // throttler: when no custom key_template is set, the default key path + // uses (rule_name, vl_source) so buckets are naturally isolated per + // source without cross-contamination. let throttle_config = ctx.rule.throttle.as_ref().unwrap_or(&ctx.default_throttle); - let throttler = Arc::new(Throttler::new(Some(throttle_config), &ctx.rule.name)); + let throttler = Arc::new(Throttler::new( + Some(throttle_config), + &ctx.rule.name, + &ctx.vl_source_name, + )); debug!( throttle_count = throttle_config.count, throttle_window_secs = throttle_config.window.as_secs(), @@ -403,20 +447,14 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() // Create callback for throttle reset on reconnection let reconnect_callback = ThrottleResetCallback::new(Arc::clone(&throttler)); - // Create TailClient for VictoriaLogs - let tail_config = TailConfig { - base_url: ctx.vl_url.clone(), - query: ctx.rule.query.clone(), - start: None, - basic_auth: ctx.vl_basic_auth.clone(), - headers: ctx.vl_headers.clone(), - tls: ctx.vl_tls.clone(), - }; + // Create TailClient for this VictoriaLogs source + let tail_config = TailConfig::from_source(&ctx.vl_source_config, ctx.rule.query.clone()); let mut tail_client = TailClient::new(tail_config).map_err(RuleError::Stream)?; // Stream with reconnection - runs until cancelled let rule_name = ctx.rule.name.clone(); + let vl_source = Arc::new(ctx.vl_source_name.clone()); let template_name = Arc::new(template_name); let destinations = Arc::new(destinations); let template_engine = ctx.template_engine; @@ -426,18 +464,17 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() let stream_result = tail_client .stream_with_reconnect(&rule_name, Some(&reconnect_callback), |line| { // Process each log line - // Clone Arcs for the async block (cheap reference counting) let queue = queue.clone(); let parser = Arc::clone(&parser); let throttler = Arc::clone(&throttler); let template_engine = Arc::clone(&template_engine); let template_name = Arc::clone(&template_name); let rule_name = rule_name.clone(); + let vl_source = Arc::clone(&vl_source); let destinations = Arc::clone(&destinations); let timestamp_timezone = Arc::clone(×tamp_timezone); async move { - // Pattern Log+Continue: never propagate errors, just log and continue if let Err(e) = process_log_line( &line, &parser, @@ -445,15 +482,16 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() &template_engine, &template_name, &rule_name, + &vl_source, &destinations, &queue, ×tamp_timezone, ) .await { - // Log at appropriate level based on error type debug!( rule_name = %rule_name, + vl_source = %vl_source, error = %e, "Failed to process log line, continuing" ); @@ -463,13 +501,11 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() }) .await; - // Check if we were cancelled or had an error if cancel.is_cancelled() { - info!("Rule task stopping due to cancellation"); + info!("Rule-source task stopping due to cancellation"); return Ok(()); } - // If we get here, stream ended unexpectedly stream_result.map_err(RuleError::Stream) } .instrument(span) @@ -490,6 +526,7 @@ async fn process_log_line( template_engine: &TemplateEngine, template_name: &str, rule_name: &str, + vl_source: &str, destinations: &[String], queue: &NotificationQueue, timestamp_timezone: &str, @@ -514,17 +551,17 @@ async fn process_log_line( // Step 1.5: Record successful match (before throttle check) record_log_matched(rule_name); - // Step 2: Check throttle + // Step 2: Check throttle (renders key with both rule_name and vl_source) match throttler.check(&fields) { ThrottleResult::Pass => { /* continue */ } ThrottleResult::Throttled => { - // Throttled - not an error, just skip sending return Ok(()); } } - // Step 3: Render template - let rendered = template_engine.render_with_fallback(template_name, &fields, rule_name); + // Step 3: Render template (layer 1 sees both rule_name and vl_source) + let rendered = + template_engine.render_with_fallback(template_name, &fields, rule_name, vl_source); // Step 4: Extract _time from parsed fields for log timestamp let log_timestamp = fields @@ -532,7 +569,11 @@ async fn process_log_line( .and_then(|v| v.as_str()) .map(|s| s.to_string()) .unwrap_or_else(|| { - warn!(rule_name = %rule_name, "Missing _time field in log, using current time"); + warn!( + rule_name = %rule_name, + vl_source = %vl_source, + "Missing _time field in log, using current time" + ); chrono::Utc::now().to_rfc3339() }); @@ -543,6 +584,7 @@ async fn process_log_line( let payload = AlertPayload { message: rendered, rule_name: rule_name.to_string(), + vl_source: vl_source.to_string(), destinations: destinations.to_vec(), log_timestamp, log_timestamp_formatted, @@ -551,6 +593,7 @@ async fn process_log_line( if let Err(e) = queue.send(payload) { warn!( rule_name = %rule_name, + vl_source = %vl_source, error = %e, "Failed to send to notification queue" ); @@ -584,9 +627,9 @@ mod tests { use super::*; use crate::config::{ CompiledParser, CompiledRule, CompiledTemplate, DefaultsConfig, MetricsConfig, - NotifyConfig, ThrottleConfig, VictoriaLogsConfig, + NotifyConfig, ThrottleConfig, VlSourceConfig, }; - use std::collections::HashMap; + use std::collections::{BTreeMap, HashMap}; fn make_test_client() -> reqwest::Client { reqwest::Client::builder() @@ -614,17 +657,23 @@ mod tests { mattermost_channel: None, destinations: vec!["mattermost-test".to_string()], }, + vl_sources: Vec::new(), } } fn make_test_runtime_config(rules: Vec) -> RuntimeConfig { - RuntimeConfig { - victorialogs: VictoriaLogsConfig { + let mut sources = BTreeMap::new(); + sources.insert( + "default".to_string(), + VlSourceConfig { url: "http://localhost:9428".to_string(), basic_auth: None, headers: None, tls: None, }, + ); + RuntimeConfig { + victorialogs: sources, defaults: DefaultsConfig { throttle: ThrottleConfig { key: None, @@ -854,7 +903,7 @@ mod tests { count: 10, window: Duration::from_secs(60), }; - let throttler = Throttler::new(Some(&throttle_config), "test_rule"); + let throttler = Throttler::new(Some(&throttle_config), "test_rule", "vlprod"); let template_engine = TemplateEngine::new(make_test_templates()); let queue = NotificationQueue::new(10); let _rx = queue.subscribe(); @@ -870,6 +919,7 @@ mod tests { &template_engine, "default", "test_rule", + "vlprod", &destinations, &queue, "UTC", @@ -889,7 +939,7 @@ mod tests { count: 10, window: Duration::from_secs(60), }; - let throttler = Throttler::new(Some(&throttle_config), "test_rule"); + let throttler = Throttler::new(Some(&throttle_config), "test_rule", "vlprod"); let template_engine = TemplateEngine::new(make_test_templates()); let queue = NotificationQueue::new(10); let _rx = queue.subscribe(); @@ -903,6 +953,7 @@ mod tests { &template_engine, "default", "test_rule", + "vlprod", &[], // Empty destinations = use default &queue, "UTC", @@ -923,7 +974,7 @@ mod tests { count: 1, // Only allow 1 alert window: Duration::from_secs(60), }; - let throttler = Throttler::new(Some(&throttle_config), "test_rule"); + let throttler = Throttler::new(Some(&throttle_config), "test_rule", "vlprod"); let template_engine = TemplateEngine::new(make_test_templates()); let queue = NotificationQueue::new(10); let _rx = queue.subscribe(); @@ -938,6 +989,7 @@ mod tests { &template_engine, "default", "test_rule", + "vlprod", &[], // Empty destinations = use default &queue, "UTC", @@ -954,6 +1006,7 @@ mod tests { &template_engine, "default", "test_rule", + "vlprod", &[], &queue, "UTC", @@ -972,7 +1025,7 @@ mod tests { count: 10, window: Duration::from_secs(60), }; - let throttler = Throttler::new(Some(&throttle_config), "test_rule"); + let throttler = Throttler::new(Some(&throttle_config), "test_rule", "vlprod"); let template_engine = TemplateEngine::new(make_test_templates()); let queue = NotificationQueue::new(10); let mut rx = queue.subscribe(); @@ -988,6 +1041,7 @@ mod tests { &template_engine, "default", "test_rule", + "vlprod", &destinations, &queue, "UTC", @@ -997,13 +1051,14 @@ mod tests { assert!(result.is_ok()); assert_eq!(queue.len(), 1); - // Verify the payload has the destinations and timestamps + // Verify the payload has the destinations, timestamps, and vl_source let payload = rx.recv().await.unwrap(); assert_eq!(payload.destinations.len(), 2); assert_eq!(payload.destinations[0], "mattermost-infra"); assert_eq!(payload.destinations[1], "mattermost-ops"); assert_eq!(payload.log_timestamp, "2026-01-09T10:00:00Z"); assert_eq!(payload.log_timestamp_formatted, "09/01/2026 10:00:00 UTC"); + assert_eq!(payload.vl_source, "vlprod"); } // =================================================================== @@ -1017,7 +1072,11 @@ mod tests { count: 1, window: Duration::from_secs(60), }; - let throttler = Arc::new(Throttler::new(Some(&throttle_config), "test_rule")); + let throttler = Arc::new(Throttler::new( + Some(&throttle_config), + "test_rule", + "vlprod", + )); let callback = ThrottleResetCallback::new(Arc::clone(&throttler)); // Use up the throttle limit @@ -1037,4 +1096,62 @@ mod tests { assert_eq!(ProcessError::Parse.to_string(), "parse error"); assert_eq!(ProcessError::Queue.to_string(), "queue error"); } + + // =================================================================== + // v2.0.0: resolve_sources multi-source fan-out semantics + // =================================================================== + + fn sources_map(names: &[&str]) -> BTreeMap { + let mut m = BTreeMap::new(); + for name in names { + m.insert( + (*name).to_string(), + VlSourceConfig { + url: format!("http://{}:9428", name), + basic_auth: None, + headers: None, + tls: None, + }, + ); + } + m + } + + #[test] + fn resolve_sources_empty_rule_fans_out_to_all_sources() { + let sources = sources_map(&["vldev", "vlprod"]); + let rule = make_test_rule("r", true); // vl_sources is empty + + let resolved = resolve_sources(&rule, &sources); + + let names: Vec<&str> = resolved.iter().map(|(n, _)| n.as_str()).collect(); + // BTreeMap ordering → deterministic [vldev, vlprod] + assert_eq!(names, vec!["vldev", "vlprod"]); + } + + #[test] + fn resolve_sources_with_subset_restricts_to_named_sources() { + let sources = sources_map(&["vldev", "vlprod", "vlstaging"]); + let mut rule = make_test_rule("r", true); + rule.vl_sources = vec!["vlprod".to_string()]; + + let resolved = resolve_sources(&rule, &sources); + + let names: Vec<&str> = resolved.iter().map(|(n, _)| n.as_str()).collect(); + assert_eq!(names, vec!["vlprod"]); + } + + #[test] + fn resolve_sources_preserves_btreemap_order_regardless_of_rule_list_order() { + // Rule lists sources in reverse alphabetical order; resolver must + // still emit them in BTreeMap (deterministic) order. + let sources = sources_map(&["vldev", "vlprod", "vlstaging"]); + let mut rule = make_test_rule("r", true); + rule.vl_sources = vec!["vlstaging".to_string(), "vldev".to_string()]; + + let resolved = resolve_sources(&rule, &sources); + + let names: Vec<&str> = resolved.iter().map(|(n, _)| n.as_str()).collect(); + assert_eq!(names, vec!["vldev", "vlstaging"]); + } } diff --git a/src/main.rs b/src/main.rs index 194cb91..db5d0ee 100644 --- a/src/main.rs +++ b/src/main.rs @@ -245,7 +245,16 @@ fn main() -> Result<()> { // Validate mode: display success and exit if cli.validate { println!("Configuration is valid: {}", cli.config.display()); - println!(" VictoriaLogs URL: {}", config.victorialogs.url); + println!( + " VictoriaLogs sources: {} [{}]", + config.victorialogs.len(), + config + .victorialogs + .iter() + .map(|(name, src)| format!("{}={}", name, src.url)) + .collect::>() + .join(", ") + ); println!( " Rules: {} ({} enabled)", config.rules.len(), diff --git a/src/notify/email.rs b/src/notify/email.rs index f4d417d..4a6ab06 100644 --- a/src/notify/email.rs +++ b/src/notify/email.rs @@ -382,6 +382,7 @@ impl EmailNotifier { title => &alert.message.title, body => &alert.message.body, rule_name => &alert.rule_name, + vl_source => &alert.vl_source, accent_color => &alert.message.accent_color, log_timestamp => &alert.log_timestamp, log_timestamp_formatted => &alert.log_timestamp_formatted, @@ -420,6 +421,7 @@ impl EmailNotifier { title => &alert.message.title, body => body_safe, rule_name => &alert.rule_name, + vl_source => &alert.vl_source, accent_color => &alert.message.accent_color, log_timestamp => &alert.log_timestamp, log_timestamp_formatted => &alert.log_timestamp_formatted, @@ -799,6 +801,7 @@ mod tests { accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), diff --git a/src/notify/mattermost.rs b/src/notify/mattermost.rs index ede5e6e..cf2b384 100644 --- a/src/notify/mattermost.rs +++ b/src/notify/mattermost.rs @@ -44,9 +44,11 @@ struct MattermostPayload { } /// Build Mattermost webhook payload from rendered message. +#[allow(clippy::too_many_arguments)] fn build_mattermost_payload( message: &crate::template::RenderedMessage, rule_name: &str, + vl_source: &str, log_timestamp_formatted: &str, channel: Option<&str>, username: Option<&str>, @@ -61,7 +63,10 @@ fn build_mattermost_payload( color: message.accent_color.clone(), title: message.title.clone(), text: message.body.clone(), - footer: format!("valerter | {} | {}", rule_name, log_timestamp_formatted), + footer: format!( + "valerter | {} | {} | {}", + rule_name, vl_source, log_timestamp_formatted + ), }], } } @@ -170,6 +175,7 @@ impl Notifier for MattermostNotifier { let mattermost_payload = build_mattermost_payload( &alert.message, &alert.rule_name, + &alert.vl_source, &alert.log_timestamp_formatted, self.channel.as_deref(), self.username.as_deref(), @@ -305,6 +311,7 @@ mod tests { let payload = build_mattermost_payload( &message, "test_rule", + "vlprod", "15/01/2026 10:49:35 UTC", None, None, @@ -319,7 +326,7 @@ mod tests { assert_eq!(attachment.color, Some("#ff0000".to_string())); assert_eq!( attachment.footer, - "valerter | test_rule | 15/01/2026 10:49:35 UTC" + "valerter | test_rule | vlprod | 15/01/2026 10:49:35 UTC" ); // Optional fields should be None assert!(payload.channel.is_none()); @@ -339,6 +346,7 @@ mod tests { let payload = build_mattermost_payload( &message, "simple_rule", + "vlprod", "09/01/2026 10:00:00 UTC", None, None, @@ -361,6 +369,7 @@ mod tests { let payload = build_mattermost_payload( &message, "rule", + "vlprod", "09/01/2026 10:00:00 UTC", Some("infra-alerts"), Some("valerter-bot"), @@ -387,6 +396,7 @@ mod tests { let payload = build_mattermost_payload( &message, "rule", + "vlprod", "09/01/2026 10:00:00 UTC", None, None, @@ -399,7 +409,7 @@ mod tests { assert!(json.contains("\"title\":\"Test\"")); assert!(json.contains("\"text\":\"Body\"")); assert!(json.contains("\"color\":\"#00ff00\"")); - assert!(json.contains("\"footer\":\"valerter | rule | 09/01/2026 10:00:00 UTC\"")); + assert!(json.contains("\"footer\":\"valerter | rule | vlprod | 09/01/2026 10:00:00 UTC\"")); // Optional fields should be omitted when None assert!(!json.contains("channel")); assert!(!json.contains("username")); @@ -418,6 +428,7 @@ mod tests { let payload = build_mattermost_payload( &message, "rule", + "vlprod", "09/01/2026 10:00:00 UTC", Some("alerts"), Some("bot"), diff --git a/src/notify/payload.rs b/src/notify/payload.rs index aa8ba37..5e71773 100644 --- a/src/notify/payload.rs +++ b/src/notify/payload.rs @@ -14,6 +14,10 @@ pub struct AlertPayload { pub message: RenderedMessage, /// Rule name for tracing and metrics. pub rule_name: String, + /// Name of the VictoriaLogs source that produced the matching event. + /// Owned `String` (not `&str`) so it survives task lifetimes and can be + /// cloned into notifier render contexts safely. + pub vl_source: String, /// Notification destinations (notifier names). /// If empty, uses the default notifier. pub destinations: Vec, diff --git a/src/notify/telegram.rs b/src/notify/telegram.rs index cec1f49..de4c3b1 100644 --- a/src/notify/telegram.rs +++ b/src/notify/telegram.rs @@ -98,6 +98,7 @@ fn render_body_template(source: &str, alert: &AlertPayload) -> Result &alert.message.title, body => &alert.message.body, rule_name => &alert.rule_name, + vl_source => &alert.vl_source, log_timestamp => &alert.log_timestamp, log_timestamp_formatted => &alert.log_timestamp_formatted, }) @@ -524,6 +525,7 @@ mod tests { accent_color: None, }, rule_name: "test_rule".to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-04-14T10:00:00Z".to_string(), log_timestamp_formatted: "14/04/2026 10:00:00 UTC".to_string(), diff --git a/src/notify/tests.rs b/src/notify/tests.rs index 506d77d..b7a88c0 100644 --- a/src/notify/tests.rs +++ b/src/notify/tests.rs @@ -27,6 +27,7 @@ fn make_payload(rule_name: &str) -> AlertPayload { accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], // Uses default notifier log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), @@ -42,6 +43,7 @@ fn make_payload_with_destinations(rule_name: &str, destinations: Vec) -> accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations, log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), @@ -640,6 +642,7 @@ fn alert_payload_clone_works() { accent_color: Some("#ff0000".to_string()), }, rule_name: "my_rule".to_string(), + vl_source: "vlprod".to_string(), destinations: vec!["mattermost-infra".to_string()], log_timestamp: "2026-01-15T10:00:00Z".to_string(), log_timestamp_formatted: "15/01/2026 10:00:00 UTC".to_string(), @@ -647,6 +650,7 @@ fn alert_payload_clone_works() { let cloned = payload.clone(); assert_eq!(cloned.rule_name, payload.rule_name); + assert_eq!(cloned.vl_source, payload.vl_source); assert_eq!(cloned.message.title, payload.message.title); assert_eq!(cloned.destinations, payload.destinations); assert_eq!(cloned.log_timestamp, payload.log_timestamp); diff --git a/src/notify/webhook.rs b/src/notify/webhook.rs index 79a34d0..af45f97 100644 --- a/src/notify/webhook.rs +++ b/src/notify/webhook.rs @@ -36,6 +36,8 @@ pub struct DefaultWebhookPayload { pub alert_name: String, /// Name of the rule that triggered the alert. pub rule_name: String, + /// Name of the VictoriaLogs source that produced the matching event. + pub vl_source: String, /// Alert title (rendered). pub title: String, /// Alert body (rendered). @@ -54,6 +56,7 @@ impl DefaultWebhookPayload { Self { alert_name: notifier_name.to_string(), rule_name: alert.rule_name.clone(), + vl_source: alert.vl_source.clone(), title: alert.message.title.clone(), body: alert.message.body.clone(), timestamp: Utc::now().to_rfc3339(), @@ -119,6 +122,7 @@ fn render_body_template(source: &str, alert: &AlertPayload) -> Result &alert.message.title, body => &alert.message.body, rule_name => &alert.rule_name, + vl_source => &alert.vl_source, log_timestamp => &alert.log_timestamp, log_timestamp_formatted => &alert.log_timestamp_formatted, }) @@ -387,6 +391,7 @@ mod tests { accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), @@ -657,6 +662,7 @@ mod tests { accent_color: None, }, rule_name: "simple_rule".to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:00:00Z".to_string(), log_timestamp_formatted: "15/01/2026 10:00:00 UTC".to_string(), @@ -797,6 +803,7 @@ mod tests { accent_color: Some("#ff0000".to_string()), }, rule_name: "test_rule".to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:00:00Z".to_string(), log_timestamp_formatted: "15/01/2026 10:00:00 UTC".to_string(), diff --git a/src/tail.rs b/src/tail.rs index 8d6dfbc..09e918a 100644 --- a/src/tail.rs +++ b/src/tail.rs @@ -36,7 +36,7 @@ use futures_util::StreamExt; use reqwest::Client; use tracing::{debug, info, trace, warn}; -use crate::config::{BasicAuthConfig, SecretString, TlsConfig}; +use crate::config::{BasicAuthConfig, SecretString, TlsConfig, VlSourceConfig}; use crate::error::StreamError; use crate::stream_buffer::StreamBuffer; @@ -71,6 +71,23 @@ pub struct TailConfig { pub tls: Option, } +impl TailConfig { + /// Build a `TailConfig` from a named VL source and a rule query. + /// + /// Credentials, TLS, and headers are all per-source. `start` is unset so + /// the tail endpoint follows live tail semantics. + pub fn from_source(source: &VlSourceConfig, query: String) -> Self { + Self { + base_url: source.url.clone(), + query, + start: None, + basic_auth: source.basic_auth.clone(), + headers: source.headers.clone(), + tls: source.tls.clone(), + } + } +} + /// Client for streaming logs from VictoriaLogs tail endpoint. /// /// Handles connection establishment, reconnection with exponential backoff, diff --git a/src/template.rs b/src/template.rs index 48dc02e..7583f3c 100644 --- a/src/template.rs +++ b/src/template.rs @@ -127,6 +127,7 @@ impl TemplateEngine { template_name: &str, fields: &Value, rule_name: &str, + vl_source: &str, ) -> Result { tracing::trace!(template_name = %template_name, "Starting template render"); @@ -138,15 +139,20 @@ impl TemplateEngine { name: template_name.to_string(), })?; - // Render each field. rule_name is injected in the render helpers so - // it is available at layer 1 (title, body, email_body_html), matching - // the existing layer 2 notifier-level contexts (issue #31). - let title = self.render_string(&template.title, fields, rule_name)?; - let body = self.render_string(&template.body, fields, rule_name)?; + // Render each field. `rule_name` and `vl_source` are injected in the + // render helpers so they are available at layer 1 (title, body, + // email_body_html), matching the notifier-level (layer 2) contexts. + let title = self.render_string(&template.title, fields, rule_name, vl_source)?; + let body = self.render_string(&template.body, fields, rule_name, vl_source)?; // Render email_body_html with HTML auto-escape if present let email_body_html = if let Some(email_body_html_template) = &template.email_body_html { - Some(self.render_string_html_escaped(email_body_html_template, fields, rule_name)?) + Some(self.render_string_html_escaped( + email_body_html_template, + fields, + rule_name, + vl_source, + )?) } else { None }; @@ -173,9 +179,10 @@ impl TemplateEngine { template_str: &str, fields: &Value, rule_name: &str, + vl_source: &str, ) -> Result { let mut ctx = crate::parser::unflatten_dotted_keys(fields); - inject_rule_name(&mut ctx, rule_name); + inject_context(&mut ctx, rule_name, vl_source); self.env .render_str(template_str, &ctx) .map_err(|e| TemplateError::RenderFailed { @@ -190,9 +197,10 @@ impl TemplateEngine { template_str: &str, fields: &Value, rule_name: &str, + vl_source: &str, ) -> Result { let mut ctx = crate::parser::unflatten_dotted_keys(fields); - inject_rule_name(&mut ctx, rule_name); + inject_context(&mut ctx, rule_name, vl_source); self.html_env .render_str(template_str, &ctx) .map_err(|e| TemplateError::RenderFailed { @@ -220,15 +228,21 @@ impl TemplateEngine { template_name: &str, fields: &Value, rule_name: &str, + vl_source: &str, ) -> RenderedMessage { - match self.render(template_name, fields, rule_name) { + match self.render(template_name, fields, rule_name, vl_source) { Ok(msg) => { - tracing::trace!(rule_name = %rule_name, "Template render successful"); + tracing::trace!( + rule_name = %rule_name, + vl_source = %vl_source, + "Template render successful" + ); msg } Err(e) => { tracing::warn!( rule_name = %rule_name, + vl_source = %vl_source, template = %template_name, error = %e, "Template render failed, using fallback" @@ -248,18 +262,23 @@ impl TemplateEngine { } } -/// Inject the synthetic `rule_name` key into a render context (issue #31). +/// Inject the synthetic `rule_name` (issue #31) and `vl_source` (v2.0.0) +/// keys into a render context. /// -/// The synthetic value wins over any event field literally named `rule_name` -/// so operators can rely on `{{ rule_name }}` consistently across layer 1 +/// The synthetic values win over any event field literally named `rule_name` +/// or `vl_source` so operators can rely on them consistently across layer 1 /// and layer 2 templates. If `ctx` is not a JSON object (should not happen /// in practice — VL events are always objects), injection is skipped. -fn inject_rule_name(ctx: &mut Value, rule_name: &str) { +fn inject_context(ctx: &mut Value, rule_name: &str, vl_source: &str) { if let Some(obj) = ctx.as_object_mut() { obj.insert( "rule_name".to_string(), Value::String(rule_name.to_string()), ); + obj.insert( + "vl_source".to_string(), + Value::String(vl_source.to_string()), + ); } } @@ -320,7 +339,9 @@ mod tests { "message": "CPU usage high" }); - let result = engine.render("alert", &fields, "test_rule").unwrap(); + let result = engine + .render("alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Alert: server-01"); assert_eq!(result.body, "Host server-01 reported: CPU usage high"); @@ -346,14 +367,14 @@ mod tests { // Test critical severity let fields_critical = json!({"severity": "critical"}); let result = engine - .render("alert", &fields_critical, "test_rule") + .render("alert", &fields_critical, "test_rule", "vlprod") .unwrap(); assert_eq!(result.title, "🚨 CRITICAL"); // Test non-critical severity let fields_warning = json!({"severity": "warning"}); let result = engine - .render("alert", &fields_warning, "test_rule") + .render("alert", &fields_warning, "test_rule", "vlprod") .unwrap(); assert_eq!(result.title, "⚠️ Warning"); } @@ -384,7 +405,9 @@ mod tests { } }); - let result = engine.render("alert", &fields, "test_rule").unwrap(); + let result = engine + .render("alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Server: prod-server-01"); assert_eq!(result.body, "Region: us-east-1, Status: alert"); @@ -406,7 +429,9 @@ mod tests { let fields = json!({"host": "server-01"}); // Should NOT return an error, missing field renders as empty string - let result = engine.render("alert", &fields, "test_rule").unwrap(); + let result = engine + .render("alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Host: server-01"); assert_eq!(result.body, "Missing: "); // Empty string for missing field @@ -430,7 +455,9 @@ mod tests { "body": "Something went wrong" }); - let result = engine.render("full_alert", &fields, "test_rule").unwrap(); + let result = engine + .render("full_alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Critical Alert"); assert_eq!(result.body, "Something went wrong"); @@ -447,7 +474,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("nonexistent", &fields, "test_rule"); + let result = engine.render("nonexistent", &fields, "test_rule", "vlprod"); assert!(result.is_err()); match result.unwrap_err() { @@ -471,11 +498,11 @@ mod tests { let fields = json!({"host": "server-01"}); // render() should return error - let result = engine.render("bad_template", &fields, "test_rule"); + let result = engine.render("bad_template", &fields, "test_rule", "vlprod"); assert!(result.is_err()); // render_with_fallback() should return fallback message - let fallback = engine.render_with_fallback("bad_template", &fields, "test_rule"); + let fallback = engine.render_with_fallback("bad_template", &fields, "test_rule", "vlprod"); assert_eq!(fallback.title, "[test_rule] Alert"); assert!(fallback.body.contains("Template render failed")); assert!(fallback.body.contains("Check logs for details")); @@ -501,13 +528,13 @@ mod tests { // Render for "rule 1" let fields1 = json!({"host": "server-01", "message": "Error A"}); let result1 = engine - .render("shared_template", &fields1, "rule_1") + .render("shared_template", &fields1, "rule_1", "vlprod") .unwrap(); // Render for "rule 2" with different data let fields2 = json!({"host": "server-02", "message": "Error B"}); let result2 = engine - .render("shared_template", &fields2, "rule_2") + .render("shared_template", &fields2, "rule_2", "vlprod") .unwrap(); // Both should render correctly with their own data @@ -533,7 +560,9 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({}); - let result = engine.render("alert", &fields, "test_rule").unwrap(); + let result = engine + .render("alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Static Title"); assert_eq!(result.body, "Static Body"); } @@ -544,7 +573,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render_with_fallback("missing", &fields, "my_rule"); + let result = engine.render_with_fallback("missing", &fields, "my_rule", "vlprod"); assert_eq!(result.title, "[my_rule] Alert"); assert!(result.body.contains("not found")); @@ -563,7 +592,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render_with_fallback("valid", &fields, "test_rule"); + let result = engine.render_with_fallback("valid", &fields, "test_rule", "vlprod"); // Should return rendered message, NOT fallback assert_eq!(result.title, "Alert: server-01"); @@ -621,7 +650,9 @@ mod tests { "items": ["apple", "banana", "cherry"] }); - let result = engine.render("list", &fields, "test_rule").unwrap(); + let result = engine + .render("list", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Items (3)"); assert!(result.body.contains("- apple")); assert!(result.body.contains("- banana")); @@ -637,7 +668,9 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("empty", &fields, "test_rule").unwrap(); + let result = engine + .render("empty", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, ""); assert_eq!(result.body, ""); } @@ -663,7 +696,9 @@ mod tests { } }); - let result = engine.render("deep", &fields, "test_rule").unwrap(); + let result = engine + .render("deep", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "deep_value"); } @@ -699,7 +734,9 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("email_alert", &fields, "test_rule").unwrap(); + let result = engine + .render("email_alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "Alert: server-01"); assert_eq!(result.body, "Host server-01 down"); @@ -722,7 +759,9 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"hostname": ""}); - let result = engine.render("email_alert", &fields, "test_rule").unwrap(); + let result = engine + .render("email_alert", &fields, "test_rule", "vlprod") + .unwrap(); let email_body_html = result.email_body_html.unwrap(); // HTML should be escaped @@ -756,7 +795,9 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"nginx.http.request_id": "abc"}); - let result = engine.render("alert", &fields, "test_rule").unwrap(); + let result = engine + .render("alert", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "abc"); assert_eq!(result.body, "id=abc"); } @@ -784,7 +825,9 @@ mod tests { "nginx.http.status_code": "400" }); - let result = engine.render("my_template", &fields, "test_rule").unwrap(); + let result = engine + .render("my_template", &fields, "test_rule", "vlprod") + .unwrap(); assert_eq!(result.title, "T"); assert_eq!(result.body, "B"); let email_body_html = result.email_body_html.unwrap(); @@ -814,7 +857,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("alert", &fields, "VM_OFF").unwrap(); + let result = engine.render("alert", &fields, "VM_OFF", "vlprod").unwrap(); assert_eq!(result.title, "Alert VM_OFF"); } @@ -832,7 +875,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("alert", &fields, "VM_OFF").unwrap(); + let result = engine.render("alert", &fields, "VM_OFF", "vlprod").unwrap(); assert_eq!( result.body, "rule=VM_OFF\nhost=server-01\nrule_again=VM_OFF" @@ -854,13 +897,93 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"host": "server-01"}); - let result = engine.render("email_alert", &fields, "VM_OFF").unwrap(); + let result = engine + .render("email_alert", &fields, "VM_OFF", "vlprod") + .unwrap(); assert_eq!( result.email_body_html.unwrap(), "

Rule: VM_OFF on server-01

" ); } + // =================================================================== + // v2.0.0: vl_source available in layer 1 templates (title, body, + // email_body_html), matching the layer 2 notifier-level contexts and + // the throttle key render context. + // =================================================================== + + #[test] + fn render_injects_vl_source_in_title() { + let mut templates = HashMap::new(); + templates.insert( + "alert".to_string(), + make_template("[{{ vl_source }}] {{ rule_name }}", "body"), + ); + + let engine = TemplateEngine::new(templates); + let fields = json!({"host": "server-01"}); + + let result = engine.render("alert", &fields, "VM_OFF", "vlprod").unwrap(); + assert_eq!(result.title, "[vlprod] VM_OFF"); + } + + #[test] + fn render_injects_vl_source_in_body() { + let mut templates = HashMap::new(); + templates.insert( + "alert".to_string(), + make_template("title", "source={{ vl_source }}\nhost={{ host }}"), + ); + + let engine = TemplateEngine::new(templates); + let fields = json!({"host": "server-01"}); + + let result = engine.render("alert", &fields, "VM_OFF", "vldev").unwrap(); + assert_eq!(result.body, "source=vldev\nhost=server-01"); + } + + #[test] + fn render_injects_vl_source_in_email_body_html() { + let mut templates = HashMap::new(); + templates.insert( + "email_alert".to_string(), + make_template_with_email_body_html( + "t", + "b", + "

Source: {{ vl_source }}, host: {{ host }}

", + ), + ); + + let engine = TemplateEngine::new(templates); + let fields = json!({"host": "server-01"}); + + let result = engine + .render("email_alert", &fields, "VM_OFF", "vlprod") + .unwrap(); + assert_eq!( + result.email_body_html.unwrap(), + "

Source: vlprod, host: server-01

" + ); + } + + #[test] + fn render_vl_source_synthetic_overrides_event_field() { + // Collision policy: synthetic vl_source wins over any event field + // literally named "vl_source" (matches rule_name collision policy). + let mut templates = HashMap::new(); + templates.insert( + "alert".to_string(), + make_template("{{ vl_source }}", "{{ vl_source }}"), + ); + + let engine = TemplateEngine::new(templates); + let fields = json!({"vl_source": "evil", "host": "server-01"}); + + let result = engine.render("alert", &fields, "VM_OFF", "vlprod").unwrap(); + assert_eq!(result.title, "vlprod"); + assert_eq!(result.body, "vlprod"); + } + #[test] fn render_rule_name_synthetic_overrides_event_field() { // Collision policy: synthetic rule_name wins over any event field @@ -874,7 +997,7 @@ mod tests { let engine = TemplateEngine::new(templates); let fields = json!({"rule_name": "event-value", "host": "server-01"}); - let result = engine.render("alert", &fields, "VM_OFF").unwrap(); + let result = engine.render("alert", &fields, "VM_OFF", "vlprod").unwrap(); assert_eq!(result.title, "VM_OFF"); assert_eq!(result.body, "VM_OFF"); } @@ -891,7 +1014,7 @@ mod tests { let fields = json!({"host": "server-01"}); let result = engine - .render("mattermost_alert", &fields, "test_rule") + .render("mattermost_alert", &fields, "test_rule", "vlprod") .unwrap(); assert!( diff --git a/src/throttle.rs b/src/throttle.rs index c9260e5..9be6af6 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -65,6 +65,9 @@ pub struct Throttler { max_count: u32, /// Rule name for logging and metrics (Arc to avoid cloning). rule_name: Arc, + /// VL source name bound to this throttler (per-task). Threaded into every + /// rendered key so the `(rule, source)` bucket is isolated by default. + vl_source: Arc, /// Pre-created Jinja environment for template rendering (H1 fix). jinja_env: Environment<'static>, } @@ -76,14 +79,11 @@ impl Throttler { /// /// * `config` - Optional throttle configuration. If None, creates a pass-through throttler. /// * `rule_name` - Name of the rule for logging and metrics. - /// - /// # Example - /// - /// ```ignore - /// let throttler = Throttler::new(Some(&compiled_throttle), "my_rule"); - /// ``` - pub fn new(config: Option<&CompiledThrottle>, rule_name: &str) -> Self { - Self::with_capacity(config, rule_name, DEFAULT_MAX_CAPACITY) + /// * `vl_source` - VL source name bound to the task (injected into render + /// context so `{{ vl_source }}` works in `throttle.key` and the default + /// key is per-source by construction). + pub fn new(config: Option<&CompiledThrottle>, rule_name: &str, vl_source: &str) -> Self { + Self::with_capacity(config, rule_name, vl_source, DEFAULT_MAX_CAPACITY) } /// Create a new Throttler with custom max capacity (for testing). @@ -92,10 +92,12 @@ impl Throttler { /// /// * `config` - Optional throttle configuration. /// * `rule_name` - Name of the rule for logging and metrics. + /// * `vl_source` - VL source name bound to this task's throttler. /// * `max_capacity` - Maximum number of keys in the cache (FR25). pub fn with_capacity( config: Option<&CompiledThrottle>, rule_name: &str, + vl_source: &str, max_capacity: u64, ) -> Self { let (key_template, max_count, window) = match config { @@ -108,12 +110,14 @@ impl Throttler { if t.count == 0 { tracing::warn!( rule_name = %rule_name, + vl_source = %vl_source, "Throttle count is 0, all alerts after first will be throttled" ); } if t.window.is_zero() { tracing::warn!( rule_name = %rule_name, + vl_source = %vl_source, "Throttle window is 0, entries will expire immediately" ); } @@ -133,6 +137,7 @@ impl Throttler { key_template, max_count, rule_name: Arc::from(rule_name), + vl_source: Arc::from(vl_source), jinja_env, } } @@ -196,26 +201,27 @@ impl Throttler { /// Render the throttle key from template and fields. /// - /// If no template is configured, returns a global key for the rule. - /// If template rendering fails, logs a warning and returns a fallback key. + /// If no template is configured, returns the per-source default key + /// `"{rule}-{source}:global"` so multi-source deployments see isolated + /// throttle buckets without any config. If rendering fails, logs a + /// warning and returns a fallback key. fn render_key(&self, fields: &Value) -> String { match &self.key_template { Some(template) => { - // Inject synthetic `rule_name` (issue #31) so users can write - // `{{ rule_name }}` in throttle.key. Synthetic wins over any - // event field with the same name, matching layer 1/2 template + // Inject synthetic `rule_name` (issue #31) and `vl_source` + // (multi-source v2.0.0). Synthetic values win over any event + // field with the same name, matching layer 1/2 template // behavior. - let enriched_ctx = enrich_with_rule_name(fields, &self.rule_name); - // H1 fix: Use pre-created jinja_env instead of creating new one + let enriched_ctx = enrich_with_context(fields, &self.rule_name, &self.vl_source); match self.jinja_env.render_str(template, &enriched_ctx) { Ok(key) => { tracing::trace!(rendered_key = %key, "Throttle key rendered"); key } Err(e) => { - // Template error - log and use fallback tracing::warn!( rule_name = %self.rule_name, + vl_source = %self.vl_source, template = %template, error = %e, "Failed to render throttle key, using fallback" @@ -225,8 +231,11 @@ impl Throttler { } } None => { - // No template = global throttle for the rule - format!("{}:global", self.rule_name) + // Default key is per-(rule, source) so buckets are isolated + // per-source by construction. Equivalent to rendering + // `"{{ rule_name }}-{{ vl_source }}:global"` via Jinja, but + // inlined to avoid the render round-trip on every call. + format!("{}-{}:global", self.rule_name, self.vl_source) } } } @@ -242,20 +251,25 @@ impl Throttler { } /// Unflatten dotted event keys (issue #25) then inject the synthetic -/// `rule_name` key (issue #31). Matches the layer 1 template rendering path -/// so users can reference both dotted event fields (`{{ nginx.http.status }}`) -/// and `{{ rule_name }}` inside a `throttle.key` template consistently. +/// `rule_name` (issue #31) and `vl_source` (v2.0.0 multi-source) keys. +/// Matches the layer 1 template rendering path so users can reference both +/// dotted event fields (`{{ nginx.http.status }}`) and the synthetic keys +/// inside a `throttle.key` template consistently. /// /// Returns the original value unchanged if it is not a JSON object (should -/// not happen in practice, VL events are always objects). The synthetic -/// value wins over any event field literally named `rule_name`. -fn enrich_with_rule_name(fields: &Value, rule_name: &str) -> Value { +/// not happen in practice, VL events are always objects). Synthetic values +/// win over any event field literally named `rule_name` or `vl_source`. +fn enrich_with_context(fields: &Value, rule_name: &str, vl_source: &str) -> Value { let mut ctx = crate::parser::unflatten_dotted_keys(fields); if let Some(obj) = ctx.as_object_mut() { obj.insert( "rule_name".to_string(), Value::String(rule_name.to_string()), ); + obj.insert( + "vl_source".to_string(), + Value::String(vl_source.to_string()), + ); } ctx } @@ -291,7 +305,7 @@ mod tests { #[test] fn render_key_with_simple_template() { let config = make_config(Some("{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01", "port": "Gi0/1"}); let key = throttler.render_key(&fields); @@ -306,7 +320,7 @@ mod tests { #[test] fn render_key_with_composite_template() { let config = make_config(Some("{{ host }}-{{ port }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01", "port": "Gi0/1"}); let key = throttler.render_key(&fields); @@ -321,7 +335,7 @@ mod tests { #[test] fn render_key_with_missing_field_returns_empty_value() { let config = make_config(Some("{{ host }}-{{ missing }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); let key = throttler.render_key(&fields); @@ -337,7 +351,7 @@ mod tests { #[test] fn first_alert_passes() { let config = make_config(Some("{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); let result = throttler.check(&fields); @@ -352,7 +366,7 @@ mod tests { #[test] fn alerts_up_to_count_pass() { let config = make_config(Some("{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); @@ -369,7 +383,7 @@ mod tests { #[test] fn alert_after_count_is_throttled() { let config = make_config(Some("{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); @@ -395,7 +409,7 @@ mod tests { count: 2, window: Duration::from_millis(100), // Very short for testing }; - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); @@ -424,7 +438,7 @@ mod tests { let config = make_config(Some("{{ key }}"), 2, 3600); // Use with_capacity to set a small max (5 keys) - let throttler = Throttler::with_capacity(Some(&config), "test_rule", 5); + let throttler = Throttler::with_capacity(Some(&config), "test_rule", "vlprod", 5); // Fill cache with 5 different keys, each gets 2 alerts (at max) for i in 0..5 { @@ -469,27 +483,44 @@ mod tests { #[test] fn no_key_template_uses_global_key() { let config = make_config(None, 2, 60); - let throttler = Throttler::new(Some(&config), "my_rule"); + let throttler = Throttler::new(Some(&config), "my_rule", "vlprod"); let fields1 = json!({"host": "SW-01"}); let fields2 = json!({"host": "SW-02"}); - // Both should use the same global key "my_rule:global" + // Both should use the same per-source default key + // "my_rule-vlprod:global" - cross-host but not cross-source. assert_eq!(throttler.check(&fields1), ThrottleResult::Pass); assert_eq!(throttler.check(&fields2), ThrottleResult::Pass); - // Third from either should be throttled (same global key) + // Third from either should be throttled (same default key) assert_eq!(throttler.check(&fields1), ThrottleResult::Throttled); } #[test] - fn global_key_format() { + fn default_key_format_is_rule_dash_source_global() { + // Spec: default throttle key is `{rule}-{source}:global`. + // Used to be `{rule}:global` (pre-v2.0.0); breaking change for + // multi-source deployments and locks per-source bucket isolation. let config = make_config(None, 2, 60); - let throttler = Throttler::new(Some(&config), "my_rule"); + let throttler = Throttler::new(Some(&config), "my_rule", "vlprod"); let fields = json!({}); let key = throttler.render_key(&fields); - assert_eq!(key, "my_rule:global"); + assert_eq!(key, "my_rule-vlprod:global"); + } + + #[test] + fn default_key_isolates_buckets_per_source() { + // Two throttlers with the same rule but different sources must + // produce different default keys, so buckets are isolated per-source. + let config = make_config(None, 2, 60); + let throttler_a = Throttler::new(Some(&config), "VM_OFF", "vlprod"); + let throttler_b = Throttler::new(Some(&config), "VM_OFF", "vldev"); + + let fields = json!({}); + assert_eq!(throttler_a.render_key(&fields), "VM_OFF-vlprod:global"); + assert_eq!(throttler_b.render_key(&fields), "VM_OFF-vldev:global"); } // =================================================================== @@ -499,7 +530,7 @@ mod tests { #[test] fn render_key_with_nested_fields() { let config = make_config(Some("{{ data.server.name }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({ "data": { @@ -520,7 +551,7 @@ mod tests { #[test] fn different_keys_are_throttled_independently() { let config = make_config(Some("{{ host }}"), 2, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let sw01 = json!({"host": "SW-01"}); let sw02 = json!({"host": "SW-02"}); @@ -538,7 +569,7 @@ mod tests { #[test] fn no_config_passes_all() { - let throttler = Throttler::new(None, "test_rule"); + let throttler = Throttler::new(None, "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); @@ -551,7 +582,7 @@ mod tests { #[test] fn reset_clears_all_entries() { let config = make_config(Some("{{ host }}"), 2, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); @@ -570,7 +601,7 @@ mod tests { #[test] fn debug_format_shows_useful_info() { let config = make_config(Some("{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let debug = format!("{:?}", throttler); @@ -590,7 +621,7 @@ mod tests { #[test] fn render_key_includes_rule_name() { let config = make_config(Some("{{ rule_name }}-{{ host }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "VM_OFF"); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); let fields = json!({"host": "SW-01"}); let key = throttler.render_key(&fields); @@ -604,7 +635,7 @@ mod tests { // tened the same way template rendering does, so `{{ nginx.http.status }}` // works here too (not just in `title`/`body`). let config = make_config(Some("{{ rule_name }}-{{ nginx.http.status_code }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "VM_OFF"); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); let fields = json!({"nginx.http.status_code": "404"}); let key = throttler.render_key(&fields); @@ -617,7 +648,7 @@ mod tests { // Collision policy: synthetic rule_name wins over any event field // literally named "rule_name". let config = make_config(Some("{{ rule_name }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "VM_OFF"); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); let fields = json!({"rule_name": "event-value", "host": "SW-01"}); let key = throttler.render_key(&fields); @@ -625,11 +656,50 @@ mod tests { assert_eq!(key, "VM_OFF"); } + // =================================================================== + // v2.0.0: vl_source injected into throttle key render context + // =================================================================== + + #[test] + fn render_key_includes_vl_source() { + let config = make_config(Some("{{ rule_name }}-{{ vl_source }}"), 3, 60); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); + + let fields = json!({"host": "SW-01"}); + let key = throttler.render_key(&fields); + + assert_eq!(key, "VM_OFF-vlprod"); + } + + #[test] + fn render_key_vl_source_synthetic_overrides_event_field() { + // Collision policy: synthetic vl_source wins over any event field + // literally named "vl_source" (matches rule_name collision policy). + let config = make_config(Some("{{ vl_source }}"), 3, 60); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); + + let fields = json!({"vl_source": "evil", "host": "SW-01"}); + let key = throttler.render_key(&fields); + + assert_eq!(key, "vlprod"); + } + + #[test] + fn render_key_custom_template_with_both_synthetics() { + let config = make_config(Some("{{ rule_name }}-{{ vl_source }}-{{ host }}"), 3, 60); + let throttler = Throttler::new(Some(&config), "VM_OFF", "vlprod"); + + let fields = json!({"host": "SW-01"}); + let key = throttler.render_key(&fields); + + assert_eq!(key, "VM_OFF-vlprod-SW-01"); + } + #[test] fn template_error_uses_fallback_key() { // Invalid template syntax that minijinja can't render let config = make_config(Some("{{ nonexistent_filter | bad_filter }}"), 3, 60); - let throttler = Throttler::new(Some(&config), "test_rule"); + let throttler = Throttler::new(Some(&config), "test_rule", "vlprod"); let fields = json!({"host": "SW-01"}); let key = throttler.render_key(&fields); diff --git a/tests/fixtures/config_disabled_invalid.yaml b/tests/fixtures/config_disabled_invalid.yaml index b0f146b..7f23240 100644 --- a/tests/fixtures/config_disabled_invalid.yaml +++ b/tests/fixtures/config_disabled_invalid.yaml @@ -1,7 +1,8 @@ # Configuration fixture with disabled rule containing invalid regex # Validation should STILL fail even for disabled rules (AD-11) victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" notifiers: test: diff --git a/tests/fixtures/config_email_missing_email_body_html.yaml b/tests/fixtures/config_email_missing_email_body_html.yaml index 3a73135..2accd97 100644 --- a/tests/fixtures/config_email_missing_email_body_html.yaml +++ b/tests/fixtures/config_email_missing_email_body_html.yaml @@ -2,7 +2,8 @@ # This should fail validation at startup (AC7) victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" defaults: throttle: diff --git a/tests/fixtures/config_invalid_basic_auth.yaml b/tests/fixtures/config_invalid_basic_auth.yaml index 29c74fe..79de6a1 100644 --- a/tests/fixtures/config_invalid_basic_auth.yaml +++ b/tests/fixtures/config_invalid_basic_auth.yaml @@ -1,9 +1,10 @@ # Configuration fixture with incomplete Basic Auth (missing password) victorialogs: - url: "https://victorialogs.secure.local:9428" - basic_auth: - username: "testuser" - # password is missing - this should fail to parse + default: + url: "https://victorialogs.secure.local:9428" + basic_auth: + username: "testuser" + # password is missing - this should fail to parse defaults: throttle: diff --git a/tests/fixtures/config_invalid_notifier_type.yaml b/tests/fixtures/config_invalid_notifier_type.yaml index 67ebce2..08f6946 100644 --- a/tests/fixtures/config_invalid_notifier_type.yaml +++ b/tests/fixtures/config_invalid_notifier_type.yaml @@ -1,6 +1,7 @@ # Configuration fixture with invalid notifier type (Story 6.2 - AC4) victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" defaults: throttle: diff --git a/tests/fixtures/config_invalid_regex.yaml b/tests/fixtures/config_invalid_regex.yaml index 161d11b..22c4b24 100644 --- a/tests/fixtures/config_invalid_regex.yaml +++ b/tests/fixtures/config_invalid_regex.yaml @@ -1,6 +1,7 @@ # Configuration fixture with invalid regex pattern for testing fail-fast validation victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" notifiers: test: diff --git a/tests/fixtures/config_invalid_template.yaml b/tests/fixtures/config_invalid_template.yaml index 9728f1c..f511146 100644 --- a/tests/fixtures/config_invalid_template.yaml +++ b/tests/fixtures/config_invalid_template.yaml @@ -1,6 +1,7 @@ # Configuration fixture with invalid Jinja template for testing fail-fast validation victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" notifiers: test: diff --git a/tests/fixtures/config_minimal.yaml b/tests/fixtures/config_minimal.yaml index b0e714f..2be185e 100644 --- a/tests/fixtures/config_minimal.yaml +++ b/tests/fixtures/config_minimal.yaml @@ -1,6 +1,7 @@ # Minimal valid config - matches README example victorialogs: - url: "http://localhost:9428" + default: + url: "http://localhost:9428" notifiers: mattermost-ops: diff --git a/tests/fixtures/config_no_notifier.yaml b/tests/fixtures/config_no_notifier.yaml index 40cb944..bcc1b72 100644 --- a/tests/fixtures/config_no_notifier.yaml +++ b/tests/fixtures/config_no_notifier.yaml @@ -1,7 +1,8 @@ # Invalid config - no notifiers configured # This should fail validation with "no notifiers configured" error victorialogs: - url: "http://localhost:9428" + default: + url: "http://localhost:9428" defaults: throttle: diff --git a/tests/fixtures/config_no_template.yaml b/tests/fixtures/config_no_template.yaml index 7c1a841..da952dc 100644 --- a/tests/fixtures/config_no_template.yaml +++ b/tests/fixtures/config_no_template.yaml @@ -1,7 +1,8 @@ # Invalid config - no templates defined # This should fail validation with "no templates defined" error victorialogs: - url: "http://localhost:9428" + default: + url: "http://localhost:9428" notifiers: test: diff --git a/tests/fixtures/config_valid.yaml b/tests/fixtures/config_valid.yaml index f9e50da..c24c573 100644 --- a/tests/fixtures/config_valid.yaml +++ b/tests/fixtures/config_valid.yaml @@ -1,6 +1,7 @@ # Valid configuration fixture for tests victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" notifiers: default-mattermost: diff --git a/tests/fixtures/config_with_auth.yaml b/tests/fixtures/config_with_auth.yaml index 0176fac..b124158 100644 --- a/tests/fixtures/config_with_auth.yaml +++ b/tests/fixtures/config_with_auth.yaml @@ -2,15 +2,16 @@ # NOTE: This is a TEST FIXTURE only - credentials are intentionally hardcoded. # In production, always use environment variables: ${VL_USER}, ${VL_PASS}, etc. victorialogs: - url: "https://victorialogs.secure.local:9428" - basic_auth: - username: "testuser" - password: "testpassword" - headers: - X-API-Key: "secret-api-key-12345" - X-Custom-Header: "custom-value" - tls: - verify: false + default: + url: "https://victorialogs.secure.local:9428" + basic_auth: + username: "testuser" + password: "testpassword" + headers: + X-API-Key: "secret-api-key-12345" + X-Custom-Header: "custom-value" + tls: + verify: false notifiers: test-notifier: diff --git a/tests/fixtures/config_with_notifiers.yaml b/tests/fixtures/config_with_notifiers.yaml index e12825f..c248d15 100644 --- a/tests/fixtures/config_with_notifiers.yaml +++ b/tests/fixtures/config_with_notifiers.yaml @@ -1,6 +1,7 @@ # Configuration fixture with notifiers section (Story 6.2) victorialogs: - url: "http://victorialogs:9428" + default: + url: "http://victorialogs:9428" defaults: throttle: diff --git a/tests/fixtures/multi-file-collision/config.yaml b/tests/fixtures/multi-file-collision/config.yaml index 88f7a65..cc73fe4 100644 --- a/tests/fixtures/multi-file-collision/config.yaml +++ b/tests/fixtures/multi-file-collision/config.yaml @@ -1,7 +1,8 @@ # Config with a rule that will collide with one in rules.d/ victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 notifiers: test: diff --git a/tests/fixtures/multi-file-cross-ref/config.yaml b/tests/fixtures/multi-file-cross-ref/config.yaml index ce78aa1..cca4373 100644 --- a/tests/fixtures/multi-file-cross-ref/config.yaml +++ b/tests/fixtures/multi-file-cross-ref/config.yaml @@ -1,7 +1,8 @@ # Config for testing cross-file references victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 notifiers: test: diff --git a/tests/fixtures/multi-file-empty/config.yaml b/tests/fixtures/multi-file-empty/config.yaml index ebd599a..952fb46 100644 --- a/tests/fixtures/multi-file-empty/config.yaml +++ b/tests/fixtures/multi-file-empty/config.yaml @@ -1,7 +1,8 @@ # Config with empty .d/ directories (should work fine) victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 notifiers: test: diff --git a/tests/fixtures/multi-file-intra-collision/config.yaml b/tests/fixtures/multi-file-intra-collision/config.yaml index ecdde8a..7ad2bd6 100644 --- a/tests/fixtures/multi-file-intra-collision/config.yaml +++ b/tests/fixtures/multi-file-intra-collision/config.yaml @@ -1,7 +1,8 @@ # Config for testing collision within .d/ directory victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 notifiers: test: diff --git a/tests/fixtures/multi-file-invalid/config.yaml b/tests/fixtures/multi-file-invalid/config.yaml index efe4f00..4df2cd1 100644 --- a/tests/fixtures/multi-file-invalid/config.yaml +++ b/tests/fixtures/multi-file-invalid/config.yaml @@ -1,7 +1,8 @@ # Valid config, but rules.d/ contains invalid YAML victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 defaults: throttle: diff --git a/tests/fixtures/multi-file-notifier-collision/config.yaml b/tests/fixtures/multi-file-notifier-collision/config.yaml index c75f71b..64a7f98 100644 --- a/tests/fixtures/multi-file-notifier-collision/config.yaml +++ b/tests/fixtures/multi-file-notifier-collision/config.yaml @@ -1,5 +1,6 @@ victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 defaults: throttle: diff --git a/tests/fixtures/multi-file-only-d/config.yaml b/tests/fixtures/multi-file-only-d/config.yaml index 4304f87..5182365 100644 --- a/tests/fixtures/multi-file-only-d/config.yaml +++ b/tests/fixtures/multi-file-only-d/config.yaml @@ -2,7 +2,8 @@ # Everything loaded from .d/ directories victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 defaults: throttle: diff --git a/tests/fixtures/multi-file-template-collision/config.yaml b/tests/fixtures/multi-file-template-collision/config.yaml index 97cb76c..971177d 100644 --- a/tests/fixtures/multi-file-template-collision/config.yaml +++ b/tests/fixtures/multi-file-template-collision/config.yaml @@ -1,5 +1,6 @@ victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 notifiers: test: diff --git a/tests/fixtures/multi-file/config.yaml b/tests/fixtures/multi-file/config.yaml index 277187a..34c64c4 100644 --- a/tests/fixtures/multi-file/config.yaml +++ b/tests/fixtures/multi-file/config.yaml @@ -1,7 +1,8 @@ # Base config with inline rules/templates/notifiers for multi-file merge testing victorialogs: - url: http://victorialogs:9428 + default: + url: http://victorialogs:9428 defaults: throttle: diff --git a/tests/integration_notify.rs b/tests/integration_notify.rs index d28900d..2aefb26 100644 --- a/tests/integration_notify.rs +++ b/tests/integration_notify.rs @@ -27,6 +27,7 @@ fn make_payload_with_destinations(rule_name: &str, destinations: Vec) -> accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations, log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), @@ -207,7 +208,7 @@ async fn test_mattermost_payload_format() { "fallback": "Alert from format_rule", "title": "Alert from format_rule", "text": "Test body content", - "footer": "valerter | format_rule | 15/01/2026 10:49:35 UTC" + "footer": "valerter | format_rule | vlprod | 15/01/2026 10:49:35 UTC" }] }))) .respond_with(ResponseTemplate::new(200)) diff --git a/tests/integration_validate.rs b/tests/integration_validate.rs index 70dab3f..f8623f2 100644 --- a/tests/integration_validate.rs +++ b/tests/integration_validate.rs @@ -57,8 +57,8 @@ fn validate_valid_config_exits_success() { stdout ); assert!( - stdout.contains("VictoriaLogs URL"), - "Output should show VictoriaLogs URL: {}", + stdout.contains("VictoriaLogs sources"), + "Output should show VictoriaLogs sources summary: {}", stdout ); assert!( diff --git a/tests/multi_source_integration.rs b/tests/multi_source_integration.rs new file mode 100644 index 0000000..8d9e7ff --- /dev/null +++ b/tests/multi_source_integration.rs @@ -0,0 +1,392 @@ +//! End-to-end integration test for the v2.0.0 multi-source VL core. +//! +//! Spins up **two** wiremock `MockServer` instances to stand in for two +//! VictoriaLogs sources (`vlprod`, `vldev`), serves distinct fixture events +//! on each, runs `RuleEngine` against the pair, and inspects the alert +//! payloads arriving on the notification queue to prove: +//! +//! 1. A rule with `vl_sources: [vlprod]` spawns exactly one task and its +//! payloads carry `vl_source == "vlprod"`. +//! 2. A rule with empty `vl_sources` fans out across every source and +//! payloads arrive tagged with each source name. +//! 3. The synthetic `vl_source` field is rendered in template output +//! (layer 1) and survives in `AlertPayload` for layer 2 notifiers. +//! 4. Per-source default throttle buckets are isolated: two sources sending +//! identical events both pass through on first delivery rather than the +//! second being dropped as a duplicate. +//! +//! The fixture corpus from `tests/fixtures/vl_events/` (chore/vl-fixtures-corpus) +//! is consumed via `common::vl_events::load_fixture`. + +mod common; + +use std::collections::BTreeMap; +use std::sync::Arc; +use std::time::Duration; + +use serde_json::Value; +use tokio::sync::broadcast; +use tokio_util::sync::CancellationToken; +use valerter::config::{ + CompiledParser, CompiledRule, CompiledTemplate, DefaultsConfig, JsonParserConfig, + MetricsConfig, NotifyConfig, RuntimeConfig, ThrottleConfig, VlSourceConfig, +}; +use valerter::notify::{AlertPayload, NotificationQueue}; +use valerter::{RuleEngine, TemplateEngine}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +use common::vl_events::load_fixture; + +/// Re-serialize a JSON fixture (which in the corpus is a single object) into +/// an NDJSON response body — one JSON object followed by a trailing newline. +fn ndjson_body(events: &[&Value]) -> Vec { + let mut out = Vec::new(); + for ev in events { + out.extend_from_slice( + serde_json::to_vec(ev) + .expect("fixture is valid JSON") + .as_slice(), + ); + out.push(b'\n'); + } + out +} + +/// Build an NDJSON mock that replies once with the given events. +async fn mount_ndjson(server: &MockServer, events: &[&Value]) { + Mock::given(method("GET")) + .and(path("/select/logsql/tail")) + .respond_with( + ResponseTemplate::new(200).set_body_raw(ndjson_body(events), "application/x-ndjson"), + ) + .mount(server) + .await; +} + +fn rule(name: &str, vl_sources: Vec) -> CompiledRule { + CompiledRule { + name: name.to_string(), + enabled: true, + query: "_stream:test".to_string(), + parser: CompiledParser { + regex: None, + json: Some(JsonParserConfig { + fields: vec!["_msg".to_string()], + }), + }, + throttle: None, + notify: NotifyConfig { + template: "tpl".to_string(), + mattermost_channel: None, + destinations: vec!["dest".to_string()], + }, + vl_sources, + } +} + +fn vl_source(uri: &str) -> VlSourceConfig { + VlSourceConfig { + url: uri.to_string(), + basic_auth: None, + headers: None, + tls: None, + } +} + +fn runtime(sources: BTreeMap, rules: Vec) -> RuntimeConfig { + let mut templates = std::collections::HashMap::new(); + templates.insert( + "tpl".to_string(), + CompiledTemplate { + title: "[{{ vl_source }}] {{ rule_name }}".to_string(), + body: "source={{ vl_source }} msg={{ _msg }}".to_string(), + email_body_html: None, + accent_color: None, + }, + ); + + RuntimeConfig { + victorialogs: sources, + defaults: DefaultsConfig { + throttle: ThrottleConfig { + key: None, + count: 5, + window: Duration::from_secs(60), + }, + timestamp_timezone: "UTC".to_string(), + }, + templates, + rules, + metrics: MetricsConfig::default(), + notifiers: None, + config_dir: std::path::PathBuf::from("."), + } +} + +/// Collect up to `max` alerts from a pre-created receiver within `deadline`, +/// returning whatever arrived. Using a receiver created BEFORE the engine +/// spawns avoids the broadcast channel's "messages before subscribe are lost" +/// behaviour (see tokio::sync::broadcast docs). +async fn drain_from( + rx: &mut broadcast::Receiver, + max: usize, + deadline: Duration, +) -> Vec { + let mut out = Vec::new(); + let _ = tokio::time::timeout(deadline, async { + while out.len() < max { + match rx.recv().await { + Ok(p) => out.push(p), + Err(_) => break, + } + } + }) + .await; + out +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn multi_source_rule_with_vl_sources_list_targets_single_source() { + let vlprod = MockServer::start().await; + let vldev = MockServer::start().await; + + let ev_prod = load_fixture("nginx_http_400.json"); + let ev_dev = load_fixture("nginx_http_500.json"); + + mount_ndjson(&vlprod, &[&ev_prod]).await; + mount_ndjson(&vldev, &[&ev_dev]).await; + + let mut sources = BTreeMap::new(); + sources.insert("vlprod".to_string(), vl_source(&vlprod.uri())); + sources.insert("vldev".to_string(), vl_source(&vldev.uri())); + + // Rule pinned to vlprod only. vldev's stream must never produce an alert + // via this rule. + let rules = vec![rule("prod_only", vec!["vlprod".to_string()])]; + + let queue = NotificationQueue::new(64); + // Subscribe BEFORE spawning the engine so no payloads are lost. + let mut rx = queue.subscribe(); + + let cfg = runtime(sources, rules); + let engine = RuleEngine::new(cfg, reqwest::Client::new(), queue.clone()); + + let cancel = CancellationToken::new(); + let cancel_clone = cancel.clone(); + let handle = tokio::spawn(async move { engine.run(cancel_clone).await }); + + let alerts = drain_from(&mut rx, 1, Duration::from_secs(3)).await; + // Negative-evidence proof: vldev MUST NOT have served any tail request, + // since the only rule is pinned to vlprod. Catches regressions where the + // resolve_sources filter is bypassed and the rule fans out anyway. + let vldev_hits = vldev.received_requests().await.unwrap_or_default(); + cancel.cancel(); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; + + assert!( + !alerts.is_empty(), + "rule pinned to vlprod should have produced at least one alert" + ); + for a in &alerts { + assert_eq!( + a.vl_source, "vlprod", + "payload carried wrong vl_source: {}", + a.vl_source + ); + assert!( + a.message.title.contains("vlprod"), + "layer 1 title must contain rendered vl_source, got: {}", + a.message.title + ); + } + assert!( + vldev_hits.is_empty(), + "vldev served {} request(s) for a rule pinned to vlprod (negative-evidence assertion)", + vldev_hits.len() + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn multi_source_rule_without_vl_sources_fans_out_across_all() { + let vlprod = MockServer::start().await; + let vldev = MockServer::start().await; + + let ev = load_fixture("k8s_pod_oom.json"); + mount_ndjson(&vlprod, &[&ev]).await; + mount_ndjson(&vldev, &[&ev]).await; + + let mut sources = BTreeMap::new(); + sources.insert("vlprod".to_string(), vl_source(&vlprod.uri())); + sources.insert("vldev".to_string(), vl_source(&vldev.uri())); + + // vl_sources empty = fan out across all sources. + let rules = vec![rule("fan_out", Vec::new())]; + + let queue = NotificationQueue::new(64); + let mut rx = queue.subscribe(); + let cfg = runtime(sources, rules); + let engine = RuleEngine::new(cfg, reqwest::Client::new(), queue.clone()); + + let cancel = CancellationToken::new(); + let cancel_clone = cancel.clone(); + let handle = tokio::spawn(async move { engine.run(cancel_clone).await }); + + // Drain a fixed time window with a high `max` so we don't exit before + // the slower of the two parallel source tasks delivers its first alert. + let alerts = drain_from(&mut rx, 200, Duration::from_secs(2)).await; + cancel.cancel(); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; + + let mut sources_seen: std::collections::HashSet = Default::default(); + for a in &alerts { + sources_seen.insert(a.vl_source.clone()); + } + + assert!( + sources_seen.contains("vlprod"), + "expected vlprod alert, got sources: {:?}", + sources_seen + ); + assert!( + sources_seen.contains("vldev"), + "expected vldev alert, got sources: {:?}", + sources_seen + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn multi_source_default_throttle_buckets_are_isolated_per_source() { + // The same event delivered twice on two different sources must not be + // deduped as a single bucket when the rule uses the default throttle + // (no custom key). The default key is `{rule}-{source}:global`, so + // each source has its own bucket and both alerts should land. + let vlprod = MockServer::start().await; + let vldev = MockServer::start().await; + + // Low count (=1) would cause a cross-source collision under v1 default. + let ev = load_fixture("nginx_http_500.json"); + mount_ndjson(&vlprod, &[&ev]).await; + mount_ndjson(&vldev, &[&ev]).await; + + let mut sources = BTreeMap::new(); + sources.insert("vlprod".to_string(), vl_source(&vlprod.uri())); + sources.insert("vldev".to_string(), vl_source(&vldev.uri())); + + // Tight throttle count=1: if buckets were shared the second source + // would be blocked. + let mut cfg = runtime(sources, vec![rule("isolate", Vec::new())]); + cfg.defaults.throttle.count = 1; + + let queue = NotificationQueue::new(64); + let mut rx = queue.subscribe(); + let engine = RuleEngine::new(cfg, reqwest::Client::new(), queue.clone()); + + let cancel = CancellationToken::new(); + let cancel_clone = cancel.clone(); + let handle = tokio::spawn(async move { engine.run(cancel_clone).await }); + + // Drain for a fixed window long enough for both sources' first stream + // to land. With throttle count=1 each (rule, source) bucket allows only + // one alert through, but both buckets are independent so both deliver. + // Use a high `max` so we don't exit early before both sources land. + let alerts = drain_from(&mut rx, 100, Duration::from_secs(2)).await; + cancel.cancel(); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; + + let sources_seen: std::collections::HashSet = + alerts.iter().map(|a| a.vl_source.clone()).collect(); + + assert!( + sources_seen.contains("vlprod") && sources_seen.contains("vldev"), + "per-source default throttle buckets must be isolated; saw: {:?} (alerts: {})", + sources_seen, + alerts.len() + ); +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn multi_source_event_field_named_vl_source_is_masked_by_synthetic() { + // If an event literally carries a `vl_source` field, the synthetic + // value wins in layer 1 and in the AlertPayload (collision policy + // matches rule_name, v1.2.1). + let server = MockServer::start().await; + + // Build an event with a hostile literal vl_source value. Use a minimal + // VL shape: _time, _stream, _msg, plus the collision field. + let hostile: Value = serde_json::json!({ + "_time": "2026-04-15T10:00:00Z", + "_stream": "{}", + "_msg": "hostile", + "vl_source": "evil" + }); + mount_ndjson(&server, &[&hostile]).await; + + let mut sources = BTreeMap::new(); + sources.insert("real_source".to_string(), vl_source(&server.uri())); + + let rules = vec![rule("collision", Vec::new())]; + + let queue = NotificationQueue::new(64); + let mut rx = queue.subscribe(); + let engine = RuleEngine::new( + runtime(sources, rules), + reqwest::Client::new(), + queue.clone(), + ); + + let cancel = CancellationToken::new(); + let cancel_clone = cancel.clone(); + let handle = tokio::spawn(async move { engine.run(cancel_clone).await }); + + let alerts = drain_from(&mut rx, 1, Duration::from_secs(3)).await; + cancel.cancel(); + let _ = tokio::time::timeout(Duration::from_secs(1), handle).await; + + assert!(!alerts.is_empty(), "expected at least one alert"); + for a in &alerts { + assert_eq!( + a.vl_source, "real_source", + "AlertPayload.vl_source must be synthetic, not event-literal 'evil'" + ); + assert!( + a.message.title.contains("real_source"), + "layer 1 title must show synthetic vl_source, got: {}", + a.message.title + ); + assert!( + !a.message.title.contains("evil"), + "title must not leak event-literal 'evil' into rendered output: {}", + a.message.title + ); + } +} + +#[tokio::test] +async fn template_engine_renders_vl_source_directly_without_http() { + // Smoke test the template-level contract independently of the engine + // so that if the integration tests above time out in CI under load we + // still have a fast unit-level guarantee that vl_source threads through + // layer 1. This also doubles as an assertion that the fixture corpus is + // consumable from template rendering (guards against future shape drift). + let mut templates = std::collections::HashMap::new(); + templates.insert( + "tpl".to_string(), + CompiledTemplate { + title: "[{{ vl_source }}] {{ rule_name }} {{ _msg }}".to_string(), + body: "b".to_string(), + email_body_html: None, + accent_color: None, + }, + ); + let engine = Arc::new(TemplateEngine::new(templates)); + + let ev = load_fixture("nginx_http_400.json"); + let rendered = engine.render_with_fallback("tpl", &ev, "nginx_rule", "vlprod"); + + assert!( + rendered.title.starts_with("[vlprod] nginx_rule "), + "expected title to include synthetic vl_source, got: {}", + rendered.title + ); +} diff --git a/tests/smtp_integration.rs b/tests/smtp_integration.rs index fd25937..d35c7c3 100644 --- a/tests/smtp_integration.rs +++ b/tests/smtp_integration.rs @@ -217,6 +217,7 @@ fn make_alert_payload(rule_name: &str, title: &str, body: &str) -> AlertPayload accent_color: Some("#ff0000".to_string()), }, rule_name: rule_name.to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), @@ -404,6 +405,7 @@ async fn test_send_email_html_format() { accent_color: Some("#ff0000".to_string()), }, rule_name: "html_rule".to_string(), + vl_source: "vlprod".to_string(), destinations: vec![], log_timestamp: "2026-01-15T10:49:35.799Z".to_string(), log_timestamp_formatted: "15/01/2026 10:49:35 UTC".to_string(), diff --git a/tests/template_against_vl_fixtures.rs b/tests/template_against_vl_fixtures.rs index 20c0a8f..a6a846a 100644 --- a/tests/template_against_vl_fixtures.rs +++ b/tests/template_against_vl_fixtures.rs @@ -54,7 +54,7 @@ fn smoke_every_fixture_renders_msg_and_time_matching_event() { .unwrap_or_else(|| panic!("fixture {} missing string _time", name)); let expected = format!("{} @ {}", msg, time); let rendered = engine - .render("t", &value, "smoke") + .render("t", &value, "smoke", "vlprod") .unwrap_or_else(|e| panic!("fixture {} failed to render: {}", name, e)); assert_eq!( rendered.body, expected, @@ -92,7 +92,7 @@ fn regression_gh25_dotted_keys_render_their_value() { continue; }; let rendered = engine - .render("t", &value, "gh25") + .render("t", &value, "gh25", "vlprod") .unwrap_or_else(|e| panic!("fixture {} failed to render for #25: {}", name, e)); assert_eq!( rendered.body, expected, @@ -116,7 +116,7 @@ fn regression_empty_fields_render_as_empty_string() { let engine = engine_with("t", "[{{ request_id }}][{{ user_id }}][{{ error }}]"); let event = load_fixture("edge_empty_fields.json"); let rendered = engine - .render("t", &event, "empty_fields") + .render("t", &event, "empty_fields", "vlprod") .expect("render should succeed with lenient undefined"); assert_eq!(rendered.body, "[][][]"); } @@ -135,7 +135,7 @@ fn raw_source_fixtures_render_missing_as_empty_under_lenient() { ); for (name, value) in fixtures { let rendered = engine - .render("t", &value, "raw") + .render("t", &value, "raw", "vlprod") .unwrap_or_else(|e| panic!("fixture {} failed lenient render: {}", name, e)); assert!(!rendered.title.is_empty(), "title empty for {}", name); assert!( @@ -153,7 +153,7 @@ fn unicode_fixture_preserves_cjk_and_emoji() { let engine = engine_with("{{ _msg }}", "{{ _msg }}"); let event = load_fixture("edge_unicode_msg.json"); let rendered = engine - .render("t", &event, "unicode") + .render("t", &event, "unicode", "vlprod") .expect("render should succeed on unicode event"); assert!(rendered.body.contains("支付失败"), "lost CJK codepoints"); assert!(rendered.body.contains('\u{2705}'), "lost emoji codepoint"); From d5e156432af48c895d05287cbfd21cf2357de71d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Xavier=20THIRY?= Date: Thu, 16 Apr 2026 14:36:26 +0200 Subject: [PATCH 3/6] feat!: multi-source observability and guardrails (v2.0.0 part 2, #34) Adds the `vl_source` Prometheus label to every per-rule counter, replaces the legacy `valerter_victorialogs_up{rule_name}` gauge with a per-source `valerter_vl_source_up{vl_source}`, introduces a `defaults.max_streams` cap with load-time enforcement, applies +/-10% uniform jitter on reconnect backoff per (rule, source) task to break thundering-herd alignment, and ships a `/metrics` snapshot integration test that catches accidental relabel or rename in future PRs. Multi-source operators can now attribute every alert and error rate to a specific source. The new gauge makes partial-source outages visible without requiring per-rule reachability inference. The cap prevents an accidental (rule x source) fan-out from DoS'ing a backend at startup. Post-review hardening folded in: dated v1.2.1 in CHANGELOG, added PromQL migration snippet for dashboards, extended snapshot coverage to include lines_discarded / query_duration / last_query_timestamp / per-notifier sentinel counters, made the jitter floor test exercise the actual clamp branch, demoted backoff_delay_default to pub(crate), extended ReconnectCallback::on_reconnect to receive vl_source, and rejected max_streams=0 at load with a clear error. Cargo.toml not bumped here; version bump lives in the release PR. --- CHANGELOG.md | 21 +- Cargo.lock | 1 + Cargo.toml | 3 + docs/configuration.md | 17 ++ docs/metrics.md | 61 ++++-- src/config/mod.rs | 6 +- src/config/tests.rs | 183 +++++++++++++++++ src/config/types.rs | 53 +++++ src/engine.rs | 103 ++++++---- src/main.rs | 34 ++- src/metrics.rs | 118 ++++++++--- src/notify/email.rs | 12 +- src/notify/mattermost.rs | 15 +- src/notify/queue.rs | 3 +- src/notify/telegram.rs | 9 +- src/notify/webhook.rs | 15 +- src/parser.rs | 40 ++-- src/tail.rs | 248 ++++++++++++++++++---- src/throttle.rs | 17 +- tests/integration_streaming.rs | 81 +++++--- tests/metrics_snapshot.rs | 330 ++++++++++++++++++++++++++++++ tests/multi_source_integration.rs | 1 + 22 files changed, 1171 insertions(+), 200 deletions(-) create mode 100644 tests/metrics_snapshot.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index 98aff3c..f9f67e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,13 +40,32 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Notifier output formats extended with `vl_source`.** The Mattermost footer now reads `valerter | | | ` instead of `valerter | | `. The default webhook payload exposes `vl_source` as a top-level JSON field. Downstream parsers / dashboards that match exact strings in either output need to update. +- **All per-rule Prometheus metrics now also carry a `vl_source` label.** Affected counters: `valerter_alerts_sent_total`, `valerter_alerts_throttled_total`, `valerter_alerts_passed_total`, `valerter_alerts_failed_total`, `valerter_email_recipient_errors_total`, `valerter_lines_discarded_total`, `valerter_logs_matched_total`, `valerter_notify_errors_total`, `valerter_parse_errors_total`, `valerter_reconnections_total`, `valerter_rule_panics_total`, `valerter_rule_errors_total`. Affected gauge/histogram: `valerter_last_query_timestamp`, `valerter_query_duration_seconds`. Dashboards and alerts that grouped by `rule_name` alone keep working but get an extra `vl_source` dimension; PromQL using `sum by (rule_name) (...)` still rolls up correctly. `valerter_queue_size` stays unlabeled (the queue is shared, not per-source). + +- **`valerter_victorialogs_up{rule_name}` removed and replaced by `valerter_vl_source_up{vl_source}`.** The new gauge is per-source (one value per configured source, regardless of how many rules tail it) since reachability is a property of the source, not the rule. Alerts and panels need to migrate from per-rule to per-source semantics. Examples: + + ```promql + # v1.x (per-rule): valerter_victorialogs_up{rule_name="nginx-5xx"} == 0 + # v2.0.0 (per-source): valerter_vl_source_up{vl_source="prod"} == 0 + + # v1.x (any rule down): min(valerter_victorialogs_up) == 0 + # v2.0.0 (any source): min(valerter_vl_source_up) == 0 + ``` + + The label key is now `vl_source` (not `rule_name`), and the cardinality drops from `|rules|` to `|sources|`. + +- **`defaults.max_streams` cap introduced (default 50).** Total VictoriaLogs streams = sum of `(rule, source)` pairs spawned for enabled rules. Breaching the cap fails the config at load with both the actual count and the cap value. Configurable via `defaults.max_streams: `. Disabled rules do not contribute. Prevents accidental fan-out from DoSing a backend. + ### Added - **Multi-source VictoriaLogs support** (issue #34). The engine spawns one task per `(rule, source)` pair with per-source cancellation and reconnect isolation, so a single unhealthy source does not stop alerts on the others. - **`{{ vl_source }}` template variable** available everywhere `{{ rule_name }}` is: layer 1 templates (`title`, `body`, `email_body_html`), `throttle.key`, and notifier-level layer 2 contexts (`subject_template`, `body_template`). Always non-empty, owned `String`, equal to the source name currently processing the event. Synthetic value wins over any event field literally named `vl_source` (matches the `rule_name` collision policy). - **`AlertPayload.vl_source`** propagated end-to-end so notifiers can render the source name. See Breaking changes above for the related output format updates on Mattermost and webhook destinations. +- **`valerter_vl_source_up{vl_source}` per-source reachability gauge.** Initialized to 0 for every configured source at startup; engine flips to 1 on tail connect success and back to 0 on permanent failure or stream error. Replaces the v1.x per-rule `valerter_victorialogs_up`. +- **`±10%` uniform jitter on reconnect backoff** (per `(rule, source)` task). Sources behind a flapping load balancer no longer reconnect in lock-step, breaking the thundering-herd alignment over a few cycles. Hardcoded jitter range; not configurable in this release. +- **`tests/metrics_snapshot.rs` integration test.** Spins up a 2-source 1-rule engine, scrapes `/metrics`, and asserts the set of metric names + label keys (not values) against an inline expected string. Catches accidental relabel/rename in future PRs. -## [1.2.1] - unreleased +## [1.2.1] - 2026-04-16 ### Fixed - **`{{ rule_name }}` available in top-level templates and throttle key** (issue #31). `rule_name` is now injected into the render context of `templates..title`, `body`, and `email_body_html`, and also into the `throttle.key` template, not just the notifier-level `subject_template` / `body_template`. Configs that referenced `{{ rule_name }}` in a top-level template previously rendered an empty string; they now render the rule name. If an event field happens to be literally named `rule_name`, the synthetic rule name wins, matching the collision policy of the existing notifier-level contexts. diff --git a/Cargo.lock b/Cargo.lock index 7c3e0a4..0c93d4e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2350,6 +2350,7 @@ dependencies = [ "minijinja", "moka", "portpicker", + "rand 0.8.5", "regex", "reqwest", "serde", diff --git a/Cargo.toml b/Cargo.toml index d960ad6..25ac221 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,6 +31,9 @@ minijinja = { version = "2.12", features = ["builtins", "json"] } # Caching/Throttle moka = { version = "0.12", features = ["sync"] } +# Random (jitter on reconnect backoff to break thundering-herd alignment). +rand = "0.8" + # Observability tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } diff --git a/docs/configuration.md b/docs/configuration.md index 512b056..70608e2 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -216,8 +216,25 @@ defaults: count: 5 # Max alerts per window window: 60s # Time window (e.g., 60s, 5m, 1h) # timestamp_timezone: "Europe/Paris" # Optional: timezone for formatted timestamps (default: UTC) + # max_streams: 50 # Optional: hard cap on total VictoriaLogs streams (default: 50) ``` +### `max_streams` — fan-out guardrail + +Multi-source deployments spawn one stream per `(enabled rule, target source)` +pair. With unscoped fan-out rules and many sources the total scales as +`rules × sources`, which can DoS a backend by accident. `defaults.max_streams` +caps that total at load time: + +``` +total = sum(if rule.vl_sources is empty then sources.len() else rule.vl_sources.len() + for rule in enabled_rules) +``` + +Disabled rules do not contribute. Breaching the cap fails `valerter --validate` +with a message stating both the actual count and the cap so an operator knows +whether to raise the cap or trim rules. Default: `50`. + ### Timestamp Timezone The `timestamp_timezone` setting controls the timezone used for `{{ log_timestamp_formatted }}` in templates and Mattermost footers. diff --git a/docs/metrics.md b/docs/metrics.md index 8c2cf81..f900c9d 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -12,32 +12,40 @@ metrics: ## Exposed Metrics +> **v2.0.0 — multi-source label.** Every per-rule metric also carries a +> `vl_source` label naming the VictoriaLogs source that produced the event. +> The legacy `valerter_victorialogs_up{rule_name}` gauge is **removed** and +> replaced by `valerter_vl_source_up{vl_source}` (per-source, no `rule_name`). +> Dashboards that grouped by `rule_name` alone now have an extra dimension +> available; alerts that matched on `valerter_victorialogs_up` must move to +> `valerter_vl_source_up`. + ### Counters | Metric | Labels | Description | |--------|--------|-------------| -| `valerter_alerts_sent_total` | `rule_name`, `notifier_name`, `notifier_type` | Alerts sent successfully | -| `valerter_alerts_throttled_total` | `rule_name` | Alerts blocked by throttling | -| `valerter_alerts_passed_total` | `rule_name` | Alerts that passed throttling | +| `valerter_alerts_sent_total` | `rule_name`, `vl_source`, `notifier_name`, `notifier_type` | Alerts sent successfully | +| `valerter_alerts_throttled_total` | `rule_name`, `vl_source` | Alerts blocked by throttling | +| `valerter_alerts_passed_total` | `rule_name`, `vl_source` | Alerts that passed throttling | | `valerter_alerts_dropped_total` | - | Alerts dropped (queue full, global counter) | -| `valerter_alerts_failed_total` | `rule_name`, `notifier_name`, `notifier_type` | Alerts that permanently failed | -| `valerter_email_recipient_errors_total` | `rule_name`, `notifier_name` | Email delivery failures per recipient | -| `valerter_lines_discarded_total` | `rule_name`, `reason` | Log lines discarded (e.g., reason=oversized for lines > 1MB) | -| `valerter_logs_matched_total` | `rule_name` | Logs matched by rule (before throttling) | +| `valerter_alerts_failed_total` | `rule_name`, `vl_source`, `notifier_name`, `notifier_type` | Alerts that permanently failed | +| `valerter_email_recipient_errors_total` | `rule_name`, `vl_source`, `notifier_name` | Email delivery failures per recipient | +| `valerter_lines_discarded_total` | `rule_name`, `vl_source`, `reason` | Log lines discarded (e.g., reason=oversized for lines > 1MB) | +| `valerter_logs_matched_total` | `rule_name`, `vl_source` | Logs matched by rule (before throttling) | | `valerter_notifier_config_errors_total` | `notifier`, `error_type` | Notifier configuration errors (e.g., env var resolution) | -| `valerter_notify_errors_total` | `rule_name`, `notifier_name`, `notifier_type` | Notification send errors | -| `valerter_parse_errors_total` | `rule_name`, `error_type` | Parsing errors | -| `valerter_reconnections_total` | `rule_name` | VictoriaLogs reconnections | -| `valerter_rule_panics_total` | `rule_name` | Rule task panics (auto-restarted) | -| `valerter_rule_errors_total` | `rule_name` | Fatal rule errors | +| `valerter_notify_errors_total` | `rule_name`, `vl_source`, `notifier_name`, `notifier_type` | Notification send errors | +| `valerter_parse_errors_total` | `rule_name`, `vl_source`, `error_type` | Parsing errors | +| `valerter_reconnections_total` | `rule_name`, `vl_source` | VictoriaLogs reconnections | +| `valerter_rule_panics_total` | `rule_name`, `vl_source` | Rule task panics (auto-restarted) | +| `valerter_rule_errors_total` | `rule_name`, `vl_source` | Fatal rule errors | ### Gauges | Metric | Labels | Description | |--------|--------|-------------| -| `valerter_queue_size` | - | Current notification queue size | -| `valerter_last_query_timestamp` | `rule_name` | Unix timestamp of last successful query | -| `valerter_victorialogs_up` | `rule_name` | VictoriaLogs connection status (1=connected, 0=disconnected or error) | +| `valerter_queue_size` | - | Current notification queue size (shared queue, not per-source) | +| `valerter_last_query_timestamp` | `rule_name`, `vl_source` | Unix timestamp of last successful query chunk | +| `valerter_vl_source_up` | `vl_source` | Per-source VictoriaLogs reachability (1=connected, 0=disconnected). Replaces v1.x `valerter_victorialogs_up{rule_name}`. | | `valerter_uptime_seconds` | - | Time since valerter started | | `valerter_build_info` | `version` | Build information (always 1) | @@ -45,7 +53,16 @@ metrics: | Metric | Labels | Description | |--------|--------|-------------| -| `valerter_query_duration_seconds` | `rule_name` | VictoriaLogs query latency (time to first chunk) | +| `valerter_query_duration_seconds` | `rule_name`, `vl_source` | VictoriaLogs query latency (time to first chunk) | + +### Reconnect Backoff Jitter + +Reconnect attempts apply `±10%` uniform jitter per `(rule, source)` task on top +of the existing exponential backoff (1s base, 60s cap). When `N` sources behind +a flapping load balancer would otherwise reconnect in lock-step, the jitter +spreads attempts in a `[0.9·D, 1.1·D]` window so the herd dissolves over a few +cycles. The jitter is hardcoded (not configurable in v2.0.0) and never drops +the effective delay below 100ms. ## Prometheus Scrape Configuration @@ -74,15 +91,15 @@ groups: summary: "Valerter rule {{ $labels.rule_name }} not querying" description: "No queries received from rule {{ $labels.rule_name }} for over 5 minutes" - # VictoriaLogs connection lost - - alert: ValerterVictoriaLogsDown - expr: valerter_victorialogs_up == 0 + # VictoriaLogs source unreachable (per-source gauge, v2.0.0) + - alert: ValerterVlSourceDown + expr: valerter_vl_source_up == 0 for: 2m labels: severity: critical annotations: - summary: "Valerter disconnected from VictoriaLogs" - description: "Rule {{ $labels.rule_name }} lost connection to VictoriaLogs. Check network and VictoriaLogs health." + summary: "Valerter disconnected from VictoriaLogs source {{ $labels.vl_source }}" + description: "Source {{ $labels.vl_source }} is unreachable. Check network and VictoriaLogs health." # Alerts failing to send - alert: ValerterAlertsFailing @@ -128,7 +145,7 @@ groups: ### Health -- `valerter_victorialogs_up` - VictoriaLogs connection status (1=connected, 0=error) +- `valerter_vl_source_up` - Per-source VictoriaLogs reachability (1=connected, 0=disconnected) - `valerter_uptime_seconds` - Process uptime (detect restarts) ### Performance diff --git a/src/config/mod.rs b/src/config/mod.rs index b637122..783eaae 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -21,9 +21,9 @@ pub use runtime::{ }; pub use secret::SecretString; pub use types::{ - BasicAuthConfig, Config, DEFAULT_CONFIG_PATH, DefaultsConfig, JsonParserConfig, MetricsConfig, - NotifyConfig, ParserConfig, RuleConfig, TemplateConfig, ThrottleConfig, TlsConfig, - VlSourceConfig, + BasicAuthConfig, Config, DEFAULT_CONFIG_PATH, DEFAULT_MAX_STREAMS, DefaultsConfig, + JsonParserConfig, MetricsConfig, NotifyConfig, ParserConfig, RuleConfig, TemplateConfig, + ThrottleConfig, TlsConfig, VlSourceConfig, }; pub use validation::validate_template_render; diff --git a/src/config/tests.rs b/src/config/tests.rs index de15c1b..8e47c06 100644 --- a/src/config/tests.rs +++ b/src/config/tests.rs @@ -358,6 +358,7 @@ fn make_runtime_config_with_destinations(destinations: Vec) -> RuntimeCo window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: super::DEFAULT_MAX_STREAMS, }, templates: { let mut t = std::collections::HashMap::new(); @@ -472,6 +473,7 @@ fn validate_collects_all_errors() { window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: super::DEFAULT_MAX_STREAMS, }, templates: { let mut t = std::collections::HashMap::new(); @@ -558,6 +560,7 @@ fn validate_throttle_key_template() { window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: super::DEFAULT_MAX_STREAMS, }, templates: { let mut t = std::collections::HashMap::new(); @@ -629,6 +632,7 @@ fn validate_nonexistent_notify_template_fails() { window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: super::DEFAULT_MAX_STREAMS, }, templates: { let mut t = std::collections::HashMap::new(); @@ -1033,6 +1037,7 @@ fn validate_rule_destinations_collects_all_errors() { window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: super::DEFAULT_MAX_STREAMS, }, templates: { let mut t = std::collections::HashMap::new(); @@ -2336,3 +2341,181 @@ rules: err_str ); } + +// ============================================================ +// v2.0.0 part 2: defaults.max_streams cap (multi-source guardrail). +// +// Total fan-out is `sum_over_enabled_rules(if vl_sources.is_empty() then +// sources.len() else vl_sources.len())`. Disabled rules do not contribute. +// Breach is rejected at load with both numbers in the error. +// ============================================================ + +/// YAML helper: build a config with `n_sources` declared sources, `n_rules` +/// enabled rules each with empty `vl_sources` (full fan-out), and an explicit +/// `defaults.max_streams: cap`. +fn config_with_fan_out(n_sources: usize, n_rules: usize, cap: usize) -> String { + let mut yaml = String::from("victorialogs:\n"); + for i in 0..n_sources { + yaml.push_str(&format!(" src{}:\n url: http://h{}:9428\n", i, i)); + } + yaml.push_str(&format!( + "defaults:\n max_streams: {}\n throttle:\n count: 5\n window: 1m\n", + cap + )); + yaml.push_str("templates:\n t:\n title: x\n body: y\n"); + yaml.push_str( + "notifiers:\n n:\n type: mattermost\n webhook_url: https://example.com/hooks/x\n", + ); + yaml.push_str("rules:\n"); + for i in 0..n_rules { + yaml.push_str(&format!( + " - name: r{}\n query: 't'\n parser:\n json:\n fields: [_msg]\n notify:\n template: t\n destinations: [n]\n", + i + )); + } + yaml +} + +#[test] +fn validate_max_streams_under_cap_passes() { + // 3 sources × 4 fan-out rules = 12 streams ≤ 50. + let yaml = config_with_fan_out(3, 4, 50); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + config + .validate() + .expect("12 streams under cap of 50 should validate"); +} + +#[test] +fn validate_max_streams_at_exact_cap_passes() { + // 5 × 10 = 50, exactly the cap — boundary case allowed. + let yaml = config_with_fan_out(5, 10, 50); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + config + .validate() + .expect("50 streams at cap of 50 should validate"); +} + +#[test] +fn validate_max_streams_breach_fails_with_actual_and_cap() { + // 5 sources × 12 fan-out rules = 60 > 50. + let yaml = config_with_fan_out(5, 12, 50); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + let errors = config.validate().expect_err("60 streams > cap should fail"); + let has_max_streams_error = errors.iter().any(|e| match e { + crate::error::ConfigError::ValidationError(msg) => { + msg.contains("max_streams") && msg.contains("60") && msg.contains("50") + } + _ => false, + }); + assert!( + has_max_streams_error, + "expected max_streams error mentioning actual=60 and cap=50, got: {:?}", + errors + ); +} + +#[test] +fn validate_max_streams_default_value_is_fifty() { + // Omit `defaults.max_streams` entirely → DEFAULT_MAX_STREAMS (50). 51 + // streams must fail; the default is what the cap reads as. + let mut yaml = String::from("victorialogs:\n"); + for i in 0..51 { + yaml.push_str(&format!(" src{}:\n url: http://h{}:9428\n", i, i)); + } + yaml.push_str("defaults:\n throttle:\n count: 5\n window: 1m\n"); + yaml.push_str("templates:\n t:\n title: x\n body: y\n"); + yaml.push_str( + "notifiers:\n n:\n type: mattermost\n webhook_url: https://example.com/hooks/x\n", + ); + yaml.push_str("rules:\n - name: r0\n query: 't'\n parser:\n json:\n fields: [_msg]\n notify:\n template: t\n destinations: [n]\n"); + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + let errors = config + .validate() + .expect_err("51 streams under default cap of 50 should fail"); + assert!( + errors.iter().any(|e| matches!( + e, + crate::error::ConfigError::ValidationError(m) if m.contains("max_streams") + )), + "expected max_streams error, got: {:?}", + errors + ); +} + +#[test] +fn validate_max_streams_disabled_rules_do_not_contribute() { + // 5 sources × (1 enabled fan-out rule + 100 disabled fan-out rules) = + // 5 enabled streams. Even though raw `vl_sources.len()` would sum to + // hundreds if we counted disabled rules, the cap is on enabled only. + let mut yaml = String::from("victorialogs:\n"); + for i in 0..5 { + yaml.push_str(&format!(" src{}:\n url: http://h{}:9428\n", i, i)); + } + yaml.push_str("defaults:\n max_streams: 5\n throttle:\n count: 5\n window: 1m\n"); + yaml.push_str("templates:\n t:\n title: x\n body: y\n"); + yaml.push_str( + "notifiers:\n n:\n type: mattermost\n webhook_url: https://example.com/hooks/x\n", + ); + yaml.push_str("rules:\n"); + yaml.push_str(" - name: enabled_rule\n query: 't'\n parser:\n json:\n fields: [_msg]\n notify:\n template: t\n destinations: [n]\n"); + for i in 0..100 { + yaml.push_str(&format!( + " - name: disabled{}\n enabled: false\n query: 't'\n parser:\n json:\n fields: [_msg]\n notify:\n template: t\n destinations: [n]\n", + i + )); + } + let config: Config = serde_yaml::from_str(&yaml).unwrap(); + config + .validate() + .expect("disabled rules should not contribute to fan-out total"); +} + +#[test] +fn validate_max_streams_pinned_rule_counts_only_listed_sources() { + // 5 sources, 2 pinned rules each `vl_sources: [src0]`, 3 fan-out rules. + // total = 2*1 + 3*5 = 17, well under default cap. + let yaml = r#" +victorialogs: + src0: { url: http://h0:9428 } + src1: { url: http://h1:9428 } + src2: { url: http://h2:9428 } + src3: { url: http://h3:9428 } + src4: { url: http://h4:9428 } +defaults: + throttle: + count: 5 + window: 1m +templates: + t: { title: x, body: y } +notifiers: + n: { type: mattermost, webhook_url: https://example.com/hooks/x } +rules: + - name: pinned1 + query: 't' + parser: { json: { fields: [_msg] } } + vl_sources: [src0] + notify: { template: t, destinations: [n] } + - name: pinned2 + query: 't' + parser: { json: { fields: [_msg] } } + vl_sources: [src0] + notify: { template: t, destinations: [n] } + - name: fan1 + query: 't' + parser: { json: { fields: [_msg] } } + notify: { template: t, destinations: [n] } + - name: fan2 + query: 't' + parser: { json: { fields: [_msg] } } + notify: { template: t, destinations: [n] } + - name: fan3 + query: 't' + parser: { json: { fields: [_msg] } } + notify: { template: t, destinations: [n] } +"#; + let config: Config = serde_yaml::from_str(yaml).unwrap(); + config + .validate() + .expect("17 streams under default cap of 50 should validate"); +} diff --git a/src/config/types.rs b/src/config/types.rs index 29eefe7..066dd4a 100644 --- a/src/config/types.rs +++ b/src/config/types.rs @@ -197,6 +197,25 @@ pub struct DefaultsConfig { /// Timezone for formatted timestamps (e.g., "UTC", "Europe/Paris"). #[serde(default = "default_timestamp_timezone")] pub timestamp_timezone: String, + /// Maximum total number of concurrent VictoriaLogs streams (sum of + /// `(rule, source)` task pairs for enabled rules). Hard cap enforced at + /// load time to prevent unintentional fan-out from DoSing a backend. + /// + /// Default: 50. Configurable via `defaults.max_streams: ` in + /// `config.yaml`. + #[serde(default = "default_max_streams")] + pub max_streams: usize, +} + +/// Default upper bound on the total number of concurrent VL streams. +/// +/// Picked to comfortably accommodate small/mid-size deployments (~10 sources × +/// a handful of fan-out rules) while still failing fast on accidentally large +/// fan-outs. Configurable per-deployment via `defaults.max_streams`. +pub const DEFAULT_MAX_STREAMS: usize = 50; + +fn default_max_streams() -> usize { + DEFAULT_MAX_STREAMS } fn default_timestamp_timezone() -> String { @@ -768,6 +787,40 @@ impl Config { ))); } + // Validate `defaults.max_streams` cap against the actual fan-out. + // Total stream count = sum, over enabled rules, of: + // - `sources.len()` if `rule.vl_sources` is empty (fan out across all) + // - `rule.vl_sources.len()` otherwise. + // Disabled rules do not contribute. Enforced at load (not runtime) so + // a misconfigured fan-out fails fast at startup. + if self.defaults.max_streams == 0 { + errors.push(ConfigError::ValidationError( + "defaults.max_streams must be >= 1 (a value of 0 would reject every config). \ + Omit the field to use the default (50) or set an explicit positive value." + .to_string(), + )); + } + let source_count = self.victorialogs.len(); + let total_streams: usize = self + .rules + .iter() + .filter(|r| r.enabled) + .map(|r| { + if r.vl_sources.is_empty() { + source_count + } else { + r.vl_sources.len() + } + }) + .sum(); + if total_streams > self.defaults.max_streams { + errors.push(ConfigError::ValidationError(format!( + "defaults.max_streams exceeded: {} stream(s) required by enabled rules > cap of {}. \ + Either raise `defaults.max_streams` or trim rules / `vl_sources` to reduce fan-out.", + total_streams, self.defaults.max_streams + ))); + } + if errors.is_empty() { Ok(()) } else { diff --git a/src/engine.rs b/src/engine.rs index cb03c39..4fed0cc 100644 --- a/src/engine.rs +++ b/src/engine.rs @@ -281,7 +281,8 @@ impl RuleEngine { ); metrics::counter!( "valerter_rule_errors_total", - "rule_name" => rule_name + "rule_name" => rule_name, + "vl_source" => vl_source, ).increment(1); handle_to_context.remove(&task_id); } @@ -297,7 +298,8 @@ impl RuleEngine { metrics::counter!( "valerter_rule_panics_total", - "rule_name" => rule_name.clone() + "rule_name" => rule_name.clone(), + "vl_source" => vl_source.clone(), ).increment(1); if !cancel.is_cancelled() { @@ -324,7 +326,12 @@ impl RuleEngine { error = %join_error, "Rule task panicked but context not found - CRITICAL" ); - metrics::counter!("valerter_rule_panics_total", "rule_name" => "unknown").increment(1); + metrics::counter!( + "valerter_rule_panics_total", + "rule_name" => "unknown", + "vl_source" => "unknown", + ) + .increment(1); } } Err(join_error) => { @@ -382,7 +389,11 @@ impl ThrottleResetCallback { } impl ReconnectCallback for ThrottleResetCallback { - fn on_reconnect(&self, _rule_name: &str) { + fn on_reconnect(&self, _rule_name: &str, _vl_source: &str) { + // Each (rule, source) task owns its own Throttler instance, so a + // reset here is already scoped to the source that just recovered. + // The vl_source parameter is accepted for trait conformance and + // future use (e.g. selective reset across shared throttle stores). self.throttler.reset(); } } @@ -462,43 +473,48 @@ async fn run_rule(ctx: RuleSpawnContext, cancel: CancellationToken) -> Result<() let timestamp_timezone = Arc::new(ctx.timestamp_timezone.clone()); let stream_result = tail_client - .stream_with_reconnect(&rule_name, Some(&reconnect_callback), |line| { - // Process each log line - let queue = queue.clone(); - let parser = Arc::clone(&parser); - let throttler = Arc::clone(&throttler); - let template_engine = Arc::clone(&template_engine); - let template_name = Arc::clone(&template_name); - let rule_name = rule_name.clone(); - let vl_source = Arc::clone(&vl_source); - let destinations = Arc::clone(&destinations); - let timestamp_timezone = Arc::clone(×tamp_timezone); - - async move { - if let Err(e) = process_log_line( - &line, - &parser, - &throttler, - &template_engine, - &template_name, - &rule_name, - &vl_source, - &destinations, - &queue, - ×tamp_timezone, - ) - .await - { - debug!( - rule_name = %rule_name, - vl_source = %vl_source, - error = %e, - "Failed to process log line, continuing" - ); + .stream_with_reconnect( + &rule_name, + vl_source.as_str(), + Some(&reconnect_callback), + |line| { + // Process each log line + let queue = queue.clone(); + let parser = Arc::clone(&parser); + let throttler = Arc::clone(&throttler); + let template_engine = Arc::clone(&template_engine); + let template_name = Arc::clone(&template_name); + let rule_name = rule_name.clone(); + let vl_source = Arc::clone(&vl_source); + let destinations = Arc::clone(&destinations); + let timestamp_timezone = Arc::clone(×tamp_timezone); + + async move { + if let Err(e) = process_log_line( + &line, + &parser, + &throttler, + &template_engine, + &template_name, + &rule_name, + &vl_source, + &destinations, + &queue, + ×tamp_timezone, + ) + .await + { + debug!( + rule_name = %rule_name, + vl_source = %vl_source, + error = %e, + "Failed to process log line, continuing" + ); + } + Ok(()) } - Ok(()) - } - }) + }, + ) .await; if cancel.is_cancelled() { @@ -543,13 +559,13 @@ async fn process_log_line( f } Err(e) => { - record_parse_error(rule_name, &e); + record_parse_error(rule_name, vl_source, &e); return Err(ProcessError::Parse); } }; // Step 1.5: Record successful match (before throttle check) - record_log_matched(rule_name); + record_log_matched(rule_name, vl_source); // Step 2: Check throttle (renders key with both rule_name and vl_source) match throttler.check(&fields) { @@ -681,6 +697,7 @@ mod tests { window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: crate::config::DEFAULT_MAX_STREAMS, }, templates: { let mut t = HashMap::new(); @@ -1085,7 +1102,7 @@ mod tests { assert_eq!(throttler.check(&fields), ThrottleResult::Throttled); // Reset via callback - callback.on_reconnect("test_rule"); + callback.on_reconnect("test_rule", "vlprod"); // Should pass again after reset assert_eq!(throttler.check(&fields), ThrottleResult::Pass); diff --git a/src/main.rs b/src/main.rs index db5d0ee..2ca8225 100644 --- a/src/main.rs +++ b/src/main.rs @@ -339,12 +339,38 @@ async fn run(runtime_config: valerter::config::RuntimeConfig) -> Result<()> { // Create cancellation token for graceful shutdown let cancel = CancellationToken::new(); - // Collect rule and notifier names for metric initialization - let rule_names: Vec<&str> = runtime_config + // Collect rule-source pairs and source names for metric initialization. + // + // Multi-source observability (v2.0.0 part 2): every per-rule metric also + // carries `vl_source`, so initialization seeds one series per + // `(enabled rule, resolved source)` pair. The `vl_source_up` gauge is + // per-source only, so we also pass the flat list of declared sources. + let source_names: Vec<&str> = runtime_config + .victorialogs + .keys() + .map(String::as_str) + .collect(); + let rule_source_pairs: Vec<(&str, &str)> = runtime_config .rules .iter() .filter(|r| r.enabled) - .map(|r| r.name.as_str()) + .flat_map(|r| { + // Same fan-out semantics the engine uses: empty `vl_sources` means + // "every configured source", non-empty restricts to the named + // subset (intersected with declared sources for safety). + if r.vl_sources.is_empty() { + source_names + .iter() + .map(move |s| (r.name.as_str(), *s)) + .collect::>() + } else { + r.vl_sources + .iter() + .filter(|s| runtime_config.victorialogs.contains_key(s.as_str())) + .map(move |s| (r.name.as_str(), s.as_str())) + .collect::>() + } + }) .collect(); let notifier_names: Vec<&str> = registry.names().collect(); @@ -373,7 +399,7 @@ async fn run(runtime_config: valerter::config::RuntimeConfig) -> Result<()> { } // Initialize all known metrics to zero now that recorder is ready - valerter::initialize_metrics(&rule_names, ¬ifier_names); + valerter::initialize_metrics(&rule_source_pairs, &source_names, ¬ifier_names); Some(handle) } else { diff --git a/src/metrics.rs b/src/metrics.rs index d8dd6d3..fde6233 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -80,8 +80,9 @@ pub fn register_metric_descriptions() { "Unix timestamp of last successful VictoriaLogs query chunk received" ); describe_gauge!( - "valerter_victorialogs_up", - "VictoriaLogs connection status (1=connected, 0=disconnected or error)" + "valerter_vl_source_up", + "Per-source VictoriaLogs reachability (1=connected, 0=disconnected). \ + Replaces the v1.x per-rule `valerter_victorialogs_up`." ); describe_gauge!( "valerter_uptime_seconds", @@ -198,9 +199,17 @@ pub fn is_recorder_installed() -> bool { /// /// # Arguments /// -/// * `rule_names` - List of rule names to initialize per-rule counters -/// * `notifier_names` - List of notifier names to initialize per-notifier counters -pub fn initialize_metrics(rule_names: &[&str], notifier_names: &[&str]) { +/// * `rule_source_pairs` - List of `(rule_name, vl_source)` pairs to initialize +/// per-(rule, source) counters. Same shape the engine spawns tasks for. +/// * `source_names` - List of every configured `vl_source` (independent of +/// which rules tail it). Used to seed the per-source `valerter_vl_source_up` +/// gauge to 0 at startup. +/// * `notifier_names` - List of notifier names to initialize per-notifier counters. +pub fn initialize_metrics( + rule_source_pairs: &[(&str, &str)], + source_names: &[&str], + notifier_names: &[&str], +) { use metrics::{counter, gauge}; // Initialize gauges with their initial values @@ -208,30 +217,92 @@ pub fn initialize_metrics(rule_names: &[&str], notifier_names: &[&str]) { gauge!("valerter_uptime_seconds").set(0.0); gauge!("valerter_queue_size").set(0.0); - // Initialize per-rule gauges - for rule_name in rule_names { - gauge!("valerter_victorialogs_up", "rule_name" => rule_name.to_string()).set(0.0); + // Initialize per-source `vl_source_up` gauge to 0 for every configured + // source. The engine flips it to 1 on the first successful tail connect. + for source_name in source_names { + gauge!("valerter_vl_source_up", "vl_source" => source_name.to_string()).set(0.0); } // Initialize counters without labels (global counters) - counter!("valerter_reconnections_total").absolute(0); - - // Initialize global counters (no labels) counter!("valerter_alerts_dropped_total").absolute(0); - // Initialize per-rule counters - for rule_name in rule_names { - counter!("valerter_logs_matched_total", "rule_name" => rule_name.to_string()).absolute(0); - counter!("valerter_alerts_sent_total", "rule_name" => rule_name.to_string()).absolute(0); - counter!("valerter_alerts_throttled_total", "rule_name" => rule_name.to_string()) - .absolute(0); - counter!("valerter_alerts_passed_total", "rule_name" => rule_name.to_string()).absolute(0); - counter!("valerter_parse_errors_total", "rule_name" => rule_name.to_string()).absolute(0); - counter!("valerter_rule_panics_total", "rule_name" => rule_name.to_string()).absolute(0); - counter!("valerter_rule_errors_total", "rule_name" => rule_name.to_string()).absolute(0); + // Initialize per-(rule, source) counters. Every per-rule metric also + // carries `vl_source` (v2.0.0 multi-source observability). + for (rule_name, vl_source) in rule_source_pairs { + counter!( + "valerter_logs_matched_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_alerts_sent_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_alerts_throttled_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_alerts_passed_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_parse_errors_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_rule_panics_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_rule_errors_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_reconnections_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .absolute(0); + counter!( + "valerter_lines_discarded_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + "reason" => "oversized", + ) + .absolute(0); + // Histograms can't be `absolute(0)` but referencing the handle here + // registers the series so it appears in /metrics from startup. + let _ = metrics::histogram!( + "valerter_query_duration_seconds", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ); + gauge!( + "valerter_last_query_timestamp", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .set(0.0); } - // Initialize per-notifier counters + // Initialize per-notifier counters. The actual emit sites carry + // (rule_name, vl_source) labels; this loop pre-registers a per-notifier + // sentinel series so dashboards listing notifiers see them at startup. + // The two label sets coexist legitimately (different aggregations). for notifier_name in notifier_names { counter!("valerter_alerts_failed_total", "notifier" => notifier_name.to_string()) .absolute(0); @@ -240,7 +311,8 @@ pub fn initialize_metrics(rule_names: &[&str], notifier_names: &[&str]) { } tracing::info!( - rule_count = rule_names.len(), + rule_source_pair_count = rule_source_pairs.len(), + source_count = source_names.len(), notifier_count = notifier_names.len(), "Metrics initialized to zero" ); diff --git a/src/notify/email.rs b/src/notify/email.rs index 4a6ab06..ac6ffae 100644 --- a/src/notify/email.rs +++ b/src/notify/email.rs @@ -583,7 +583,8 @@ impl Notifier for EmailNotifier { metrics::counter!( "valerter_email_recipient_errors_total", "rule_name" => alert.rule_name.clone(), - "notifier_name" => self.name.clone() + "vl_source" => alert.vl_source.clone(), + "notifier_name" => self.name.clone(), ) .increment(1); } @@ -604,8 +605,9 @@ impl Notifier for EmailNotifier { metrics::counter!( "valerter_alerts_sent_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "email" + "notifier_type" => "email", ) .increment(1); Ok(()) @@ -620,16 +622,18 @@ impl Notifier for EmailNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "email" + "notifier_type" => "email", ) .increment(1); // Permanent failure - all recipients failed metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "email" + "notifier_type" => "email", ) .increment(1); Err(NotifyError::SendFailed(format!( diff --git a/src/notify/mattermost.rs b/src/notify/mattermost.rs index cf2b384..86703d8 100644 --- a/src/notify/mattermost.rs +++ b/src/notify/mattermost.rs @@ -202,8 +202,9 @@ impl Notifier for MattermostNotifier { metrics::counter!( "valerter_alerts_sent_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "mattermost" + "notifier_type" => "mattermost", ) .increment(1); return Ok(()); @@ -218,16 +219,18 @@ impl Notifier for MattermostNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "mattermost" + "notifier_type" => "mattermost", ) .increment(1); // Permanent failure - count as failed alert metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "mattermost" + "notifier_type" => "mattermost", ) .increment(1); return Err(NotifyError::SendFailed(format!("client error: {}", status))); @@ -267,16 +270,18 @@ impl Notifier for MattermostNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "mattermost" + "notifier_type" => "mattermost", ) .increment(1); // Permanent failure after retries exhausted metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "mattermost" + "notifier_type" => "mattermost", ) .increment(1); Err(NotifyError::MaxRetriesExceeded) diff --git a/src/notify/queue.rs b/src/notify/queue.rs index dda6892..92095c6 100644 --- a/src/notify/queue.rs +++ b/src/notify/queue.rs @@ -190,7 +190,8 @@ impl NotificationWorker { "valerter_notify_errors_total", "notifier_name" => dest_name.to_string(), "notifier_type" => "unknown", - "rule_name" => payload.rule_name.clone() + "rule_name" => payload.rule_name.clone(), + "vl_source" => payload.vl_source.clone(), ) .increment(1); None diff --git a/src/notify/telegram.rs b/src/notify/telegram.rs index de4c3b1..2f21849 100644 --- a/src/notify/telegram.rs +++ b/src/notify/telegram.rs @@ -436,8 +436,9 @@ impl Notifier for TelegramNotifier { metrics::counter!( "valerter_alerts_sent_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "telegram" + "notifier_type" => "telegram", ) .increment(1); } @@ -450,15 +451,17 @@ impl Notifier for TelegramNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "telegram" + "notifier_type" => "telegram", ) .increment(1); metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "telegram" + "notifier_type" => "telegram", ) .increment(1); } diff --git a/src/notify/webhook.rs b/src/notify/webhook.rs index af45f97..ca16e6d 100644 --- a/src/notify/webhook.rs +++ b/src/notify/webhook.rs @@ -281,8 +281,9 @@ impl Notifier for WebhookNotifier { metrics::counter!( "valerter_alerts_sent_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "webhook" + "notifier_type" => "webhook", ) .increment(1); return Ok(()); @@ -297,16 +298,18 @@ impl Notifier for WebhookNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "webhook" + "notifier_type" => "webhook", ) .increment(1); // Permanent failure - count as failed alert metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "webhook" + "notifier_type" => "webhook", ) .increment(1); return Err(NotifyError::SendFailed(format!("client error: {}", status))); @@ -345,16 +348,18 @@ impl Notifier for WebhookNotifier { metrics::counter!( "valerter_notify_errors_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "webhook" + "notifier_type" => "webhook", ) .increment(1); // Permanent failure after retries exhausted metrics::counter!( "valerter_alerts_failed_total", "rule_name" => alert.rule_name.clone(), + "vl_source" => alert.vl_source.clone(), "notifier_name" => self.name.clone(), - "notifier_type" => "webhook" + "notifier_type" => "webhook", ) .increment(1); Err(NotifyError::MaxRetriesExceeded) diff --git a/src/parser.rs b/src/parser.rs index 45cfcec..ab11df2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -26,30 +26,33 @@ use serde_json::{Map, Value}; /// Record a successful log match with metrics. /// /// This function should be called by the pipeline code when parsing succeeds. -/// It increments the `valerter_logs_matched_total` counter with proper labels. -/// This metric counts successful matches BEFORE throttling is applied. +/// It increments the `valerter_logs_matched_total` counter with `rule_name` +/// and `vl_source` labels. This metric counts successful matches BEFORE +/// throttling is applied. /// /// # Arguments /// /// * `rule_name` - The name of the rule that matched the log +/// * `vl_source` - The name of the VictoriaLogs source the event came from /// /// # Example /// /// ```ignore /// match parser.parse(&line) { /// Ok(fields) => { -/// record_log_matched(&rule.name); +/// record_log_matched(&rule.name, &vl_source); /// // Continue processing... /// } /// Err(e) => { -/// record_parse_error(&rule.name, &e); +/// record_parse_error(&rule.name, &vl_source, &e); /// } /// } /// ``` -pub fn record_log_matched(rule_name: &str) { +pub fn record_log_matched(rule_name: &str, vl_source: &str) { metrics::counter!( "valerter_logs_matched_total", - "rule_name" => rule_name.to_string() + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), ) .increment(1); } @@ -58,11 +61,13 @@ pub fn record_log_matched(rule_name: &str) { /// /// This function should be called by the pipeline code when parsing fails. /// It logs the error at the appropriate level and increments the -/// `valerter_parse_errors_total` counter with proper labels. +/// `valerter_parse_errors_total` counter with `rule_name`, `vl_source`, and +/// `error_type` labels. /// /// # Arguments /// /// * `rule_name` - The name of the rule that encountered the error +/// * `vl_source` - The name of the VictoriaLogs source the event came from /// * `error` - The parse error that occurred /// /// # Example @@ -71,12 +76,12 @@ pub fn record_log_matched(rule_name: &str) { /// match parser.parse(&line) { /// Ok(fields) => { /* process */ } /// Err(e) => { -/// record_parse_error(&rule.name, &e); +/// record_parse_error(&rule.name, &vl_source, &e); /// // Skip this log silently (FR20) /// } /// } /// ``` -pub fn record_parse_error(rule_name: &str, error: &ParseError) { +pub fn record_parse_error(rule_name: &str, vl_source: &str, error: &ParseError) { let error_type = match error { ParseError::NoMatch => "regex_no_match", ParseError::InvalidJson(_) => "invalid_json", @@ -88,6 +93,7 @@ pub fn record_parse_error(rule_name: &str, error: &ParseError) { // NoMatch is normal behavior - log at DEBUG level tracing::debug!( rule_name = %rule_name, + vl_source = %vl_source, "Regex did not match, skipping log" ); } @@ -95,6 +101,7 @@ pub fn record_parse_error(rule_name: &str, error: &ParseError) { // Invalid JSON is unexpected - log at WARN level tracing::warn!( rule_name = %rule_name, + vl_source = %vl_source, error = %msg, "Invalid JSON in log" ); @@ -105,7 +112,8 @@ pub fn record_parse_error(rule_name: &str, error: &ParseError) { metrics::counter!( "valerter_parse_errors_total", "rule_name" => rule_name.to_string(), - "error_type" => error_type + "vl_source" => vl_source.to_string(), + "error_type" => error_type, ) .increment(1); } @@ -754,7 +762,7 @@ mod tests { fn record_parse_error_no_match_does_not_panic() { let error = ParseError::NoMatch; // Should not panic - super::record_parse_error("test_rule", &error); + super::record_parse_error("test_rule", "vlprod", &error); } // Test: record_parse_error does not panic for InvalidJson @@ -762,22 +770,22 @@ mod tests { fn record_parse_error_invalid_json_does_not_panic() { let error = ParseError::InvalidJson("unexpected token".to_string()); // Should not panic - super::record_parse_error("test_rule", &error); + super::record_parse_error("test_rule", "vlprod", &error); } // Test: record_log_matched does not panic #[test] fn record_log_matched_does_not_panic() { // Should not panic - super::record_log_matched("test_rule"); + super::record_log_matched("test_rule", "vlprod"); } // Test: record_log_matched can be called multiple times #[test] fn record_log_matched_multiple_calls() { // Should not panic even with multiple calls - super::record_log_matched("rule_a"); - super::record_log_matched("rule_b"); - super::record_log_matched("rule_a"); + super::record_log_matched("rule_a", "vlprod"); + super::record_log_matched("rule_b", "vldev"); + super::record_log_matched("rule_a", "vlprod"); } } diff --git a/src/tail.rs b/src/tail.rs index 09e918a..e2872d1 100644 --- a/src/tail.rs +++ b/src/tail.rs @@ -188,6 +188,7 @@ impl TailClient { pub async fn connect_and_receive( &mut self, rule_name: &str, + vl_source: &str, ) -> Result, StreamError> { let url = self.build_url(); @@ -219,6 +220,7 @@ impl TailClient { if let Err(StreamError::LineTooLarge(size, max)) = self.buffer.push(&chunk) { warn!( rule_name = %rule_name, + vl_source = %vl_source, size_bytes = size, max_bytes = max, "Discarding oversized log line, buffer cleared" @@ -226,7 +228,8 @@ impl TailClient { metrics::counter!( "valerter_lines_discarded_total", "rule_name" => rule_name.to_string(), - "reason" => "oversized" + "vl_source" => vl_source.to_string(), + "reason" => "oversized", ) .increment(1); continue; @@ -235,7 +238,12 @@ impl TailClient { for line in lines { if !line.is_empty() { - trace!(rule_name = %rule_name, line_len = line.len(), "Received log line"); + trace!( + rule_name = %rule_name, + vl_source = %vl_source, + line_len = line.len(), + "Received log line" + ); all_lines.push(line); } } @@ -286,6 +294,7 @@ impl TailClient { /// // Stream with reconnection - runs until cancelled /// client.stream_with_reconnect( /// "my_rule", + /// "vlprod", /// None, /// |line| async move { /// println!("Received: {}", line); @@ -298,6 +307,7 @@ impl TailClient { pub async fn stream_with_reconnect( &mut self, rule_name: &str, + vl_source: &str, on_reconnect: Option<&dyn ReconnectCallback>, mut line_handler: F, ) -> Result<(), StreamError> @@ -310,7 +320,12 @@ impl TailClient { loop { let url = self.build_url(); - debug!(rule_name = %rule_name, url = %url, "Connecting to VictoriaLogs tail endpoint"); + debug!( + rule_name = %rule_name, + vl_source = %vl_source, + url = %url, + "Connecting to VictoriaLogs tail endpoint" + ); // Start timing for query_duration metric let request_start = Instant::now(); @@ -318,19 +333,26 @@ impl TailClient { let response = match connect_result { Ok(resp) if resp.status().is_success() => { - info!(rule_name = %rule_name, status = %resp.status(), "Connected to VictoriaLogs"); - // Connection successful - mark VictoriaLogs as up + info!( + rule_name = %rule_name, + vl_source = %vl_source, + status = %resp.status(), + "Connected to VictoriaLogs" + ); + // Connection successful - mark this source as up + // (per-source gauge replaces the v1.x per-rule + // `valerter_victorialogs_up{rule_name}`). metrics::gauge!( - "valerter_victorialogs_up", - "rule_name" => rule_name.to_string() + "valerter_vl_source_up", + "vl_source" => vl_source.to_string(), ) .set(1.0); if had_failure { // We recovered from a failure - log_reconnection_success(rule_name); + log_reconnection_success(rule_name, vl_source); if let Some(callback) = on_reconnect { - callback.on_reconnect(rule_name); + callback.on_reconnect(rule_name, vl_source); } } attempt = 0; @@ -338,39 +360,45 @@ impl TailClient { resp } Ok(resp) => { - // HTTP error (4xx, 5xx) - mark VictoriaLogs as down + // HTTP error (4xx, 5xx) - mark this source as down. metrics::gauge!( - "valerter_victorialogs_up", - "rule_name" => rule_name.to_string() + "valerter_vl_source_up", + "vl_source" => vl_source.to_string(), ) .set(0.0); had_failure = true; - let delay = backoff_delay_default(attempt); - log_reconnection_attempt(rule_name, attempt, delay); + let delay = backoff_delay_with_jitter(attempt); + log_reconnection_attempt(rule_name, vl_source, attempt, delay); tokio::time::sleep(delay).await; attempt = attempt.saturating_add(1); warn!( rule_name = %rule_name, + vl_source = %vl_source, status = %resp.status(), "HTTP error from VictoriaLogs" ); continue; } Err(e) => { - // Connection error - mark VictoriaLogs as down + // Connection error - mark this source as down. metrics::gauge!( - "valerter_victorialogs_up", - "rule_name" => rule_name.to_string() + "valerter_vl_source_up", + "vl_source" => vl_source.to_string(), ) .set(0.0); had_failure = true; - let delay = backoff_delay_default(attempt); - log_reconnection_attempt(rule_name, attempt, delay); + let delay = backoff_delay_with_jitter(attempt); + log_reconnection_attempt(rule_name, vl_source, attempt, delay); tokio::time::sleep(delay).await; attempt = attempt.saturating_add(1); - warn!(rule_name = %rule_name, error = %e, "Connection failed"); + warn!( + rule_name = %rule_name, + vl_source = %vl_source, + error = %e, + "Connection failed" + ); continue; } }; @@ -388,7 +416,8 @@ impl TailClient { let duration = request_start.elapsed(); metrics::histogram!( "valerter_query_duration_seconds", - "rule_name" => rule_name.to_string() + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), ) .record(duration.as_secs_f64()); } @@ -400,7 +429,8 @@ impl TailClient { .as_secs_f64(); metrics::gauge!( "valerter_last_query_timestamp", - "rule_name" => rule_name.to_string() + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), ) .set(now); @@ -408,6 +438,7 @@ impl TailClient { { warn!( rule_name = %rule_name, + vl_source = %vl_source, size_bytes = size, max_bytes = max, "Discarding oversized log line, buffer cleared" @@ -415,7 +446,8 @@ impl TailClient { metrics::counter!( "valerter_lines_discarded_total", "rule_name" => rule_name.to_string(), - "reason" => "oversized" + "vl_source" => vl_source.to_string(), + "reason" => "oversized", ) .increment(1); continue; @@ -424,25 +456,35 @@ impl TailClient { for line in lines { if !line.is_empty() { - trace!(rule_name = %rule_name, line_len = line.len(), "Received log line"); + trace!( + rule_name = %rule_name, + vl_source = %vl_source, + line_len = line.len(), + "Received log line" + ); line_handler(line).await?; } } } Err(e) => { - // Stream error - mark VictoriaLogs as down and reconnect + // Stream error - mark this source as down and reconnect. metrics::gauge!( - "valerter_victorialogs_up", - "rule_name" => rule_name.to_string() + "valerter_vl_source_up", + "vl_source" => vl_source.to_string(), ) .set(0.0); had_failure = true; - let delay = backoff_delay_default(attempt); - log_reconnection_attempt(rule_name, attempt, delay); + let delay = backoff_delay_with_jitter(attempt); + log_reconnection_attempt(rule_name, vl_source, attempt, delay); tokio::time::sleep(delay).await; attempt = attempt.saturating_add(1); - warn!(rule_name = %rule_name, error = %e, "Stream read error"); + warn!( + rule_name = %rule_name, + vl_source = %vl_source, + error = %e, + "Stream read error" + ); break; // Break inner loop to reconnect } } @@ -451,7 +493,11 @@ impl TailClient { // Stream ended (server closed connection) - reconnect if !had_failure { // Normal stream end, not a failure - still need to reconnect - debug!(rule_name = %rule_name, "Stream ended, reconnecting"); + debug!( + rule_name = %rule_name, + vl_source = %vl_source, + "Stream ended, reconnecting" + ); } had_failure = true; } @@ -480,17 +526,59 @@ pub fn backoff_delay(attempt: u32, base: Duration, max: Duration) -> Duration { /// Calculate exponential backoff delay using default VictoriaLogs parameters. /// /// Uses BACKOFF_BASE (1s) and BACKOFF_MAX (60s) as per AD-07. -pub fn backoff_delay_default(attempt: u32) -> Duration { +/// +/// Exposed `pub(crate)` only: production callers must go through +/// [`backoff_delay_with_jitter`] so the jitter clamp is always applied. Direct +/// use bypasses that safety net. +pub(crate) fn backoff_delay_default(attempt: u32) -> Duration { backoff_delay(attempt, BACKOFF_BASE, BACKOFF_MAX) } +/// Minimum reconnect delay floor in milliseconds (post-jitter clamp). +/// +/// The exponential backoff base is 1s = 1000ms, so a -10% jitter on attempt +/// 0 produces 900ms which is well above this floor; the clamp is a defensive +/// safety net for any future change that lowers `BACKOFF_BASE`. +pub const MIN_RECONNECT_DELAY_MS: u64 = 100; + +/// Compute the backoff delay with `±10%` uniform jitter applied per call. +/// +/// Multi-source observability (v2.0.0 part 2): when N sources behind a flapping +/// load balancer all reconnect at the same exponential cadence they form a +/// thundering herd. Per-task uniform jitter spreads attempts in a `[0.9·D, +/// 1.1·D]` window so the herd dissolves over a few cycles without changing +/// the overall reconnect rate. +/// +/// The jitter is uniform over `[-0.10, +0.10]` (inclusive) and the resulting +/// delay is clamped to [`MIN_RECONNECT_DELAY_MS`] so a negative jitter never +/// produces a sub-100ms hot loop. +pub fn backoff_delay_with_jitter(attempt: u32) -> Duration { + use rand::Rng; + + let base = backoff_delay_default(attempt); + let jitter: f64 = rand::thread_rng().gen_range(-0.10..=0.10); + apply_jitter_floor(base.as_millis() as u64, jitter) +} + +/// Apply a `(1 + jitter)` multiplier to a millisecond base and clamp the +/// result to [`MIN_RECONNECT_DELAY_MS`]. Pure helper so the clamp branch is +/// directly exercisable from unit tests with synthetic small bases. +fn apply_jitter_floor(base_ms: u64, jitter: f64) -> Duration { + let effective_ms = ((base_ms as f64) * (1.0 + jitter)).max(MIN_RECONNECT_DELAY_MS as f64); + Duration::from_millis(effective_ms as u64) +} + /// Trait for reconnection callbacks. /// /// Implementors receive notification when a connection is restored after failure. /// This is used to reset throttle caches as per FR7. pub trait ReconnectCallback: Send + Sync { /// Called when connection is restored after failure. - fn on_reconnect(&self, rule_name: &str); + /// + /// Receives both the rule name AND the source name so implementors can + /// scope their reaction (e.g. throttle reset) per `(rule, source)` task + /// rather than fan-out across all sources of the same rule. + fn on_reconnect(&self, rule_name: &str, vl_source: &str); } /// Log the reconnection attempt with proper tracing. @@ -498,19 +586,26 @@ pub trait ReconnectCallback: Send + Sync { /// # Arguments /// /// * `rule_name` - Name of the rule for the tracing span +/// * `vl_source` - Name of the VictoriaLogs source for the tracing span /// * `attempt` - Current retry attempt number /// * `delay` - Delay before next retry -pub fn log_reconnection_attempt(rule_name: &str, attempt: u32, delay: Duration) { +pub fn log_reconnection_attempt(rule_name: &str, vl_source: &str, attempt: u32, delay: Duration) { warn!( rule_name = %rule_name, + vl_source = %vl_source, attempt = attempt, delay_secs = delay.as_secs(), "Connection failed, retrying" ); - // Increment reconnection metric - metrics::counter!("valerter_reconnections_total", "rule_name" => rule_name.to_string()) - .increment(1); + // Increment reconnection metric (now per-(rule, source) for multi-source + // observability — v2.0.0 part 2). + metrics::counter!( + "valerter_reconnections_total", + "rule_name" => rule_name.to_string(), + "vl_source" => vl_source.to_string(), + ) + .increment(1); } /// Log successful reconnection after failure. @@ -518,9 +613,11 @@ pub fn log_reconnection_attempt(rule_name: &str, attempt: u32, delay: Duration) /// # Arguments /// /// * `rule_name` - Name of the rule for the tracing span -pub fn log_reconnection_success(rule_name: &str) { +/// * `vl_source` - Name of the VictoriaLogs source for the tracing span +pub fn log_reconnection_success(rule_name: &str, vl_source: &str) { info!( rule_name = %rule_name, + vl_source = %vl_source, "Connection restored, throttle cache reset signal sent" ); } @@ -759,6 +856,81 @@ mod tests { assert_eq!(BACKOFF_MAX, Duration::from_secs(60)); } + // ========================================================================== + // Multi-source observability v2.0.0: jitter on reconnect backoff. + // The intent is to break thundering-herd alignment on flapping load + // balancers; we cannot prove statistical independence in a unit test, but + // we can prove the bounds and the floor clamp. + // ========================================================================== + + #[test] + fn jitter_stays_within_plus_minus_ten_percent_of_base_for_attempt_3() { + // Attempt 3 → base 8s = 8000ms. Jittered value must lie in [7200, 8800]. + let base = backoff_delay_default(3); + assert_eq!(base, Duration::from_secs(8)); + let lo = (base.as_millis() as f64 * 0.90).floor() as u128; + let hi = (base.as_millis() as f64 * 1.10).ceil() as u128; + for _ in 0..200 { + let d = backoff_delay_with_jitter(3); + let ms = d.as_millis(); + assert!( + ms >= lo && ms <= hi, + "jittered delay {}ms outside [{}, {}]", + ms, + lo, + hi + ); + } + } + + #[test] + fn jitter_caps_below_min_reconnect_delay_floor() { + // Directly exercise the floor branch via the pure helper. With + // base_ms=50 and jitter=-0.5, the natural product (25ms) is far + // below MIN_RECONNECT_DELAY_MS (100ms) and must be clamped up. + let clamped = apply_jitter_floor(50, -0.5); + assert_eq!( + clamped.as_millis() as u64, + MIN_RECONNECT_DELAY_MS, + "small base + heavy negative jitter must be clamped to the floor" + ); + + // Edge: jitter that would land exactly at the floor still pegs to it. + let exact = apply_jitter_floor(100, 0.0); + assert_eq!(exact.as_millis() as u64, MIN_RECONNECT_DELAY_MS); + + // The default-base path remains unaffected (probabilistic check). + for _ in 0..50 { + let d = backoff_delay_with_jitter(0); + assert!( + d.as_millis() >= MIN_RECONNECT_DELAY_MS as u128, + "default-base jitter must not drop below floor: {}ms", + d.as_millis() + ); + } + } + + #[test] + fn jitter_at_capped_attempt_stays_within_window_around_max() { + // Attempt 100 → backoff is capped to BACKOFF_MAX = 60s. + // Jitter window is computed from the cap, not from 2^100. + let base = backoff_delay_default(100); + assert_eq!(base, BACKOFF_MAX); + let lo = (base.as_millis() as f64 * 0.90).floor() as u128; + let hi = (base.as_millis() as f64 * 1.10).ceil() as u128; + for _ in 0..50 { + let d = backoff_delay_with_jitter(100); + let ms = d.as_millis(); + assert!( + ms >= lo && ms <= hi, + "jittered capped delay {}ms outside [{}, {}]", + ms, + lo, + hi + ); + } + } + #[test] fn test_client_creation_success() { let config = test_config("http://localhost:9428", "test", None); diff --git a/src/throttle.rs b/src/throttle.rs index 9be6af6..21bbd05 100644 --- a/src/throttle.rs +++ b/src/throttle.rs @@ -166,14 +166,18 @@ impl Throttler { "Throttle count updated" ); - // L1: Pre-convert rule_name to String once for metrics (required for 'static) + // L1: Pre-convert rule_name and vl_source to String once for metrics + // (required for 'static label storage). let rule_name_str = self.rule_name.to_string(); + let vl_source_str = self.vl_source.to_string(); if count <= self.max_count { - // M3: Increment metric for passed alerts + // M3: Increment metric for passed alerts (gains `vl_source` for + // multi-source observability — v2.0.0 part 2). metrics::counter!( "valerter_alerts_passed_total", - "rule_name" => rule_name_str + "rule_name" => rule_name_str, + "vl_source" => vl_source_str, ) .increment(1); @@ -182,16 +186,19 @@ impl Throttler { // Log at DEBUG level (throttling is normal behavior) tracing::debug!( rule_name = %self.rule_name, + vl_source = %self.vl_source, throttle_key = %key, count = count, max_count = self.max_count, "Alert throttled" ); - // Increment metric (FR23, FR24) + // Increment metric (FR23, FR24); gains `vl_source` to disambiguate + // throttle hot-spots per source. metrics::counter!( "valerter_alerts_throttled_total", - "rule_name" => rule_name_str + "rule_name" => rule_name_str, + "vl_source" => vl_source_str, ) .increment(1); diff --git a/tests/integration_streaming.rs b/tests/integration_streaming.rs index 5da5fb3..4ae8d27 100644 --- a/tests/integration_streaming.rs +++ b/tests/integration_streaming.rs @@ -53,7 +53,10 @@ async fn test_streaming_basic_single_line() { let config = create_config(&mock_server, "_stream:test"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("test log")); @@ -80,7 +83,10 @@ async fn test_streaming_multiple_lines() { let config = create_config(&mock_server, "_stream:multi"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 3); assert!(lines[0].contains("log 1")); @@ -101,7 +107,10 @@ async fn test_streaming_empty_response() { let config = create_config(&mock_server, "_stream:empty"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert!(lines.is_empty()); } @@ -123,7 +132,7 @@ async fn test_connection_error_http_500() { let config = create_config(&mock_server, "_stream:error"); let mut client = TailClient::new(config).unwrap(); - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; assert!(result.is_err()); match result { @@ -147,7 +156,7 @@ async fn test_connection_error_http_404() { let config = create_config(&mock_server, "_stream:notfound"); let mut client = TailClient::new(config).unwrap(); - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; assert!(result.is_err()); match result { @@ -171,7 +180,7 @@ async fn test_connection_error_http_503() { let config = create_config(&mock_server, "_stream:unavailable"); let mut client = TailClient::new(config).unwrap(); - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; assert!(result.is_err()); match result { @@ -196,7 +205,7 @@ async fn test_connection_error_server_down() { let mut client = TailClient::new(config).unwrap(); - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; assert!(result.is_err()); match result { @@ -231,7 +240,7 @@ async fn test_timeout_detection() { let mut client = TailClient::new(config).unwrap(); // This won't actually timeout since delay is short, but tests the path - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; // Should succeed (data received before timeout) // The buffer will hold "incomplete" without newline @@ -287,7 +296,7 @@ async fn test_url_construction_is_correct() { }; let mut client = TailClient::new(config).unwrap(); - let _ = client.connect_and_receive("test_rule").await; + let _ = client.connect_and_receive("test_rule", "default").await; // If we get here without panic, the URL matched } @@ -315,7 +324,7 @@ async fn test_url_with_start_param() { }; let mut client = TailClient::new(config).unwrap(); - let _ = client.connect_and_receive("test_rule").await; + let _ = client.connect_and_receive("test_rule", "default").await; } // ============================================================================= @@ -338,7 +347,7 @@ async fn test_headers_are_set_correctly() { let config = create_config(&mock_server, "_stream:headers"); let mut client = TailClient::new(config).unwrap(); - let _ = client.connect_and_receive("test_rule").await; + let _ = client.connect_and_receive("test_rule", "default").await; } // ============================================================================= @@ -365,7 +374,10 @@ async fn test_streaming_with_utf8_content() { let config = create_config(&mock_server, "_stream:utf8"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 2); assert!(lines[0].contains("Café")); @@ -395,7 +407,7 @@ impl TestReconnectCallback { } impl ReconnectCallback for TestReconnectCallback { - fn on_reconnect(&self, _rule_name: &str) { + fn on_reconnect(&self, _rule_name: &str, _vl_source: &str) { self.count.fetch_add(1, Ordering::SeqCst); } } @@ -423,7 +435,7 @@ async fn test_stream_with_reconnect_receives_lines() { // Use tokio::time::timeout to prevent infinite loop let result = tokio::time::timeout(Duration::from_millis(500), async { client - .stream_with_reconnect("test_rule", None, |line| { + .stream_with_reconnect("test_rule", "default", None, |line| { let lines = Arc::clone(&lines_clone); async move { lines.lock().unwrap().push(line); @@ -474,7 +486,7 @@ async fn test_stream_with_reconnect_retries_on_error() { // Use short timeout - should get at least one retry and one success let _ = tokio::time::timeout(Duration::from_secs(3), async { client - .stream_with_reconnect("test_rule", Some(&callback), |line| { + .stream_with_reconnect("test_rule", "default", Some(&callback), |line| { let lines = Arc::clone(&lines_clone); async move { lines.lock().unwrap().push(line); @@ -505,15 +517,15 @@ async fn test_stream_with_reconnect_retries_on_error() { #[test] fn test_log_reconnection_attempt_does_not_panic() { // Just verify the function can be called without panic - log_reconnection_attempt("test_rule", 0, Duration::from_secs(1)); - log_reconnection_attempt("test_rule", 5, Duration::from_secs(32)); - log_reconnection_attempt("test_rule", 10, Duration::from_secs(60)); + log_reconnection_attempt("test_rule", "default", 0, Duration::from_secs(1)); + log_reconnection_attempt("test_rule", "default", 5, Duration::from_secs(32)); + log_reconnection_attempt("test_rule", "default", 10, Duration::from_secs(60)); } #[test] fn test_log_reconnection_success_does_not_panic() { // Just verify the function can be called without panic - log_reconnection_success("test_rule"); + log_reconnection_success("test_rule", "default"); } #[test] @@ -521,10 +533,10 @@ fn test_reconnect_callback_trait() { let callback = TestReconnectCallback::new(); assert_eq!(callback.reconnect_count(), 0); - callback.on_reconnect("rule1"); + callback.on_reconnect("rule1", "vlprod"); assert_eq!(callback.reconnect_count(), 1); - callback.on_reconnect("rule2"); + callback.on_reconnect("rule2", "vldev"); assert_eq!(callback.reconnect_count(), 2); } @@ -598,7 +610,10 @@ async fn test_basic_auth_header_is_sent() { let config = create_config_with_basic_auth(&mock_server, "testuser", "testpass"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("authenticated")); @@ -633,7 +648,10 @@ async fn test_custom_headers_are_sent() { let config = create_config_with_headers(&mock_server, headers); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("headers received")); @@ -663,7 +681,10 @@ async fn test_bearer_token_in_header() { let config = create_config_with_headers(&mock_server, headers); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("bearer auth ok")); @@ -706,7 +727,10 @@ async fn test_basic_auth_with_custom_headers_combined() { let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("combined auth ok")); @@ -734,7 +758,7 @@ async fn test_basic_auth_401_on_wrong_credentials() { let config = create_config_with_basic_auth(&mock_server, "wrong_user", "wrong_pass"); let mut client = TailClient::new(config).unwrap(); - let result = client.connect_and_receive("test_rule").await; + let result = client.connect_and_receive("test_rule", "default").await; assert!(result.is_err()); match result { @@ -763,7 +787,10 @@ async fn test_without_auth_no_authorization_header() { let config = create_config(&mock_server, "_stream:noauth"); let mut client = TailClient::new(config).unwrap(); - let lines = client.connect_and_receive("test_rule").await.unwrap(); + let lines = client + .connect_and_receive("test_rule", "default") + .await + .unwrap(); assert_eq!(lines.len(), 1); assert!(lines[0].contains("no auth")); diff --git a/tests/metrics_snapshot.rs b/tests/metrics_snapshot.rs new file mode 100644 index 0000000..b8b5003 --- /dev/null +++ b/tests/metrics_snapshot.rs @@ -0,0 +1,330 @@ +//! `/metrics` snapshot test for the multi-source observability work +//! (v2.0.0 part 2). +//! +//! Spins up a 2-source 1-rule engine plus the real Prometheus metrics +//! exporter on an ephemeral port, exercises every per-rule metric path +//! once (alert sent, alert throttled, parse error, log matched), then +//! scrapes `/metrics` and asserts the **set of metric names + label keys** +//! against an inline expected list. Values and timestamps are intentionally +//! ignored — the test catches accidental metric rename/relabel in future PRs +//! without coupling to runtime numbers. +//! +//! ## Why a separate integration test binary +//! +//! `metrics-exporter-prometheus` installs a global recorder via +//! `PrometheusBuilder::install()`, which can only run once per process. Each +//! integration test binary gets its own process, so this file owns the +//! recorder for its run and does not race with `src/metrics.rs` unit tests +//! or other integration suites. + +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use std::sync::Arc; +use std::time::Duration; + +use serde_json::Value; +use tokio::sync::broadcast; +use tokio_util::sync::CancellationToken; +use valerter::config::{ + CompiledParser, CompiledRule, CompiledTemplate, DEFAULT_MAX_STREAMS, DefaultsConfig, + JsonParserConfig, MetricsConfig, NotifyConfig, RuntimeConfig, ThrottleConfig, VlSourceConfig, +}; +use valerter::notify::{AlertPayload, NotificationQueue}; +use valerter::{MetricsServer, RuleEngine}; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, ResponseTemplate}; + +/// Re-serialize a JSON value into NDJSON (one event + trailing newline). +fn ndjson_body(events: &[&Value]) -> Vec { + let mut out = Vec::new(); + for ev in events { + out.extend_from_slice( + serde_json::to_vec(ev) + .expect("fixture is valid JSON") + .as_slice(), + ); + out.push(b'\n'); + } + out +} + +async fn mount_ndjson(server: &MockServer, events: &[&Value]) { + Mock::given(method("GET")) + .and(path("/select/logsql/tail")) + .respond_with( + ResponseTemplate::new(200).set_body_raw(ndjson_body(events), "application/x-ndjson"), + ) + .mount(server) + .await; +} + +fn rule(name: &str, vl_sources: Vec, throttle_count: u32) -> CompiledRule { + CompiledRule { + name: name.to_string(), + enabled: true, + query: "_stream:test".to_string(), + parser: CompiledParser { + regex: None, + json: Some(JsonParserConfig { + fields: vec!["_msg".to_string()], + }), + }, + throttle: Some(valerter::config::CompiledThrottle { + key_template: None, + count: throttle_count, + window: Duration::from_secs(60), + }), + notify: NotifyConfig { + template: "tpl".to_string(), + mattermost_channel: None, + destinations: vec!["dest".to_string()], + }, + vl_sources, + } +} + +fn vl_source(uri: &str) -> VlSourceConfig { + VlSourceConfig { + url: uri.to_string(), + basic_auth: None, + headers: None, + tls: None, + } +} + +fn runtime(sources: BTreeMap, rules: Vec) -> RuntimeConfig { + let mut templates = std::collections::HashMap::new(); + templates.insert( + "tpl".to_string(), + CompiledTemplate { + title: "{{ rule_name }}@{{ vl_source }}".to_string(), + body: "{{ _msg }}".to_string(), + email_body_html: None, + accent_color: None, + }, + ); + + RuntimeConfig { + victorialogs: sources, + defaults: DefaultsConfig { + throttle: ThrottleConfig { + key: None, + count: 5, + window: Duration::from_secs(60), + }, + timestamp_timezone: "UTC".to_string(), + max_streams: DEFAULT_MAX_STREAMS, + }, + templates, + rules, + metrics: MetricsConfig::default(), + notifiers: None, + config_dir: std::path::PathBuf::from("."), + } +} + +async fn drain(rx: &mut broadcast::Receiver, max: usize, deadline: Duration) { + let mut got = 0; + let _ = tokio::time::timeout(deadline, async { + while got < max { + match rx.recv().await { + Ok(_) => got += 1, + Err(_) => break, + } + } + }) + .await; +} + +/// Parse a Prometheus exposition body and return the set of `name{labelkeys}` +/// strings. Label *values* and metric *values* are stripped; only the metric +/// identifier and the **sorted set of label keys** are kept. This is exactly +/// what we want to catch accidental rename/relabel without coupling to +/// counter values, timestamps, or how many label-value combinations exist. +fn extract_name_label_keys(body: &str) -> BTreeSet { + let mut out = BTreeSet::new(); + for line in body.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + continue; + } + // Extract name and (optional) labels block. Format: `name{k="v",...} value` + // or `name value`. + let (head, _) = match line.split_once(' ') { + Some(parts) => parts, + None => continue, + }; + let (name, label_keys) = if let Some(brace) = head.find('{') { + let name = &head[..brace]; + let labels_str = &head[brace + 1..head.len() - 1]; + let mut keys: Vec<&str> = labels_str + .split(',') + .filter_map(|kv| kv.split_once('=').map(|(k, _)| k)) + .collect(); + keys.sort(); + keys.dedup(); + (name.to_string(), keys.join(",")) + } else { + (head.to_string(), String::new()) + }; + if label_keys.is_empty() { + out.insert(name); + } else { + out.insert(format!("{}{{{}}}", name, label_keys)); + } + } + out +} + +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +async fn metrics_snapshot_two_sources_one_rule() { + // 1) Mock 2 VL sources. Source `vlprod` serves a parseable event so the + // engine drives the throttle + alert path. Source `vldev` serves a + // line that fails JSON parsing, exercising the parse-error path. + let vlprod = MockServer::start().await; + let vldev = MockServer::start().await; + + let good_event: Value = serde_json::json!({ + "_time": "2026-04-15T10:00:00Z", + "_stream": "{}", + "_msg": "ok", + }); + mount_ndjson(&vlprod, &[&good_event, &good_event, &good_event]).await; + + // For vldev, serve raw garbage so the parser increments + // `valerter_parse_errors_total{rule_name, vl_source, error_type}`. + Mock::given(method("GET")) + .and(path("/select/logsql/tail")) + .respond_with( + ResponseTemplate::new(200) + .set_body_raw(b"this is not json\n".to_vec(), "application/x-ndjson"), + ) + .mount(&vldev) + .await; + + let mut sources = BTreeMap::new(); + sources.insert("vlprod".to_string(), vl_source(&vlprod.uri())); + sources.insert("vldev".to_string(), vl_source(&vldev.uri())); + + // Throttle count=1 on the only rule so the second event on `vlprod` + // also exercises the throttled path. + let rules = vec![rule("snapshot_rule", Vec::new(), 1)]; + let cfg = runtime(sources, rules); + + // 2) Boot the metrics server on an ephemeral port. The recorder install + // is a one-shot global; subsequent tests in this same binary cannot + // install it again, which is why this file is a dedicated integration + // test. + let port = portpicker::pick_unused_port().expect("free port"); + let cancel = CancellationToken::new(); + let (ready_tx, ready_rx) = tokio::sync::oneshot::channel(); + let metrics_cancel = cancel.clone(); + let metrics_handle = tokio::spawn(async move { + let server = MetricsServer::with_ready_signal(port, ready_tx); + let _ = server.run(metrics_cancel).await; + }); + ready_rx.await.expect("metrics server should signal ready"); + + // 3) Initialize all known metric series so the snapshot is deterministic + // even before counters tick. Mirrors the call valerter's main does. + let rule_source_pairs: Vec<(&str, &str)> = + vec![("snapshot_rule", "vlprod"), ("snapshot_rule", "vldev")]; + let source_names: Vec<&str> = vec!["vlprod", "vldev"]; + // Pass at least one notifier so the per-notifier sentinel counters + // (`alerts_failed_total{notifier}` / `notify_errors_total{notifier}`) + // are seeded and the snapshot can assert their presence. + let notifier_names: Vec<&str> = vec!["sentinel"]; + valerter::initialize_metrics(&rule_source_pairs, &source_names, ¬ifier_names); + + // 4) Run the engine briefly so each metric path fires at least once. + let queue = NotificationQueue::new(64); + let mut rx = queue.subscribe(); + let engine = RuleEngine::new(cfg, reqwest::Client::new(), queue.clone()); + let cancel_for_engine = cancel.clone(); + let engine_handle = tokio::spawn(async move { engine.run(cancel_for_engine).await }); + + // Drain a few alerts to make sure the throttle/passed/sent paths run. + drain(&mut rx, 5, Duration::from_secs(2)).await; + + // 5) Scrape /metrics. + let url = format!("http://127.0.0.1:{}/metrics", port); + let body = reqwest::Client::new() + .get(&url) + .send() + .await + .expect("scrape should succeed") + .text() + .await + .expect("body should decode"); + + // 6) Tear down. The engine task runs forever until cancelled. + cancel.cancel(); + let _ = tokio::time::timeout(Duration::from_secs(2), engine_handle).await; + let _ = tokio::time::timeout(Duration::from_secs(1), metrics_handle).await; + + // 7) Assert the snapshot. We check that *every expected* metric series + // (name + label-key set) is present. The actual output may carry + // additional series from per-(rule, source) initialization that we + // explicitly seeded, so we tolerate supersets. + let actual = extract_name_label_keys(&body); + + // Inline expected snapshot. Sorted alphabetically for stable diffs. + // Each entry is `metric_name{label_keys_csv_sorted}` or just + // `metric_name` when unlabeled. + let expected: Arc<[&'static str]> = Arc::from([ + // Per-(rule, source) counters seeded by initialize_metrics. + "valerter_alerts_passed_total{rule_name,vl_source}", + "valerter_alerts_sent_total{rule_name,vl_source}", + "valerter_alerts_throttled_total{rule_name,vl_source}", + "valerter_logs_matched_total{rule_name,vl_source}", + "valerter_parse_errors_total{rule_name,vl_source}", + "valerter_reconnections_total{rule_name,vl_source}", + "valerter_rule_errors_total{rule_name,vl_source}", + "valerter_rule_panics_total{rule_name,vl_source}", + // Per-(rule, source) discarded counter (3-label, reason="oversized"). + "valerter_lines_discarded_total{reason,rule_name,vl_source}", + // Per-(rule, source) gauge for last query timestamp. + "valerter_last_query_timestamp{rule_name,vl_source}", + // Per-(rule, source) histogram exported as a Prometheus summary by + // metrics-exporter-prometheus: emits the metric with `quantile` label + // plus `_sum` and `_count` companion series. + "valerter_query_duration_seconds{quantile,rule_name,vl_source}", + "valerter_query_duration_seconds_count{rule_name,vl_source}", + "valerter_query_duration_seconds_sum{rule_name,vl_source}", + // Per-notifier sentinel counters. + "valerter_alerts_failed_total{notifier}", + "valerter_notify_errors_total{notifier}", + // Global / shared counters & gauges. + "valerter_alerts_dropped_total", + "valerter_queue_size", + "valerter_uptime_seconds", + // Per-source reachability gauge (replaces the old per-rule + // valerter_victorialogs_up). + "valerter_vl_source_up{vl_source}", + // Build info carries only the version label. + "valerter_build_info{version}", + ]); + + let mut missing: Vec = Vec::new(); + for want in expected.iter() { + if !actual.contains(*want) { + missing.push((*want).to_string()); + } + } + assert!( + missing.is_empty(), + "metrics snapshot missing expected series:\n missing = {:#?}\n\n actual = {:#?}\n\n raw body =\n{}", + missing, + actual, + body + ); + + // Hard regression: the v1.x per-rule gauge MUST be gone in v2.0.0. + assert!( + !actual + .iter() + .any(|s| s.starts_with("valerter_victorialogs_up")), + "valerter_victorialogs_up must be removed in v2.0.0 (replaced by valerter_vl_source_up). Found in /metrics:\n{}", + body + ); +} diff --git a/tests/multi_source_integration.rs b/tests/multi_source_integration.rs index 8d9e7ff..e964532 100644 --- a/tests/multi_source_integration.rs +++ b/tests/multi_source_integration.rs @@ -115,6 +115,7 @@ fn runtime(sources: BTreeMap, rules: Vec) window: Duration::from_secs(60), }, timestamp_timezone: "UTC".to_string(), + max_streams: valerter::config::DEFAULT_MAX_STREAMS, }, templates, rules, From 4ca7fa77a9d33279f7abe7a26960f73c4f4da35c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois-Xavier=20THIRY?= Date: Thu, 16 Apr 2026 15:04:09 +0200 Subject: [PATCH 4/6] docs(changelog): add v2.0.0 security advisory on raw _msg in email_body_html Documents a pre-existing concern made wider by the v1.2.0 #26 fix: the example config now actively pipes `{{ _msg }}` into `body`, and operators who mirror this pattern in `email_body_html` may inadvertently render unescaped HTML/script content from untrusted log fields. Hardening of the email path is a follow-up; the advisory makes the limit visible at the v2.0.0 release boundary. --- CHANGELOG.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f9f67e6..0d60172 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [2.0.0] - unreleased +### Security advisory + +- **Be cautious when piping raw `_msg` into `email_body_html`.** The example + config switched `body: "{{ _msg }}"` in v1.2.0 (#26 fix), and operators may + reasonably mirror that in `email_body_html`. The email notifier marks `body` + as `safe` (pre-escaped HTML) before injection into the email envelope, so a + log line containing raw HTML or `