diff --git a/Cargo.lock b/Cargo.lock index 26ac5d08ab..5e8002cb05 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1679,9 +1679,11 @@ dependencies = [ "antithesis_sdk", "anyhow", "clap", + "itoa", "num-traits", "rand 0.10.1", "rand_distr", + "ryu", "serde", "serde_json", "serde_yaml", diff --git a/test/antithesis/deploy/Dockerfile b/test/antithesis/deploy/Dockerfile index d8de1bb5c0..05ada256c8 100644 --- a/test/antithesis/deploy/Dockerfile +++ b/test/antithesis/deploy/Dockerfile @@ -5,7 +5,7 @@ # Build context is the repository root. Three named targets: # - adp : agent-data-plane built WITH Antithesis coverage instrumentation + SDK (the SUT) # - intake : datadog-intake mock Datadog intake (dependency) -# - workload : millstone load generator + test templates + setup-complete (the client) +# - workload : DogStatsD driver + test templates + setup-complete (the client) # # ADP is built native x86_64-unknown-linux-gnu (glibc), so no musl cross-compile headers are needed. @@ -67,7 +67,7 @@ RUN --mount=type=cache,target=/adp/target,id=antithesis-adp-target \ echo "Instrumentation symbols present." # --------------------------------------------------------------------------- -# Build the correctness tools (datadog-intake + millstone), uninstrumented. +# Build the correctness tools (datadog-intake) and the test-command binaries, uninstrumented. # These are supporting harness components, not the SUT, so they need no coverage instrumentation. # --------------------------------------------------------------------------- FROM build-base AS tools-builder @@ -77,11 +77,10 @@ RUN --mount=type=cache,target=/tools/target,id=antithesis-tools-target \ --mount=type=cache,target=/root/.cargo/registry,id=cargo-registry \ --mount=type=cache,target=/root/.cargo/git,id=cargo-git \ cargo build --release \ - --bin datadog-intake --bin millstone \ + --bin datadog-intake \ --bin parallel_driver_send_dogstatsd --bin finally_verify_delivery --bin eventually_adp_alive \ --bin first_sample_config && \ cp /tools/target/release/datadog-intake /usr/local/bin/datadog-intake && \ - cp /tools/target/release/millstone /usr/local/bin/millstone && \ cp /tools/target/release/parallel_driver_send_dogstatsd /usr/local/bin/parallel_driver_send_dogstatsd && \ cp /tools/target/release/finally_verify_delivery /usr/local/bin/finally_verify_delivery && \ cp /tools/target/release/eventually_adp_alive /usr/local/bin/eventually_adp_alive && \ @@ -91,19 +90,18 @@ RUN --mount=type=cache,target=/tools/target,id=antithesis-tools-target \ # Runtime: Agent Data Plane (SUT). # --------------------------------------------------------------------------- FROM ${APP_IMAGE} AS adp -ENV NO_COLOR=1 \ - RUST_BACKTRACE=1 +ENV NO_COLOR=1 RUN apt-get update && \ apt-get install --no-install-recommends -y ca-certificates openssl && \ rm -rf /var/lib/apt/lists/* COPY --from=adp-builder /usr/local/bin/agent-data-plane /usr/local/bin/agent-data-plane # Expose DWARF/build-id symbols to Antithesis for symbolization (one-hop symlink to the unstripped binary). RUN mkdir -p /symbols && ln -s /usr/local/bin/agent-data-plane /symbols/agent-data-plane -# main.rs requires the bootstrap config file to exist at the default path; ship a minimal standalone -# config as a fallback. The boot wrapper overwrites it with the per-replay config written by the -# `first_sample_config` workload command onto the shared `agent-config` volume. +# main.rs requires a config file at the default path. Ship a minimal standalone config as a +# fallback. The boot wrapper overwrites it with the per-timeline config that first_sample_config +# samples onto the shared `agent-config` volume. COPY test/antithesis/deploy/adp/datadog.yaml /etc/datadog-agent/datadog.yaml -# Boot wrapper: waits for the drawn config sentinel, copies the config into place, then execs ADP. +# Boot wrapper waits for the config sentinel, copies the config into place, then execs ADP. COPY --chmod=755 test/antithesis/deploy/adp/entrypoint.sh /entrypoint.sh # ADP's control-plane secure API requires an IPC TLS cert (a single PEM holding both certificate and # private key) that the Core Agent normally generates. In standalone mode there is no Core Agent, so @@ -126,7 +124,7 @@ COPY --from=tools-builder /usr/local/bin/datadog-intake /usr/local/bin/datadog-i ENTRYPOINT ["/usr/local/bin/datadog-intake"] # --------------------------------------------------------------------------- -# Runtime: workload client (millstone load generator + test templates). +# Runtime: workload client (DogStatsD driver + test templates). # --------------------------------------------------------------------------- FROM ${APP_IMAGE} AS workload ENV NO_COLOR=1 @@ -134,7 +132,6 @@ RUN test -d /usr/share/ca-certificates || ( \ apt-get update && \ apt-get install --no-install-recommends -y ca-certificates && \ rm -rf /var/lib/apt/lists/* ) -COPY --from=tools-builder /usr/local/bin/millstone /usr/local/bin/millstone # Antithesis setup-complete helper and test templates (helper files + the "main" template dir). COPY --chmod=755 test/antithesis/deploy/workload/setup-complete.sh /opt/antithesis/setup-complete.sh COPY test/antithesis/deploy/workload/test/ /opt/antithesis/test/ diff --git a/test/antithesis/deploy/docker-compose.yaml b/test/antithesis/deploy/docker-compose.yaml index 6e3b3bf9a7..4a54c2b95b 100644 --- a/test/antithesis/deploy/docker-compose.yaml +++ b/test/antithesis/deploy/docker-compose.yaml @@ -33,7 +33,6 @@ services: command: ["run"] environment: NO_COLOR: "1" - RUST_BACKTRACE: "1" DD_API_KEY: "antithesis-test-api-key" DD_DATA_PLANE_ENABLED: "true" DD_DATA_PLANE_STANDALONE_MODE: "true" diff --git a/test/antithesis/harness/Cargo.toml b/test/antithesis/harness/Cargo.toml index 9d35aa056c..c7d13dcd35 100644 --- a/test/antithesis/harness/Cargo.toml +++ b/test/antithesis/harness/Cargo.toml @@ -16,9 +16,11 @@ clap = { workspace = true, features = [ "std", "usage", ] } +itoa = { workspace = true } num-traits = { workspace = true } rand = { workspace = true } rand_distr = { workspace = true } +ryu = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } diff --git a/test/antithesis/harness/src/bin/first_sample_config/config.rs b/test/antithesis/harness/src/bin/first_sample_config/config.rs index d84f0756ea..eaff67e535 100644 --- a/test/antithesis/harness/src/bin/first_sample_config/config.rs +++ b/test/antithesis/harness/src/bin/first_sample_config/config.rs @@ -64,17 +64,17 @@ impl Distribution for Probe { #[derive(Debug, Clone, Copy, Serialize)] #[serde(rename_all = "lowercase")] pub(crate) enum LogLevel { - /// Warnings and above. - Warn, - /// Errors only. + /// Errors only — the quietest level that still logs. Error, + /// No logs at all — the floor of the log-output budget. + Off, } impl Distribution for StandardUniform { fn sample(&self, rng: &mut R) -> LogLevel { match rng.random_range(0..2u8) { - 0 => LogLevel::Warn, - _ => LogLevel::Error, + 0 => LogLevel::Error, + _ => LogLevel::Off, } } } diff --git a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs index 9f19dd7c59..e0d177a886 100644 --- a/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs +++ b/test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs @@ -1,9 +1,6 @@ -//! Antithesis `parallel_driver_` test command: sends a batch of `DogStatsD` metrics to ADP. -//! -//! Draws a per-timeline cardinality regime (swarm biasing) and a batch size, then sends metrics over -//! UDS. The high-cardinality regime floods distinct aggregation contexts, targeting the -//! `rss-bounded-under-cardinality` property (ADP's memory limiter is disabled by default, so RSS can -//! grow without bound under sustained high cardinality). +//! Feral `DogStatsD` load generator: pick a batch size, then fire that many +//! sampled metric lines at the socket and exit. Antithesis runs many of these +//! in parallel to drive concurrency and push context limits. use std::os::unix::net::UnixDatagram; use std::path::{Path, PathBuf}; @@ -11,10 +8,10 @@ use std::thread::sleep; use std::time::{Duration, Instant}; use antithesis_sdk::prelude::*; -use antithesis_sdk::random::AntithesisRng; -use anyhow::Context as _; +use antithesis_sdk::random::{random_choice, AntithesisRng}; use clap::Parser; -use rand::{rand_core::UnwrapErr, seq::IndexedRandom as _, RngExt as _}; +use harness::payload::dogstatsd; +use rand::{rand_core::UnwrapErr, RngExt}; use serde_json::json; #[derive(Debug, Parser)] @@ -28,11 +25,12 @@ struct Config { dogstatsd_socket: PathBuf, } -#[derive(Clone, Copy, Debug)] -enum Cardinality { - Low, - Medium, - High, +/// Per-batch composition: 50% clean, 25% feral, 25% mixed. +#[derive(Clone, Copy)] +enum Batch { + Clean, + Feral, + Mixed, } fn main() -> anyhow::Result<()> { @@ -40,69 +38,72 @@ fn main() -> anyhow::Result<()> { let config = Config::try_parse()?; let mut rng = UnwrapErr(AntithesisRng); - let regimes = [Cardinality::Low, Cardinality::Medium, Cardinality::High]; - let regime = *regimes - .choose(&mut rng) - .context("cardinality regime choices must not be empty")?; - let regime_label = match regime { - Cardinality::Low => "low", - Cardinality::Medium => "medium", - Cardinality::High => "high", - }; - let count: u64 = rng.random_range(50..=2000); - let socket = connect_with_retry(&config.dogstatsd_socket)?; + // Socket unavailable (ADP booting, or a fault). No-op exit, not a failure. + let Some(socket) = connect_with_retry(&config.dogstatsd_socket) else { + return Ok(()); + }; - let names = ["adp.test.foo", "adp.test.bar", "adp.test.balkajsldfkjasdlfkjasdfz"]; - let metric_types = ["c", "g"]; + let batch = match random_choice(&[Batch::Clean, Batch::Clean, Batch::Feral, Batch::Mixed]) { + Some(Batch::Feral) => Batch::Feral, + Some(Batch::Mixed) => Batch::Mixed, + _ => Batch::Clean, + }; + let count = rng.random_range(0..=10_000u64); + let mut line: Vec = Vec::new(); let mut attempted = 0usize; - for i in 0..count { - let name = *names - .choose(&mut rng) - .context("metric name choices must not be empty")?; - let metric_type = *metric_types - .choose(&mut rng) - .context("metric type choices must not be empty")?; - let value: u64 = rng.random_range(0..=1000); - let tag = match regime { - Cardinality::Low => format!("host:h{}", rng.random_range(0..4)), - Cardinality::Medium => format!("host:h{}", rng.random_range(0..256)), - Cardinality::High => format!("uid:{i}-{}", rng.random::()), + for _ in 0..count { + let vibe = match batch { + Batch::Clean => dogstatsd::Vibe::Clean, + Batch::Feral => dogstatsd::Vibe::Feral, + Batch::Mixed => dogstatsd::sample_vibe(), }; - let line = format!("{name}:{value}|{metric_type}|#{tag}\n"); - if socket.send(line.as_bytes()).is_ok() { + dogstatsd::send(&mut rng, &mut line, vibe); + if socket.send(&line).is_ok() { attempted += 1; } } assert_reachable!( - "workload sent a dogstatsd batch", - &json!({ - "attempted": attempted, - "regime": regime_label, - "socket": config.dogstatsd_socket.display().to_string(), - }) + "workload ran a dogstatsd batch", + &json!({ "attempted": attempted, "dogstatsd_socket": config.dogstatsd_socket.display().to_string() }) + ); + assert_sometimes!( + attempted > 0, + "workload delivered a dogstatsd line", + &json!({ "attempted": attempted }) ); - - // Confirm timelines sometimes drive a high-cardinality flood (the interesting case for memory). assert_sometimes!( - matches!(regime, Cardinality::High), - "workload drove a high-cardinality dogstatsd flood", + attempted > 0 && matches!(batch, Batch::Clean), + "workload ran a fully clean batch", + &json!({ "attempted": attempted }) + ); + assert_sometimes!( + attempted > 0 && matches!(batch, Batch::Feral), + "workload ran a fully feral batch", + &json!({ "attempted": attempted }) + ); + assert_sometimes!( + attempted > 0 && matches!(batch, Batch::Mixed), + "workload ran a mixed batch", &json!({ "attempted": attempted }) ); Ok(()) } -// Wait for ADP to bind the socket, intentionally naive. -fn connect_with_retry(path: &Path) -> anyhow::Result { +/// Wait for ADP to bind the socket, intentionally naive. +fn connect_with_retry(path: &Path) -> Option { let deadline = Instant::now() + Duration::from_secs(30); loop { - let socket = UnixDatagram::unbound()?; - match socket.connect(path) { - Ok(()) => return Ok(socket), - Err(_) if Instant::now() < deadline => sleep(Duration::from_millis(250)), - Err(e) => return Err(e).with_context(|| format!("ADP did not bind {} within 30s", path.display())), + if let Ok(socket) = UnixDatagram::unbound() { + if socket.connect(path).is_ok() { + return Some(socket); + } + } + if Instant::now() >= deadline { + return None; } + sleep(Duration::from_millis(250)); } } diff --git a/test/antithesis/harness/src/lib.rs b/test/antithesis/harness/src/lib.rs index 8c05e117db..ebc75125fa 100644 --- a/test/antithesis/harness/src/lib.rs +++ b/test/antithesis/harness/src/lib.rs @@ -1,4 +1,5 @@ //! Shared helpers for the Antithesis harness, used by the `src/bin/*` test //! commands. +pub mod payload; pub mod rand; diff --git a/test/antithesis/harness/src/payload.rs b/test/antithesis/harness/src/payload.rs new file mode 100644 index 0000000000..515e4ac842 --- /dev/null +++ b/test/antithesis/harness/src/payload.rs @@ -0,0 +1,3 @@ +//! Payload generators for the protocols under test. + +pub mod dogstatsd; diff --git a/test/antithesis/harness/src/payload/dogstatsd.rs b/test/antithesis/harness/src/payload/dogstatsd.rs new file mode 100644 index 0000000000..060bd50b21 --- /dev/null +++ b/test/antithesis/harness/src/payload/dogstatsd.rs @@ -0,0 +1,86 @@ +//! `DogStatsD` payload generation. + +// Here's the basic idea. +// +// Dogstatsd is three message types: +// +// * metric +// * event +// * service check +// +// # Metrics +// +// :||@|#,...|c:|T|e:|card: +// +// Required: :|. +// +// * := [^:|\n]+ +// * := (:)* ':'-packed multi-value, non-set +// | [^|\n]+ raw string, set type +// * := [+-]?(\d+\.?\d*|\.\d+)([eE][+-]?\d+)? | [+-]?(inf|infinity|nan) +// * := c|g|ms|h|s|d count gauge timer histogram set distribution +// * := @ +// * := [^,|\n]+ conventionally :, the ':' is not required +// * := c:[^|\n]+ e.g. ci-, in- +// * := T\d+ unix seconds +// * := e:[^|\n]+ e.g. it-,cn-,pu- +// * := card:[^|\n]+ recognized: none|low|orchestrator|high +// +// # Events +// +// _e{,}:|<TEXT>|d:<TS>|h:<HOST>|k:<AGGKEY>|p:<PRIO>|s:<SRC>|t:<ALERT>|#<TAGS> +// +// Required: _e{<TITLE_LEN>,<TEXT_LEN>}:<TITLE>|<TEXT>. c: / e: / card: are valid here too. +// +// * <TITLE_LEN>, +// <TEXT_LEN> := \d+ byte length of TITLE / TEXT +// * <TITLE>, +// <TEXT> := [^\n]{LEN} length-delimited, so '|' and ':' are allowed; '\\n' -> newline +// * <TS> := d:\d+ unix seconds +// * <HOST> := h:[^|\n]+ +// * <AGGKEY> := k:[^|\n]+ +// * <PRIO> := p:[^|\n]+ recognized: normal|low (else default) +// * <SRC> := s:[^|\n]+ +// * <ALERT> := t:[^|\n]+ recognized: error|warning|info|success (else default) +// * <TAGS> := #<TAG>(,<TAG>)* +// +// # Service checks +// +// _sc|<NAME>|<STATUS>|d:<TS>|h:<HOST>|#<TAG>,<TAG>...|m:<MESSAGE> +// +// Required: _sc|<NAME>|<STATUS>. c: / e: / card: are valid here too. +// +// * <NAME> := [^|\n]+ +// * <STATUS> := [0-3] OK warning critical unknown +// * <TS> := d:\d+ unix seconds +// * <HOST> := h:[^|\n]+ +// * <TAGS> := #<TAG>(,<TAG>)* +// * <MESSAGE> := m:[^|\n]+ + +use antithesis_sdk::random::random_choice; +use rand::Rng; + +mod common; +mod events; +mod metrics; +mod service_checks; + +pub use common::{sample_vibe, Vibe}; + +/// The three `DogStatsD` message types. +#[derive(Clone, Copy)] +enum Message { + Metric, + Event, + ServiceCheck, +} + +/// Write one `DogStatsD` message of a random type to `buf` at the given vibe. +pub fn send<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + buf.clear(); + match random_choice(&[Message::Metric, Message::Event, Message::ServiceCheck]) { + Some(Message::Event) => events::write(rng, buf, vibe), + Some(Message::ServiceCheck) => service_checks::write(rng, buf, vibe), + _ => metrics::write(rng, buf, vibe), + } +} diff --git a/test/antithesis/harness/src/payload/dogstatsd/common.rs b/test/antithesis/harness/src/payload/dogstatsd/common.rs new file mode 100644 index 0000000000..1d18ef959b --- /dev/null +++ b/test/antithesis/harness/src/payload/dogstatsd/common.rs @@ -0,0 +1,209 @@ +//! Shared `DogStatsD` payload sampling: vibe, segment and number builders, tags. + +use antithesis_sdk::random::random_choice; +use rand::distr::Distribution; +use rand::Rng; + +use crate::rand::Boundary; + +/// Clean by-the-book output, or feral. +#[derive(Clone, Copy, Debug)] +pub enum Vibe { + /// Well-formed. + Clean, + /// Aberrant. + Feral, +} + +/// Sample a per-line vibe, evenly. +#[must_use] +pub fn sample_vibe() -> Vibe { + match random_choice(&[Vibe::Clean, Vibe::Feral]) { + Some(Vibe::Feral) => Vibe::Feral, + _ => Vibe::Clean, + } +} + +/// The Agent's name-legal separators, for joining name-like segments. +pub(crate) const NAME_SEPARATORS: &[u8] = b"._- "; + +/// Compliant identifier segments: names, hosts, keys, source types. +pub(crate) const COMPLIANT_WORD: &[&[u8]] = &[ + b"adp", + b"dogstatsd", + b"requests", + b"latency", + b"errors", + b"count", + b"total", + b"bytes", + b"queue", + b"workers", +]; + +/// Aberrant identifier segments: empty, whitespace, NUL, embedded delimiters, +/// invalid UTF-8, message-type prefixes. +pub(crate) const ABERRANT_WORD: &[&[u8]] = &[ + b"", + b" ", + b"\t", + b"\0", + b"a:b", + b"a|b", + b"a,b", + b"#hash", + b"@at", + b"_sc", + b"_e{1,1}", + b"\x80", + b"\xc3", + b"\xed\xa0\x80", + b"\xc0\x80", + b"\xff\xfe", + b"emoji\xf0\x9f\x92\xa9", +]; + +/// Values that break number parsers, including long encodings and unicode that +/// looks numeric: infinity, fullwidth and Arabic-Indic digits. +pub(crate) const ABERRANT_VALUES: &[&[u8]] = &[ + b"0", + b"-0", + b"inf", + b"-inf", + b"+inf", + b"nan", + b"infinity", + b"1e999999", + b"-1e999999", + b"0x1p4", + b"1_000", + b".", + b"+", + b"-", + b"1.", + b".5", + b"1:2:3:4:5", + b"00000000000000000000000000000000000000000000000000000001.5", + b"3.141592653589793115997963468544185161590576171875000000000000000000000000", + "\u{221e}".as_bytes(), + "-\u{221e}".as_bytes(), + "\u{ff11}\u{ff12}\u{ff13}".as_bytes(), + "\u{0664}\u{0662}".as_bytes(), +]; + +/// Unix-timestamp payloads (the `d:` / `T` fields). +pub(crate) const COMPLIANT_TS: &[&[u8]] = &[b"1700000000", b"1", b"1609459200"]; + +const COMPLIANT_TAG_KEYS: &[&[u8]] = &[b"env", b"service", b"region", b"version", b"team", b"host", b"shard"]; +const ABERRANT_TAG_KEYS: &[&[u8]] = &[b"", b" ", b":", b",", b"#", b"\0", b"\x80"]; +const COMPLIANT_TAG_VALUES: &[&[u8]] = &[ + b"prod", + b"staging", + b"adp", + b"us-east-1", + b"eu-west-1", + b"1.2.3", + b"web01", + b"0", +]; +const ABERRANT_TAG_VALUES: &[&[u8]] = &[b"", b",", b"|", b":", b"\xff", b"\xed\xa0\x80", b"a,b"]; + +/// Compact, or a cursed-but-equivalent padded encoding. +#[derive(Clone, Copy)] +enum Form { + Compact, + Expanded, +} + +/// Extend `buf` with one item. Clean draws from `compliant`; feral chooses +/// between compliant and aberrant — a choice, never a coin flip. +pub(crate) fn extend_choice(buf: &mut Vec<u8>, vibe: Vibe, compliant: &[&[u8]], aberrant: &[&[u8]]) { + let pools: &[&[&[u8]]] = match vibe { + Vibe::Clean => &[compliant], + Vibe::Feral => &[compliant, aberrant], + }; + if let Some(&pool) = random_choice(pools) { + if let Some(&item) = random_choice(pool) { + buf.extend_from_slice(item); + } + } +} + +/// Sample a count of segments and join them with sampled `separators`. A pool of +/// `N` segments over a count `c` gives `N^c` results. +pub(crate) fn write_segments<R: Rng + ?Sized>( + rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe, compliant: &[&[u8]], aberrant: &[&[u8]], separators: &[u8], +) { + let count = Boundary::<u8>::new().sample(rng); + for i in 0..count { + if i > 0 { + if let Some(&sep) = random_choice(separators) { + buf.push(sep); + } + } + extend_choice(buf, vibe, compliant, aberrant); + } +} + +/// An identifier (name, host, key, source) built from word segments. +pub(crate) fn write_words<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + write_segments(rng, buf, vibe, COMPLIANT_WORD, ABERRANT_WORD, NAME_SEPARATORS); +} + +/// Append `|<prefix><item>`, the item chosen for the vibe. +pub(crate) fn write_field(buf: &mut Vec<u8>, vibe: Vibe, prefix: &[u8], compliant: &[&[u8]], aberrant: &[&[u8]]) { + buf.push(b'|'); + buf.extend_from_slice(prefix); + extend_choice(buf, vibe, compliant, aberrant); +} + +/// A boundary-sampled count of `key:value` tags joined by ','. A count of zero +/// writes no tags. Clean draws compliant keys and values; feral mixes aberrant +/// ones in, key and value independently. +pub(crate) fn write_tags<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + let count = Boundary::<u8>::new().sample(rng); + for t in 0..count { + if t == 0 { + buf.extend_from_slice(b"|#"); + } else { + buf.push(b','); + } + write_segments(rng, buf, vibe, COMPLIANT_TAG_KEYS, ABERRANT_TAG_KEYS, NAME_SEPARATORS); + buf.push(b':'); + write_segments( + rng, + buf, + vibe, + COMPLIANT_TAG_VALUES, + ABERRANT_TAG_VALUES, + NAME_SEPARATORS, + ); + } +} + +/// Write `digits` to `buf` as-is, or padded with equivalent leading zeros (and +/// trailing zeros when there is a fractional part). Same value, cursed encoding. +pub(crate) fn write_number<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, digits: &[u8]) { + match random_choice(&[Form::Compact, Form::Expanded]) { + Some(Form::Expanded) => { + let (sign, rest) = match digits.first() { + Some(&(b'-' | b'+')) => (&digits[..1], &digits[1..]), + _ => (&digits[..0], digits), + }; + buf.extend_from_slice(sign); + pad_zeros(rng, buf); + buf.extend_from_slice(rest); + let fractional = rest.contains(&b'.') && !rest.iter().any(|&c| c == b'e' || c == b'E'); + if fractional { + pad_zeros(rng, buf); + } + } + _ => buf.extend_from_slice(digits), + } +} + +/// Append a boundary-sampled run of '0' bytes to `buf`. +fn pad_zeros<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>) { + let zeros = usize::from(Boundary::<u8>::new().sample(rng)); + buf.resize(buf.len() + zeros, b'0'); +} diff --git a/test/antithesis/harness/src/payload/dogstatsd/events.rs b/test/antithesis/harness/src/payload/dogstatsd/events.rs new file mode 100644 index 0000000000..8e14be614c --- /dev/null +++ b/test/antithesis/harness/src/payload/dogstatsd/events.rs @@ -0,0 +1,85 @@ +//! Feral `DogStatsD` event generation. + +use antithesis_sdk::random::random_choice; +use rand::distr::Distribution; +use rand::Rng; + +use super::common::{self, Vibe}; +use crate::rand::Boundary; + +/// Priority payloads (the `p:` field). +const COMPLIANT_PRIO: &[&[u8]] = &[b"normal", b"low"]; + +/// Alert-type payloads (the `t:` field). +const COMPLIANT_ALERT: &[&[u8]] = &[b"error", b"warning", b"info", b"success"]; + +/// An event optional field. +#[derive(Clone, Copy)] +enum Opt { + Timestamp, + Hostname, + AggKey, + Priority, + Source, + Alert, +} + +/// Append one event `_e{<TLEN>,<XLEN>}:<TITLE>|<TEXT>[|opt...]` to `buf`. +pub(crate) fn write<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + let mut title = Vec::new(); + common::write_words(rng, &mut title, vibe); + let mut text = Vec::new(); + common::write_words(rng, &mut text, vibe); + + buf.extend_from_slice(b"_e{"); + write_len(rng, buf, vibe, title.len()); + buf.push(b','); + write_len(rng, buf, vibe, text.len()); + buf.extend_from_slice(b"}:"); + buf.extend_from_slice(&title); + buf.push(b'|'); + buf.extend_from_slice(&text); + + let count = Boundary::<u8>::new().sample(rng); + for _ in 0..count { + match random_choice(&[ + Opt::Timestamp, + Opt::Hostname, + Opt::AggKey, + Opt::Priority, + Opt::Source, + Opt::Alert, + ]) { + Some(Opt::Timestamp) => { + common::write_field(buf, vibe, b"d:", common::COMPLIANT_TS, common::ABERRANT_VALUES); + } + Some(Opt::Hostname) => { + buf.extend_from_slice(b"|h:"); + common::write_words(rng, buf, vibe); + } + Some(Opt::AggKey) => { + buf.extend_from_slice(b"|k:"); + common::write_words(rng, buf, vibe); + } + Some(Opt::Priority) => common::write_field(buf, vibe, b"p:", COMPLIANT_PRIO, common::ABERRANT_WORD), + Some(Opt::Source) => { + buf.extend_from_slice(b"|s:"); + common::write_words(rng, buf, vibe); + } + _ => common::write_field(buf, vibe, b"t:", COMPLIANT_ALERT, common::ABERRANT_WORD), + } + } + + common::write_tags(rng, buf, vibe); + buf.push(b'\n'); +} + +/// The event header length. Clean writes the true byte length; feral writes a +/// boundary-sampled lie — the malformed-event surface. +fn write_len<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe, actual: usize) { + let mut itoa = itoa::Buffer::new(); + match vibe { + Vibe::Clean => buf.extend_from_slice(itoa.format(actual).as_bytes()), + Vibe::Feral => buf.extend_from_slice(itoa.format(Boundary::<u64>::new().sample(rng)).as_bytes()), + } +} diff --git a/test/antithesis/harness/src/payload/dogstatsd/metrics.rs b/test/antithesis/harness/src/payload/dogstatsd/metrics.rs new file mode 100644 index 0000000000..6660ceb0a3 --- /dev/null +++ b/test/antithesis/harness/src/payload/dogstatsd/metrics.rs @@ -0,0 +1,116 @@ +//! Feral `DogStatsD` metric-line generation. + +use antithesis_sdk::random::random_choice; +use rand::distr::Distribution; +use rand::Rng; + +use super::common::{self, Vibe}; +use crate::rand::{Boundary, Probe}; + +const METRIC_TYPES: &[&[u8]] = &[b"c", b"g", b"ms", b"h", b"s", b"d"]; + +/// Sample-rate payloads (the `@` field). +const COMPLIANT_RATE: &[&[u8]] = &[b"1", b"0.5", b"0.25", b"0.1", b"0.001"]; + +/// Container-id payloads (the `c:` field). +const COMPLIANT_CONTAINER: &[&[u8]] = &[b"ci-0a1b2c3d4e5f", b"cid-deadbeef", b"in-4026531840"]; + +/// External-data items (the `e:` field), joined by ',' at runtime. +const COMPLIANT_EXT: &[&[u8]] = &[ + b"it-true", + b"it-false", + b"cn-redis", + b"cn-web", + b"pu-810fe89d", + b"pu-abc", +]; + +/// Cardinality payloads (the `card:` field). +const COMPLIANT_CARD: &[&[u8]] = &[b"none", b"low", b"orchestrator", b"high"]; + +/// The `e:` external-data item separator. +const EXT_SEPARATORS: &[u8] = b","; + +/// How to build a value. +#[derive(Clone, Copy)] +enum ValueKind { + Aberrant, + Int, + Float, +} + +/// A metric extension field. +#[derive(Clone, Copy)] +enum Ext { + Rate, + Container, + Timestamp, + External, + Cardinality, +} + +/// Append one metric line `<NAME>:<VALUE>|<TYPE>[|ext...]` to `buf`. +pub(crate) fn write<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + common::write_words(rng, buf, vibe); + buf.push(b':'); + write_value(rng, buf, vibe); + buf.push(b'|'); + if let Some(&t) = random_choice(METRIC_TYPES) { + buf.extend_from_slice(t); + } + common::write_tags(rng, buf, vibe); + write_extensions(rng, buf, vibe); + buf.push(b'\n'); +} + +/// Clean: a compact integer. Feral: an aberrant literal, or an int/float in a +/// compact or cursed-but-equivalent expanded encoding. +fn write_value<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + let mut itoa = itoa::Buffer::new(); + match vibe { + Vibe::Clean => { + let v = Boundary::<i64>::new().sample(rng); + buf.extend_from_slice(itoa.format(v).as_bytes()); + } + Vibe::Feral => match random_choice(&[ValueKind::Aberrant, ValueKind::Int, ValueKind::Float]) { + Some(ValueKind::Aberrant) => { + if let Some(&v) = random_choice(common::ABERRANT_VALUES) { + buf.extend_from_slice(v); + } + } + Some(ValueKind::Float) => { + let v: f64 = Probe.sample(rng); + let mut ryu = ryu::Buffer::new(); + common::write_number(rng, buf, ryu.format(v).as_bytes()); + } + _ => { + let v = Boundary::<i64>::new().sample(rng); + common::write_number(rng, buf, itoa.format(v).as_bytes()); + } + }, + } +} + +/// A boundary-sampled count of extension fields, each a random kind. Repeats and +/// zero are allowed. +fn write_extensions<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + let count = Boundary::<u8>::new().sample(rng); + for _ in 0..count { + match random_choice(&[ + Ext::Rate, + Ext::Container, + Ext::Timestamp, + Ext::External, + Ext::Cardinality, + ]) { + Some(Ext::Rate) => common::write_field(buf, vibe, b"@", COMPLIANT_RATE, common::ABERRANT_VALUES), + Some(Ext::Container) => common::write_field(buf, vibe, b"c:", COMPLIANT_CONTAINER, common::ABERRANT_WORD), + Some(Ext::Timestamp) => common::write_field(buf, vibe, b"T", common::COMPLIANT_TS, common::ABERRANT_VALUES), + Some(Ext::External) => { + buf.extend_from_slice(b"|e:"); + common::write_segments(rng, buf, vibe, COMPLIANT_EXT, common::ABERRANT_WORD, EXT_SEPARATORS); + } + _ => common::write_field(buf, vibe, b"card:", COMPLIANT_CARD, common::ABERRANT_WORD), + } + } +} diff --git a/test/antithesis/harness/src/payload/dogstatsd/service_checks.rs b/test/antithesis/harness/src/payload/dogstatsd/service_checks.rs new file mode 100644 index 0000000000..ee8ba784a9 --- /dev/null +++ b/test/antithesis/harness/src/payload/dogstatsd/service_checks.rs @@ -0,0 +1,47 @@ +//! Feral `DogStatsD` service-check generation. + +use antithesis_sdk::random::random_choice; +use rand::distr::Distribution; +use rand::Rng; + +use super::common::{self, Vibe}; +use crate::rand::Boundary; + +/// Status payloads: OK, warning, critical, unknown. +const COMPLIANT_STATUS: &[&[u8]] = &[b"0", b"1", b"2", b"3"]; + +/// A service-check optional field. +#[derive(Clone, Copy)] +enum Opt { + Timestamp, + Hostname, + Message, +} + +/// Append one service check `_sc|<NAME>|<STATUS>[|opt...]` to `buf`. +pub(crate) fn write<R: Rng + ?Sized>(rng: &mut R, buf: &mut Vec<u8>, vibe: Vibe) { + buf.extend_from_slice(b"_sc|"); + common::write_words(rng, buf, vibe); + buf.push(b'|'); + common::extend_choice(buf, vibe, COMPLIANT_STATUS, common::ABERRANT_VALUES); + + let count = Boundary::<u8>::new().sample(rng); + for _ in 0..count { + match random_choice(&[Opt::Timestamp, Opt::Hostname, Opt::Message]) { + Some(Opt::Timestamp) => { + common::write_field(buf, vibe, b"d:", common::COMPLIANT_TS, common::ABERRANT_VALUES); + } + Some(Opt::Hostname) => { + buf.extend_from_slice(b"|h:"); + common::write_words(rng, buf, vibe); + } + _ => { + buf.extend_from_slice(b"|m:"); + common::write_words(rng, buf, vibe); + } + } + } + + common::write_tags(rng, buf, vibe); + buf.push(b'\n'); +} diff --git a/test/antithesis/harness/src/rand.rs b/test/antithesis/harness/src/rand.rs index 125babecfa..674710a626 100644 --- a/test/antithesis/harness/src/rand.rs +++ b/test/antithesis/harness/src/rand.rs @@ -1,11 +1,18 @@ //! Randomness utilities. +use std::marker::PhantomData; + use rand::distr::Distribution; use rand::{Rng, RngExt}; use rand_distr::LogNormal; -/// Boundary values for the u64 field. -const BOUNDARIES: &[u64] = &[ +// =========================================================================== +// Probe — a boundary-biased magnitude sampler. ~1/8 of draws are a boundary +// value, the rest a typical log-normal magnitude. +// =========================================================================== + +/// `u64` boundary values: 0, 1, and each fixed-width max ±1. +const BOUNDARIES_U64: &[u64] = &[ 0, 1, i8::MAX as u64 - 1, @@ -33,21 +40,82 @@ const BOUNDARIES: &[u64] = &[ u64::MAX, ]; -/// Produces `u64` values that are generally 'normal' and with some being -/// boundary values. +/// `i64` boundary values: 0, ±1, and each signed-width min/max. +const BOUNDARIES_I64: &[i64] = &[ + i64::MIN, + i64::MIN + 1, + i32::MIN as i64, + i16::MIN as i64, + i8::MIN as i64, + -1, + 0, + 1, + i8::MAX as i64, + i16::MAX as i64, + i32::MAX as i64, + i64::MAX - 1, + i64::MAX, +]; + +/// `f64` boundary values (no NaN/inf — those break frame parsing and belong to a +/// dedicated malformed-input driver). +const BOUNDARIES_F64: &[f64] = &[ + 0.0, + 1.0, + -1.0, + f64::MIN_POSITIVE, + -f64::MIN_POSITIVE, + f64::MAX, + f64::MIN, +]; + +/// A boundary-biased distribution: ~1/8 of draws are a boundary value, the rest a +/// "typical" log-normal magnitude. Generic over the numeric output type so a draw +/// site reads `let v: i64 = Probe.sample(rng)` and gets type-appropriate +/// boundaries. `i64`/`f64` draws carry a random sign. #[derive(Debug, Clone, Copy)] pub struct Probe; impl Distribution<u64> for Probe { fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u64 { if rng.random_ratio(1, 8) { - BOUNDARIES[rng.random_range(0..BOUNDARIES.len())] + BOUNDARIES_U64[rng.random_range(0..BOUNDARIES_U64.len())] } else { typical(rng) } } } +impl Distribution<i64> for Probe { + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> i64 { + if rng.random_ratio(1, 8) { + BOUNDARIES_I64[rng.random_range(0..BOUNDARIES_I64.len())] + } else { + let magnitude = num_traits::cast::<u64, i64>(typical(rng)).unwrap_or(i64::MAX); + if rng.random_ratio(1, 2) { + -magnitude + } else { + magnitude + } + } + } +} + +impl Distribution<f64> for Probe { + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> f64 { + if rng.random_ratio(1, 8) { + BOUNDARIES_F64[rng.random_range(0..BOUNDARIES_F64.len())] + } else { + let magnitude = num_traits::cast::<u64, f64>(typical(rng)).unwrap_or(f64::MAX); + if rng.random_ratio(1, 2) { + -magnitude + } else { + magnitude + } + } + } +} + /// Approximate probability of a typical draw landing in each range: /// /// | Value range | Probability | @@ -63,3 +131,83 @@ fn typical<R: Rng + ?Sized>(rng: &mut R) -> u64 { let dist = LogNormal::new(1024.0_f64.ln(), 4.0).expect("median > 0 and sigma >= 0"); num_traits::cast::<f64, u64>(dist.sample(rng).round()).unwrap_or(u64::MAX) } + +// =========================================================================== +// Boundary<T> — a finite type-boundary sampler: each fixed-width max ±1 and the +// half-range midpoint ±1, the same idea as Probe's arrays but for one type. +// =========================================================================== + +/// A boundary-value sampler for `T`: each fixed-width max ±1 and the half-range +/// midpoint ±1. `Boundary::<T>::new().sample(rng)` returns one. +#[derive(Clone, Copy, Debug, Default)] +pub struct Boundary<T>(PhantomData<T>); + +impl<T> Boundary<T> { + /// A boundary sampler for `T`. + #[must_use] + pub const fn new() -> Self { + Boundary(PhantomData) + } +} + +const BOUNDARY_U8: &[u8] = &[0, 1, 2, 126, 127, 128, 129, 254, 255]; + +impl Distribution<u8> for Boundary<u8> { + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u8 { + BOUNDARY_U8[rng.random_range(0..BOUNDARY_U8.len())] + } +} + +const BOUNDARY_U64: &[u64] = &[ + 0, + 1, + 2, + u8::MAX as u64 - 1, + u8::MAX as u64, + u8::MAX as u64 + 1, + u16::MAX as u64 - 1, + u16::MAX as u64, + u16::MAX as u64 + 1, + u32::MAX as u64 - 1, + u32::MAX as u64, + u32::MAX as u64 + 1, + u64::MAX / 2 - 1, + u64::MAX / 2, + u64::MAX / 2 + 1, + u64::MAX - 1, + u64::MAX, +]; + +impl Distribution<u64> for Boundary<u64> { + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> u64 { + BOUNDARY_U64[rng.random_range(0..BOUNDARY_U64.len())] + } +} + +const BOUNDARY_I64: &[i64] = &[ + i64::MIN, + i64::MIN + 1, + i64::MIN / 2 - 1, + i64::MIN / 2, + i64::MIN / 2 + 1, + i32::MIN as i64, + i16::MIN as i64, + i8::MIN as i64, + -1, + 0, + 1, + i8::MAX as i64, + i16::MAX as i64, + i32::MAX as i64, + i64::MAX / 2 - 1, + i64::MAX / 2, + i64::MAX / 2 + 1, + i64::MAX - 1, + i64::MAX, +]; + +impl Distribution<i64> for Boundary<i64> { + fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> i64 { + BOUNDARY_I64[rng.random_range(0..BOUNDARY_I64.len())] + } +} diff --git a/test/antithesis/scratchbook/bug-ledger.md b/test/antithesis/scratchbook/bug-ledger.md index 75cd0b8ce3..ab63265666 100644 --- a/test/antithesis/scratchbook/bug-ledger.md +++ b/test/antithesis/scratchbook/bug-ledger.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: fc4bb29728814ddf9321572b954ec28f58faeb53 -updated: 2026-05-30 +commit: 21b2072b4743ddbf4c84891d93abac7299dc4ce8 +updated: 2026-06-01 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees that frame these defects as bugs. @@ -31,6 +31,18 @@ bug happens. Run all five (expect five FAILURES — the failing tests are the demonstrations): `cargo nextest run --no-fail-fast -E 'test(/bug_nan_sample_poisons_sum_and_avg|bug_corrupt_length_prefix_silently_drops_following_records|bug_forward_clock_jump_floods_zero_value_points|bug_default_heap_fallback_makes_context_resolution_unbounded|bug_config_ready_hangs_forever_without_snapshot/)'` +> **Workload reach (2026-06-01):** the live `parallel_driver_send_dogstatsd` feral DSD-line generator +> exercises only **one** of these five repros under a run — #4, the high-cardinality interner +> heap-fallback (`rss-bounded-under-cardinality`) — and even that needs a memory-capped `adp` container +> or a SUT-side RSS assertion to be *caught* (neither yet wired). The other four are off the DSD-socket +> input path: +> - **#1 `ddsketch-no-nan-poison`** — DSD drops non-finite at the codec; needs a `checks_ipc` gRPC +> Histogram feeder. +> - **#2 `replay-corruption-not-silent-eof`** — needs the `agent-data-plane dogstatsd replay` CLI plus +> crafted capture files. +> - **#3 `aggregate-clock-skew-stable` (forward-jump)** — needs a clock-skip fault. +> - **#5 `config-stall-no-deadlock`** — needs a config-stream stub that withholds the snapshot. + ## Resolved upstream on main (repro now stale) - **`aggregate-no-panic-any-window` — sub-second window `% 0` panic (was bug #1).** Fixed on main: @@ -46,7 +58,7 @@ Run all five (expect five FAILURES — the failing tests are the demonstrations) ## Burned into an Antithesis triage shot (submitted run) -- **`rss-bounded-under-cardinality` (behavioral)** and **`forwarder-eventual-delivery` (baseline liveness)** — run id (redacted; tracked internally) (test-name `saluki-adp-bug-hunt`, 30 min, submitted 2026-05-29). The `parallel_driver_send_dogstatsd` high-cardinality regime drives memory growth; `finally_verify_delivery` checks delivery. Triage with the `antithesis-triage` skill once it completes. +- **`rss-bounded-under-cardinality` (behavioral)** and **`forwarder-eventual-delivery` (baseline liveness)** — run id (redacted; tracked internally) (test-name `saluki-adp-bug-hunt`, 30 min, submitted 2026-05-29). The `parallel_driver_send_dogstatsd` driver (a sampled batch of feral DSD lines whose names/tags/values are built combinatorially from finite segment pools) floods distinct contexts and drives memory growth; `finally_verify_delivery` checks delivery. Triage with the `antithesis-triage` skill once it completes. **Caveat:** `rss-bounded-under-cardinality` only becomes a *caught* failure with a memory-capped `adp` container (OOM ⇒ `eventually_adp_alive`) or a SUT-side RSS assertion — neither yet wired. ## Antithesis-shot-only — blocked on harness infrastructure (not locally reproducible) diff --git a/test/antithesis/scratchbook/deployment-topology.md b/test/antithesis/scratchbook/deployment-topology.md index fcef2890c0..d0ba82ad5f 100644 --- a/test/antithesis/scratchbook/deployment-topology.md +++ b/test/antithesis/scratchbook/deployment-topology.md @@ -44,7 +44,7 @@ containers so every link is faultable. ```text +------------------------+ DogStatsD +------------------------+ HTTP (Datadog +------------------------+ | workload-client | (UDP/TCP, faultable) | adp | intake API, | mock-intake | -| - millstone load gen | ------------------------> | agent-data-plane | faultable, retryable) | datadog-intake | +| - dogstatsd driver | ------------------------> | agent-data-plane | faultable, retryable) | datadog-intake | | - Antithesis SDK | | (standalone mode) | ----------------------> | (mock fakeintake) | | - test template | <------------------------ | UDP/TCP/UDS listeners | <---------------------- | records payloads, | +------------------------+ backpressure / health +------------------------+ acks / 5xx / hang | queryable for asserts | @@ -55,22 +55,24 @@ containers so every link is faultable. |---|---|---|---|---|---| | `adp` | Service (SUT) | reuse `docker/Dockerfile.agent-data-plane` (standalone build) | `agent-data-plane run` in **standalone mode** (`DD_DATA_PLANE_STANDALONE_MODE=true`, `DD_DATA_PLANE_DOGSTATSD_ENABLED=true`), no Core Agent dependency | receives DogStatsD from `workload-client`; forwards to `mock-intake` over HTTP | 1 | | `mock-intake` | Dependency | reuse `docker/Dockerfile.correctness-tools` (the `datadog-intake` binary) | mock Datadog intake; record + count forwarded payloads; expose a query API the workload reads for assertions | receives ADP forwarder traffic; queried by `workload-client` | 1 | -| `workload-client` | Client (test driver) | new thin Dockerfile layering the `millstone` binary + test template + Antithesis Rust SDK | emits `setup_complete`, then test commands drive `millstone` load and run assertions against `mock-intake` | sends DogStatsD to `adp`; queries `mock-intake` | 1 | +| `workload-client` | Client (test driver) | thin Dockerfile layering the compiled test-command binaries + test templates + Antithesis Rust SDK | emits `setup_complete`, then `parallel_driver_send_dogstatsd` samples DogStatsD load (the `harness::payload::dogstatsd` feral/clean generator) and `finally_verify_delivery` checks the intake | sends DogStatsD to `adp`; queries `mock-intake` | 1 | Notes: -- **Use UDP or TCP, not UDS, between `workload-client` and `adp`.** UDS requires a shared volume - (same fate / no faulting), and it couples origin-detection credentials. UDP/TCP keeps the intake - *and* the DSD-intake links independently faultable and lets `malformed-dsd-no-crash` exercise the - network listeners. (UDS-specific listener behavior can be a secondary case with a shared-volume - sidecar — see "Listener-coverage variant".) +- **The DSD link between `workload-client` and `adp` currently uses UDS** via a shared + `dogstatsd-socket` volume (`DSD_SOCKET`). The tradeoff: the ingress link is no longer independently + faultable (shared volume, same fate) and it couples origin-detection credentials. A UDP/TCP + variant would keep the intake *and* the DSD-intake links independently faultable and let + `malformed-dsd-no-crash` exercise the network listeners; track it as a follow-up (see + "Listener-coverage variant"). - **Point ADP's forwarder at `mock-intake`** via `DD_URL` / forwarder endpoint config; set a real (fake) API key. This is the link that unlocks the entire egress data-loss cluster. -- `millstone` already supports deterministic seeds and fixed payload counts (`millstone.yaml`), - so the workload is reproducible; Antithesis adds the fault dimension on top. +- The driver samples all randomness through `AntithesisRng` (boundary-biased `Probe`/`Boundary` + samples and `random_choice` selections), so the workload is deterministic and simulator-steerable; + Antithesis adds the fault dimension on top. ### What the primary topology covers -- **Memory & resource bounds (Cat A):** high-cardinality / many-timestamp `millstone` corpus + +- **Memory & resource bounds (Cat A):** high-cardinality / many-timestamp load from the driver + `memory_mode`/`memory_limit` set on `adp`; node-throttling on `adp` to stress the limiter timing; observe RSS vs grant. `rss-bounded-under-cardinality`, `aggregate-context-limit-enforced`, `interner-full-bounded`, `memory-limiter-survives-rss-read-failure` (needs `/proc` fault — see @@ -91,8 +93,8 @@ Notes: SUT-side assertions (`interner-reclamation-no-corruption`, `non-finite-values-handled-consistently`). - **Events & service-checks (Cat B/E additions):** the workload must emit well-formed *and* malformed events + service-checks so `events-sc-no-silent-loss`, `malformed-event-sc-no-crash`, and - the anti-vacuity anchor `events-sc-pipeline-reachable` are exercised — a metrics-only `millstone` - corpus leaves these vacuous. + the anti-vacuity anchor `events-sc-pipeline-reachable` are exercised — a metrics-only workload + leaves these vacuous. - **Transformer correctness (Cat G, primary-runnable subset):** `mapper-interner-bounded` rides a high-cardinality flood of distinct *mappable* names against a small `dogstatsd_mapper_string_interner_size`. The differential Cat G properties (`mapper-output-matches-agent`, `prefix-filter-ordering-matches-agent`) @@ -209,7 +211,7 @@ needs a script. Confirm whether the existing binary supports this or needs a small extension. - **A minimal Core Agent config-stub** must be built (or the full `datadog-agent` image adapted) to send adversarial config the real Agent wouldn't — needed for Add-on 1. -- Whether the workload can drive DogStatsD over **UDP/TCP at the volume `millstone` targets** without +- Whether the workload can drive DogStatsD over **UDP/TCP at the volume the driver targets** without loss confounding the assertions (UDP is lossy by nature; for no-loss assertions prefer TCP/UDS, and scope UDP cases to no-crash rather than no-loss). - The `checks_ipc` Histogram NaN bypass (`ddsketch-no-nan-poison`) needs a **checks-IPC producer** in diff --git a/test/antithesis/scratchbook/existing-assertions.md b/test/antithesis/scratchbook/existing-assertions.md index 479c17494f..540b34c450 100644 --- a/test/antithesis/scratchbook/existing-assertions.md +++ b/test/antithesis/scratchbook/existing-assertions.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 -updated: 2026-05-31 +commit: 21b2072b4743ddbf4c84891d93abac7299dc4ce8 +updated: 2026-06-01 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: Datadog ADP Confluence space (design notes, weekly summaries, gap analyses) consulted for grounding. @@ -16,19 +16,22 @@ external_references: ## Summary **A bootstrap-and-workload assertion set exists, now with the first liveness instrumentation.** It -comprises **8 SDK call sites**: one lifecycle init and one bootstrap reachability probe in ADP, two -workload-side `assert_reachable!`/`assert_sometimes!` pairs in the harness drivers, and — added -2026-05-31 — the external `eventually_adp_alive` liveness `assert_always!` plus the **first in-SUT -property assertion**, an `assert_sometimes!` at the forwarder 2xx site in `saluki-components`. All -ADP/`saluki-components` sites are gated behind an `antithesis` cargo feature (no-op in production). -The bootstrap probe and the two driver anchors remain **integration probes / anti-vacuity anchors**; -the two new sites are real liveness instrumentation (Category H `adp-stays-alive` and the -good-function half of `adp-keeps-delivering` / in-SUT seed of `forwarder-eventual-delivery`). +comprises **11 SDK call sites**: one lifecycle init and one bootstrap reachability probe in ADP, a +`finally_verify_delivery` `assert_reachable!`/`assert_sometimes!` pair, the +`parallel_driver_send_dogstatsd` anchors (one `assert_reachable!` plus four `assert_sometimes!` — +delivered, clean, feral, mixed batch composition), the external `eventually_adp_alive` liveness +`assert_always!`, and the **first in-SUT property assertion**, an `assert_sometimes!` at the +forwarder 2xx site in `saluki-components`. All ADP/`saluki-components` sites are gated behind an +`antithesis` cargo feature (no-op in production). The bootstrap probe and the driver anchors remain +**integration probes / anti-vacuity anchors**; the liveness sites are real liveness instrumentation +(Category H `adp-stays-alive` and the good-function half of `adp-keeps-delivering` / in-SUT seed of +`forwarder-eventual-delivery`). > [!NOTE] > History: an early version of this file claimed no SDK assertions existed (true before the harness -> commit; corrected 2026-05-30). Updated again 2026-05-31 when the liveness pieces landed (6 → 8 -> sites). +> commit; corrected 2026-05-30). Updated 2026-05-31 when the liveness pieces landed (6 → 8 sites), +> and again when `parallel_driver_send_dogstatsd` added the clean/feral/mixed batch assertions +> (8 → 11 sites). ## Assertions present @@ -38,10 +41,25 @@ good-function half of `adp-keeps-delivering` / in-SUT seed of `forwarder-eventua | `bin/agent-data-plane/src/main.rs:100` | `assert_reachable!` | "agent-data-plane completed bootstrap" | `#[cfg(feature = "antithesis")]` | Bootstrap-integration probe — proves the SDK is linked, cataloging works, the instrumentation path is wired. | | `test/antithesis/harness/src/bin/finally_verify_delivery.rs:54` | `assert_reachable!` | "intake metrics dump query succeeded" | harness binary | Confirms the delivery-verification query path ran. | | `test/antithesis/harness/src/bin/finally_verify_delivery.rs:59` | `assert_sometimes!` | "metrics delivered end-to-end to the intake" (`delivered > 0`) | harness binary | Workload-side liveness anchor — partially seeds `forwarder-eventual-delivery`. | -| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:77` | `assert_reachable!` | "workload sent a dogstatsd batch" | harness binary | Confirms the DSD driver actually emitted load. | -| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:87` | `assert_sometimes!` | "workload drove a high-cardinality dogstatsd flood" (`regime == High`) | harness binary | Anti-vacuity anchor that timelines reach the high-cardinality regime — seeds `rss-bounded-under-cardinality`. | -| `test/antithesis/harness/src/bin/eventually_adp_alive.rs:62` | `assert_always!` | "ADP booted: API reachable and DogStatsD socket present" | harness binary (`eventually_`, faults-paused) | Death-liveness for `adp-stays-alive` — fails the branch when ADP self-crashed (config panic / load) but stayed down through the quiet period. | -| `lib/saluki-components/src/common/datadog/io.rs:553` | `assert_sometimes!` | "ADP forwarded a payload to the intake" (`{ domain }`) | `#[cfg(feature = "antithesis")]` | First in-SUT property assertion — good-function liveness (the full pipeline ran to a 2xx) + replay checkpoint; good-function half of `adp-keeps-delivering`, in-SUT seed of `forwarder-eventual-delivery`. | +| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:67` | `assert_reachable!` | "workload ran a dogstatsd batch" | harness binary | Confirms the DSD driver ran a batch; details carry the attempted-line count and socket path. | +| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:68` | `assert_sometimes!` | "workload delivered a dogstatsd line" (`attempted > 0`) | harness binary | Anti-vacuity anchor: a batch can sample count == 0, so "ran" does not imply "sent"; this proves a timeline sometimes actually delivers a line, else delivery checks are vacuous. | +| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:73` | `assert_sometimes!` | "workload ran a fully clean batch" (`attempted > 0 && Clean`) | harness binary | Composition anchor: proves the clean branch is sometimes exercised, so the clean delivery surface is non-vacuous. | +| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:78` | `assert_sometimes!` | "workload ran a fully feral batch" (`attempted > 0 && Feral`) | harness binary | Composition anchor: proves the feral branch is sometimes exercised. | +| `test/antithesis/harness/src/bin/parallel_driver_send_dogstatsd.rs:83` | `assert_sometimes!` | "workload ran a mixed batch" (`attempted > 0 && Mixed`) | harness binary | Composition anchor: proves the mixed branch is sometimes exercised. | +| `test/antithesis/harness/src/bin/eventually_adp_alive.rs:63` | `assert_always!` | "ADP booted: API reachable and DogStatsD socket present" | harness binary (`eventually_`, faults-paused) | Death-liveness for `adp-stays-alive` — fails the branch when ADP self-crashed (config panic / load) but stayed down through the quiet period. | +| `lib/saluki-components/src/common/datadog/io.rs:556` | `assert_sometimes!` | "ADP forwarded a payload to the intake" (`{ domain }`) | `#[cfg(feature = "antithesis")]` | First in-SUT property assertion — good-function liveness (the full pipeline ran to a 2xx) + replay checkpoint; good-function half of `adp-keeps-delivering`, in-SUT seed of `forwarder-eventual-delivery`. | + +> **Load driver (2026-06-01):** `parallel_driver_send_dogstatsd` replaced the `parallel_driver_load` +> driver (the four-profile C1–C4 ladder and the `harness::load_gen` Generator/Profile module are gone). +> The driver samples a batch size (`random_range(0..=10_000)`), and for each line calls +> `harness::payload::dogstatsd::send`, which picks a message type via `random_choice` and dispatches to +> `metrics`/`events`/`service_checks`, then writes the bytes to the DSD UDS socket and exits. The +> generator builds names, tags, values, and headers combinatorially from finite segment pools joined by +> sampled separators (`harness::payload::dogstatsd::common`), with counts from the finite +> `harness::rand::Boundary` sampler. A per-message `Vibe` toggle is either clean (by-the-book) or feral +> (aberrant bytes, cursed-but-equivalent number encodings, skewed `_e{len,len}` event header lengths). +> Its five assertions above are the `assert_reachable!` batch anchor plus four `assert_sometimes!` +> anchors (delivered, and the clean/feral/mixed batch-composition checks). Dependency wiring: ADP gains the SDK only under the `antithesis` feature (`bin/agent-data-plane/Cargo.toml:14` → `dep:antithesis_sdk`, `antithesis_sdk/full`, @@ -59,14 +77,15 @@ Searched the repository with ripgrep over `*.rs` and `*.toml`: - `rg -li "antithesis" -g '*.rs' -g '*.toml'` — matches in ADP `main.rs`, the two harness binaries, and the `Cargo.toml` files above. - `rg "assert_always|assert_sometimes|assert_reachable|assert_unreachable|antithesis_sdk" -g '*.rs'` - — the 6 call sites tabled above; **no `assert_always!` and no `assert_unreachable!` anywhere yet.** + — the 11 call sites tabled above (`assert_always!` now present in `eventually_adp_alive`); **no + `assert_unreachable!` anywhere yet.** ## Implication for property work Most catalog invariants are still **net-new instrumentation**, but the pattern is now proven in-SUT: - `forwarder-eventual-delivery` now has an **in-SUT** `Sometimes(forwarded a payload)` at the 2xx - site (io.rs:553) in addition to the workload-side `Sometimes(delivered > 0)`. The full no-loss + site (io.rs:556) in addition to the workload-side `Sometimes(delivered > 0)`. The full no-loss `Always`/accounting reconciliation (delivered == accepted-and-retryable after a transient outage) is still net-new. - `rss-bounded-under-cardinality` has its high-cardinality `Sometimes` anchor but no SUT-side RSS or diff --git a/test/antithesis/scratchbook/property-catalog.md b/test/antithesis/scratchbook/property-catalog.md index 73160b962a..76f8a02cfe 100644 --- a/test/antithesis/scratchbook/property-catalog.md +++ b/test/antithesis/scratchbook/property-catalog.md @@ -1,7 +1,7 @@ --- sut_path: /home/ssm-user/src/saluki -commit: 2e4ae1b8be45143882f0dbeb5e74998021c5faf9 -updated: 2026-05-31 +commit: 21b2072b4743ddbf4c84891d93abac7299dc4ce8 +updated: 2026-06-01 external_references: - path: https://datadoghq.atlassian.net/wiki/spaces/DADP/ why: ADP Confluence space — headline guarantees and gap analyses that seed properties. @@ -33,6 +33,16 @@ assertion** — an `assert_sometimes!` at the forwarder 2xx site in `saluki-comp fail by design** under default config (memory limiter disabled, interner heap-fallback enabled, disk persistence off) — these are flagged; they are the highest-value findings, not catalog errors. +> **Workload-reach note (2026-06-01):** the live `parallel_driver_send_dogstatsd` feral DSD-line +> generator exercises only **one** of the five still-unfixed reproduced bugs (branch +> `blt/antithesis-bug-tests`): the high-cardinality interner heap-fallback +> (`rss-bounded-under-cardinality`). The other four are off the DSD-socket input path entirely: +> `ddsketch-no-nan-poison` needs a `checks_ipc` gRPC Histogram feeder (DSD drops non-finite at the +> codec); `replay-corruption-not-silent-eof` needs the `agent-data-plane dogstatsd replay` CLI plus +> crafted capture files; `aggregate-clock-skew-stable` (forward-jump) needs a clock-skip fault; +> `config-stall-no-deadlock` needs a config-stream stub that withholds the snapshot. The sub-second +> window `%0` panic is fixed upstream. See `bug-ledger.md`. + Provenance tags `[Fn]` after each slug name the discovery focus that surfaced it: `[RB]` resource boundaries, `[DL]` data-loss/recovery, `[AG]` aggregation/sketch, `[LC]` lifecycle/config, `[RC]` replay/codec/concurrency, `[WC]` wildcard (from SUT analysis). @@ -47,12 +57,16 @@ limiter is advisory (≤25ms backoff, 250ms sampling, cooperative), disabled by interner spills to the heap by default. This category probes whether RSS is *actually* bounded. ### rss-bounded-under-cardinality — RSS bounded under high cardinality -> **Status (2026-05-29): WORKLOAD WIRED + ROOT CAUSE REPRO'D** — `parallel_driver_send_dogstatsd` -> (high-cardinality regime) floods distinct contexts in the Antithesis harness to drive this -> behavioral bug under a run; and the root cause is reproduced as a unit test in -> `lib/saluki-context/src/resolver.rs` +> **Status (2026-06-01): WORKLOAD WIRED + ROOT CAUSE REPRO'D** — `parallel_driver_send_dogstatsd` +> sends a sampled batch of feral DSD lines whose names/tags/values are built combinatorially from +> finite segment pools, flooding distinct contexts in ADP to drive this behavioral bug under a run; and +> the root cause is reproduced as a unit test in `lib/saluki-context/src/resolver.rs` > `tests::bug_default_heap_fallback_makes_context_resolution_unbounded` (default heap fallback ⇒ -> resolution never refuses ⇒ unbounded memory). Not fixed. +> resolution never refuses ⇒ unbounded memory). Not fixed. **This is the only one of the five +> still-unfixed reproduced bugs that the live DSD-line generator exercises**, and even here it becomes a +> *caught* failure only with a memory-capped `adp` container (OOM ⇒ `eventually_adp_alive`) or a +> SUT-side RSS assertion — neither yet wired. The other four bugs are off the DSD-socket input path +> (see the note below the catalog header / bug-ledger). | | | |---|---| | **Type** | Safety (expected to FAIL by design under default config) |