From 8f1ba322c8fe33deaf38cd2343ce43547f8ec8d0 Mon Sep 17 00:00:00 2001 From: jskoiz <20649937+jskoiz@users.noreply.github.com> Date: Sat, 6 Jun 2026 08:47:31 -1000 Subject: [PATCH 1/3] Add serde-saphyr comparison benchmark Benchmark saneyaml against serde-saphyr across dynamic-value and typed-struct deserialization in a new example, and document the results in BENCHMARKS.md. Add serde-saphyr as a dev-dependency. --- Cargo.lock | 157 +++++++++++++ Cargo.toml | 1 + docs/BENCHMARKS.md | 69 ++++++ examples/serde_saphyr_headtohead.rs | 327 ++++++++++++++++++++++++++++ 4 files changed, 554 insertions(+) create mode 100644 examples/serde_saphyr_headtohead.rs diff --git a/Cargo.lock b/Cargo.lock index e35fe4a..2715a17 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,6 +17,36 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "annotate-snippets" +version = "0.12.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f211a51805bc641f3ad5b7664c77d2547af685cc33b4cd8d31964027a46f13f1" +dependencies = [ + "anstyle", + "memchr", + "unicode-width", +] + +[[package]] +name = "anstyle" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" + [[package]] name = "anyhow" version = "1.0.102" @@ -50,6 +80,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + [[package]] name = "bit-set" version = "0.8.0" @@ -80,6 +116,12 @@ dependencies = [ "generic-array", ] +[[package]] +name = "bumpalo" +version = "3.20.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72f5acc6cb2ba439de613abc23857ec3d78374d8ed5ac84e9d11336e87da8649" + [[package]] name = "cfg-if" version = "1.0.4" @@ -140,6 +182,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs_io" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" +dependencies = [ + "encoding_rs", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -197,9 +248,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" dependencies = [ "cfg-if", + "js-sys", "libc", "r-efi 5.3.0", "wasip2", + "wasm-bindgen", ] [[package]] @@ -221,6 +274,16 @@ version = "0.32.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" +[[package]] +name = "granit-parser" +version = "0.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f50ba32164f9e098d5da618776a32afbb32270adcbe3d3d006107dae11e37c91" +dependencies = [ + "arraydeque", + "smallvec", +] + [[package]] name = "hashbrown" version = "0.15.5" @@ -293,6 +356,17 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" +[[package]] +name = "js-sys" +version = "0.3.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "142bc4740e452c1e57ade0cbc129f139c9093e354346f0872ef985f4f5cf5f11" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen", +] + [[package]] name = "lazy_static" version = "1.5.0" @@ -353,6 +427,12 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c505b3e17ed6b70a7ed2e67fbb2c560ee327353556120d6e72f5232b6880d536" +[[package]] +name = "nohash-hasher" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bf50223579dc7cdcfb3bfcacf7069ff68243f8c363f62ffa99cf000a6b9c451" + [[package]] name = "num-traits" version = "0.2.19" @@ -561,6 +641,12 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + [[package]] name = "rusty-fork" version = "0.3.1" @@ -589,6 +675,7 @@ dependencies = [ "saphyr", "saphyr-parser", "serde", + "serde-saphyr", "serde_json", "serde_yaml", "sha2", @@ -641,6 +728,25 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-saphyr" +version = "0.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5897b4c3faadadd35fdb6689f015641f3bc481d5adaaac56231ea15aeb243db3" +dependencies = [ + "ahash", + "annotate-snippets", + "base64", + "encoding_rs_io", + "getrandom 0.3.4", + "granit-parser", + "nohash-hasher", + "num-traits", + "serde", + "smallvec", + "zmij", +] + [[package]] name = "serde_core" version = "1.0.228" @@ -802,6 +908,12 @@ version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -847,6 +959,51 @@ dependencies = [ "wit-bindgen 0.51.0", ] +[[package]] +name = "wasm-bindgen" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ed04576f974d2b2fba0f38c51dbc5518011e38c36bf1143164be765528fd409" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "916151b09da36bd82f6615cbf3a419e2f0ba23a03c6160e8e92eb6bd4aa1dec6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "299047362ccbfce148b67ab7e73349f77748e00c8296f9542adfad2ad82c5c5e" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.122" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a929b2c61f11ba3e9bc35b50c1f25cb38e0e892c0c231ae2b8cf78d5dad4437" +dependencies = [ + "unicode-ident", +] + [[package]] name = "wasm-encoder" version = "0.244.0" diff --git a/Cargo.toml b/Cargo.toml index 10558dd..914854f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -61,3 +61,4 @@ sha2 = "0.10" toml = "0.8" yaml-rust2 = "0.11.0" dhat = "0.3.3" +serde-saphyr = "0.0.27" diff --git a/docs/BENCHMARKS.md b/docs/BENCHMARKS.md index af2726c..2b7ca4f 100644 --- a/docs/BENCHMARKS.md +++ b/docs/BENCHMARKS.md @@ -305,6 +305,73 @@ is not a sole leader there. Its differentiation is the combination of full spec conformance with tree-policy rejection of the duplicate-key/tree-error cases that `saphyr` accepts, while `serde_yaml` trails the spec set at 333/400. +## serde-saphyr Head-To-Head + +[`serde-saphyr`](https://crates.io/crates/serde-saphyr) is the closest active +competitor: a pure-Rust, Serde-based YAML layer (built on `granit-parser`) that +deserializes straight into Rust types without first building an intermediate +node tree. Because both crates drive Serde, the fair comparison feeds identical +bytes into identical target types through each crate's idiomatic `from_str` +(one warm-up pass, then a timed `Instant::now()` loop), isolating the YAML +layer. Captured 2026-06-06 on an Apple M4 Pro against +`serde-saphyr 0.0.27`, both libraries on their shipping defaults: + +```sh +cargo run --release --example serde_saphyr_headtohead +``` + +**Axis 1 — dynamic value (`serde_json::Value`) over the real-world corpus.** +The neutral target both crates deserialize into; 27 of the 30 single-document +corpus fixtures are accepted by both (see the tag caveat below). + +| load path | iterations | bytes/iter | ns/byte | +|---|---:|---:|---:| +| `saneyaml::from_str::` | 500 | 19,346 | 20.26 | +| `serde_saphyr::from_str::` | 500 | 19,346 | 38.78 | +| `saneyaml::parse_documents` (native tree, reference) | 500 | 19,346 | 15.99 | + +**Axis 2 — typed deserialize into a nested `Config` struct** (generated, 4,000 +services, 983,795 bytes). + +| load path | iterations | bytes/iter | ns/byte | +|---|---:|---:|---:| +| `saneyaml::from_str::` | 40 | 983,795 | 22.20 | +| `serde_saphyr::from_str::` | 40 | 983,795 | 39.98 | + +**Axis 3 — typed deserialize into a flat `Vec`** (generated, 15,000 +records, 912,130 bytes) — the "load a large homogeneous file straight into +structs" shape `serde-saphyr` is designed around. + +| load path | iterations | bytes/iter | ns/byte | +|---|---:|---:|---:| +| `saneyaml::from_str::>` | 40 | 912,130 | 23.62 | +| `serde_saphyr::from_str::>` | 40 | 912,130 | 51.98 | + +Result: on this machine and these inputs `saneyaml` is ~1.9x faster on the +dynamic-value corpus, ~1.8x on the nested typed struct, and ~2.2x on the flat +typed records — the widest margin on the very shape `serde-saphyr` targets. As +everywhere in this document the trustworthy signal is the same-run ratio, not +the absolute ns/byte, which is machine- and structure-dependent. (`serde-saphyr`'s +own published ~11 ns/byte figure was measured on a single large homogeneous file +and, at 25 MiB, would have required raising its default node budget — so it is +not directly comparable to these default-settings captures.) + +Two non-speed differences surfaced and are worth recording: + +- **Custom tags.** Three corpus fixtures (`ansible/vault-and-unsafe-tags.yaml`, + `cloudformation/sam-api.yaml`, `symfony/services.yaml`) carry application tags + (`!vault`, `!Ref`, Symfony service tags). `saneyaml` surfaces these as typed + tagged values that `serde_json::Value` cannot represent and rejects them + (`invalid type: enum`); `serde-saphyr` discards the tag and accepts the + underlying value. Neither is "wrong" — `saneyaml` is the more tag-faithful of + the two — but it is why the dynamic-value axis is timed on the 27-fixture + intersection both accept. +- **Default resource caps differ.** Both crates are safe-by-default but tuned + differently: `saneyaml` caps collections at 16,384 items (plus input-byte, + scalar, and nesting limits); `serde-saphyr` caps total nodes at ~250,000. The + generated inputs above stay under the tighter of the two so both run on + defaults; neither is unbounded out of the box. + ## Reproduction & Tooling Every number in this document comes from an in-repo example, run under Cargo's @@ -324,6 +391,7 @@ single capture as indicative rather than authoritative. | Allocator-backed memory (dhat) | `cargo run --release --example dhat_memory -- --all` | | dhat single (library, corpus) pair | `cargo run --release --example dhat_memory -- saneyaml-borrowed multidoc` | | Conformance (402 curated cases) | `cargo run --release --example conformance_compare` | +| serde-saphyr head-to-head | `cargo run --release --example serde_saphyr_headtohead` | Iteration counts default to 200 for `real_world_benchmark` (`YAML_BENCH_ITERS`) and 20 for `large_input_benchmark` (`YAML_LARGE_BENCH_ITERS`). The @@ -342,6 +410,7 @@ dev-dependency versions (see `Cargo.toml`): | `saphyr` | 0.0.6 | | `saphyr-parser` | 0.0.6 | | `yaml-rust2` | 0.11.0 | +| `serde-saphyr` | 0.0.27 | | `dhat` | 0.3.3 | To reproduce against the exact pinned set, build with the checked-in diff --git a/examples/serde_saphyr_headtohead.rs b/examples/serde_saphyr_headtohead.rs new file mode 100644 index 0000000..e9e830c --- /dev/null +++ b/examples/serde_saphyr_headtohead.rs @@ -0,0 +1,327 @@ +//! Head-to-head benchmark: `saneyaml` vs `serde-saphyr`. +//! +//! Both crates are Serde-based, so the fair comparison feeds the *same bytes* +//! into the *same target type* with each library, isolating the YAML layer. +//! +//! Two axes are measured: +//! 1. Dynamic value (`serde_json::Value`) over the real-world config corpus +//! — the "parse arbitrary YAML into a tree" case. +//! 2. Typed structs over a generated config — serde-saphyr's advertised +//! sweet spot (deserialize straight into structs, no intermediate tree), +//! and saneyaml's primary use case. +//! +//! Run: +//! cargo run --release --example serde_saphyr_headtohead +//! YAML_BENCH_ITERS=1000 SVC_COUNT=5000 cargo run --release --example serde_saphyr_headtohead + +use serde::Deserialize; +use std::collections::BTreeMap; +use std::hint::black_box; +use std::path::{Path, PathBuf}; +use std::time::{Duration, Instant}; + +struct Fixture { + name: String, + input: String, + docs: usize, +} + +fn collect_yaml(dir: &Path, out: &mut Vec) { + let Ok(entries) = std::fs::read_dir(dir) else { + return; + }; + let mut paths: Vec = entries.flatten().map(|e| e.path()).collect(); + paths.sort(); + for path in paths { + if path.is_dir() { + collect_yaml(&path, out); + } else if matches!( + path.extension().and_then(|e| e.to_str()), + Some("yaml") | Some("yml") + ) { + out.push(path); + } + } +} + +fn measure u64>(iters: usize, mut run: F) -> (Duration, u64) { + // Warm up so the first allocation/branch-predict pass is not timed. + black_box(run()); + let start = Instant::now(); + let mut acc = 0u64; + for _ in 0..iters { + acc ^= black_box(run()); + } + (start.elapsed(), acc) +} + +fn ns_per_byte(elapsed: Duration, iters: usize, bytes: usize) -> f64 { + elapsed.as_nanos() as f64 / (iters * bytes) as f64 +} + +fn row(label: &str, iters: usize, bytes: usize, elapsed: Duration) { + println!( + "| {label} | {iters} | {bytes} | {:.3} | {:.2} |", + elapsed.as_secs_f64() * 1000.0, + ns_per_byte(elapsed, iters, bytes), + ); +} + +fn table_header() { + println!("| load path | iterations | bytes/iter | elapsed ms | ns/byte |"); + println!("|---|---:|---:|---:|---:|"); +} + +// Both libraries are safe-by-default with resource caps (saneyaml: +// max_collection_items = 16384; serde-saphyr: ~250000 total nodes). The +// generated inputs are sized just under the tighter cap so BOTH run on their +// shipping defaults — no limits are lifted on either side. + +// ---- Typed-struct workload ------------------------------------------------- + +// Fields are populated by Serde during deserialization; the benchmark never +// reads them back, so silence the dead-code lint for this fixture type. +#[allow(dead_code)] +#[derive(Debug, Deserialize)] +struct Service { + name: String, + image: String, + replicas: u32, + enabled: bool, + weight: f64, + ports: Vec, + tags: Vec, + env: BTreeMap, +} + +#[allow(dead_code)] +#[derive(Debug, Deserialize)] +struct Config { + version: String, + services: Vec, +} + +// A flat, homogeneous record — the shape closest to serde-saphyr's own +// published "big file of simple records" benchmark, to check the ratio is not +// an artifact of the nested/BTreeMap shape above. +#[allow(dead_code)] +#[derive(Debug, Deserialize)] +struct Record { + id: u64, + name: String, + active: bool, + score: f64, +} + +fn generate_records(n: usize) -> String { + let mut s = String::with_capacity(n * 64); + for i in 0..n { + s.push_str(&format!( + "- id: {i}\n name: item-{i}\n active: {}\n score: {}.{}\n", + i % 2 == 0, + i % 100, + i % 1000, + )); + } + s +} + +fn generate_typed(service_count: usize) -> String { + let mut s = String::with_capacity(service_count * 220); + s.push_str("version: \"3.8\"\nservices:\n"); + for i in 0..service_count { + s.push_str(&format!( + " - name: service-{i}\n \ + image: registry.example.com/team/app:{i}\n \ + replicas: {}\n \ + enabled: {}\n \ + weight: {}.{}\n \ + ports: [{}, {}]\n \ + tags: [web, prod, region-{}]\n \ + env:\n \ + LOG_LEVEL: info\n \ + PORT: \"{}\"\n \ + INDEX: \"{i}\"\n", + (i % 8) + 1, + i % 2 == 0, + i % 10, + i % 100, + 8000 + (i % 1000), + 9000 + (i % 1000), + i % 16, + 8000 + (i % 1000), + )); + } + s +} + +fn main() { + let iters = std::env::var("YAML_BENCH_ITERS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(500usize); + let large_iters = std::env::var("YAML_LARGE_BENCH_ITERS") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(40usize); + let service_count = std::env::var("SVC_COUNT") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(4000usize); + + println!( + "saneyaml {} vs serde-saphyr (see Cargo.lock for resolved version)\n", + env!("CARGO_PKG_VERSION"), + ); + + // ---- Load corpus ------------------------------------------------------- + let root = Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/real-world"); + let mut paths = Vec::new(); + collect_yaml(&root, &mut paths); + + let mut fixtures: Vec = Vec::new(); + for path in &paths { + let input = std::fs::read_to_string(path).expect("read fixture"); + // Authoritative document count from saneyaml's parser. + let docs = match saneyaml::parse_documents(&input) { + Ok(d) => d.len(), + Err(_) => continue, + }; + let name = path + .strip_prefix(&root) + .unwrap_or(path) + .to_string_lossy() + .into_owned(); + fixtures.push(Fixture { name, input, docs }); + } + println!( + "Loaded {} fixtures ({} bytes) from {}\n", + fixtures.len(), + fixtures.iter().map(|f| f.input.len()).sum::(), + root.display(), + ); + + // ---- Axis 1: dynamic value (serde_json::Value), single-document -------- + // + // Only fixtures both libraries accept into serde_json::Value are timed, so + // the byte totals are identical for the two rows. Exclusions are reported. + let mut both_ok: Vec<&Fixture> = Vec::new(); + let mut excluded: Vec<(String, String)> = Vec::new(); + for f in fixtures.iter().filter(|f| f.docs == 1) { + let sane = saneyaml::from_str::(&f.input); + let saph = serde_saphyr::from_str::(&f.input); + match (&sane, saph.is_ok()) { + (Ok(_), true) => both_ok.push(f), + (Err(e), true) => excluded.push((f.name.clone(), format!("saneyaml rejected: {e}"))), + (Ok(_), false) => excluded.push((f.name.clone(), "serde-saphyr rejected".into())), + (Err(e), false) => excluded.push((f.name.clone(), format!("both rejected: {e}"))), + } + } + let dyn_bytes: usize = both_ok.iter().map(|f| f.input.len()).sum(); + + println!("## Axis 1 — dynamic value into serde_json::Value (single-doc corpus)\n"); + println!( + "{} single-doc fixtures accepted by both ({} bytes); {} excluded.\n", + both_ok.len(), + dyn_bytes, + excluded.len() + ); + if !excluded.is_empty() { + for (name, why) in &excluded { + println!("- excluded: {name} ({why})"); + } + println!(); + } + + table_header(); + let (e, c) = measure(iters, || { + let mut n = 0u64; + for f in &both_ok { + n += saneyaml::from_str::(&f.input).unwrap().is_object() as u64; + } + n + }); + black_box(c); + row("saneyaml::from_str::", iters, dyn_bytes, e); + + let (e, c) = measure(iters, || { + let mut n = 0u64; + for f in &both_ok { + n += serde_saphyr::from_str::(&f.input) + .unwrap() + .is_object() as u64; + } + n + }); + black_box(c); + row("serde_saphyr::from_str::", iters, dyn_bytes, e); + + // saneyaml native tree, for reference (not a head-to-head row). + let (e, c) = measure(iters, || { + let mut n = 0u64; + for f in &both_ok { + n += saneyaml::parse_documents(&f.input).unwrap().len() as u64; + } + n + }); + black_box(c); + row("saneyaml::parse_documents (native, ref)", iters, dyn_bytes, e); + + // ---- Axis 2: typed struct, generated config ---------------------------- + let typed = generate_typed(service_count); + let typed_bytes = typed.len(); + + // Validate both produce the expected shape before timing. + let sane_cfg: Config = saneyaml::from_str(&typed).expect("saneyaml typed parse"); + let saph_cfg: Config = serde_saphyr::from_str(&typed).expect("serde-saphyr typed parse"); + assert_eq!(sane_cfg.services.len(), service_count); + assert_eq!(saph_cfg.services.len(), service_count); + + println!( + "\n## Axis 2 — typed deserialize into Config (generated, {service_count} services, {typed_bytes} bytes; both on defaults)\n" + ); + table_header(); + let (e, c) = measure(large_iters, || { + let cfg: Config = saneyaml::from_str(&typed).unwrap(); + cfg.services.len() as u64 + }); + black_box(c); + row("saneyaml::from_str::", large_iters, typed_bytes, e); + + let (e, c) = measure(large_iters, || { + let cfg: Config = serde_saphyr::from_str(&typed).unwrap(); + cfg.services.len() as u64 + }); + black_box(c); + row("serde_saphyr::from_str::", large_iters, typed_bytes, e); + + // ---- Axis 3: typed flat records (serde-saphyr's home-turf shape) ------- + let record_count = std::env::var("REC_COUNT") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(15000usize); + let records = generate_records(record_count); + let rec_bytes = records.len(); + let sane_recs: Vec = saneyaml::from_str(&records).expect("saneyaml records"); + let saph_recs: Vec = serde_saphyr::from_str(&records).expect("serde-saphyr records"); + assert_eq!(sane_recs.len(), record_count); + assert_eq!(saph_recs.len(), record_count); + + println!( + "\n## Axis 3 — typed flat Vec (generated, {record_count} records, {rec_bytes} bytes; both on defaults)\n" + ); + table_header(); + let (e, c) = measure(large_iters, || { + let v: Vec = saneyaml::from_str(&records).unwrap(); + v.len() as u64 + }); + black_box(c); + row("saneyaml::from_str::>", large_iters, rec_bytes, e); + + let (e, c) = measure(large_iters, || { + let v: Vec = serde_saphyr::from_str(&records).unwrap(); + v.len() as u64 + }); + black_box(c); + row("serde_saphyr::from_str::>", large_iters, rec_bytes, e); +} From da82f371eaf20fe64229ef9dda08e277ca384c93 Mon Sep 17 00:00:00 2001 From: jskoiz <20649937+jskoiz@users.noreply.github.com> Date: Sat, 6 Jun 2026 09:25:56 -1000 Subject: [PATCH 2/3] Add Spanned to capture source locations of deserialized values Spanned pairs a deserialized value with the source span (byte offsets plus line and column) it was read from, built on the existing node span tree via a private deserializer protocol. It is available on the from_str, from_slice, and from_node read paths and nested struct fields, serializes transparently, and adds no dependencies. The from_value path remains spanless. --- docs/PUBLIC_API.txt | 29 ++++ src/de.rs | 36 ++++- src/lib.rs | 2 + src/spanned.rs | 362 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 424 insertions(+), 5 deletions(-) create mode 100644 src/spanned.rs diff --git a/docs/PUBLIC_API.txt b/docs/PUBLIC_API.txt index 56d10d1..3f11030 100644 --- a/docs/PUBLIC_API.txt +++ b/docs/PUBLIC_API.txt @@ -2241,6 +2241,35 @@ pub saneyaml::Span::start: usize impl saneyaml::Span pub fn saneyaml::Span::new(usize, usize, usize, usize) -> Self pub fn saneyaml::Span::point(usize, usize, usize) -> Self +pub struct saneyaml::Spanned +impl saneyaml::Spanned +pub fn saneyaml::Spanned::column(&self) -> usize +pub fn saneyaml::Spanned::end(&self) -> usize +pub fn saneyaml::Spanned::get_mut(&mut self) -> &mut T +pub fn saneyaml::Spanned::get_ref(&self) -> &T +pub fn saneyaml::Spanned::into_inner(self) -> T +pub fn saneyaml::Spanned::line(&self) -> usize +pub fn saneyaml::Spanned::new(saneyaml::Span, T) -> Self +pub fn saneyaml::Spanned::span(&self) -> saneyaml::Span +pub fn saneyaml::Spanned::start(&self) -> usize +impl<'de, T> serde_core::de::Deserialize<'de> for saneyaml::Spanned where T: serde_core::de::Deserialize<'de> +pub fn saneyaml::Spanned::deserialize(D) -> core::result::Result::Error> where D: serde_core::de::Deserializer<'de> +impl core::cmp::Eq for saneyaml::Spanned +impl core::cmp::Ord for saneyaml::Spanned +pub fn saneyaml::Spanned::cmp(&self, &Self) -> core::cmp::Ordering +impl core::cmp::PartialEq for saneyaml::Spanned +pub fn saneyaml::Spanned::eq(&self, &Self) -> bool +impl core::cmp::PartialOrd for saneyaml::Spanned +pub fn saneyaml::Spanned::partial_cmp(&self, &Self) -> core::option::Option +impl core::hash::Hash for saneyaml::Spanned +pub fn saneyaml::Spanned::hash(&self, &mut H) +impl serde_core::ser::Serialize for saneyaml::Spanned +pub fn saneyaml::Spanned::serialize(&self, S) -> core::result::Result<::Ok, ::Error> where S: serde_core::ser::Serializer +impl core::convert::AsRef for saneyaml::Spanned +pub fn saneyaml::Spanned::as_ref(&self) -> &T +impl core::ops::deref::Deref for saneyaml::Spanned +pub type saneyaml::Spanned::Target = T +pub fn saneyaml::Spanned::deref(&self) -> &T pub struct saneyaml::Tag pub saneyaml::Tag::handle: alloc::string::String pub saneyaml::Tag::suffix: alloc::string::String diff --git a/src/de.rs b/src/de.rs index 811b92f..41a0646 100644 --- a/src/de.rs +++ b/src/de.rs @@ -1895,13 +1895,16 @@ impl<'de, 'tree> de::Deserializer<'de> for InputNode<'tree, 'de> { fn deserialize_struct( self, - _name: &'static str, + name: &'static str, _fields: &'static [&'static str], visitor: V, ) -> Result where V: Visitor<'de>, { + if name == crate::spanned::NAME { + return crate::spanned::deserialize_spanned(self.node.span, self, visitor); + } self.deserialize_map(visitor) } @@ -2391,13 +2394,16 @@ impl<'de> de::Deserializer<'de> for &'de Node { fn deserialize_struct( self, - _name: &'static str, + name: &'static str, _fields: &'static [&'static str], visitor: V, ) -> Result where V: Visitor<'de>, { + if name == crate::spanned::NAME { + return crate::spanned::deserialize_spanned(self.span, self, visitor); + } self.deserialize_map(visitor) } @@ -2868,13 +2874,17 @@ impl<'de> de::Deserializer<'de> for Node { fn deserialize_struct( self, - _name: &'static str, + name: &'static str, _fields: &'static [&'static str], visitor: V, ) -> Result where V: Visitor<'de>, { + if name == crate::spanned::NAME { + let span = self.span; + return crate::spanned::deserialize_spanned(span, self, visitor); + } self.deserialize_map(visitor) } @@ -3302,13 +3312,21 @@ impl<'de> de::Deserializer<'de> for Value { fn deserialize_struct( self, - _name: &'static str, + name: &'static str, _fields: &'static [&'static str], visitor: V, ) -> Result where V: Visitor<'de>, { + if name == crate::spanned::NAME { + // The Value tree is spanless; report a default (zero) span. + return crate::spanned::deserialize_spanned( + crate::error::Span::default(), + self, + visitor, + ); + } self.deserialize_map(visitor) } @@ -4720,7 +4738,15 @@ impl<'de> de::Deserializer<'de> for &'de Value { where V: Visitor<'de>, { - let _ = (name, fields); + let _ = fields; + if name == crate::spanned::NAME { + // The Value tree is spanless; report a default (zero) span. + return crate::spanned::deserialize_spanned( + crate::error::Span::default(), + self, + visitor, + ); + } self.deserialize_map(visitor) } diff --git a/src/lib.rs b/src/lib.rs index 16b100e..3a2f4cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,6 +34,7 @@ pub mod lossless; mod parse; mod schema; mod ser; +mod spanned; mod yaml11; /// Serde helper modules matching selected `serde_yaml::with` paths. @@ -102,3 +103,4 @@ pub use schema::{ pub use ser::{ Serializer, to_string, to_string_with_options, to_value, to_writer, to_writer_with_options, }; +pub use spanned::Spanned; diff --git a/src/spanned.rs b/src/spanned.rs new file mode 100644 index 0000000..3cbb663 --- /dev/null +++ b/src/spanned.rs @@ -0,0 +1,362 @@ +//! [`Spanned`] captures the source [`Span`] a value occupied during +//! deserialization, so callers can report *where* a configuration value came +//! from — not only what it was. +//! +//! Error spans only appear on failure; `Spanned` exposes the location of a +//! *successful* read, which is what config linters, language servers, and +//! "this setting came from line N" tooling need. It is built directly on this +//! crate's spanful [`Node`](crate::Node) tree: when a span-bearing deserializer +//! is asked for a private marker struct, it hands back the current node's span +//! alongside the normally deserialized value. No second parse and no retained +//! source buffer are required, because [`Node`](crate::Node) already carries +//! line, column, and byte offsets. +//! +//! ``` +//! use serde::Deserialize; +//! use saneyaml::Spanned; +//! +//! #[derive(Deserialize)] +//! struct Config { +//! name: Spanned, +//! } +//! +//! let yaml = "name: api\n"; +//! let config: Config = saneyaml::from_str(yaml)?; +//! let name = config.name; +//! assert_eq!(name.line(), 1); +//! assert_eq!(&yaml[name.start()..name.end()], "api"); +//! assert_eq!(name.into_inner(), "api"); +//! # Ok::<(), saneyaml::Error>(()) +//! ``` +//! +//! Supported on the span-bearing read paths: [`from_str`](crate::from_str), +//! [`from_slice`](crate::from_slice), and [`from_node`](crate::from_node), +//! including nested struct fields. On the spanless [`from_value`](crate::from_value) +//! path the value still deserializes, but the span is [`Span::default`] (line 0). + +use std::cmp::Ordering; +use std::fmt; +use std::hash::{Hash, Hasher}; +use std::marker::PhantomData; +use std::ops::Deref; + +use serde::de::{ + self, Deserialize, DeserializeSeed, Deserializer, IntoDeserializer, SeqAccess, Visitor, +}; +use serde::ser::{Serialize, Serializer}; + +use crate::error::{Error, Span}; + +/// Private marker-struct name a [`Spanned`] read asks for. Spelled so it can +/// never collide with a real Rust type name, mirroring how `toml`/`serde-spanned` +/// smuggle a position request through Serde's type-erased API. +pub(crate) const NAME: &str = "$saneyaml::private::Spanned"; + +/// Field names paired with [`NAME`]; present only to satisfy the +/// `deserialize_struct` contract. The four span components plus the value are +/// produced positionally as a sequence, so the names are never matched. +pub(crate) const FIELDS: &[&str] = &[ + "$saneyaml::private::start", + "$saneyaml::private::end", + "$saneyaml::private::line", + "$saneyaml::private::column", + "$saneyaml::private::value", +]; + +/// A value deserialized from YAML, paired with the source [`Span`] it came from. +/// +/// Equality, ordering, and hashing consider only the inner value, never the +/// span, so wrapping a field in `Spanned` does not change its identity as a +/// mapping key or its comparison behavior. +#[derive(Clone, Copy, Debug, Default)] +pub struct Spanned { + span: Span, + value: T, +} + +impl Spanned { + /// Creates a spanned value from an explicit span and value. + pub fn new(span: Span, value: T) -> Self { + Self { span, value } + } + + /// Returns the source span the value occupied. + pub fn span(&self) -> Span { + self.span + } + + /// Returns the zero-based byte offset where the value starts. + pub fn start(&self) -> usize { + self.span.start + } + + /// Returns the zero-based byte offset just past the value. + pub fn end(&self) -> usize { + self.span.end + } + + /// Returns the one-based source line of the value start. + pub fn line(&self) -> usize { + self.span.line + } + + /// Returns the one-based UTF-8 byte column of the value start. + pub fn column(&self) -> usize { + self.span.column + } + + /// Returns a reference to the inner value. + pub fn get_ref(&self) -> &T { + &self.value + } + + /// Returns a mutable reference to the inner value. + pub fn get_mut(&mut self) -> &mut T { + &mut self.value + } + + /// Consumes the wrapper, returning the inner value. + pub fn into_inner(self) -> T { + self.value + } +} + +impl Deref for Spanned { + type Target = T; + + fn deref(&self) -> &T { + &self.value + } +} + +impl AsRef for Spanned { + fn as_ref(&self) -> &T { + &self.value + } +} + +impl PartialEq for Spanned { + fn eq(&self, other: &Self) -> bool { + self.value == other.value + } +} + +impl Eq for Spanned {} + +impl PartialOrd for Spanned { + fn partial_cmp(&self, other: &Self) -> Option { + self.value.partial_cmp(&other.value) + } +} + +impl Ord for Spanned { + fn cmp(&self, other: &Self) -> Ordering { + self.value.cmp(&other.value) + } +} + +impl Hash for Spanned { + fn hash(&self, state: &mut H) { + self.value.hash(state); + } +} + +impl<'de, T> Deserialize<'de> for Spanned +where + T: Deserialize<'de>, +{ + fn deserialize(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + deserializer.deserialize_struct(NAME, FIELDS, SpannedVisitor(PhantomData)) + } +} + +/// Serializes transparently as the inner value; the span is a read-side concern. +impl Serialize for Spanned { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + self.value.serialize(serializer) + } +} + +struct SpannedVisitor(PhantomData); + +impl<'de, T> Visitor<'de> for SpannedVisitor +where + T: Deserialize<'de>, +{ + type Value = Spanned; + + fn expecting(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + formatter.write_str("a spanned YAML value") + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + let start = seq + .next_element::()? + .ok_or_else(|| de::Error::invalid_length(0, &self))?; + let end = seq + .next_element::()? + .ok_or_else(|| de::Error::invalid_length(1, &self))?; + let line = seq + .next_element::()? + .ok_or_else(|| de::Error::invalid_length(2, &self))?; + let column = seq + .next_element::()? + .ok_or_else(|| de::Error::invalid_length(3, &self))?; + let value = seq + .next_element::()? + .ok_or_else(|| de::Error::invalid_length(4, &self))?; + Ok(Spanned { + span: Span::new(start as usize, end as usize, line as usize, column as usize), + value, + }) + } +} + +/// Answers a [`Spanned`] read from a span-bearing deserializer. +/// +/// Each [`Deserializer`](serde::Deserializer) that carries spans calls this from +/// its `deserialize_struct` when it sees [`NAME`], passing the current node span +/// and itself as the value deserializer. +pub(crate) fn deserialize_spanned<'de, D, V>( + span: Span, + value: D, + visitor: V, +) -> Result +where + D: Deserializer<'de, Error = Error>, + V: Visitor<'de>, +{ + visitor.visit_seq(SpannedSeq { + span, + value: Some(value), + index: 0, + _marker: PhantomData, + }) +} + +/// A five-element [`SeqAccess`] yielding `start, end, line, column, value`, +/// where the value comes from the wrapped deserializer `D`. +struct SpannedSeq<'de, D> { + span: Span, + value: Option, + index: u8, + _marker: PhantomData<&'de ()>, +} + +impl<'de, D> SeqAccess<'de> for SpannedSeq<'de, D> +where + D: Deserializer<'de, Error = Error>, +{ + type Error = Error; + + fn next_element_seed(&mut self, seed: S) -> Result, Error> + where + S: DeserializeSeed<'de>, + { + let index = self.index; + self.index = self.index.saturating_add(1); + match index { + 0 => seed + .deserialize(u64_deserializer(self.span.start as u64)) + .map(Some), + 1 => seed + .deserialize(u64_deserializer(self.span.end as u64)) + .map(Some), + 2 => seed + .deserialize(u64_deserializer(self.span.line as u64)) + .map(Some), + 3 => seed + .deserialize(u64_deserializer(self.span.column as u64)) + .map(Some), + 4 => { + let value = self + .value + .take() + .expect("spanned value deserializer is consumed exactly once"); + seed.deserialize(value).map(Some) + } + _ => Ok(None), + } + } + + fn size_hint(&self) -> Option { + Some(usize::from(5u8.saturating_sub(self.index))) + } +} + +fn u64_deserializer(value: u64) -> serde::de::value::U64Deserializer { + value.into_deserializer() +} + +#[cfg(test)] +mod tests { + use super::*; + use serde::{Deserialize, Serialize}; + + #[derive(Deserialize, Serialize)] + struct Config { + name: Spanned, + port: Spanned, + } + + #[test] + fn captures_field_spans_from_str() { + let yaml = "name: api\nport: 8080\n"; + let config: Config = crate::from_str(yaml).unwrap(); + + assert_eq!(config.name.get_ref().as_str(), "api"); + assert_eq!(config.name.line(), 1); + assert_eq!(&yaml[config.name.start()..config.name.end()], "api"); + + assert_eq!(*config.port.get_ref(), 8080); + assert_eq!(config.port.line(), 2); + assert_eq!(&yaml[config.port.start()..config.port.end()], "8080"); + } + + #[test] + fn deref_and_into_inner() { + let yaml = "name: web\nport: 80\n"; + let config: Config = crate::from_str(yaml).unwrap(); + // Deref to the inner value. + assert_eq!(config.name.len(), 3); + assert_eq!(config.name.into_inner(), "web"); + } + + #[test] + fn equality_ignores_span() { + let left: Spanned = crate::from_str("a\n").unwrap(); + let right = Spanned::new(Span::new(99, 100, 7, 7), "a".to_string()); + // Different spans, same value: equal. + assert_eq!(left, right); + } + + #[test] + fn serializes_transparently() { + let yaml = "name: api\nport: 8080\n"; + let config: Config = crate::from_str(yaml).unwrap(); + let emitted = crate::to_string(&config).unwrap(); + assert!(emitted.contains("name: api"), "got: {emitted}"); + assert!(emitted.contains("port: 8080"), "got: {emitted}"); + // No span leakage in the emitted form. + assert!(!emitted.contains("private"), "got: {emitted}"); + } + + #[test] + fn from_value_is_spanless_but_reads_value() { + let value = crate::Value::from("api"); + let spanned: Spanned = crate::from_value(value).unwrap(); + assert_eq!(spanned.get_ref().as_str(), "api"); + // Spanless path: default (zero) span. + assert_eq!(spanned.line(), 0); + assert_eq!(spanned.span(), Span::default()); + } +} From ae3585bdc0e47853032e8409613f750b54fbf56c Mon Sep 17 00:00:00 2001 From: jskoiz <20649937+jskoiz@users.noreply.github.com> Date: Sat, 6 Jun 2026 09:26:27 -1000 Subject: [PATCH 3/3] Apply rustfmt to serde-saphyr benchmark example --- examples/serde_saphyr_headtohead.rs | 46 ++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/examples/serde_saphyr_headtohead.rs b/examples/serde_saphyr_headtohead.rs index e9e830c..1aa27d4 100644 --- a/examples/serde_saphyr_headtohead.rs +++ b/examples/serde_saphyr_headtohead.rs @@ -237,12 +237,19 @@ fn main() { let (e, c) = measure(iters, || { let mut n = 0u64; for f in &both_ok { - n += saneyaml::from_str::(&f.input).unwrap().is_object() as u64; + n += saneyaml::from_str::(&f.input) + .unwrap() + .is_object() as u64; } n }); black_box(c); - row("saneyaml::from_str::", iters, dyn_bytes, e); + row( + "saneyaml::from_str::", + iters, + dyn_bytes, + e, + ); let (e, c) = measure(iters, || { let mut n = 0u64; @@ -254,7 +261,12 @@ fn main() { n }); black_box(c); - row("serde_saphyr::from_str::", iters, dyn_bytes, e); + row( + "serde_saphyr::from_str::", + iters, + dyn_bytes, + e, + ); // saneyaml native tree, for reference (not a head-to-head row). let (e, c) = measure(iters, || { @@ -265,7 +277,12 @@ fn main() { n }); black_box(c); - row("saneyaml::parse_documents (native, ref)", iters, dyn_bytes, e); + row( + "saneyaml::parse_documents (native, ref)", + iters, + dyn_bytes, + e, + ); // ---- Axis 2: typed struct, generated config ---------------------------- let typed = generate_typed(service_count); @@ -293,7 +310,12 @@ fn main() { cfg.services.len() as u64 }); black_box(c); - row("serde_saphyr::from_str::", large_iters, typed_bytes, e); + row( + "serde_saphyr::from_str::", + large_iters, + typed_bytes, + e, + ); // ---- Axis 3: typed flat records (serde-saphyr's home-turf shape) ------- let record_count = std::env::var("REC_COUNT") @@ -316,12 +338,22 @@ fn main() { v.len() as u64 }); black_box(c); - row("saneyaml::from_str::>", large_iters, rec_bytes, e); + row( + "saneyaml::from_str::>", + large_iters, + rec_bytes, + e, + ); let (e, c) = measure(large_iters, || { let v: Vec = serde_saphyr::from_str(&records).unwrap(); v.len() as u64 }); black_box(c); - row("serde_saphyr::from_str::>", large_iters, rec_bytes, e); + row( + "serde_saphyr::from_str::>", + large_iters, + rec_bytes, + e, + ); }