From 75d6be44c056954cb8295a3cd342ef149be3575a Mon Sep 17 00:00:00 2001 From: Chris Lundquist Date: Tue, 10 Mar 2026 02:21:37 -0700 Subject: [PATCH 1/4] refactor: eliminate double-conversion in SortLZ paths MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `demux_tokens()` taking `&[LzToken]` directly, skip intermediate `Vec` in GPU SortLZ coordinator - Migrate LzSeq SortLZ CPU path from `encode_match_sequence` to `parse_matches` → `encode_from_tokens` - Migrate `encode_optimal` to use `matches_to_tokens` → `encode_from_tokens` - Delete `matches_to_lz77_greedy`, `matches_to_lz77_lazy` (~110 lines) - Delete `encode_match_sequence` (~55 lines) - Replace sortlz LZ77 roundtrip tests with token-based equivalents Co-Authored-By: Claude Opus 4.6 --- src/lzseq/mod.rs | 65 +---------------- src/pipeline/demux.rs | 34 +++++---- src/pipeline/parallel.rs | 9 ++- src/sortlz.rs | 154 ++++----------------------------------- 4 files changed, 45 insertions(+), 217 deletions(-) diff --git a/src/lzseq/mod.rs b/src/lzseq/mod.rs index ed9105f..6b6e00b 100644 --- a/src/lzseq/mod.rs +++ b/src/lzseq/mod.rs @@ -622,66 +622,6 @@ fn emit_match( length_extra_writer.write_bits(lev, leb); } -/// Encode a pre-computed match sequence into LzSeq token streams. -/// -/// Used by `encode_optimal` after the backward DP has selected matches, -/// and by the SortLZ match finder to feed pre-computed matches into the -/// LzSeq 6-stream format. Applies repeat offset encoding. -pub fn encode_match_sequence( - _input: &[u8], - matches: &[crate::lz77::Match], - _config: &SeqConfig, -) -> PzResult { - let mut repeats = RepeatOffsets::new(); - let mut flags_vec: Vec = Vec::new(); - let mut literals: Vec = Vec::new(); - let mut offset_codes: Vec = Vec::new(); - let mut length_codes: Vec = Vec::new(); - let mut offset_extra_writer = BitWriter::new(); - let mut length_extra_writer = BitWriter::new(); - - for m in matches { - if m.length == 0 { - // Literal token from optimal parser - flags_vec.push(true); - literals.push(m.next); - } else { - emit_match( - m.offset as u32, - m.length, - &mut repeats, - &mut flags_vec, - &mut offset_codes, - &mut offset_extra_writer, - &mut length_codes, - &mut length_extra_writer, - ); - // Emit the 'next' byte as a separate literal token. - // The DP forward trace produces Match structs where each match covers - // `length` bytes plus a "next" byte. The cost model accounts for the - // next byte in the match cost, but the actual token stream must emit - // the next byte as a literal to avoid data loss. - flags_vec.push(true); - literals.push(m.next); - } - } - - let num_tokens = flags_vec.len() as u32; - let num_matches = offset_codes.len() as u32; - let flags = pack_flags(&flags_vec); - - Ok(SeqEncoded { - flags, - literals, - offset_codes, - offset_extra: offset_extra_writer.finish(), - length_codes, - length_extra: length_extra_writer.finish(), - num_tokens, - num_matches, - }) -} - /// Encode a universal `LzToken` stream into LzSeq's 6-stream format. /// /// Like `encode_match_sequence` but takes `LzToken` directly instead of @@ -774,8 +714,9 @@ pub fn encode_optimal(input: &[u8], config: &SeqConfig) -> PzResult ); let matches = crate::optimal::optimal_parse_lzseq(input, &table)?; - // Encode the optimal match sequence into LzSeq token streams. - encode_match_sequence(input, &matches, config) + // Convert optimal parse output to universal tokens, then encode. + let tokens = crate::lz_token::matches_to_tokens(&matches); + encode_from_tokens(&tokens, config) } /// Compress input using LzSeq with lazy matching and configurable window. diff --git a/src/pipeline/demux.rs b/src/pipeline/demux.rs index 991d5eb..3f889f5 100644 --- a/src/pipeline/demux.rs +++ b/src/pipeline/demux.rs @@ -84,12 +84,8 @@ fn encoder_for_demuxer(demuxer: &LzDemuxer) -> Box { /// Demux pre-computed LZ77 matches into encoder streams for the GPU coordinator. /// -/// Used by the GPU coordinator to demux matches returned from -/// `find_matches_batched()` without re-running match-finding. -/// Converts `Vec` → `Vec` → encoder.encode(). -/// -/// `input` is the original block data — needed by `Lz77Encoder` to look up -/// trailing literal bytes when consecutive matches appear. +/// Used by the batched LZ77 GPU path (`find_matches_batched`) which returns +/// `Vec`. Converts Match → LzToken → encoder.encode(). #[cfg(feature = "webgpu")] pub(crate) fn demux_lz77_matches( input: &[u8], @@ -97,9 +93,22 @@ pub(crate) fn demux_lz77_matches( pipeline: super::Pipeline, ) -> PzResult { let tokens = lz_token::matches_to_tokens(&matches); + demux_tokens(input, &tokens, pipeline) +} + +/// Demux a universal token stream into encoder streams. +/// +/// Used by GPU SortLZ coordinator and LzSeq SortLZ CPU path, which produce +/// `Vec` directly via `sortlz::parse_matches()`. +#[cfg(feature = "webgpu")] +pub(crate) fn demux_tokens( + input: &[u8], + tokens: &[lz_token::LzToken], + pipeline: super::Pipeline, +) -> PzResult { let demuxer = demuxer_for_pipeline(pipeline).ok_or(PzError::InvalidInput)?; let encoder = encoder_for_demuxer(&demuxer); - Ok(encoder.encode(input, &tokens)?.into()) + Ok(encoder.encode(input, tokens)?.into()) } impl StreamDemuxer for LzDemuxer { @@ -167,8 +176,8 @@ impl StreamDemuxer for LzDemuxer { }) } LzDemuxer::LzSeq => { - // SortLZ match finder path: use sort-based matches converted - // to LzSeq's 6-stream format via encode_match_sequence. + // SortLZ match finder path: sort-based matches → LzToken → + // LzSeq 6-stream format via encode_from_tokens. // Prefers GPU radix sort when available for large inputs. if options.match_finder == super::MatchFinder::SortLz { let window = options @@ -195,10 +204,9 @@ impl StreamDemuxer for LzDemuxer { #[cfg(not(feature = "webgpu"))] let raw_matches = crate::sortlz::find_matches(input, &config); - let lz_matches = crate::sortlz::matches_to_lz77_lazy(input, &raw_matches); - let enc = lzseq::encode_match_sequence( - input, - &lz_matches, + let tokens = crate::sortlz::parse_matches(input, &raw_matches, true); + let enc = lzseq::encode_from_tokens( + &tokens, &lzseq::SeqConfig { max_window: window, ..lzseq::SeqConfig::default() diff --git a/src/pipeline/parallel.rs b/src/pipeline/parallel.rs index f7956ff..5b14410 100644 --- a/src/pipeline/parallel.rs +++ b/src/pipeline/parallel.rs @@ -543,7 +543,7 @@ fn compress_parallel_unified( // Process Stage 0 batch last to avoid starving queued StageN/Fused // continuations when bursts arrive together. if !stage0_batch.is_empty() && uses_sortlz_match_finder { - // SortLZ GPU match finding: per-block dispatch with LZ77 conversion + // SortLZ GPU match finding: per-block dispatch → parse_matches → demux_tokens let sortlz_config = crate::sortlz::SortLzConfig::for_lz77( opts.max_match_len.unwrap_or(crate::lz77::LZ77_MAX_MATCH), ); @@ -552,13 +552,14 @@ fn compress_parallel_unified( let result = engine .sortlz_find_matches(blocks[block_idx], &sortlz_config) .and_then(|raw_matches| { - let lz_matches = crate::sortlz::matches_to_lz77_lazy( + let tokens = crate::sortlz::parse_matches( blocks[block_idx], &raw_matches, + true, // lazy parsing ); - let demux = super::demux::demux_lz77_matches( + let demux = super::demux::demux_tokens( blocks[block_idx], - lz_matches, + &tokens, pipeline, )?; Ok(StageBlock { diff --git a/src/sortlz.rs b/src/sortlz.rs index 82ce566..17d5c69 100644 --- a/src/sortlz.rs +++ b/src/sortlz.rs @@ -364,128 +364,6 @@ pub(crate) fn parse_matches( // SortLZ → LZ77 Match conversion (for feeding into LZ77 pipelines) // --------------------------------------------------------------------------- -/// Convert position-indexed matches to an LZ77 match sequence using greedy parsing. -/// -/// Takes the longest match at every position, emitting `lz77::Match` structs -/// with the `next` byte. Produces the same format as `lz77::compress_lazy_to_matches()`. -pub fn matches_to_lz77_greedy( - input: &[u8], - matches: &[Option<(u16, u16)>], -) -> Vec { - let n = input.len(); - let mut result = Vec::with_capacity(n / 4); - let mut pos = 0; - - while pos < n { - if let Some((offset, length)) = matches.get(pos).copied().flatten() { - let end = pos + length as usize; - if end < n { - result.push(crate::lz77::Match { - offset, - length, - next: input[end], - }); - pos = end + 1; - } else { - // Match extends to or past end of input; truncate to leave room for next byte - let adj_len = (n - 1 - pos) as u16; - if adj_len >= crate::lz77::MIN_MATCH { - result.push(crate::lz77::Match { - offset, - length: adj_len, - next: input[pos + adj_len as usize], - }); - pos = pos + adj_len as usize + 1; - } else { - result.push(crate::lz77::Match { - offset: 0, - length: 0, - next: input[pos], - }); - pos += 1; - } - } - } else { - result.push(crate::lz77::Match { - offset: 0, - length: 0, - next: input[pos], - }); - pos += 1; - } - } - - result -} - -/// Convert position-indexed matches to an LZ77 match sequence using lazy parsing. -/// -/// If the next position has a longer match, emits a literal for the current -/// position and takes the longer match instead (gzip-style lazy evaluation). -pub fn matches_to_lz77_lazy( - input: &[u8], - matches: &[Option<(u16, u16)>], -) -> Vec { - let n = input.len(); - let mut result = Vec::with_capacity(n / 4); - let mut pos = 0; - - while pos < n { - if let Some((offset, length)) = matches.get(pos).copied().flatten() { - // Lazy check: if next position has a longer match, emit literal here - if pos + 1 < n { - if let Some((_, next_len)) = matches.get(pos + 1).copied().flatten() { - if next_len > length { - result.push(crate::lz77::Match { - offset: 0, - length: 0, - next: input[pos], - }); - pos += 1; - continue; - } - } - } - - let end = pos + length as usize; - if end < n { - result.push(crate::lz77::Match { - offset, - length, - next: input[end], - }); - pos = end + 1; - } else { - let adj_len = (n - 1 - pos) as u16; - if adj_len >= crate::lz77::MIN_MATCH { - result.push(crate::lz77::Match { - offset, - length: adj_len, - next: input[pos + adj_len as usize], - }); - pos = pos + adj_len as usize + 1; - } else { - result.push(crate::lz77::Match { - offset: 0, - length: 0, - next: input[pos], - }); - pos += 1; - } - } - } else { - result.push(crate::lz77::Match { - offset: 0, - length: 0, - next: input[pos], - }); - pos += 1; - } - } - - result -} - /// Compress using the SortLZ pipeline. /// /// Wire format (v2 — LzSeq-encoded streams + FSE): @@ -749,33 +627,33 @@ mod tests { } #[test] - fn test_lz77_greedy_roundtrip() { + fn test_parse_matches_greedy_roundtrip() { let input = test_data(); let config = SortLzConfig::for_lz77(crate::lz77::LZ77_MAX_MATCH); let matches = find_matches(&input, &config); - let lz_matches = matches_to_lz77_greedy(&input, &matches); - - // Verify matches reconstruct the input via LZ77 decompress - let mut lz_bytes = Vec::new(); - for m in &lz_matches { - lz_bytes.extend_from_slice(&m.to_bytes()); - } - let decoded = crate::lz77::decompress(&lz_bytes).unwrap(); + let tokens = parse_matches(&input, &matches, false); // greedy + + // Verify tokens reconstruct the input via Lz77Encoder round-trip. + let encoder = crate::lz_token::Lz77Encoder; + let encoded = encoder.encode(&input, &tokens).unwrap(); + let decoded = encoder + .decode(encoded.streams, &encoded.meta, input.len()) + .unwrap(); assert_eq!(decoded, input); } #[test] - fn test_lz77_lazy_roundtrip() { + fn test_parse_matches_lazy_roundtrip() { let input = test_data(); let config = SortLzConfig::for_lz77(crate::lz77::LZ77_MAX_MATCH); let matches = find_matches(&input, &config); - let lz_matches = matches_to_lz77_lazy(&input, &matches); + let tokens = parse_matches(&input, &matches, true); // lazy - let mut lz_bytes = Vec::new(); - for m in &lz_matches { - lz_bytes.extend_from_slice(&m.to_bytes()); - } - let decoded = crate::lz77::decompress(&lz_bytes).unwrap(); + let encoder = crate::lz_token::Lz77Encoder; + let encoded = encoder.encode(&input, &tokens).unwrap(); + let decoded = encoder + .decode(encoded.streams, &encoded.meta, input.len()) + .unwrap(); assert_eq!(decoded, input); } From 502adf34d8cd665248bbe909c0d8960e0c84f6da Mon Sep 17 00:00:00 2001 From: Chris Lundquist Date: Tue, 10 Mar 2026 02:25:42 -0700 Subject: [PATCH 2/4] refactor: unify LzSeq Optimal+SortLz through tokenize() Route SortLz and Optimal parse strategies through the shared tokenize() entry point for LzSeq pipelines. This gives LzSeqR/LzSeqH access to GPU match finding and unified parse strategy dispatch for these modes. Keep encode_with_config() for the default lazy/greedy CPU path (tuned adaptive chain depth, hash4 prefix, repeat-offset-aware matching). Co-Authored-By: Claude Opus 4.6 --- src/pipeline/demux.rs | 53 +++++++++++++------------------------------ 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/src/pipeline/demux.rs b/src/pipeline/demux.rs index 3f889f5..1bf157d 100644 --- a/src/pipeline/demux.rs +++ b/src/pipeline/demux.rs @@ -176,46 +176,27 @@ impl StreamDemuxer for LzDemuxer { }) } LzDemuxer::LzSeq => { - // SortLZ match finder path: sort-based matches → LzToken → - // LzSeq 6-stream format via encode_from_tokens. - // Prefers GPU radix sort when available for large inputs. - if options.match_finder == super::MatchFinder::SortLz { - let window = options - .seq_window_size - .unwrap_or_else(|| lzseq::SeqConfig::default().max_window); - let config = crate::sortlz::SortLzConfig { - max_window: window, - ..crate::sortlz::SortLzConfig::default() - }; - - // GPU path for SortLZ match finding - #[cfg(feature = "webgpu")] - let raw_matches = if let Some(ref engine) = options.webgpu_engine { - if input.len() >= crate::webgpu::MIN_GPU_INPUT_SIZE - && input.len() <= engine.max_dispatch_input_size() - { - engine.sortlz_find_matches(input, &config)? - } else { - crate::sortlz::find_matches(input, &config) - } - } else { - crate::sortlz::find_matches(input, &config) - }; - #[cfg(not(feature = "webgpu"))] - let raw_matches = crate::sortlz::find_matches(input, &config); - - let tokens = crate::sortlz::parse_matches(input, &raw_matches, true); + // SortLz and Optimal strategies route through the shared + // tokenize() entry point, which handles GPU/CPU dispatch, + // SortLz match finding, and optimal parsing uniformly. + let use_tokenize = options.match_finder == super::MatchFinder::SortLz + || options.parse_strategy == ParseStrategy::Optimal; + + if use_tokenize { + let tokens = super::tokenize(input, options)?; + let defaults = lzseq::SeqConfig::default(); let enc = lzseq::encode_from_tokens( &tokens, &lzseq::SeqConfig { - max_window: window, - ..lzseq::SeqConfig::default() + max_window: options.seq_window_size.unwrap_or(defaults.max_window), + max_match_len: options.max_match_len.unwrap_or(defaults.max_match_len), + ..defaults }, )?; return Ok(seq_encoded_to_demux(enc)); } - // GPU path: match finding + demux on-device + // GPU path: fused match finding + demux on-device #[cfg(feature = "webgpu")] if let Backend::WebGpu = options.backend { if let Some(ref engine) = options.webgpu_engine { @@ -228,17 +209,15 @@ impl StreamDemuxer for LzDemuxer { } } - // CPU path + // Default CPU path: encode_with_config (tuned lazy matching + // with repeat-offset awareness, adaptive chain depth, hash4). let defaults = lzseq::SeqConfig::default(); let config = lzseq::SeqConfig { max_window: options.seq_window_size.unwrap_or(defaults.max_window), max_match_len: options.max_match_len.unwrap_or(defaults.max_match_len), ..defaults }; - let enc = match options.parse_strategy { - ParseStrategy::Optimal => lzseq::encode_optimal(input, &config)?, - _ => lzseq::encode_with_config(input, &config)?, - }; + let enc = lzseq::encode_with_config(input, &config)?; Ok(seq_encoded_to_demux(enc)) } } From 241cf450f8ccf942ad2a98726ea6c42effdfcc21 Mon Sep 17 00:00:00 2001 From: Chris Lundquist Date: Tue, 10 Mar 2026 02:38:43 -0700 Subject: [PATCH 3/4] docs: update ARCHITECTURE.md and add wire-formats.md Update multi-stream section to reflect TokenEncoder architecture (LzSeqEncoder 6-stream, LzssEncoder 4-stream). Add "Active architecture" section documenting GPU vs CPU design decisions. Add comprehensive wire format reference covering container V2, per-block framing, entropy coders, and pipeline ID table. Co-Authored-By: Claude Opus 4.6 --- ARCHITECTURE.md | 103 ++++++------- .../PLAN-interleaved-rans.md | 0 .../PLAN-p0a-gpu-rans-vertical-slice.md | 0 .../PLAN-unified-scheduler-north-star.md | 0 .../agent-harness-implementation.md | 0 docs/wire-formats.md | 140 ++++++++++++++++++ 6 files changed, 187 insertions(+), 56 deletions(-) rename docs/exec-plans/{active => completed}/PLAN-interleaved-rans.md (100%) rename docs/exec-plans/{active => completed}/PLAN-p0a-gpu-rans-vertical-slice.md (100%) rename docs/exec-plans/{active => completed}/PLAN-unified-scheduler-north-star.md (100%) rename docs/exec-plans/{active => completed}/agent-harness-implementation.md (100%) create mode 100644 docs/wire-formats.md diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 4841764..4d95809 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -25,59 +25,32 @@ For day-to-day development instructions, see `CLAUDE.md`. ## Multi-stream entropy coding -The Lzf and LzSeqR pipelines use **multi-stream entropy coding** to improve -compression ratio by separating LZ77 output into independent byte streams with -tighter symbol distributions. Instead of feeding one mixed stream to the entropy -coder, the encoder deinterleaves tokens into three streams: +All LZ-based pipelines use **multi-stream entropy coding**: the match finder +produces a universal `LzToken` stream (literals + matches), and a pluggable +`TokenEncoder` splits it into independent byte streams with tighter symbol +distributions. Each stream gets its own entropy coder, yielding lower +per-stream entropy than a single combined stream. -| Stream | Contents | Why it helps | -|--------|----------|-------------| -| **Offsets** | High bytes of match offsets (offset >> 8) | Offsets cluster in a narrow range; dedicated Huffman/RC table exploits this | -| **Lengths** | Match lengths (capped to u8) | Length distribution is highly skewed (short matches dominate) | -| **Literals** | Literal bytes + low offset bytes + next bytes | Natural-language / binary byte distribution | +### Wire encoders (`src/lz_token.rs`) -Each stream gets its own FSE table (Lzf) or rANS context (LzSeqR), -yielding lower per-stream entropy than a single combined stream. +| Encoder | Streams | Used by | Contents | +|---------|---------|---------|----------| +| `LzSeqEncoder` | 6 | Lzf, LzSeqR, LzSeqH, SortLz | flags, literals, offset_codes, offset_extra, length_codes, length_extra | +| `LzssEncoder` | 4 | Lzfi, LzssR | flags (1-bit per token), literals, offsets (u16 LE), lengths (u16 LE) | -### Encoding format +`LzSeqEncoder` uses log2-coded offsets and lengths with repeat offset tracking, +achieving the best ratio (~32% on Canterbury+large). `LzssEncoder` uses raw u16 +values with flag bits, trading ratio for simplicity. -Multi-stream data is stored with a `0x02` stream-format flag in the container -header, followed by three length-prefixed compressed sub-streams: +### Architecture ``` -[stream_format: u8 = 0x02] -[offsets_len: u32 LE] [offsets compressed data...] -[lengths_len: u32 LE] [lengths compressed data...] -[literals_len: u32 LE] [literals compressed data...] +Input → tokenize() → Vec → TokenEncoder::encode() → EncodedStreams → entropy coding ``` -The decoder reads the flag, decompresses each sub-stream independently, then -reinterleaves them back into the original LZ77 token sequence. Single-stream -format (`0x01`) is used as fallback for small inputs (< 256 bytes) or when -multi-stream produces larger output. - -### Benchmark results - -Comparison on Canterbury + Large corpus (14 files, 13.3 MB total), averaged over -3 iterations. "Before" = single-stream, "After" = multi-stream: - -**Compression (size and throughput):** - -| Pipeline | Before (bytes) | After (bytes) | Size delta | Throughput delta | -|----------|---------------|--------------|------------|-----------------| -| Lzf | 6,199,044 | 5,107,601 | **-17.6%** | +2.8% faster | - -**Decompression throughput:** - -| Pipeline | Throughput delta | -|----------|-----------------| -| Lzf | **+2.4%** faster | - -Multi-stream is a pure win: better compression **and** faster speed. The largest -gains are on big files (E.coli: -21% size, +11% decode throughput; bible.txt: --14% size, +16% decode throughput). Small files (< 4 KB) may see slight -expansion due to the overhead of three separate stream headers — the encoder -automatically falls back to single-stream when multi-stream would be larger. +Match finding and parsing (`tokenize()` in `src/pipeline/mod.rs`) are decoupled +from wire encoding. The `tokenize()` entry point handles GPU/CPU dispatch, +SortLz vs HashChain match finding, and parse strategy (greedy/lazy/optimal). ## rANS entropy coder @@ -163,21 +136,20 @@ match verification. Zero atomics, fully deterministic — ideal for GPU executio When used as a `MatchFinder`, SortLZ is transparent to the wire format — the output is 100% compatible with the host pipeline (Lzf, LzSeqR, LzSeqH). The consumer and decompressor see no difference. -### Pipeline::SortLz wire format (per block) +### Pipeline::SortLz wire format v2 (per block) ``` -[num_tokens: u32 LE] total token count (literals + matches) -[num_literals: u32 LE] literal count -[flags_len: u32 LE] ceil(num_tokens / 8) -[flags: flags_len bytes] bitfield (1 = literal, 0 = match, MSB-first) -[fse_lit_len: u32 LE] [fse_literals: ...] FSE-encoded literal bytes -[fse_off_len: u32 LE] [fse_offsets: ...] FSE-encoded u16 LE offsets -[fse_len_len: u32 LE] [fse_lengths: ...] FSE-encoded u16 LE lengths +[meta_len: u16 LE] LzSeq metadata length +[meta: meta_len bytes] num_tokens + num_matches (u32 each) +[num_streams: u8] number of streams (6 for LzSeq) +per stream: + [orig_len: u32 LE] uncompressed stream length + [fse_len: u32 LE] FSE-compressed length + [fse_data: fse_len bytes] ``` -This is NOT wire-compatible with any other pipeline. It uses FSE entropy coding -on three raw byte streams (literals, offsets as u16 LE, lengths as u16 LE), -with a bitfield flag stream to interleave them during decompression. +SortLz uses `LzSeqEncoder` for wire encoding + FSE for entropy. The 6 streams +are: flags, literals, offset_codes, offset_extra, length_codes, length_extra. ### Algorithm @@ -280,6 +252,25 @@ GPU path. See `docs/design-docs/gpu-strategy.md` for full analysis and `CLAUDE.md` "Known dead ends" for the complete list of GPU optimization attempts that failed. +## Active architecture: what GPU does / doesn't do + +### GPU-accelerated paths (shipping) +- **LZ77 match finding** — GPU hash-table kernel for Lzf/LzSeq pipelines (2x faster at 256KB+) +- **SortLZ match finding** — GPU radix sort + match verification (10.6x faster at 4MB) +- **BWT suffix array** — GPU radix sort with prefix-doubling +- **Interleaved FSE** — GPU-accelerated encode/decode for Lzfi pipeline +- **rANS Recoil decode** — GPU-accelerated parallel rANS decode using split-point metadata + +### CPU-only paths (by design) +- **All entropy coding in the streaming CLI** — `streaming::compress_stream` uses CPU rANS/FSE +- **rANS/FSE/Huffman encode/decode** — serial state machines, GPU is 0.54-0.77x CPU speed +- **LZ77 lazy evaluation** — sequential dependency (next match depends on current) +- **LzSeq `encode_with_config`** — repeat offset tracking, adaptive chain depth, hash4 prefix + +### Threshold gates +- `GPU_ENTROPY_THRESHOLD` (256KB) > `DEFAULT_GPU_BLOCK_SIZE` (128KB) — prevents entropy routing to GPU +- `MIN_GPU_INPUT_SIZE` — minimum block size for GPU dispatch (avoids setup overhead) + ## Remaining GPU bottlenecks 1. **GPU BWT still slower than CPU SA-IS** — Radix sort improved 7-14x over bitonic diff --git a/docs/exec-plans/active/PLAN-interleaved-rans.md b/docs/exec-plans/completed/PLAN-interleaved-rans.md similarity index 100% rename from docs/exec-plans/active/PLAN-interleaved-rans.md rename to docs/exec-plans/completed/PLAN-interleaved-rans.md diff --git a/docs/exec-plans/active/PLAN-p0a-gpu-rans-vertical-slice.md b/docs/exec-plans/completed/PLAN-p0a-gpu-rans-vertical-slice.md similarity index 100% rename from docs/exec-plans/active/PLAN-p0a-gpu-rans-vertical-slice.md rename to docs/exec-plans/completed/PLAN-p0a-gpu-rans-vertical-slice.md diff --git a/docs/exec-plans/active/PLAN-unified-scheduler-north-star.md b/docs/exec-plans/completed/PLAN-unified-scheduler-north-star.md similarity index 100% rename from docs/exec-plans/active/PLAN-unified-scheduler-north-star.md rename to docs/exec-plans/completed/PLAN-unified-scheduler-north-star.md diff --git a/docs/exec-plans/active/agent-harness-implementation.md b/docs/exec-plans/completed/agent-harness-implementation.md similarity index 100% rename from docs/exec-plans/active/agent-harness-implementation.md rename to docs/exec-plans/completed/agent-harness-implementation.md diff --git a/docs/wire-formats.md b/docs/wire-formats.md new file mode 100644 index 0000000..76bc0d6 --- /dev/null +++ b/docs/wire-formats.md @@ -0,0 +1,140 @@ +# Wire Formats Reference + +Pre-release format (pre-1.0). All formats subject to change. + +## Container Format (V2, multi-block) + +``` +[magic: 2 bytes = "PZ"] +[version: u8 = 2] +[pipeline_id: u8] see Pipeline ID Table below +[original_len: u32 LE] total uncompressed size +[num_blocks: u32 LE] number of blocks +Block table (num_blocks entries): + [compressed_len: u32 LE] + [original_len: u32 LE] +Block data: concatenated compressed block bytes +``` + +**Framed (streaming) mode:** `num_blocks = 0xFFFFFFFF`, blocks are +length-delimited pairs `[compressed_len: u32][original_len: u32][data]` +terminated by `compressed_len = 0`. + +## Per-Block: Multi-stream Entropy Container + +Each block stores its pre-entropy streams in a multi-stream container: + +``` +[num_streams: u8] +[pre_entropy_len: u32 LE] total pre-entropy byte count (for metadata) +[meta_len: u16 LE] +[meta: meta_len bytes] encoder-specific metadata (round-trips through entropy) +Per-stream framing (depends on entropy coder): + [orig_len: u32 LE] uncompressed stream length + [compressed_len: u32 LE] compressed length (high bits may carry flags) + [payload: compressed_len bytes] +``` + +### Per-stream flags (rANS pipelines) + +The `compressed_len` field's high bits carry variant flags: + +| Bit | Flag | Meaning | +|-----|------|---------| +| 31 | `RANS_INTERLEAVED_FLAG` | N-way interleaved rANS payload | +| 30 | `RANS_RECOIL_FLAG` | Recoil split-point metadata appended | +| 29 | `RANS_SHARED_STREAM_FLAG` | Shared-stream rANS (ryg_rans-style) | + +## Pre-entropy Stream Formats (TokenEncoder) + +### LzSeqEncoder (6 streams) + +Used by: **Lzf**, **LzSeqR**, **LzSeqH**, **SortLz** (as MatchFinder) + +Log2-coded offsets/lengths with repeat offset tracking. Best ratio. + +| Stream | Contents | +|--------|----------| +| flags | Packed bits MSB-first (1=literal, 0=match) | +| literals | u8 per literal token | +| offset_codes | u8 per match (0-2 = repeat offset, 3+ = literal offset code) | +| offset_extra | LSB-first packed bitstream (extra bits per offset code) | +| length_codes | u8 per match | +| length_extra | LSB-first packed bitstream (extra bits per length code) | + +**Meta** (8 bytes): `[num_tokens: u32 LE][num_matches: u32 LE]` + +### LzssEncoder (4 streams) + +Used by: **Lzfi**, **LzssR** + +Flag bits + raw u16 offsets/lengths. + +| Stream | Contents | +|--------|----------| +| flags | Packed bits MSB-first (1=literal, 0=match) | +| literals | u8 per literal token | +| offsets | u16 LE per match | +| lengths | u16 LE per match | + +**Meta** (4 bytes): `[num_tokens: u32 LE]` + +## Entropy Coders + +### FSE (Finite State Entropy) + +Used by: Lzf (stage 1), Lzfi (interleaved), Bw (stage 3), SortLz + +Per-stream: `[orig_len: u32 LE][compressed_len: u32 LE][fse_data]` + +### rANS (range ANS) + +Used by: LzSeqR, LzssR + +Single-stream format: +``` +[scale_bits: u8] [freq_table: 256 x u16 LE] [final_state: u32 LE] +[num_words: u32 LE] [words: num_words x u16 LE] +``` + +Interleaved N-way format: +``` +[scale_bits: u8] [freq_table: 256 x u16 LE] [num_states: u8] +[final_states: N x u32 LE] [num_words: N x u32 LE] +[stream_0_words] [stream_1_words] ... [stream_N-1_words] +``` + +### Huffman + +Used by: LzSeqH + +Per-stream: `[data_len: u32 LE][total_bits: u32 LE][freq_table: 256 x u32 LE][data]` + +## SortLz Standalone Wire Format (v2) + +Pipeline::SortLz (ID 10) uses its own framing with `LzSeqEncoder` + FSE: + +``` +[meta_len: u16 LE] +[meta: meta_len bytes] LzSeq metadata (num_tokens + num_matches) +[num_streams: u8] 6 (LzSeq streams) +Per stream: + [orig_len: u32 LE] uncompressed stream length + [fse_len: u32 LE] FSE-compressed length + [fse_data: fse_len bytes] +``` + +## Pipeline ID Table + +| ID | Pipeline | Pre-entropy | Entropy | Streams | +|----|----------|-------------|---------|---------| +| 1 | Bw | BWT+MTF+RLE | FSE | 1 | +| 2 | Bbw | BBWT+MTF+RLE | FSE | 1 | +| 4 | Lzf | LzSeqEncoder | FSE | 6 | +| 5 | Lzfi | LzssEncoder | Interleaved FSE | 4 | +| 6 | LzssR | LzssEncoder | rANS | 4 | +| 8 | LzSeqR | LzSeqEncoder | rANS | 6 | +| 9 | LzSeqH | LzSeqEncoder | Huffman | 6 | +| 10 | SortLz | SortLz own | FSE | 6 | + +**Retired IDs:** 0 (Deflate), 3 (Lzr), 7 (Lz78R) From f7a077d18ed616da51ad8b527ac8aa1cf0083e3f Mon Sep 17 00:00:00 2001 From: Chris Lundquist Date: Tue, 10 Mar 2026 02:41:28 -0700 Subject: [PATCH 4/4] docs: add investigation TODOs and clean up exec plan index Add TODO-gpu-rans-6stream-bug.md (GPU rANS fails with 6-stream LzSeqR) and TODO-benchmark-lzfi-vs-lzssr.md (consolidation candidate). Move 4 stale/closed plans to completed/ and reorganize index with Investigation TODOs section. Co-Authored-By: Claude Opus 4.6 --- .../active/TODO-benchmark-lzfi-vs-lzssr.md | 39 +++++++++++++++++++ .../active/TODO-gpu-rans-6stream-bug.md | 38 ++++++++++++++++++ docs/exec-plans/active/index.md | 24 +++++------- 3 files changed, 87 insertions(+), 14 deletions(-) create mode 100644 docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md create mode 100644 docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md diff --git a/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md b/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md new file mode 100644 index 0000000..c0957d9 --- /dev/null +++ b/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md @@ -0,0 +1,39 @@ +# TODO: Benchmark Lzfi vs LzssR — consolidation candidate + +## Question + +Are both Lzfi and LzssR worth keeping? They use the same demuxer (LZSS, 4 +streams) and differ only in entropy coder (interleaved FSE vs rANS). + +## Current state + +| Property | Lzfi | LzssR | +|----------|------|-------| +| Demuxer | LzssEncoder (4 streams) | LzssEncoder (4 streams) | +| Entropy | Interleaved FSE | rANS | +| Auto-selected | Yes (high entropy + matches) | Never | +| Pipeline ID | 5 | 6 | +| GPU entropy | Yes (interleaved FSE) | Yes (rANS Recoil) | + +## Known data + +- FSE decode is ~2.2x faster than rANS decode (596 vs 266 MB/s, Criterion) +- FSE encode is comparable to rANS encode (~357 vs 359 MB/s) +- Lzfi auto-selected when: match_density > 0.4 + byte_entropy > 6.0, + or match_density > 0.2 + byte_entropy > 5.0 +- LzssR is only exercised via trial compression or explicit user selection + +## Action items + +1. Run `./scripts/bench.sh` comparing Lzfi vs LzssR on Canterbury+Silesia corpus +2. Run Criterion benchmarks: `cargo bench -- lzfi lzssr` for per-stage timing +3. If LzssR shows no ratio or throughput advantage over Lzfi, consider removing + it to reduce pipeline surface area (similar to Lzr removal) +4. If rANS interleaved or Recoil decode gives LzssR better GPU decode throughput, + document the use case and keep it + +## Files + +- `src/pipeline/stages.rs` — stage dispatch for both pipelines +- `src/pipeline/mod.rs` — `auto_select_pipeline`, `select_pipeline_trial` +- `src/pipeline/blocks.rs` — entropy encode/decode dispatch diff --git a/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md b/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md new file mode 100644 index 0000000..b5a1d6d --- /dev/null +++ b/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md @@ -0,0 +1,38 @@ +# TODO: GPU rANS interleaved decode fails with 6-stream LzSeqR + +## Problem + +GPU rANS interleaved decode works correctly for 4-stream pipelines (LzssR) +but fails for 6-stream pipelines (LzSeqR). CPU rANS interleaved decode +works correctly for both 4 and 6 streams. + +## Evidence + +- `test_gpu_rans_interleaved_decode_round_trip` originally used `Pipeline::Lzr` + (3 streams). After Lzr removal, switching to `Pipeline::LzSeqR` (6 streams) + caused the test to fail with `InvalidInput`. +- Switching to `Pipeline::LzssR` (4 streams) passes. +- CPU rANS interleaved encode/decode with LzSeqR works fine. +- The rANS encode/decode code in `src/pipeline/stages.rs` is stream-count + agnostic — each stream is encoded/decoded independently. + +## Workaround + +Test uses `Pipeline::LzssR` (4-stream) instead of `Pipeline::LzSeqR` (6-stream). +See `src/pipeline/tests.rs:test_gpu_rans_interleaved_decode_round_trip`. + +## Investigation directions + +- LzSeq's `offset_extra` and `length_extra` streams can be very small or empty. + GPU buffer sizing or dispatch dimensions may misbehave with near-zero streams. +- Check if the GPU rANS decode path (`stage_rans_decode_webgpu`) has alignment + assumptions that break with 6 streams. +- Compare the per-stream byte sizes between LzssR (4 streams, all non-trivial) + and LzSeqR (6 streams, some potentially empty) to find the divergence point. +- Test with synthetic 6-stream data where all streams are non-trivially sized. + +## Files + +- `src/pipeline/stages.rs` — `stage_rans_decode_webgpu`, `stage_rans_encode_with_options` +- `src/pipeline/tests.rs` — `test_gpu_rans_interleaved_decode_round_trip` +- `src/webgpu/rans.rs` — GPU rANS implementation diff --git a/docs/exec-plans/active/index.md b/docs/exec-plans/active/index.md index 4571b6c..2626192 100644 --- a/docs/exec-plans/active/index.md +++ b/docs/exec-plans/active/index.md @@ -1,6 +1,6 @@ # Active Execution Plans -**Last Updated:** 2026-03-09 +**Last Updated:** 2026-03-10 ## Active Plans @@ -10,27 +10,23 @@ ### [PLAN-unified-scheduler-perf-validation.md](PLAN-unified-scheduler-perf-validation.md) **Status:** In Progress (Phases 0-1 landed; local baseline captured; Phase 2 optimization started) | **Priority:** P0 -## Parked Plans +## Investigation TODOs -### [PLAN-interleaved-rans.md](PLAN-interleaved-rans.md) -**Status:** PARKED — Phase A merged (PR #91); Phase D cancelled (GPU rANS dead end); Phases B–C need new owner | **Priority:** P1 +### [TODO-gpu-rans-6stream-bug.md](TODO-gpu-rans-6stream-bug.md) +**Status:** Open — GPU rANS interleaved decode fails with 6-stream LzSeqR; works with 4-stream LzssR | **Priority:** P1 -### [PLAN-unified-scheduler-north-star.md](PLAN-unified-scheduler-north-star.md) -**Status:** PARKED — Phases 3–4 done and in production; Phases 2+5 blocked indefinitely (GPU entropy not competitive) | **Priority:** P1 +### [TODO-benchmark-lzfi-vs-lzssr.md](TODO-benchmark-lzfi-vs-lzssr.md) +**Status:** Open — Benchmark whether LzssR is worth keeping vs Lzfi consolidation | **Priority:** P2 ### [TODO-huffman-sync-decode.md](TODO-huffman-sync-decode.md) **Status:** PARKED — valid approach, zero implementation progress, awaiting LzSeq encoding work | **Priority:** P2 -### [agent-harness-implementation.md](agent-harness-implementation.md) -**Status:** PARKED — Phase 1 complete; Phases 2–8 deferred | **Priority:** P1 - -## Closed Plans - -### [PLAN-p0a-gpu-rans-vertical-slice.md](PLAN-p0a-gpu-rans-vertical-slice.md) -**Status:** CLOSED — Slice 4 perf gate failed; GPU rANS 0.54–0.77x CPU after 29+ iterations; structural dead end | **Priority:** was P0 - ## Completed Plans (in ../completed/) +- `PLAN-p0a-gpu-rans-vertical-slice.md` — GPU chunked rANS vertical slice (CLOSED: structural dead end) +- `PLAN-unified-scheduler-north-star.md` — Unified scheduler north star (PARKED: GPU entropy blocked) +- `PLAN-interleaved-rans.md` — Interleaved rANS (PARKED: Phase A merged, Phase D cancelled) +- `agent-harness-implementation.md` — Agent harness (PARKED: Phase 1 complete, rest deferred) - `PLAN-gpu-backpressure-impl.md` — GPU ring buffer batching - `lz77_merge.md` — Cooperative-stitch kernel consolidation - `upgrade-wgpu-to-27.md` — wgpu 24→27 upgrade