From 75d6be44c056954cb8295a3cd342ef149be3575a Mon Sep 17 00:00:00 2001
From: Chris Lundquist <rampantdurandal@gmail.com>
Date: Tue, 10 Mar 2026 02:21:37 -0700
Subject: [PATCH 1/4] refactor: eliminate double-conversion in SortLZ paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add `demux_tokens()` taking `&[LzToken]` directly, skip intermediate
  `Vec<lz77::Match>` in GPU SortLZ coordinator
- Migrate LzSeq SortLZ CPU path from `encode_match_sequence` to
  `parse_matches` → `encode_from_tokens`
- Migrate `encode_optimal` to use `matches_to_tokens` → `encode_from_tokens`
- Delete `matches_to_lz77_greedy`, `matches_to_lz77_lazy` (~110 lines)
- Delete `encode_match_sequence` (~55 lines)
- Replace sortlz LZ77 roundtrip tests with token-based equivalents

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/lzseq/mod.rs         |  65 +----------------
 src/pipeline/demux.rs    |  34 +++++----
 src/pipeline/parallel.rs |   9 ++-
 src/sortlz.rs            | 154 ++++-----------------------------------
 4 files changed, 45 insertions(+), 217 deletions(-)
diff --git a/src/lzseq/mod.rs b/src/lzseq/mod.rs
index ed9105f..6b6e00b 100644
--- a/src/lzseq/mod.rs
+++ b/src/lzseq/mod.rs
@@ -622,66 +622,6 @@ fn emit_match(
     length_extra_writer.write_bits(lev, leb);
 }
 
-/// Encode a pre-computed match sequence into LzSeq token streams.
-///
-/// Used by `encode_optimal` after the backward DP has selected matches,
-/// and by the SortLZ match finder to feed pre-computed matches into the
-/// LzSeq 6-stream format. Applies repeat offset encoding.
-pub fn encode_match_sequence(
-    _input: &[u8],
-    matches: &[crate::lz77::Match],
-    _config: &SeqConfig,
-) -> PzResult<SeqEncoded> {
-    let mut repeats = RepeatOffsets::new();
-    let mut flags_vec: Vec<bool> = Vec::new();
-    let mut literals: Vec<u8> = Vec::new();
-    let mut offset_codes: Vec<u8> = Vec::new();
-    let mut length_codes: Vec<u8> = Vec::new();
-    let mut offset_extra_writer = BitWriter::new();
-    let mut length_extra_writer = BitWriter::new();
-
-    for m in matches {
-        if m.length == 0 {
-            // Literal token from optimal parser
-            flags_vec.push(true);
-            literals.push(m.next);
-        } else {
-            emit_match(
-                m.offset as u32,
-                m.length,
-                &mut repeats,
-                &mut flags_vec,
-                &mut offset_codes,
-                &mut offset_extra_writer,
-                &mut length_codes,
-                &mut length_extra_writer,
-            );
-            // Emit the 'next' byte as a separate literal token.
-            // The DP forward trace produces Match structs where each match covers
-            // `length` bytes plus a "next" byte. The cost model accounts for the
-            // next byte in the match cost, but the actual token stream must emit
-            // the next byte as a literal to avoid data loss.
-            flags_vec.push(true);
-            literals.push(m.next);
-        }
-    }
-
-    let num_tokens = flags_vec.len() as u32;
-    let num_matches = offset_codes.len() as u32;
-    let flags = pack_flags(&flags_vec);
-
-    Ok(SeqEncoded {
-        flags,
-        literals,
-        offset_codes,
-        offset_extra: offset_extra_writer.finish(),
-        length_codes,
-        length_extra: length_extra_writer.finish(),
-        num_tokens,
-        num_matches,
-    })
-}
-
 /// Encode a universal `LzToken` stream into LzSeq's 6-stream format.
 ///
 /// Like `encode_match_sequence` but takes `LzToken` directly instead of
@@ -774,8 +714,9 @@ pub fn encode_optimal(input: &[u8], config: &SeqConfig) -> PzResult<SeqEncoded>
     );
     let matches = crate::optimal::optimal_parse_lzseq(input, &table)?;
 
-    // Encode the optimal match sequence into LzSeq token streams.
-    encode_match_sequence(input, &matches, config)
+    // Convert optimal parse output to universal tokens, then encode.
+    let tokens = crate::lz_token::matches_to_tokens(&matches);
+    encode_from_tokens(&tokens, config)
 }
 
 /// Compress input using LzSeq with lazy matching and configurable window.
diff --git a/src/pipeline/demux.rs b/src/pipeline/demux.rs
index 991d5eb..3f889f5 100644
--- a/src/pipeline/demux.rs
+++ b/src/pipeline/demux.rs
@@ -84,12 +84,8 @@ fn encoder_for_demuxer(demuxer: &LzDemuxer) -> Box<dyn lz_token::TokenEncoder> {
 
 /// Demux pre-computed LZ77 matches into encoder streams for the GPU coordinator.
 ///
-/// Used by the GPU coordinator to demux matches returned from
-/// `find_matches_batched()` without re-running match-finding.
-/// Converts `Vec<Match>` → `Vec<LzToken>` → encoder.encode().
-///
-/// `input` is the original block data — needed by `Lz77Encoder` to look up
-/// trailing literal bytes when consecutive matches appear.
+/// Used by the batched LZ77 GPU path (`find_matches_batched`) which returns
+/// `Vec<lz77::Match>`. Converts Match → LzToken → encoder.encode().
 #[cfg(feature = "webgpu")]
 pub(crate) fn demux_lz77_matches(
     input: &[u8],
@@ -97,9 +93,22 @@ pub(crate) fn demux_lz77_matches(
     pipeline: super::Pipeline,
 ) -> PzResult<DemuxOutput> {
     let tokens = lz_token::matches_to_tokens(&matches);
+    demux_tokens(input, &tokens, pipeline)
+}
+
+/// Demux a universal token stream into encoder streams.
+///
+/// Used by GPU SortLZ coordinator and LzSeq SortLZ CPU path, which produce
+/// `Vec<LzToken>` directly via `sortlz::parse_matches()`.
+#[cfg(feature = "webgpu")]
+pub(crate) fn demux_tokens(
+    input: &[u8],
+    tokens: &[lz_token::LzToken],
+    pipeline: super::Pipeline,
+) -> PzResult<DemuxOutput> {
     let demuxer = demuxer_for_pipeline(pipeline).ok_or(PzError::InvalidInput)?;
     let encoder = encoder_for_demuxer(&demuxer);
-    Ok(encoder.encode(input, &tokens)?.into())
+    Ok(encoder.encode(input, tokens)?.into())
 }
 
 impl StreamDemuxer for LzDemuxer {
@@ -167,8 +176,8 @@ impl StreamDemuxer for LzDemuxer {
                 })
             }
             LzDemuxer::LzSeq => {
-                // SortLZ match finder path: use sort-based matches converted
-                // to LzSeq's 6-stream format via encode_match_sequence.
+                // SortLZ match finder path: sort-based matches → LzToken →
+                // LzSeq 6-stream format via encode_from_tokens.
                 // Prefers GPU radix sort when available for large inputs.
                 if options.match_finder == super::MatchFinder::SortLz {
                     let window = options
@@ -195,10 +204,9 @@ impl StreamDemuxer for LzDemuxer {
                     #[cfg(not(feature = "webgpu"))]
                     let raw_matches = crate::sortlz::find_matches(input, &config);
 
-                    let lz_matches = crate::sortlz::matches_to_lz77_lazy(input, &raw_matches);
-                    let enc = lzseq::encode_match_sequence(
-                        input,
-                        &lz_matches,
+                    let tokens = crate::sortlz::parse_matches(input, &raw_matches, true);
+                    let enc = lzseq::encode_from_tokens(
+                        &tokens,
                         &lzseq::SeqConfig {
                             max_window: window,
                             ..lzseq::SeqConfig::default()
diff --git a/src/pipeline/parallel.rs b/src/pipeline/parallel.rs
index f7956ff..5b14410 100644
--- a/src/pipeline/parallel.rs
+++ b/src/pipeline/parallel.rs
@@ -543,7 +543,7 @@ fn compress_parallel_unified(
                     // Process Stage 0 batch last to avoid starving queued StageN/Fused
                     // continuations when bursts arrive together.
                     if !stage0_batch.is_empty() && uses_sortlz_match_finder {
-                        // SortLZ GPU match finding: per-block dispatch with LZ77 conversion
+                        // SortLZ GPU match finding: per-block dispatch → parse_matches → demux_tokens
                         let sortlz_config = crate::sortlz::SortLzConfig::for_lz77(
                             opts.max_match_len.unwrap_or(crate::lz77::LZ77_MAX_MATCH),
                         );
@@ -552,13 +552,14 @@ fn compress_parallel_unified(
                             let result = engine
                                 .sortlz_find_matches(blocks[block_idx], &sortlz_config)
                                 .and_then(|raw_matches| {
-                                    let lz_matches = crate::sortlz::matches_to_lz77_lazy(
+                                    let tokens = crate::sortlz::parse_matches(
                                         blocks[block_idx],
                                         &raw_matches,
+                                        true, // lazy parsing
                                     );
-                                    let demux = super::demux::demux_lz77_matches(
+                                    let demux = super::demux::demux_tokens(
                                         blocks[block_idx],
-                                        lz_matches,
+                                        &tokens,
                                         pipeline,
                                     )?;
                                     Ok(StageBlock {
diff --git a/src/sortlz.rs b/src/sortlz.rs
index 82ce566..17d5c69 100644
--- a/src/sortlz.rs
+++ b/src/sortlz.rs
@@ -364,128 +364,6 @@ pub(crate) fn parse_matches(
 // SortLZ → LZ77 Match conversion (for feeding into LZ77 pipelines)
 // ---------------------------------------------------------------------------
 
-/// Convert position-indexed matches to an LZ77 match sequence using greedy parsing.
-///
-/// Takes the longest match at every position, emitting `lz77::Match` structs
-/// with the `next` byte. Produces the same format as `lz77::compress_lazy_to_matches()`.
-pub fn matches_to_lz77_greedy(
-    input: &[u8],
-    matches: &[Option<(u16, u16)>],
-) -> Vec<crate::lz77::Match> {
-    let n = input.len();
-    let mut result = Vec::with_capacity(n / 4);
-    let mut pos = 0;
-
-    while pos < n {
-        if let Some((offset, length)) = matches.get(pos).copied().flatten() {
-            let end = pos + length as usize;
-            if end < n {
-                result.push(crate::lz77::Match {
-                    offset,
-                    length,
-                    next: input[end],
-                });
-                pos = end + 1;
-            } else {
-                // Match extends to or past end of input; truncate to leave room for next byte
-                let adj_len = (n - 1 - pos) as u16;
-                if adj_len >= crate::lz77::MIN_MATCH {
-                    result.push(crate::lz77::Match {
-                        offset,
-                        length: adj_len,
-                        next: input[pos + adj_len as usize],
-                    });
-                    pos = pos + adj_len as usize + 1;
-                } else {
-                    result.push(crate::lz77::Match {
-                        offset: 0,
-                        length: 0,
-                        next: input[pos],
-                    });
-                    pos += 1;
-                }
-            }
-        } else {
-            result.push(crate::lz77::Match {
-                offset: 0,
-                length: 0,
-                next: input[pos],
-            });
-            pos += 1;
-        }
-    }
-
-    result
-}
-
-/// Convert position-indexed matches to an LZ77 match sequence using lazy parsing.
-///
-/// If the next position has a longer match, emits a literal for the current
-/// position and takes the longer match instead (gzip-style lazy evaluation).
-pub fn matches_to_lz77_lazy(
-    input: &[u8],
-    matches: &[Option<(u16, u16)>],
-) -> Vec<crate::lz77::Match> {
-    let n = input.len();
-    let mut result = Vec::with_capacity(n / 4);
-    let mut pos = 0;
-
-    while pos < n {
-        if let Some((offset, length)) = matches.get(pos).copied().flatten() {
-            // Lazy check: if next position has a longer match, emit literal here
-            if pos + 1 < n {
-                if let Some((_, next_len)) = matches.get(pos + 1).copied().flatten() {
-                    if next_len > length {
-                        result.push(crate::lz77::Match {
-                            offset: 0,
-                            length: 0,
-                            next: input[pos],
-                        });
-                        pos += 1;
-                        continue;
-                    }
-                }
-            }
-
-            let end = pos + length as usize;
-            if end < n {
-                result.push(crate::lz77::Match {
-                    offset,
-                    length,
-                    next: input[end],
-                });
-                pos = end + 1;
-            } else {
-                let adj_len = (n - 1 - pos) as u16;
-                if adj_len >= crate::lz77::MIN_MATCH {
-                    result.push(crate::lz77::Match {
-                        offset,
-                        length: adj_len,
-                        next: input[pos + adj_len as usize],
-                    });
-                    pos = pos + adj_len as usize + 1;
-                } else {
-                    result.push(crate::lz77::Match {
-                        offset: 0,
-                        length: 0,
-                        next: input[pos],
-                    });
-                    pos += 1;
-                }
-            }
-        } else {
-            result.push(crate::lz77::Match {
-                offset: 0,
-                length: 0,
-                next: input[pos],
-            });
-            pos += 1;
-        }
-    }
-
-    result
-}
-
 /// Compress using the SortLZ pipeline.
 ///
 /// Wire format (v2 — LzSeq-encoded streams + FSE):
@@ -749,33 +627,33 @@ mod tests {
     }
 
     #[test]
-    fn test_lz77_greedy_roundtrip() {
+    fn test_parse_matches_greedy_roundtrip() {
         let input = test_data();
         let config = SortLzConfig::for_lz77(crate::lz77::LZ77_MAX_MATCH);
         let matches = find_matches(&input, &config);
-        let lz_matches = matches_to_lz77_greedy(&input, &matches);
-
-        // Verify matches reconstruct the input via LZ77 decompress
-        let mut lz_bytes = Vec::new();
-        for m in &lz_matches {
-            lz_bytes.extend_from_slice(&m.to_bytes());
-        }
-        let decoded = crate::lz77::decompress(&lz_bytes).unwrap();
+        let tokens = parse_matches(&input, &matches, false); // greedy
+
+        // Verify tokens reconstruct the input via Lz77Encoder round-trip.
+        let encoder = crate::lz_token::Lz77Encoder;
+        let encoded = encoder.encode(&input, &tokens).unwrap();
+        let decoded = encoder
+            .decode(encoded.streams, &encoded.meta, input.len())
+            .unwrap();
         assert_eq!(decoded, input);
     }
 
     #[test]
-    fn test_lz77_lazy_roundtrip() {
+    fn test_parse_matches_lazy_roundtrip() {
         let input = test_data();
         let config = SortLzConfig::for_lz77(crate::lz77::LZ77_MAX_MATCH);
         let matches = find_matches(&input, &config);
-        let lz_matches = matches_to_lz77_lazy(&input, &matches);
+        let tokens = parse_matches(&input, &matches, true); // lazy
 
-        let mut lz_bytes = Vec::new();
-        for m in &lz_matches {
-            lz_bytes.extend_from_slice(&m.to_bytes());
-        }
-        let decoded = crate::lz77::decompress(&lz_bytes).unwrap();
+        let encoder = crate::lz_token::Lz77Encoder;
+        let encoded = encoder.encode(&input, &tokens).unwrap();
+        let decoded = encoder
+            .decode(encoded.streams, &encoded.meta, input.len())
+            .unwrap();
         assert_eq!(decoded, input);
     }
 

From 502adf34d8cd665248bbe909c0d8960e0c84f6da Mon Sep 17 00:00:00 2001
From: Chris Lundquist <rampantdurandal@gmail.com>
Date: Tue, 10 Mar 2026 02:25:42 -0700
Subject: [PATCH 2/4] refactor: unify LzSeq Optimal+SortLz through tokenize()

Route SortLz and Optimal parse strategies through the shared tokenize()
entry point for LzSeq pipelines. This gives LzSeqR/LzSeqH access to GPU
match finding and unified parse strategy dispatch for these modes.

Keep encode_with_config() for the default lazy/greedy CPU path (tuned
adaptive chain depth, hash4 prefix, repeat-offset-aware matching).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/pipeline/demux.rs | 53 +++++++++++++------------------------------
 1 file changed, 16 insertions(+), 37 deletions(-)

diff --git a/src/pipeline/demux.rs b/src/pipeline/demux.rs
index 3f889f5..1bf157d 100644
--- a/src/pipeline/demux.rs
+++ b/src/pipeline/demux.rs
@@ -176,46 +176,27 @@ impl StreamDemuxer for LzDemuxer {
                 })
             }
             LzDemuxer::LzSeq => {
-                // SortLZ match finder path: sort-based matches → LzToken →
-                // LzSeq 6-stream format via encode_from_tokens.
-                // Prefers GPU radix sort when available for large inputs.
-                if options.match_finder == super::MatchFinder::SortLz {
-                    let window = options
-                        .seq_window_size
-                        .unwrap_or_else(|| lzseq::SeqConfig::default().max_window);
-                    let config = crate::sortlz::SortLzConfig {
-                        max_window: window,
-                        ..crate::sortlz::SortLzConfig::default()
-                    };
-
-                    // GPU path for SortLZ match finding
-                    #[cfg(feature = "webgpu")]
-                    let raw_matches = if let Some(ref engine) = options.webgpu_engine {
-                        if input.len() >= crate::webgpu::MIN_GPU_INPUT_SIZE
-                            && input.len() <= engine.max_dispatch_input_size()
-                        {
-                            engine.sortlz_find_matches(input, &config)?
-                        } else {
-                            crate::sortlz::find_matches(input, &config)
-                        }
-                    } else {
-                        crate::sortlz::find_matches(input, &config)
-                    };
-                    #[cfg(not(feature = "webgpu"))]
-                    let raw_matches = crate::sortlz::find_matches(input, &config);
-
-                    let tokens = crate::sortlz::parse_matches(input, &raw_matches, true);
+                // SortLz and Optimal strategies route through the shared
+                // tokenize() entry point, which handles GPU/CPU dispatch,
+                // SortLz match finding, and optimal parsing uniformly.
+                let use_tokenize = options.match_finder == super::MatchFinder::SortLz
+                    || options.parse_strategy == ParseStrategy::Optimal;
+
+                if use_tokenize {
+                    let tokens = super::tokenize(input, options)?;
+                    let defaults = lzseq::SeqConfig::default();
                     let enc = lzseq::encode_from_tokens(
                         &tokens,
                         &lzseq::SeqConfig {
-                            max_window: window,
-                            ..lzseq::SeqConfig::default()
+                            max_window: options.seq_window_size.unwrap_or(defaults.max_window),
+                            max_match_len: options.max_match_len.unwrap_or(defaults.max_match_len),
+                            ..defaults
                         },
                     )?;
                     return Ok(seq_encoded_to_demux(enc));
                 }
 
-                // GPU path: match finding + demux on-device
+                // GPU path: fused match finding + demux on-device
                 #[cfg(feature = "webgpu")]
                 if let Backend::WebGpu = options.backend {
                     if let Some(ref engine) = options.webgpu_engine {
@@ -228,17 +209,15 @@ impl StreamDemuxer for LzDemuxer {
                     }
                 }
 
-                // CPU path
+                // Default CPU path: encode_with_config (tuned lazy matching
+                // with repeat-offset awareness, adaptive chain depth, hash4).
                 let defaults = lzseq::SeqConfig::default();
                 let config = lzseq::SeqConfig {
                     max_window: options.seq_window_size.unwrap_or(defaults.max_window),
                     max_match_len: options.max_match_len.unwrap_or(defaults.max_match_len),
                     ..defaults
                 };
-                let enc = match options.parse_strategy {
-                    ParseStrategy::Optimal => lzseq::encode_optimal(input, &config)?,
-                    _ => lzseq::encode_with_config(input, &config)?,
-                };
+                let enc = lzseq::encode_with_config(input, &config)?;
                 Ok(seq_encoded_to_demux(enc))
             }
         }

From 241cf450f8ccf942ad2a98726ea6c42effdfcc21 Mon Sep 17 00:00:00 2001
From: Chris Lundquist <rampantdurandal@gmail.com>
Date: Tue, 10 Mar 2026 02:38:43 -0700
Subject: [PATCH 3/4] docs: update ARCHITECTURE.md and add wire-formats.md

Update multi-stream section to reflect TokenEncoder architecture
(LzSeqEncoder 6-stream, LzssEncoder 4-stream). Add "Active architecture"
section documenting GPU vs CPU design decisions. Add comprehensive
wire format reference covering container V2, per-block framing, entropy
coders, and pipeline ID table.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ARCHITECTURE.md                               | 103 ++++++-------
 .../PLAN-interleaved-rans.md                  |   0
 .../PLAN-p0a-gpu-rans-vertical-slice.md       |   0
 .../PLAN-unified-scheduler-north-star.md      |   0
 .../agent-harness-implementation.md           |   0
 docs/wire-formats.md                          | 140 ++++++++++++++++++
 6 files changed, 187 insertions(+), 56 deletions(-)
 rename docs/exec-plans/{active => completed}/PLAN-interleaved-rans.md (100%)
 rename docs/exec-plans/{active => completed}/PLAN-p0a-gpu-rans-vertical-slice.md (100%)
 rename docs/exec-plans/{active => completed}/PLAN-unified-scheduler-north-star.md (100%)
 rename docs/exec-plans/{active => completed}/agent-harness-implementation.md (100%)
 create mode 100644 docs/wire-formats.md

diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
index 4841764..4d95809 100644
--- a/ARCHITECTURE.md
+++ b/ARCHITECTURE.md
@@ -25,59 +25,32 @@ For day-to-day development instructions, see `CLAUDE.md`.
 
 ## Multi-stream entropy coding
 
-The Lzf and LzSeqR pipelines use **multi-stream entropy coding** to improve
-compression ratio by separating LZ77 output into independent byte streams with
-tighter symbol distributions. Instead of feeding one mixed stream to the entropy
-coder, the encoder deinterleaves tokens into three streams:
+All LZ-based pipelines use **multi-stream entropy coding**: the match finder
+produces a universal `LzToken` stream (literals + matches), and a pluggable
+`TokenEncoder` splits it into independent byte streams with tighter symbol
+distributions. Each stream gets its own entropy coder, yielding lower
+per-stream entropy than a single combined stream.
 
-| Stream | Contents | Why it helps |
-|--------|----------|-------------|
-| **Offsets** | High bytes of match offsets (offset >> 8) | Offsets cluster in a narrow range; dedicated Huffman/RC table exploits this |
-| **Lengths** | Match lengths (capped to u8) | Length distribution is highly skewed (short matches dominate) |
-| **Literals** | Literal bytes + low offset bytes + next bytes | Natural-language / binary byte distribution |
+### Wire encoders (`src/lz_token.rs`)
 
-Each stream gets its own FSE table (Lzf) or rANS context (LzSeqR),
-yielding lower per-stream entropy than a single combined stream.
+| Encoder | Streams | Used by | Contents |
+|---------|---------|---------|----------|
+| `LzSeqEncoder` | 6 | Lzf, LzSeqR, LzSeqH, SortLz | flags, literals, offset_codes, offset_extra, length_codes, length_extra |
+| `LzssEncoder` | 4 | Lzfi, LzssR | flags (1-bit per token), literals, offsets (u16 LE), lengths (u16 LE) |
 
-### Encoding format
+`LzSeqEncoder` uses log2-coded offsets and lengths with repeat offset tracking,
+achieving the best ratio (~32% on Canterbury+large). `LzssEncoder` uses raw u16
+values with flag bits, trading ratio for simplicity.
 
-Multi-stream data is stored with a `0x02` stream-format flag in the container
-header, followed by three length-prefixed compressed sub-streams:
+### Architecture
 
 ```
-[stream_format: u8 = 0x02]
-[offsets_len: u32 LE] [offsets compressed data...]
-[lengths_len: u32 LE] [lengths compressed data...]
-[literals_len: u32 LE] [literals compressed data...]
+Input → tokenize() → Vec<LzToken> → TokenEncoder::encode() → EncodedStreams → entropy coding
 ```
 
-The decoder reads the flag, decompresses each sub-stream independently, then
-reinterleaves them back into the original LZ77 token sequence. Single-stream
-format (`0x01`) is used as fallback for small inputs (< 256 bytes) or when
-multi-stream produces larger output.
-
-### Benchmark results
-
-Comparison on Canterbury + Large corpus (14 files, 13.3 MB total), averaged over
-3 iterations. "Before" = single-stream, "After" = multi-stream:
-
-**Compression (size and throughput):**
-
-| Pipeline | Before (bytes) | After (bytes) | Size delta | Throughput delta |
-|----------|---------------|--------------|------------|-----------------|
-| Lzf      | 6,199,044     | 5,107,601    | **-17.6%** | +2.8% faster    |
-
-**Decompression throughput:**
-
-| Pipeline | Throughput delta |
-|----------|-----------------|
-| Lzf      | **+2.4%** faster |
-
-Multi-stream is a pure win: better compression **and** faster speed. The largest
-gains are on big files (E.coli: -21% size, +11% decode throughput; bible.txt:
--14% size, +16% decode throughput). Small files (< 4 KB) may see slight
-expansion due to the overhead of three separate stream headers — the encoder
-automatically falls back to single-stream when multi-stream would be larger.
+Match finding and parsing (`tokenize()` in `src/pipeline/mod.rs`) are decoupled
+from wire encoding. The `tokenize()` entry point handles GPU/CPU dispatch,
+SortLz vs HashChain match finding, and parse strategy (greedy/lazy/optimal).
 
 ## rANS entropy coder
 
@@ -163,21 +136,20 @@ match verification. Zero atomics, fully deterministic — ideal for GPU executio
 When used as a `MatchFinder`, SortLZ is transparent to the wire format — the
 output is 100% compatible with the host pipeline (Lzf, LzSeqR, LzSeqH). The consumer and decompressor see no difference.
 
-### Pipeline::SortLz wire format (per block)
+### Pipeline::SortLz wire format v2 (per block)
 
 ```
-[num_tokens: u32 LE]       total token count (literals + matches)
-[num_literals: u32 LE]     literal count
-[flags_len: u32 LE]        ceil(num_tokens / 8)
-[flags: flags_len bytes]   bitfield (1 = literal, 0 = match, MSB-first)
-[fse_lit_len: u32 LE]      [fse_literals: ...]   FSE-encoded literal bytes
-[fse_off_len: u32 LE]      [fse_offsets: ...]    FSE-encoded u16 LE offsets
-[fse_len_len: u32 LE]      [fse_lengths: ...]    FSE-encoded u16 LE lengths
+[meta_len: u16 LE]         LzSeq metadata length
+[meta: meta_len bytes]     num_tokens + num_matches (u32 each)
+[num_streams: u8]          number of streams (6 for LzSeq)
+per stream:
+  [orig_len: u32 LE]       uncompressed stream length
+  [fse_len: u32 LE]        FSE-compressed length
+  [fse_data: fse_len bytes]
 ```
 
-This is NOT wire-compatible with any other pipeline. It uses FSE entropy coding
-on three raw byte streams (literals, offsets as u16 LE, lengths as u16 LE),
-with a bitfield flag stream to interleave them during decompression.
+SortLz uses `LzSeqEncoder` for wire encoding + FSE for entropy. The 6 streams
+are: flags, literals, offset_codes, offset_extra, length_codes, length_extra.
 
 ### Algorithm
 
@@ -280,6 +252,25 @@ GPU path.
 See `docs/design-docs/gpu-strategy.md` for full analysis and `CLAUDE.md` "Known
 dead ends" for the complete list of GPU optimization attempts that failed.
 
+## Active architecture: what GPU does / doesn't do
+
+### GPU-accelerated paths (shipping)
+- **LZ77 match finding** — GPU hash-table kernel for Lzf/LzSeq pipelines (2x faster at 256KB+)
+- **SortLZ match finding** — GPU radix sort + match verification (10.6x faster at 4MB)
+- **BWT suffix array** — GPU radix sort with prefix-doubling
+- **Interleaved FSE** — GPU-accelerated encode/decode for Lzfi pipeline
+- **rANS Recoil decode** — GPU-accelerated parallel rANS decode using split-point metadata
+
+### CPU-only paths (by design)
+- **All entropy coding in the streaming CLI** — `streaming::compress_stream` uses CPU rANS/FSE
+- **rANS/FSE/Huffman encode/decode** — serial state machines, GPU is 0.54-0.77x CPU speed
+- **LZ77 lazy evaluation** — sequential dependency (next match depends on current)
+- **LzSeq `encode_with_config`** — repeat offset tracking, adaptive chain depth, hash4 prefix
+
+### Threshold gates
+- `GPU_ENTROPY_THRESHOLD` (256KB) > `DEFAULT_GPU_BLOCK_SIZE` (128KB) — prevents entropy routing to GPU
+- `MIN_GPU_INPUT_SIZE` — minimum block size for GPU dispatch (avoids setup overhead)
+
 ## Remaining GPU bottlenecks
 
 1. **GPU BWT still slower than CPU SA-IS** — Radix sort improved 7-14x over bitonic
diff --git a/docs/exec-plans/active/PLAN-interleaved-rans.md b/docs/exec-plans/completed/PLAN-interleaved-rans.md
similarity index 100%
rename from docs/exec-plans/active/PLAN-interleaved-rans.md
rename to docs/exec-plans/completed/PLAN-interleaved-rans.md
diff --git a/docs/exec-plans/active/PLAN-p0a-gpu-rans-vertical-slice.md b/docs/exec-plans/completed/PLAN-p0a-gpu-rans-vertical-slice.md
similarity index 100%
rename from docs/exec-plans/active/PLAN-p0a-gpu-rans-vertical-slice.md
rename to docs/exec-plans/completed/PLAN-p0a-gpu-rans-vertical-slice.md
diff --git a/docs/exec-plans/active/PLAN-unified-scheduler-north-star.md b/docs/exec-plans/completed/PLAN-unified-scheduler-north-star.md
similarity index 100%
rename from docs/exec-plans/active/PLAN-unified-scheduler-north-star.md
rename to docs/exec-plans/completed/PLAN-unified-scheduler-north-star.md
diff --git a/docs/exec-plans/active/agent-harness-implementation.md b/docs/exec-plans/completed/agent-harness-implementation.md
similarity index 100%
rename from docs/exec-plans/active/agent-harness-implementation.md
rename to docs/exec-plans/completed/agent-harness-implementation.md
diff --git a/docs/wire-formats.md b/docs/wire-formats.md
new file mode 100644
index 0000000..76bc0d6
--- /dev/null
+++ b/docs/wire-formats.md
@@ -0,0 +1,140 @@
+# Wire Formats Reference
+
+Pre-release format (pre-1.0). All formats subject to change.
+
+## Container Format (V2, multi-block)
+
+```
+[magic: 2 bytes = "PZ"]
+[version: u8 = 2]
+[pipeline_id: u8]              see Pipeline ID Table below
+[original_len: u32 LE]         total uncompressed size
+[num_blocks: u32 LE]           number of blocks
+Block table (num_blocks entries):
+  [compressed_len: u32 LE]
+  [original_len: u32 LE]
+Block data: concatenated compressed block bytes
+```
+
+**Framed (streaming) mode:** `num_blocks = 0xFFFFFFFF`, blocks are
+length-delimited pairs `[compressed_len: u32][original_len: u32][data]`
+terminated by `compressed_len = 0`.
+
+## Per-Block: Multi-stream Entropy Container
+
+Each block stores its pre-entropy streams in a multi-stream container:
+
+```
+[num_streams: u8]
+[pre_entropy_len: u32 LE]      total pre-entropy byte count (for metadata)
+[meta_len: u16 LE]
+[meta: meta_len bytes]         encoder-specific metadata (round-trips through entropy)
+Per-stream framing (depends on entropy coder):
+  [orig_len: u32 LE]           uncompressed stream length
+  [compressed_len: u32 LE]     compressed length (high bits may carry flags)
+  [payload: compressed_len bytes]
+```
+
+### Per-stream flags (rANS pipelines)
+
+The `compressed_len` field's high bits carry variant flags:
+
+| Bit | Flag | Meaning |
+|-----|------|---------|
+| 31 | `RANS_INTERLEAVED_FLAG` | N-way interleaved rANS payload |
+| 30 | `RANS_RECOIL_FLAG` | Recoil split-point metadata appended |
+| 29 | `RANS_SHARED_STREAM_FLAG` | Shared-stream rANS (ryg_rans-style) |
+
+## Pre-entropy Stream Formats (TokenEncoder)
+
+### LzSeqEncoder (6 streams)
+
+Used by: **Lzf**, **LzSeqR**, **LzSeqH**, **SortLz** (as MatchFinder)
+
+Log2-coded offsets/lengths with repeat offset tracking. Best ratio.
+
+| Stream | Contents |
+|--------|----------|
+| flags | Packed bits MSB-first (1=literal, 0=match) |
+| literals | u8 per literal token |
+| offset_codes | u8 per match (0-2 = repeat offset, 3+ = literal offset code) |
+| offset_extra | LSB-first packed bitstream (extra bits per offset code) |
+| length_codes | u8 per match |
+| length_extra | LSB-first packed bitstream (extra bits per length code) |
+
+**Meta** (8 bytes): `[num_tokens: u32 LE][num_matches: u32 LE]`
+
+### LzssEncoder (4 streams)
+
+Used by: **Lzfi**, **LzssR**
+
+Flag bits + raw u16 offsets/lengths.
+
+| Stream | Contents |
+|--------|----------|
+| flags | Packed bits MSB-first (1=literal, 0=match) |
+| literals | u8 per literal token |
+| offsets | u16 LE per match |
+| lengths | u16 LE per match |
+
+**Meta** (4 bytes): `[num_tokens: u32 LE]`
+
+## Entropy Coders
+
+### FSE (Finite State Entropy)
+
+Used by: Lzf (stage 1), Lzfi (interleaved), Bw (stage 3), SortLz
+
+Per-stream: `[orig_len: u32 LE][compressed_len: u32 LE][fse_data]`
+
+### rANS (range ANS)
+
+Used by: LzSeqR, LzssR
+
+Single-stream format:
+```
+[scale_bits: u8] [freq_table: 256 x u16 LE] [final_state: u32 LE]
+[num_words: u32 LE] [words: num_words x u16 LE]
+```
+
+Interleaved N-way format:
+```
+[scale_bits: u8] [freq_table: 256 x u16 LE] [num_states: u8]
+[final_states: N x u32 LE] [num_words: N x u32 LE]
+[stream_0_words] [stream_1_words] ... [stream_N-1_words]
+```
+
+### Huffman
+
+Used by: LzSeqH
+
+Per-stream: `[data_len: u32 LE][total_bits: u32 LE][freq_table: 256 x u32 LE][data]`
+
+## SortLz Standalone Wire Format (v2)
+
+Pipeline::SortLz (ID 10) uses its own framing with `LzSeqEncoder` + FSE:
+
+```
+[meta_len: u16 LE]
+[meta: meta_len bytes]         LzSeq metadata (num_tokens + num_matches)
+[num_streams: u8]              6 (LzSeq streams)
+Per stream:
+  [orig_len: u32 LE]           uncompressed stream length
+  [fse_len: u32 LE]            FSE-compressed length
+  [fse_data: fse_len bytes]
+```
+
+## Pipeline ID Table
+
+| ID | Pipeline | Pre-entropy | Entropy | Streams |
+|----|----------|-------------|---------|---------|
+| 1 | Bw | BWT+MTF+RLE | FSE | 1 |
+| 2 | Bbw | BBWT+MTF+RLE | FSE | 1 |
+| 4 | Lzf | LzSeqEncoder | FSE | 6 |
+| 5 | Lzfi | LzssEncoder | Interleaved FSE | 4 |
+| 6 | LzssR | LzssEncoder | rANS | 4 |
+| 8 | LzSeqR | LzSeqEncoder | rANS | 6 |
+| 9 | LzSeqH | LzSeqEncoder | Huffman | 6 |
+| 10 | SortLz | SortLz own | FSE | 6 |
+
+**Retired IDs:** 0 (Deflate), 3 (Lzr), 7 (Lz78R)

From f7a077d18ed616da51ad8b527ac8aa1cf0083e3f Mon Sep 17 00:00:00 2001
From: Chris Lundquist <rampantdurandal@gmail.com>
Date: Tue, 10 Mar 2026 02:41:28 -0700
Subject: [PATCH 4/4] docs: add investigation TODOs and clean up exec plan
 index

Add TODO-gpu-rans-6stream-bug.md (GPU rANS fails with 6-stream LzSeqR)
and TODO-benchmark-lzfi-vs-lzssr.md (consolidation candidate). Move 4
stale/closed plans to completed/ and reorganize index with Investigation
TODOs section.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../active/TODO-benchmark-lzfi-vs-lzssr.md    | 39 +++++++++++++++++++
 .../active/TODO-gpu-rans-6stream-bug.md       | 38 ++++++++++++++++++
 docs/exec-plans/active/index.md               | 24 +++++-------
 3 files changed, 87 insertions(+), 14 deletions(-)
 create mode 100644 docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md
 create mode 100644 docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md

diff --git a/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md b/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md
new file mode 100644
index 0000000..c0957d9
--- /dev/null
+++ b/docs/exec-plans/active/TODO-benchmark-lzfi-vs-lzssr.md
@@ -0,0 +1,39 @@
+# TODO: Benchmark Lzfi vs LzssR — consolidation candidate
+
+## Question
+
+Are both Lzfi and LzssR worth keeping? They use the same demuxer (LZSS, 4
+streams) and differ only in entropy coder (interleaved FSE vs rANS).
+
+## Current state
+
+| Property | Lzfi | LzssR |
+|----------|------|-------|
+| Demuxer | LzssEncoder (4 streams) | LzssEncoder (4 streams) |
+| Entropy | Interleaved FSE | rANS |
+| Auto-selected | Yes (high entropy + matches) | Never |
+| Pipeline ID | 5 | 6 |
+| GPU entropy | Yes (interleaved FSE) | Yes (rANS Recoil) |
+
+## Known data
+
+- FSE decode is ~2.2x faster than rANS decode (596 vs 266 MB/s, Criterion)
+- FSE encode is comparable to rANS encode (~357 vs 359 MB/s)
+- Lzfi auto-selected when: match_density > 0.4 + byte_entropy > 6.0,
+  or match_density > 0.2 + byte_entropy > 5.0
+- LzssR is only exercised via trial compression or explicit user selection
+
+## Action items
+
+1. Run `./scripts/bench.sh` comparing Lzfi vs LzssR on Canterbury+Silesia corpus
+2. Run Criterion benchmarks: `cargo bench -- lzfi lzssr` for per-stage timing
+3. If LzssR shows no ratio or throughput advantage over Lzfi, consider removing
+   it to reduce pipeline surface area (similar to Lzr removal)
+4. If rANS interleaved or Recoil decode gives LzssR better GPU decode throughput,
+   document the use case and keep it
+
+## Files
+
+- `src/pipeline/stages.rs` — stage dispatch for both pipelines
+- `src/pipeline/mod.rs` — `auto_select_pipeline`, `select_pipeline_trial`
+- `src/pipeline/blocks.rs` — entropy encode/decode dispatch
diff --git a/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md b/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md
new file mode 100644
index 0000000..b5a1d6d
--- /dev/null
+++ b/docs/exec-plans/active/TODO-gpu-rans-6stream-bug.md
@@ -0,0 +1,38 @@
+# TODO: GPU rANS interleaved decode fails with 6-stream LzSeqR
+
+## Problem
+
+GPU rANS interleaved decode works correctly for 4-stream pipelines (LzssR)
+but fails for 6-stream pipelines (LzSeqR). CPU rANS interleaved decode
+works correctly for both 4 and 6 streams.
+
+## Evidence
+
+- `test_gpu_rans_interleaved_decode_round_trip` originally used `Pipeline::Lzr`
+  (3 streams). After Lzr removal, switching to `Pipeline::LzSeqR` (6 streams)
+  caused the test to fail with `InvalidInput`.
+- Switching to `Pipeline::LzssR` (4 streams) passes.
+- CPU rANS interleaved encode/decode with LzSeqR works fine.
+- The rANS encode/decode code in `src/pipeline/stages.rs` is stream-count
+  agnostic — each stream is encoded/decoded independently.
+
+## Workaround
+
+Test uses `Pipeline::LzssR` (4-stream) instead of `Pipeline::LzSeqR` (6-stream).
+See `src/pipeline/tests.rs:test_gpu_rans_interleaved_decode_round_trip`.
+
+## Investigation directions
+
+- LzSeq's `offset_extra` and `length_extra` streams can be very small or empty.
+  GPU buffer sizing or dispatch dimensions may misbehave with near-zero streams.
+- Check if the GPU rANS decode path (`stage_rans_decode_webgpu`) has alignment
+  assumptions that break with 6 streams.
+- Compare the per-stream byte sizes between LzssR (4 streams, all non-trivial)
+  and LzSeqR (6 streams, some potentially empty) to find the divergence point.
+- Test with synthetic 6-stream data where all streams are non-trivially sized.
+
+## Files
+
+- `src/pipeline/stages.rs` — `stage_rans_decode_webgpu`, `stage_rans_encode_with_options`
+- `src/pipeline/tests.rs` — `test_gpu_rans_interleaved_decode_round_trip`
+- `src/webgpu/rans.rs` — GPU rANS implementation
diff --git a/docs/exec-plans/active/index.md b/docs/exec-plans/active/index.md
index 4571b6c..2626192 100644
--- a/docs/exec-plans/active/index.md
+++ b/docs/exec-plans/active/index.md
@@ -1,6 +1,6 @@
 # Active Execution Plans
 
-**Last Updated:** 2026-03-09
+**Last Updated:** 2026-03-10
 
 ## Active Plans
 
@@ -10,27 +10,23 @@
 ### [PLAN-unified-scheduler-perf-validation.md](PLAN-unified-scheduler-perf-validation.md)
 **Status:** In Progress (Phases 0-1 landed; local baseline captured; Phase 2 optimization started) | **Priority:** P0
 
-## Parked Plans
+## Investigation TODOs
 
-### [PLAN-interleaved-rans.md](PLAN-interleaved-rans.md)
-**Status:** PARKED — Phase A merged (PR #91); Phase D cancelled (GPU rANS dead end); Phases B–C need new owner | **Priority:** P1
+### [TODO-gpu-rans-6stream-bug.md](TODO-gpu-rans-6stream-bug.md)
+**Status:** Open — GPU rANS interleaved decode fails with 6-stream LzSeqR; works with 4-stream LzssR | **Priority:** P1
 
-### [PLAN-unified-scheduler-north-star.md](PLAN-unified-scheduler-north-star.md)
-**Status:** PARKED — Phases 3–4 done and in production; Phases 2+5 blocked indefinitely (GPU entropy not competitive) | **Priority:** P1
+### [TODO-benchmark-lzfi-vs-lzssr.md](TODO-benchmark-lzfi-vs-lzssr.md)
+**Status:** Open — Benchmark whether LzssR is worth keeping vs Lzfi consolidation | **Priority:** P2
 
 ### [TODO-huffman-sync-decode.md](TODO-huffman-sync-decode.md)
 **Status:** PARKED — valid approach, zero implementation progress, awaiting LzSeq encoding work | **Priority:** P2
 
-### [agent-harness-implementation.md](agent-harness-implementation.md)
-**Status:** PARKED — Phase 1 complete; Phases 2–8 deferred | **Priority:** P1
-
-## Closed Plans
-
-### [PLAN-p0a-gpu-rans-vertical-slice.md](PLAN-p0a-gpu-rans-vertical-slice.md)
-**Status:** CLOSED — Slice 4 perf gate failed; GPU rANS 0.54–0.77x CPU after 29+ iterations; structural dead end | **Priority:** was P0
-
 ## Completed Plans (in ../completed/)
 
+- `PLAN-p0a-gpu-rans-vertical-slice.md` — GPU chunked rANS vertical slice (CLOSED: structural dead end)
+- `PLAN-unified-scheduler-north-star.md` — Unified scheduler north star (PARKED: GPU entropy blocked)
+- `PLAN-interleaved-rans.md` — Interleaved rANS (PARKED: Phase A merged, Phase D cancelled)
+- `agent-harness-implementation.md` — Agent harness (PARKED: Phase 1 complete, rest deferred)
 - `PLAN-gpu-backpressure-impl.md` — GPU ring buffer batching
 - `lz77_merge.md` — Cooperative-stitch kernel consolidation
 - `upgrade-wgpu-to-27.md` — wgpu 24→27 upgrade