diff --git a/Cargo.toml b/Cargo.toml index 7ff85b4..3ebdb98 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,10 @@ harness = false name = "throughput_lzfi" harness = false +[[bench]] +name = "throughput_lzseqr" +harness = false + [[bench]] name = "stages_bwt" harness = false @@ -61,10 +65,6 @@ harness = false name = "stages_lzss" harness = false -[[bench]] -name = "stages_lz78" -harness = false - [[bench]] name = "stages_lz_comparison" harness = false diff --git a/benches/stages_lz78.rs b/benches/stages_lz78.rs deleted file mode 100644 index 7ea04ad..0000000 --- a/benches/stages_lz78.rs +++ /dev/null @@ -1,33 +0,0 @@ -#[path = "stages_common.rs"] -mod stages_common; - -use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; -use stages_common::{cap, get_test_data}; - -const SIZES_ALL: &[usize] = &[8192, 65536, 4_194_304]; - -fn bench_lz78(c: &mut Criterion) { - let mut group = c.benchmark_group("lz78"); - cap(&mut group); - for &size in SIZES_ALL { - let data = get_test_data(size); - group.throughput(Throughput::Bytes(size as u64)); - - group.bench_with_input(BenchmarkId::new("encode", size), &data, |b, data| { - b.iter(|| pz::lz78::encode(data).unwrap()); - }); - - let compressed = pz::lz78::encode(&data).unwrap(); - group.bench_with_input( - BenchmarkId::new("decode", size), - &compressed, - |b, compressed| { - b.iter(|| pz::lz78::decode(compressed).unwrap()); - }, - ); - } - group.finish(); -} - -criterion_group!(benches, bench_lz78); -criterion_main!(benches); diff --git a/benches/stages_lz_comparison.rs b/benches/stages_lz_comparison.rs index 1d4971d..f5fcc19 100644 --- a/benches/stages_lz_comparison.rs +++ b/benches/stages_lz_comparison.rs @@ -18,13 +18,9 @@ fn bench_lz_comparison(c: &mut Criterion) { group.bench_with_input(BenchmarkId::new("lzss_compress", size), &data, |b, data| { b.iter(|| pz::lzss::encode(data).unwrap()); }); - group.bench_with_input(BenchmarkId::new("lz78_compress", size), &data, |b, data| { - b.iter(|| pz::lz78::encode(data).unwrap()); - }); let lz77_c = pz::lz77::compress_lazy(&data).unwrap(); let lzss_c = pz::lzss::encode(&data).unwrap(); - let lz78_c = pz::lz78::encode(&data).unwrap(); group.bench_with_input( BenchmarkId::new("lz77_decompress", size), @@ -40,13 +36,6 @@ fn bench_lz_comparison(c: &mut Criterion) { b.iter(|| pz::lzss::decode(c).unwrap()); }, ); - group.bench_with_input( - BenchmarkId::new("lz78_decompress", size), - &lz78_c, - |b, c| { - b.iter(|| pz::lz78::decode(c).unwrap()); - }, - ); group.finish(); } diff --git a/benches/stages_lz_plus_fse.rs b/benches/stages_lz_plus_fse.rs index bc437d4..e7b96bb 100644 --- a/benches/stages_lz_plus_fse.rs +++ b/benches/stages_lz_plus_fse.rs @@ -34,17 +34,6 @@ fn bench_lz_plus_fse(c: &mut Criterion) { }, ); - group.bench_with_input( - BenchmarkId::new("lz78_fse_compress", size), - &data, - |b, data| { - b.iter(|| { - let lz = pz::lz78::encode(data).unwrap(); - pz::fse::encode(&lz) - }); - }, - ); - group.finish(); } diff --git a/benches/throughput_lzseqr.rs b/benches/throughput_lzseqr.rs new file mode 100644 index 0000000..2a5cb60 --- /dev/null +++ b/benches/throughput_lzseqr.rs @@ -0,0 +1,23 @@ +#[path = "throughput_common.rs"] +mod throughput_common; + +use criterion::{criterion_group, criterion_main, Criterion}; +use pz::pipeline::Pipeline; +use throughput_common::{run_throughput_benches, ThroughputBenchSpec}; + +const SPEC: ThroughputBenchSpec = ThroughputBenchSpec { + id: "lzseqr", + pipeline: Pipeline::LzSeqR, + parallel: true, + large: true, + decompress_large: true, + webgpu: false, + webgpu_large: false, +}; + +fn bench(c: &mut Criterion) { + run_throughput_benches(c, &SPEC); +} + +criterion_group!(benches, bench); +criterion_main!(benches); diff --git a/docs/exec-plans/tech-debt-tracker.md b/docs/exec-plans/tech-debt-tracker.md index 27170e2..ed4b805 100644 --- a/docs/exec-plans/tech-debt-tracker.md +++ b/docs/exec-plans/tech-debt-tracker.md @@ -29,7 +29,7 @@ Catalog of known issues, gaps, and technical debt in libpz. Items are prioritize - `fuzz_pipeline_roundtrip` — all 9 pipelines compress/decompress roundtrip - `fuzz_decompress` — arbitrary bytes to decompress (crash resistance) - `fuzz_bwt`, `fuzz_rans`, `fuzz_fse`, `fuzz_huffman` — entropy coding roundtrips - - `fuzz_lz77`, `fuzz_lz78`, `fuzz_lzss`, `fuzz_lzseq` — LZ family roundtrips + - `fuzz_lz77`, `fuzz_lzss`, `fuzz_lzseq` — LZ family roundtrips - `fuzz_rle`, `fuzz_mtf` — simple codec roundtrips - Each target tests encode→decode roundtrip AND arbitrary-bytes decode crash resistance - Requires: `rustup toolchain install nightly && cargo install cargo-fuzz` diff --git a/examples/explore_pipelines.rs b/examples/explore_pipelines.rs index 5ec7456..785aa55 100644 --- a/examples/explore_pipelines.rs +++ b/examples/explore_pipelines.rs @@ -313,16 +313,14 @@ fn main() { (Pipeline::Lzf, ParseStrategy::Optimal, 1, Backend::Cpu), (Pipeline::Bw, ParseStrategy::Auto, 1, Backend::Cpu), (Pipeline::Bbw, ParseStrategy::Auto, 1, Backend::Cpu), - // Experimental: LZSS and LZ78 pipelines + // Experimental: LZSS pipeline (Pipeline::LzssR, ParseStrategy::Auto, 1, Backend::Cpu), - (Pipeline::Lz78R, ParseStrategy::Auto, 1, Backend::Cpu), // Multi-threaded CPU (Pipeline::Deflate, ParseStrategy::Lazy, 0, Backend::Cpu), (Pipeline::Lzf, ParseStrategy::Lazy, 0, Backend::Cpu), (Pipeline::Bw, ParseStrategy::Auto, 0, Backend::Cpu), (Pipeline::Bbw, ParseStrategy::Auto, 0, Backend::Cpu), (Pipeline::LzssR, ParseStrategy::Auto, 0, Backend::Cpu), - (Pipeline::Lz78R, ParseStrategy::Auto, 0, Backend::Cpu), ]; // WebGPU GPU variants (if available) diff --git a/examples/pipeline_comparison.rs b/examples/pipeline_comparison.rs new file mode 100644 index 0000000..357bc41 --- /dev/null +++ b/examples/pipeline_comparison.rs @@ -0,0 +1,77 @@ +use pz::pipeline::{self, Pipeline}; +use std::time::Instant; + +fn main() { + println!("\n=== Pipeline Compression Comparison ===\n"); + + // Test with different data patterns + test_pipeline_comparison("repetitive", generate_repetitive(256 * 1024)); + test_pipeline_comparison("sequential", generate_sequential(256 * 1024)); + test_pipeline_comparison("structured", generate_structured(256 * 1024)); +} + +fn test_pipeline_comparison(name: &str, data: Vec) { + println!("Test data: {} ({} bytes)", name, data.len()); + println!("{:-<80}", ""); + + let pipelines = vec![ + ("Lzr (LZ77+rANS)", Pipeline::Lzr), + ("Lzf (LZ77+FSE)", Pipeline::Lzf), + ("LzSeqR (LzSeq+rANS)", Pipeline::LzSeqR), + ("Deflate (LZ77+Huffman)", Pipeline::Deflate), + ]; + + println!( + "{:<25} {:<15} {:<15} {:<15}", + "Pipeline", "Compressed", "Ratio %", "Time (ms)" + ); + println!("{:-<70}", ""); + + for (label, pipeline) in pipelines { + let start = Instant::now(); + match pipeline::compress(&data, pipeline) { + Ok(compressed) => { + let elapsed = start.elapsed().as_secs_f64() * 1000.0; + let ratio = (compressed.len() as f64 / data.len() as f64) * 100.0; + println!( + "{:<25} {:<15} {:<15.2} {:<15.3}", + label, + format!("{} bytes", compressed.len()), + ratio, + elapsed + ); + } + Err(e) => { + println!("{:<25} ERROR: {}", label, e); + } + } + } + println!(); +} + +fn generate_repetitive(size: usize) -> Vec { + let pattern = + b"The quick brown fox jumps over the lazy dog. This is a test pattern for compression. "; + let mut data = Vec::with_capacity(size); + while data.len() < size { + let remaining = size - data.len(); + let chunk = remaining.min(pattern.len()); + data.extend_from_slice(&pattern[..chunk]); + } + data +} + +fn generate_sequential(size: usize) -> Vec { + (0..size).map(|i| (i % 256) as u8).collect() +} + +fn generate_structured(size: usize) -> Vec { + let json_pattern = br#"{"key":"value","number":123,"nested":{"field":"data"}}"#; + let mut data = Vec::with_capacity(size); + while data.len() < size { + let remaining = size - data.len(); + let chunk = remaining.min(json_pattern.len()); + data.extend_from_slice(&json_pattern[..chunk]); + } + data +} diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index 8c00088..56dd65c 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -56,11 +56,6 @@ name = "fuzz_lz77" path = "fuzz_targets/fuzz_lz77.rs" doc = false -[[bin]] -name = "fuzz_lz78" -path = "fuzz_targets/fuzz_lz78.rs" -doc = false - [[bin]] name = "fuzz_lzss" path = "fuzz_targets/fuzz_lzss.rs" diff --git a/fuzz/fuzz_targets/fuzz_lz78.rs b/fuzz/fuzz_targets/fuzz_lz78.rs deleted file mode 100644 index 71578f2..0000000 --- a/fuzz/fuzz_targets/fuzz_lz78.rs +++ /dev/null @@ -1,23 +0,0 @@ -#![no_main] -use libfuzzer_sys::fuzz_target; -use pz::lz78; - -/// LZ78 encode/decode roundtrip and decode crash resistance. -fuzz_target!(|data: &[u8]| { - if data.is_empty() { - return; - } - let input = if data.len() > 64 * 1024 { &data[..64 * 1024] } else { data }; - - // Roundtrip - let encoded = match lz78::encode(input) { - Ok(e) => e, - Err(_) => return, - }; - let decoded = lz78::decode(&encoded) - .expect("LZ78 decode failed on valid encoded data"); - assert_eq!(input, decoded.as_slice(), "LZ78 roundtrip mismatch"); - - // Crash resistance - let _ = lz78::decode(data); -}); diff --git a/fuzz/fuzz_targets/fuzz_pipeline_roundtrip.rs b/fuzz/fuzz_targets/fuzz_pipeline_roundtrip.rs index e910d4a..1114eb7 100644 --- a/fuzz/fuzz_targets/fuzz_pipeline_roundtrip.rs +++ b/fuzz/fuzz_targets/fuzz_pipeline_roundtrip.rs @@ -18,7 +18,6 @@ fuzz_target!(|data: &[u8]| { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, Pipeline::LzSeqR, ]; diff --git a/src/bin/pz.rs b/src/bin/pz.rs index 5ed9d6e..4cce780 100644 --- a/src/bin/pz.rs +++ b/src/bin/pz.rs @@ -81,7 +81,6 @@ fn list_pipelines() { ("lzf", "4", "LZ77 + FSE (zstd-style entropy coding)"), ("lzfi", "5", "LZSS + interleaved FSE (fast CPU decode)"), ("lzssr", "6", "LZSS + rANS (experimental)"), - ("lz78r", "7", "LZ78 + rANS (experimental)"), ("lzseqr", "8", "LzSeq + rANS (zstd-style code+extra-bits)"), ("lzseqh", "9", "LzSeq + Huffman (fast decode)"), ("sortlz", "10", "Sort-based LZ77 + FSE (GPU experiment)"), @@ -245,7 +244,6 @@ fn parse_args() -> Opts { "lzf" | "4" => Pipeline::Lzf, "lzfi" | "5" => Pipeline::Lzfi, "lzssr" | "6" => Pipeline::LzssR, - "lz78r" | "7" => Pipeline::Lz78R, "lzseqr" | "8" => Pipeline::LzSeqR, "lzseqh" | "9" => Pipeline::LzSeqH, "sortlz" | "10" => Pipeline::SortLz, @@ -441,7 +439,6 @@ fn list_file(path: &str, data: &[u8]) -> Result<(), String> { 4 => "lzf", 5 => "lzfi", 6 => "lzssr", - 7 => "lz78r", 8 => "lzseqr", 9 => "lzseqh", 10 => "sortlz", diff --git a/src/lib.rs b/src/lib.rs index 52cc680..cd5dcdb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,7 +7,6 @@ pub mod fse; pub mod gzip; pub mod huffman; pub mod lz77; -pub mod lz78; pub mod lzseq; pub mod lzss; pub mod mtf; diff --git a/src/lz78.rs b/src/lz78.rs deleted file mode 100644 index bbfc806..0000000 --- a/src/lz78.rs +++ /dev/null @@ -1,315 +0,0 @@ -/// LZ78 (Lempel-Ziv 1978) compression and decompression. -/// -/// Builds an incremental dictionary (trie) during compression. At each step, -/// finds the longest prefix of the remaining input that exists in the -/// dictionary, emits a (dictionary_index, next_byte) token, and adds the -/// extended string to the dictionary. -/// -/// Wire format: -/// ```text -/// [original_len: u32 LE] [max_dict_size: u16 LE] [num_tokens: u32 LE] -/// [tokens: (index: u16 LE, next: u8) × num_tokens] -/// ``` -/// -/// Each token is 3 bytes. Dictionary index 0 represents the root (empty string). -/// When the dictionary fills to max_dict_size, no new entries are added (freeze). -use std::collections::HashMap; - -use crate::{PzError, PzResult}; - -/// Default maximum dictionary entries. -const DEFAULT_DICT_SIZE: u16 = 16384; - -/// Header size: original_len (4) + max_dict_size (2) + num_tokens (4). -const HEADER_SIZE: usize = 10; - -/// Bytes per serialized token: index (2) + next (1). -const TOKEN_SIZE: usize = 3; - -/// Compress input using LZ78 with the default dictionary size (16384). -pub fn encode(input: &[u8]) -> PzResult> { - encode_with_dict_size(input, DEFAULT_DICT_SIZE) -} - -/// Compress input using LZ78 with a specific maximum dictionary size. -/// -/// `max_dict_size` must be >= 2 (at least root + one entry). -pub fn encode_with_dict_size(input: &[u8], max_dict_size: u16) -> PzResult> { - if input.is_empty() { - return Ok(Vec::new()); - } - - if max_dict_size < 2 { - return Err(PzError::InvalidInput); - } - - // Trie: (parent_index, byte) -> child_index - // Index 0 = root (empty string) - let mut trie: HashMap<(u16, u8), u16> = HashMap::new(); - let mut next_index: u16 = 1; - let mut tokens: Vec<(u16, u8)> = Vec::new(); - let mut pos = 0; - - while pos < input.len() { - let mut current_index: u16 = 0; // start at root - - // Walk the trie, finding the longest known prefix - while pos < input.len() { - let byte = input[pos]; - if let Some(&child) = trie.get(&(current_index, byte)) { - current_index = child; - pos += 1; - } else { - break; - } - } - - if pos < input.len() { - let byte = input[pos]; - tokens.push((current_index, byte)); - - // Add new entry to dictionary if not full - if next_index < max_dict_size { - trie.insert((current_index, byte), next_index); - next_index += 1; - } - pos += 1; - } else { - // Reached end of input while matching a prefix. - // Emit token with a dummy next byte; decoder uses original_len - // to truncate output to the correct length. - tokens.push((current_index, 0)); - } - } - - // Serialize - let total_size = HEADER_SIZE + tokens.len() * TOKEN_SIZE; - let mut output = Vec::with_capacity(total_size); - output.extend_from_slice(&(input.len() as u32).to_le_bytes()); - output.extend_from_slice(&max_dict_size.to_le_bytes()); - output.extend_from_slice(&(tokens.len() as u32).to_le_bytes()); - - for &(index, next) in &tokens { - output.extend_from_slice(&index.to_le_bytes()); - output.push(next); - } - - Ok(output) -} - -/// Decompress LZ78-compressed data. -pub fn decode(input: &[u8]) -> PzResult> { - if input.is_empty() { - return Ok(Vec::new()); - } - - if input.len() < HEADER_SIZE { - return Err(PzError::InvalidInput); - } - - let original_len = u32::from_le_bytes(input[0..4].try_into().unwrap()) as usize; - let max_dict_size = u16::from_le_bytes(input[4..6].try_into().unwrap()) as usize; - let num_tokens = u32::from_le_bytes(input[6..10].try_into().unwrap()) as usize; - - let expected_data_len = num_tokens * TOKEN_SIZE; - if input.len() < HEADER_SIZE + expected_data_len { - return Err(PzError::InvalidInput); - } - - // Dictionary: index -> full string - // Index 0 = root = empty string - let mut dict: Vec> = Vec::with_capacity(max_dict_size.min(65536)); - dict.push(Vec::new()); // root - - let mut output = Vec::with_capacity(original_len); - let token_data = &input[HEADER_SIZE..]; - - for i in 0..num_tokens { - let base = i * TOKEN_SIZE; - let index = u16::from_le_bytes([token_data[base], token_data[base + 1]]) as usize; - let next = token_data[base + 2]; - - if index >= dict.len() { - return Err(PzError::InvalidInput); - } - - // Output = dict[index] + next_byte - let prefix = &dict[index]; - output.extend_from_slice(prefix); - output.push(next); - - // Add new dictionary entry: prefix + next - if dict.len() < max_dict_size { - let mut new_entry = prefix.clone(); - new_entry.push(next); - dict.push(new_entry); - } - } - - // Truncate to original length (handles end-of-stream edge case) - output.truncate(original_len); - Ok(output) -} - -/// Compress into a caller-allocated buffer. Returns bytes written. -pub fn encode_to_buf(input: &[u8], output: &mut [u8]) -> PzResult { - let encoded = encode(input)?; - if encoded.len() > output.len() { - return Err(PzError::BufferTooSmall); - } - output[..encoded.len()].copy_from_slice(&encoded); - Ok(encoded.len()) -} - -/// Decompress into a caller-allocated buffer. Returns bytes written. -pub fn decode_to_buf(input: &[u8], output: &mut [u8]) -> PzResult { - if input.is_empty() { - return Ok(0); - } - - if input.len() < HEADER_SIZE { - return Err(PzError::InvalidInput); - } - - let original_len = u32::from_le_bytes(input[0..4].try_into().unwrap()) as usize; - if original_len > output.len() { - return Err(PzError::BufferTooSmall); - } - - let decoded = decode(input)?; - output[..decoded.len()].copy_from_slice(&decoded); - Ok(decoded.len()) -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_round_trip_text() { - let input = b"To be, or not to be, that is the question: \ - Whether 'tis nobler in the mind to suffer \ - The slings and arrows of outrageous fortune, \ - Or to take arms against a sea of troubles" - .to_vec(); - let compressed = encode(&input).unwrap(); - let decompressed = decode(&compressed).unwrap(); - assert_eq!(decompressed, input); - } - - #[test] - fn test_round_trip_binary() { - let input: Vec = (0..1024).map(|i| (i % 256) as u8).collect(); - let compressed = encode(&input).unwrap(); - let decompressed = decode(&compressed).unwrap(); - assert_eq!(decompressed, input); - } - - #[test] - fn test_small_dict() { - let input = b"aaabbbaaabbbaaabbb".to_vec(); - let compressed = encode_with_dict_size(&input, 8).unwrap(); - let decompressed = decode(&compressed).unwrap(); - assert_eq!(decompressed, input); - } - - #[test] - fn test_large_dict() { - let pattern = b"the quick brown fox jumps over the lazy dog. "; - let mut input = Vec::new(); - for _ in 0..50 { - input.extend_from_slice(pattern); - } - let compressed = encode_with_dict_size(&input, 65535).unwrap(); - let decompressed = decode(&compressed).unwrap(); - assert_eq!(decompressed, input); - } - - #[test] - fn test_dict_fills_up() { - // With a tiny dictionary, it should fill up quickly but still work - let mut input = Vec::new(); - for i in 0..500u16 { - input.push((i % 256) as u8); - input.push(((i / 256) % 256) as u8); - } - let compressed = encode_with_dict_size(&input, 16).unwrap(); - let decompressed = decode(&compressed).unwrap(); - assert_eq!(decompressed, input); - } - - #[test] - fn test_invalid_dict_size() { - let input = b"hello".to_vec(); - let result = encode_with_dict_size(&input, 1); - assert_eq!(result, Err(PzError::InvalidInput)); - } - - #[test] - fn test_decode_invalid_short() { - let result = decode(&[0, 1, 2]); - assert_eq!(result, Err(PzError::InvalidInput)); - } - - #[test] - fn test_decode_invalid_truncated() { - let mut data = vec![0u8; HEADER_SIZE]; - // original_len = 10, max_dict_size = 256, num_tokens = 100 (too many for data) - data[0..4].copy_from_slice(&10u32.to_le_bytes()); - data[4..6].copy_from_slice(&256u16.to_le_bytes()); - data[6..10].copy_from_slice(&100u32.to_le_bytes()); - let result = decode(&data); - assert_eq!(result, Err(PzError::InvalidInput)); - } - - #[test] - fn test_decode_invalid_index() { - // Craft a token with an out-of-range dictionary index - let mut data = vec![0u8; HEADER_SIZE + TOKEN_SIZE]; - data[0..4].copy_from_slice(&1u32.to_le_bytes()); // original_len = 1 - data[4..6].copy_from_slice(&256u16.to_le_bytes()); // max_dict_size - data[6..10].copy_from_slice(&1u32.to_le_bytes()); // num_tokens = 1 - // Token: index = 999 (invalid), next = 'a' - data[10..12].copy_from_slice(&999u16.to_le_bytes()); - data[12] = b'a'; - let result = decode(&data); - assert_eq!(result, Err(PzError::InvalidInput)); - } - - #[test] - fn test_encode_to_buf() { - let input = b"hello hello hello".to_vec(); - let encoded = encode(&input).unwrap(); - let mut buf = vec![0u8; encoded.len() + 10]; - let written = encode_to_buf(&input, &mut buf).unwrap(); - assert_eq!(written, encoded.len()); - assert_eq!(&buf[..written], &encoded[..]); - } - - #[test] - fn test_encode_to_buf_too_small() { - let input = vec![b'a'; 100]; - let mut buf = vec![0u8; 1]; - let result = encode_to_buf(&input, &mut buf); - assert_eq!(result, Err(PzError::BufferTooSmall)); - } - - #[test] - fn test_decode_to_buf() { - let input = b"test data for decode_to_buf".to_vec(); - let encoded = encode(&input).unwrap(); - let mut buf = vec![0u8; input.len() + 10]; - let written = decode_to_buf(&encoded, &mut buf).unwrap(); - assert_eq!(written, input.len()); - assert_eq!(&buf[..written], &input[..]); - } - - #[test] - fn test_decode_to_buf_too_small() { - let input = vec![b'z'; 100]; - let encoded = encode(&input).unwrap(); - let mut buf = vec![0u8; 1]; - let result = decode_to_buf(&encoded, &mut buf); - assert_eq!(result, Err(PzError::BufferTooSmall)); - } -} diff --git a/src/pipeline/blocks.rs b/src/pipeline/blocks.rs index 1b39ba6..1ee8d8f 100644 --- a/src/pipeline/blocks.rs +++ b/src/pipeline/blocks.rs @@ -1,6 +1,6 @@ //! Per-pipeline single-block compress and decompress implementations. //! -//! LZ-based pipelines (Deflate, Lzr, Lzf, Lzfi, LzssR, Lz78R) use a unified path: +//! LZ-based pipelines (Deflate, Lzr, Lzf, Lzfi, LzssR) use a unified path: //! compress: demux → entropy_encode //! decompress: entropy_decode → demux //! @@ -142,7 +142,7 @@ fn entropy_encode( let _ = (input_len, options); stage_huffman_encode(block) } - Pipeline::Lzr | Pipeline::LzssR | Pipeline::Lz78R | Pipeline::LzSeqR => { + Pipeline::Lzr | Pipeline::LzssR | Pipeline::LzSeqR => { let _ = (input_len, options); stage_rans_encode_with_options(block, options) } @@ -180,7 +180,7 @@ fn entropy_decode( ) -> PzResult { match pipeline { Pipeline::Deflate => stage_huffman_decode(block), - Pipeline::Lzr | Pipeline::LzssR | Pipeline::Lz78R | Pipeline::LzSeqR => { + Pipeline::Lzr | Pipeline::LzssR | Pipeline::LzSeqR => { #[cfg(feature = "webgpu")] { if let Backend::WebGpu = options.backend { diff --git a/src/pipeline/demux.rs b/src/pipeline/demux.rs index e3a3ac2..e1a10f3 100644 --- a/src/pipeline/demux.rs +++ b/src/pipeline/demux.rs @@ -2,7 +2,6 @@ //! for entropy coding, and re-merges them on decompression. use crate::lz77; -use crate::lz78; use crate::lzseq; use crate::lzss; use crate::{PzError, PzResult}; @@ -21,7 +20,7 @@ pub(crate) struct DemuxOutput { pub meta: Vec, } -/// Describes how a pre-entropy stage (LZ77, LZSS, LZ78, etc.) splits its +/// Describes how a pre-entropy stage (LZ77, LZSS, LzSeq, etc.) splits its /// output into independent byte streams for entropy coding, and merges /// them back on decompression. pub(crate) trait StreamDemuxer { @@ -46,8 +45,6 @@ pub(crate) enum LzDemuxer { Lz77, /// LZSS: 4 streams (flags, literals, offsets, lengths). Lzss, - /// LZ78: 1 stream (flat blob, no splitting). - Lz78, /// LzSeq: 6 streams (flags, literals, offset_codes, offset_extra, length_codes, length_extra). LzSeq, } @@ -60,7 +57,6 @@ pub(crate) fn demuxer_for_pipeline(pipeline: super::Pipeline) -> Option Some(LzDemuxer::Lzss), - super::Pipeline::Lz78R => Some(LzDemuxer::Lz78), super::Pipeline::LzSeqR | super::Pipeline::LzSeqH => Some(LzDemuxer::LzSeq), super::Pipeline::Bw | super::Pipeline::Bbw => None, super::Pipeline::SortLz => None, @@ -95,7 +91,6 @@ impl StreamDemuxer for LzDemuxer { match self { LzDemuxer::Lz77 => 3, LzDemuxer::Lzss => 4, - LzDemuxer::Lz78 => 1, LzDemuxer::LzSeq => 6, } } @@ -170,15 +165,6 @@ impl StreamDemuxer for LzDemuxer { meta: num_tokens.to_le_bytes().to_vec(), }) } - LzDemuxer::Lz78 => { - let encoded = lz78::encode(input)?; - let pre_entropy_len = encoded.len(); - Ok(DemuxOutput { - streams: vec![encoded], - pre_entropy_len, - meta: Vec::new(), - }) - } LzDemuxer::LzSeq => { // SortLZ match finder path: use sort-based matches converted // to LzSeq's 6-stream format via encode_match_sequence. @@ -355,16 +341,6 @@ impl StreamDemuxer for LzDemuxer { } Ok(decoded) } - LzDemuxer::Lz78 => { - if streams.len() != 1 { - return Err(PzError::InvalidInput); - } - let decoded = lz78::decode(&streams[0])?; - if decoded.len() != original_len { - return Err(PzError::InvalidInput); - } - Ok(decoded) - } LzDemuxer::LzSeq => { if streams.len() != 6 { return Err(PzError::InvalidInput); diff --git a/src/pipeline/mod.rs b/src/pipeline/mod.rs index 039b36d..67e8371 100644 --- a/src/pipeline/mod.rs +++ b/src/pipeline/mod.rs @@ -15,7 +15,6 @@ //! | `Lzf` | LZ77 → FSE | zstd-like | //! | `Lzfi` | LZSS → interleaved FSE | fast CPU decode | //! | `LzssR` | LZSS → rANS | experimental | -//! | `Lz78R` | LZ78 → rANS | experimental | //! | `LzSeqR` | LzSeq → rANS | zstd-style | //! | `LzSeqH` | LzSeq → Huffman | fast decode | //! | `SortLz` | SortLZ → FSE | GPU match find | @@ -28,7 +27,7 @@ //! Each compressed stream starts with a header: //! - Magic bytes: `PZ` (2 bytes) //! - Version: 2 (1 byte) -//! - Pipeline ID: 0=Deflate, 1=Bw, 3=Lzr, 4=Lzf, 5=Lzfi, 6=LzssR, 7=Lz78R, 8=LzSeqR, 9=LzSeqH, 10=SortLz (1 byte) +//! - Pipeline ID: 0=Deflate, 1=Bw, 3=Lzr, 4=Lzf, 5=Lzfi, 6=LzssR, 8=LzSeqR, 9=LzSeqH, 10=SortLz (1 byte) //! - Original length: u32 little-endian (4 bytes) //! - num_blocks: u32 little-endian (4 bytes) //! - Block table: \[compressed_len: u32, original_len: u32\] \* num_blocks @@ -340,8 +339,8 @@ pub enum Pipeline { Lzfi = 5, /// LZSS + rANS (flag-bit LZ + arithmetic ANS, experimental) LzssR = 6, - /// LZ78 + rANS (incremental trie + rANS, experimental) - Lz78R = 7, + // ID 7 was Lz78R (LZ78 + rANS) — removed as uncompetitive (8-13x worse ratio, + // 34% slower than Lzr). See lz78r_benchmark_report.md. /// LzSeq + rANS (code+extra-bits sequence encoding, zstd-style) LzSeqR = 8, /// LzSeq + Huffman (fast decode, simpler entropy coding) @@ -364,7 +363,7 @@ impl TryFrom for Pipeline { 4 => Ok(Self::Lzf), 5 => Ok(Self::Lzfi), 6 => Ok(Self::LzssR), - 7 => Ok(Self::Lz78R), + // 7 was Lz78R — removed 8 => Ok(Self::LzSeqR), 9 => Ok(Self::LzSeqH), 10 => Ok(Self::SortLz), @@ -699,7 +698,6 @@ pub fn select_pipeline_trial( Pipeline::Lzr, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, Pipeline::LzSeqR, Pipeline::LzSeqH, Pipeline::SortLz, @@ -712,14 +710,12 @@ pub fn select_pipeline_trial( for &pipeline in &candidates { // SortLz pipeline has its own match finder, only test default - let finders: &[MatchFinder] = if matches!( - pipeline, - Pipeline::Bw | Pipeline::Bbw | Pipeline::Lz78R | Pipeline::SortLz - ) { - &[MatchFinder::HashChain] - } else { - &match_finders - }; + let finders: &[MatchFinder] = + if matches!(pipeline, Pipeline::Bw | Pipeline::Bbw | Pipeline::SortLz) { + &[MatchFinder::HashChain] + } else { + &match_finders + }; for &finder in finders { let opts = CompressOptions { diff --git a/src/pipeline/stages.rs b/src/pipeline/stages.rs index de5b7d2..84797a6 100644 --- a/src/pipeline/stages.rs +++ b/src/pipeline/stages.rs @@ -911,8 +911,6 @@ pub(crate) fn run_compress_stage( } (Pipeline::LzssR, 0) => stage_demux_compress(block, &LzDemuxer::Lzss, options), (Pipeline::LzssR, 1) => stage_rans_encode_with_options(block, options), - (Pipeline::Lz78R, 0) => stage_demux_compress(block, &LzDemuxer::Lz78, options), - (Pipeline::Lz78R, 1) => stage_rans_encode_with_options(block, options), (Pipeline::LzSeqR, 0) => stage_demux_compress(block, &LzDemuxer::LzSeq, options), (Pipeline::LzSeqR, 1) => { #[cfg(feature = "webgpu")] diff --git a/src/pipeline/tests.rs b/src/pipeline/tests.rs index aab7f1f..8fa0f8b 100644 --- a/src/pipeline/tests.rs +++ b/src/pipeline/tests.rs @@ -50,7 +50,6 @@ fn test_all_pipelines_banana() { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, Pipeline::Lzfi, ] { let compressed = compress(input, pipeline).unwrap(); @@ -72,7 +71,6 @@ fn test_all_pipelines_medium_text() { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, Pipeline::Lzfi, ] { let compressed = compress(&input, pipeline).unwrap(); diff --git a/src/validation.rs b/src/validation.rs index 43aa571..4e8e833 100644 --- a/src/validation.rs +++ b/src/validation.rs @@ -13,7 +13,6 @@ mod tests { use crate::frequency; use crate::huffman::HuffmanTree; use crate::lz77; - use crate::lz78; use crate::lzss; use crate::mtf; use crate::pipeline::{self, Pipeline}; @@ -132,14 +131,6 @@ mod tests { let decompressed = lzss::decode(&compressed).unwrap(); assert_eq!(decompressed, input, "lzss round-trip failed"); } - - #[test] - fn lz78() { - let input = $data; - let compressed = lz78::encode(&input).unwrap(); - let decompressed = lz78::decode(&compressed).unwrap(); - assert_eq!(decompressed, input, "lz78 round-trip failed"); - } } }; } @@ -246,19 +237,6 @@ mod tests { assert_eq!(inv_lzss, input); } - #[test] - fn lz78_then_fse() { - use crate::fse; - let input = data_repeating_text(); - let lz78_data = lz78::encode(&input).unwrap(); - let fse_data = fse::encode(&lz78_data); - // Inverse - let inv_fse = fse::decode(&fse_data, lz78_data.len()).unwrap(); - assert_eq!(inv_fse, lz78_data); - let inv_lz78 = lz78::decode(&inv_fse).unwrap(); - assert_eq!(inv_lz78, input); - } - #[test] fn mtf_then_rle_then_huffman() { // MTF + RLE + Huffman (without BWT) @@ -305,7 +283,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -322,7 +299,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -339,7 +315,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -356,7 +331,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -373,7 +347,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -390,7 +363,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -407,7 +379,6 @@ mod tests { Pipeline::Lzf, Pipeline::Lzfi, Pipeline::LzssR, - Pipeline::Lz78R, ] { assert_pipeline_round_trip(&input, p); } @@ -543,22 +514,10 @@ mod tests { lz77_size ); } - - #[test] - fn lz78_dictionary_grows() { - let input = data_repeating_text(); - let compressed = lz78::encode(&input).unwrap(); - assert!( - compressed.len() < input.len(), - "LZ78 should compress repetitive text: {} >= {}", - compressed.len(), - input.len() - ); - } } // --------------------------------------------------------------- - // Compression ratio comparison (LZ77 vs LZSS vs LZ78) + // Compression ratio comparison (LZ77 vs LZSS) // --------------------------------------------------------------- mod lz_comparison { @@ -568,33 +527,27 @@ mod tests { fn ratio_report(name: &str, input: &[u8]) { let lz77_raw = lz77::compress_lazy(input).unwrap(); let lzss_raw = lzss::encode(input).unwrap(); - let lz78_raw = lz78::encode(input).unwrap(); let lz77_fse = fse::encode(&lz77_raw); let lzss_fse = fse::encode(&lzss_raw); - let lz78_fse = fse::encode(&lz78_raw); eprintln!( - " {:20} {:>6}B | LZ77 {:>6} ({:5.1}%) | LZSS {:>6} ({:5.1}%) | LZ78 {:>6} ({:5.1}%)", + " {:20} {:>6}B | LZ77 {:>6} ({:5.1}%) | LZSS {:>6} ({:5.1}%)", name, input.len(), lz77_raw.len(), 100.0 * lz77_raw.len() as f64 / input.len() as f64, lzss_raw.len(), 100.0 * lzss_raw.len() as f64 / input.len() as f64, - lz78_raw.len(), - 100.0 * lz78_raw.len() as f64 / input.len() as f64, ); eprintln!( - " {:20} {:>6} | +FSE {:>6} ({:5.1}%) | +FSE {:>6} ({:5.1}%) | +FSE {:>6} ({:5.1}%)", + " {:20} {:>6} | +FSE {:>6} ({:5.1}%) | +FSE {:>6} ({:5.1}%)", "", "", lz77_fse.len(), 100.0 * lz77_fse.len() as f64 / input.len() as f64, lzss_fse.len(), 100.0 * lzss_fse.len() as f64 / input.len() as f64, - lz78_fse.len(), - 100.0 * lz78_fse.len() as f64 / input.len() as f64, ); } @@ -649,7 +602,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, pipe).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -756,7 +708,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(input, pipe).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -784,7 +735,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(input, pipe).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -815,7 +765,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -834,7 +783,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -853,7 +801,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -880,7 +827,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -898,7 +844,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap(); @@ -917,7 +862,6 @@ mod tests { Pipeline::Lzr, Pipeline::Lzf, Pipeline::LzssR, - Pipeline::Lz78R, ] { let compressed = pipeline::compress(&input, p).unwrap(); let decompressed = pipeline::decompress(&compressed).unwrap();