From 393d3a76bbf8e382dbabe15da4cb63d1bd1aab2d Mon Sep 17 00:00:00 2001 From: Justin Payne Date: Fri, 29 May 2026 09:23:27 -0500 Subject: [PATCH] test: add acceptance tests for #62 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive test suite for CoverageTrack RLE compression with quantization: Unit tests (14): - Uniform coverage (single run) - Alternating coverage (many runs) - Zero coverage regions - Quantization to nearest 5 (exact multiples, boundaries, rounding) - Random access via binary search (O(log n)) - Out-of-bounds access - Iterator over positions - Empty coverage edge case - Single position edge case - Realistic bacterial genome (4.6Mbp with variation) - High depth sequencing (100x) - Serialization round-trip Property tests (5): - Round-trip encode/decode equality (modulo quantization) - Quantization idempotence (quantize twice == quantize once) - coverage_at(i) matches to_vec()[i] - All decompressed values are multiples of 5 - Quantized values within ±2 of original Benchmarks: - Compression ratio on uniform, realistic, high-variation, and random coverage - Random access performance via binary search - Full decompression performance All tests FAIL as expected (RED phase) - implementation deferred to separate agent. Co-Authored-By: Claude Sonnet 4.5 --- .gitignore | 1 + phraya-core/Cargo.toml | 8 +- phraya-core/benches/coverage_track.rs | 144 ++++++++++ phraya-core/src/lib.rs | 6 + phraya-core/src/types.rs | 362 +++++++++++++++++++++++++- 5 files changed, 516 insertions(+), 5 deletions(-) create mode 100644 phraya-core/benches/coverage_track.rs diff --git a/.gitignore b/.gitignore index 99a1e3c..6ca8d52 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,4 @@ Thumbs.db *.cubin *.fatbin *.ptx +proptest-regressions/ diff --git a/phraya-core/Cargo.toml b/phraya-core/Cargo.toml index f298c05..decb84e 100644 --- a/phraya-core/Cargo.toml +++ b/phraya-core/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "phraya-core" version = "0.1.0" -edition = "2026" +edition = "2021" [dependencies] serde = { workspace = true } @@ -9,3 +9,9 @@ thiserror = { workspace = true } [dev-dependencies] serde_json = "1.0" +proptest = "1.4" +criterion = "0.5" + +[[bench]] +name = "coverage_track" +harness = false diff --git a/phraya-core/benches/coverage_track.rs b/phraya-core/benches/coverage_track.rs new file mode 100644 index 0000000..8c5e8a4 --- /dev/null +++ b/phraya-core/benches/coverage_track.rs @@ -0,0 +1,144 @@ +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use phraya_core::CoverageTrack; + +fn benchmark_compression_ratio(c: &mut Criterion) { + let mut group = c.benchmark_group("coverage_track_compression"); + + // Test 1: Uniform coverage (best case - single RLE run) + let uniform_coverage = vec![30; 4_600_000]; // E. coli genome size at 30x + group.bench_with_input( + BenchmarkId::new("uniform", "4.6Mbp_30x"), + &uniform_coverage, + |b, cov| { + b.iter(|| { + let track = CoverageTrack::from_coverage(black_box(cov.clone())); + black_box(track) + }) + }, + ); + + // Test 2: Realistic coverage with variation (multiple regions) + let mut realistic_coverage = Vec::new(); + realistic_coverage.extend(vec![30; 1_000_000]); // 1Mbp at 30x + realistic_coverage.extend(vec![10; 500_000]); // 500kbp at 10x + realistic_coverage.extend(vec![35; 2_000_000]); // 2Mbp at 35x + realistic_coverage.extend(vec![0; 100_000]); // 100kbp no coverage + realistic_coverage.extend(vec![30; 1_000_000]); // 1Mbp at 30x + + group.bench_with_input( + BenchmarkId::new("realistic", "4.6Mbp_mixed"), + &realistic_coverage, + |b, cov| { + b.iter(|| { + let track = CoverageTrack::from_coverage(black_box(cov.clone())); + black_box(track) + }) + }, + ); + + // Test 3: High variation coverage (worst case - many runs) + let mut high_variation = Vec::new(); + for i in 0..100_000 { + high_variation.push(if i % 2 == 0 { 10 } else { 20 }); + } + + group.bench_with_input( + BenchmarkId::new("high_variation", "100k_alternating"), + &high_variation, + |b, cov| { + b.iter(|| { + let track = CoverageTrack::from_coverage(black_box(cov.clone())); + black_box(track) + }) + }, + ); + + // Test 4: Random coverage (realistic noise) + fn simple_rand(seed: u64) -> u64 { + seed.wrapping_mul(6364136223846793005).wrapping_add(1) + } + + let mut random_coverage = Vec::new(); + let mut seed = 42u64; + for _ in 0..1_000_000 { + seed = simple_rand(seed); + let coverage = 20 + (seed % 20) as usize; // 20-40x with variation + random_coverage.push(coverage); + } + + group.bench_with_input( + BenchmarkId::new("random", "1Mbp_20-40x"), + &random_coverage, + |b, cov| { + b.iter(|| { + let track = CoverageTrack::from_coverage(black_box(cov.clone())); + black_box(track) + }) + }, + ); + + group.finish(); +} + +fn benchmark_random_access(c: &mut Criterion) { + let mut group = c.benchmark_group("coverage_track_random_access"); + + // Build a realistic track + let mut coverage = Vec::new(); + coverage.extend(vec![30; 1_000_000]); + coverage.extend(vec![10; 500_000]); + coverage.extend(vec![35; 2_000_000]); + coverage.extend(vec![0; 100_000]); + coverage.extend(vec![30; 1_000_000]); + + let track = CoverageTrack::from_coverage(coverage.clone()); + + // Benchmark random access via binary search + group.bench_function("coverage_at_position", |b| { + b.iter(|| { + let pos = black_box(1_234_567); + let cov = track.coverage_at(pos); + black_box(cov) + }) + }); + + // Benchmark sequential access via iterator + group.bench_function("iterate_all_positions", |b| { + b.iter(|| { + let sum: usize = track.iter().map(|(_, cov)| cov).sum(); + black_box(sum) + }) + }); + + group.finish(); +} + +fn benchmark_decompression(c: &mut Criterion) { + let mut group = c.benchmark_group("coverage_track_decompression"); + + // Build a realistic track + let mut coverage = Vec::new(); + coverage.extend(vec![30; 1_000_000]); + coverage.extend(vec![10; 500_000]); + coverage.extend(vec![35; 2_000_000]); + + let track = CoverageTrack::from_coverage(coverage.clone()); + + // Benchmark full decompression + group.bench_function("to_vec_full_decompression", |b| { + b.iter(|| { + let decompressed = track.to_vec(); + black_box(decompressed) + }) + }); + + group.finish(); +} + +criterion_group!( + benches, + benchmark_compression_ratio, + benchmark_random_access, + benchmark_decompression +); +criterion_main!(benches); diff --git a/phraya-core/src/lib.rs b/phraya-core/src/lib.rs index 493dbbf..c4cdd86 100644 --- a/phraya-core/src/lib.rs +++ b/phraya-core/src/lib.rs @@ -1,6 +1,12 @@ // Module declarations pub mod types; +// Re-exports for convenience +pub use types::{ + AlignmentError, CoverageTrack, EvidenceLayer, FilterError, IoError, ParseError, Sequence, + VariantObservation, +}; + /// Represents a detected tandem repeat region in a sequence. /// /// A tandem repeat is a pattern of nucleotides that repeats multiple times in succession. diff --git a/phraya-core/src/types.rs b/phraya-core/src/types.rs index c3660e9..9e1961d 100644 --- a/phraya-core/src/types.rs +++ b/phraya-core/src/types.rs @@ -257,6 +257,48 @@ pub struct CoverageTrack { // Stub for now - implementation in separate slice } +impl CoverageTrack { + /// Create a CoverageTrack from raw coverage values. + /// Values are quantized to nearest 5 and RLE-compressed. + pub fn from_coverage(_coverage: Vec) -> Self { + unimplemented!("CoverageTrack::from_coverage not yet implemented") + } + + /// Decompress the RLE-encoded coverage to a full vector. + pub fn to_vec(&self) -> Vec { + unimplemented!("CoverageTrack::to_vec not yet implemented") + } + + /// Get coverage at a specific position via binary search. + /// Returns 0 for out-of-bounds positions. + pub fn coverage_at(&self, _pos: usize) -> usize { + unimplemented!("CoverageTrack::coverage_at not yet implemented") + } + + /// Get the number of RLE runs in this track. + pub fn run_count(&self) -> usize { + unimplemented!("CoverageTrack::run_count not yet implemented") + } + + /// Iterator over (position, coverage) pairs. + pub fn iter(&self) -> CoverageTrackIter { + unimplemented!("CoverageTrack::iter not yet implemented") + } +} + +/// Iterator over coverage track positions. +pub struct CoverageTrackIter { + // Stub for now - implementation in separate slice +} + +impl Iterator for CoverageTrackIter { + type Item = (usize, usize); + + fn next(&mut self) -> Option { + unimplemented!("CoverageTrackIter::next not yet implemented") + } +} + /// Parse errors for FASTA, FASTQ, and other input formats #[derive(Debug, Clone, Error, Serialize, Deserialize, PartialEq)] pub enum ParseError { @@ -565,12 +607,324 @@ mod tests { assert_eq!(deserialized.kmer_uniqueness().get(&100), Some(&1.0)); } - // ===== CoverageTrack type stub tests ===== + // ===== CoverageTrack RLE compression tests ===== + + #[test] + fn coverage_track_uniform_coverage_single_run() { + // Uniform coverage (e.g., all 10x) should compress to a single RLE run + let coverage = vec![10; 1000]; // 1000 positions with 10x coverage + let track = CoverageTrack::from_coverage(coverage.clone()); + + // Verify decompression matches original (modulo quantization to 10) + let decompressed = track.to_vec(); + assert_eq!(decompressed.len(), coverage.len()); + for val in &decompressed { + assert_eq!(*val, 10); // Quantized to nearest 5 + } + + // Verify efficient compression - should be single run + assert_eq!(track.run_count(), 1); + } + + #[test] + fn coverage_track_alternating_coverage_many_runs() { + // Alternating coverage should result in many RLE runs + let mut coverage = Vec::new(); + for i in 0..500 { + coverage.push(if i % 2 == 0 { 10 } else { 20 }); + } + let track = CoverageTrack::from_coverage(coverage.clone()); + + let decompressed = track.to_vec(); + assert_eq!(decompressed.len(), coverage.len()); + + // Verify alternating pattern preserved (with quantization) + for (i, &val) in decompressed.iter().enumerate() { + let expected = if i % 2 == 0 { 10 } else { 20 }; + assert_eq!(val, expected); + } + } + + #[test] + fn coverage_track_zero_coverage_regions() { + // Zero coverage regions should be preserved + let mut coverage = vec![0; 100]; + coverage.extend(vec![15; 100]); + coverage.extend(vec![0; 100]); + coverage.extend(vec![30; 100]); + + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + assert_eq!(decompressed.len(), 400); + // First 100 positions should be 0 + for i in 0..100 { + assert_eq!(decompressed[i], 0); + } + // Next 100 should be 15 + for i in 100..200 { + assert_eq!(decompressed[i], 15); + } + // Next 100 should be 0 + for i in 200..300 { + assert_eq!(decompressed[i], 0); + } + // Last 100 should be 30 + for i in 300..400 { + assert_eq!(decompressed[i], 30); + } + } + + #[test] + fn coverage_track_quantization_to_nearest_5() { + // Coverage values should be quantized to nearest 5 + let coverage = vec![7, 8, 12, 13, 17, 18, 22, 23]; + let track = CoverageTrack::from_coverage(coverage); + let decompressed = track.to_vec(); + + assert_eq!(decompressed, vec![5, 10, 10, 15, 15, 20, 20, 25]); + } + + #[test] + fn coverage_track_quantization_exact_multiples_of_5() { + // Exact multiples of 5 should remain unchanged + let coverage = vec![0, 5, 10, 15, 20, 25, 30]; + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + assert_eq!(decompressed, coverage); + } + + #[test] + fn coverage_track_quantization_boundary_cases() { + // Test rounding behavior at boundaries + // 2 rounds to 0, 3 rounds to 5, 7 rounds to 5, 8 rounds to 10 + let coverage = vec![2, 3, 7, 8, 12, 13]; + let track = CoverageTrack::from_coverage(coverage); + let decompressed = track.to_vec(); + + assert_eq!(decompressed, vec![0, 5, 5, 10, 10, 15]); + } + + #[test] + fn coverage_track_random_access_via_binary_search() { + // Random access should work via binary search in O(log n) time + let mut coverage = Vec::new(); + coverage.extend(vec![10; 100]); + coverage.extend(vec![20; 100]); + coverage.extend(vec![30; 100]); + + let track = CoverageTrack::from_coverage(coverage); + + // Test random access at various positions + assert_eq!(track.coverage_at(0), 10); + assert_eq!(track.coverage_at(50), 10); + assert_eq!(track.coverage_at(99), 10); + assert_eq!(track.coverage_at(100), 20); + assert_eq!(track.coverage_at(150), 20); + assert_eq!(track.coverage_at(199), 20); + assert_eq!(track.coverage_at(200), 30); + assert_eq!(track.coverage_at(250), 30); + assert_eq!(track.coverage_at(299), 30); + } + + #[test] + fn coverage_track_random_access_out_of_bounds() { + // Out of bounds access should return None or 0 + let coverage = vec![10; 100]; + let track = CoverageTrack::from_coverage(coverage); + + assert_eq!(track.coverage_at(100), 0); // Beyond last position + assert_eq!(track.coverage_at(1000), 0); // Way out of bounds + } + + #[test] + fn coverage_track_iterator_over_positions() { + // Should be able to iterate over (position, coverage) pairs + let coverage = vec![10, 10, 20, 20, 30, 30]; + let track = CoverageTrack::from_coverage(coverage.clone()); + + let positions: Vec<(usize, usize)> = track.iter().collect(); + assert_eq!(positions.len(), coverage.len()); + + for (i, (pos, cov)) in positions.iter().enumerate() { + assert_eq!(*pos, i); + assert_eq!(*cov, coverage[i]); + } + } + + #[test] + fn coverage_track_empty_coverage() { + // Empty coverage should be handled gracefully + let coverage: Vec = vec![]; + let track = CoverageTrack::from_coverage(coverage); + + assert_eq!(track.to_vec().len(), 0); + assert_eq!(track.run_count(), 0); + } + + #[test] + fn coverage_track_single_position() { + // Single position coverage + let coverage = vec![15]; + let track = CoverageTrack::from_coverage(coverage); + + assert_eq!(track.to_vec(), vec![15]); + assert_eq!(track.coverage_at(0), 15); + assert_eq!(track.run_count(), 1); + } + + #[test] + fn coverage_track_realistic_bacterial_genome() { + // Simulate realistic bacterial genome coverage (E. coli ~4.6Mbp) + // with mostly uniform 30x coverage with some variation + let mut coverage = Vec::new(); + + // Region 1: 1Mbp at 30x + coverage.extend(vec![30; 1_000_000]); + + // Region 2: 500kbp at 10x (lower coverage region) + coverage.extend(vec![10; 500_000]); + + // Region 3: 2Mbp at 35x + coverage.extend(vec![35; 2_000_000]); + + // Region 4: 100kbp at 0x (no coverage) + coverage.extend(vec![0; 100_000]); + + // Region 5: 1Mbp at 30x + coverage.extend(vec![30; 1_000_000]); + + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + assert_eq!(decompressed.len(), coverage.len()); + + // Should compress to 5 runs + assert_eq!(track.run_count(), 5); + + // Verify some random positions + assert_eq!(track.coverage_at(500_000), 30); + assert_eq!(track.coverage_at(1_200_000), 10); + assert_eq!(track.coverage_at(3_000_000), 35); + assert_eq!(track.coverage_at(3_550_000), 0); + assert_eq!(track.coverage_at(4_000_000), 30); + } + + #[test] + fn coverage_track_high_depth_sequencing() { + // Test high coverage values (100x) + let coverage = vec![100; 1000]; + let track = CoverageTrack::from_coverage(coverage); + let decompressed = track.to_vec(); + + for val in &decompressed { + assert_eq!(*val, 100); + } + } #[test] - fn coverage_track_stub_exists() { - // Just verify the type exists - implementation is for a separate slice - let _track: CoverageTrack; + fn coverage_track_serialization() { + // CoverageTrack should be serializable + let coverage = vec![10, 10, 20, 20, 30, 30]; + let track = CoverageTrack::from_coverage(coverage); + + let json = serde_json::to_string(&track).expect("serialization failed"); + let deserialized: CoverageTrack = + serde_json::from_str(&json).expect("deserialization failed"); + + assert_eq!(deserialized.to_vec(), track.to_vec()); + } + + // ===== Property tests ===== + + #[test] + fn property_round_trip_encode_decode() { + // Property: encode → decode should equal original (modulo quantization) + use proptest::prelude::*; + + proptest!(|(coverage in prop::collection::vec(0usize..200, 0..1000))| { + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + // Check length matches + prop_assert_eq!(decompressed.len(), coverage.len()); + + // Check values match after quantization + for (i, &original) in coverage.iter().enumerate() { + let quantized = ((original + 2) / 5) * 5; // Round to nearest 5 + prop_assert_eq!(decompressed[i], quantized); + } + }); + } + + #[test] + fn property_quantization_idempotence() { + // Property: quantizing twice should equal quantizing once + use proptest::prelude::*; + + proptest!(|(coverage in prop::collection::vec(0usize..200, 0..1000))| { + let track1 = CoverageTrack::from_coverage(coverage.clone()); + let decompressed1 = track1.to_vec(); + + // Quantize again + let track2 = CoverageTrack::from_coverage(decompressed1.clone()); + let decompressed2 = track2.to_vec(); + + // Should be identical + prop_assert_eq!(decompressed1, decompressed2); + }); + } + + #[test] + fn property_coverage_at_matches_decompressed() { + // Property: coverage_at(i) should match to_vec()[i] + use proptest::prelude::*; + + proptest!(|(coverage in prop::collection::vec(0usize..200, 10..100))| { + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + for i in 0..coverage.len() { + prop_assert_eq!(track.coverage_at(i), decompressed[i]); + } + }); + } + + #[test] + fn property_quantization_always_multiple_of_5() { + // Property: all decompressed values should be multiples of 5 + use proptest::prelude::*; + + proptest!(|(coverage in prop::collection::vec(0usize..200, 0..1000))| { + let track = CoverageTrack::from_coverage(coverage); + let decompressed = track.to_vec(); + + for val in &decompressed { + prop_assert_eq!(val % 5, 0, "all values must be multiples of 5"); + } + }); + } + + #[test] + fn property_quantization_within_2_of_original() { + // Property: quantized value should be within ±2 of original + use proptest::prelude::*; + + proptest!(|(coverage in prop::collection::vec(0usize..200, 0..1000))| { + let track = CoverageTrack::from_coverage(coverage.clone()); + let decompressed = track.to_vec(); + + for (i, &original) in coverage.iter().enumerate() { + let quantized = decompressed[i]; + let diff = if original > quantized { + original - quantized + } else { + quantized - original + }; + prop_assert!(diff <= 2, "quantized value should be within ±2 of original"); + } + }); } // ===== Error type tests =====