Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion phraya-core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "phraya-core"
version = "0.1.0"
edition = "2026"
edition = "2021"

[dependencies]
serde = { workspace = true }
Expand Down
5 changes: 5 additions & 0 deletions phraya-core/src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ impl Sequence {
}
})
}

/// Get the raw DNA bases as a byte slice
pub fn bases(&self) -> &[u8] {
&self.bases
}
}

/// Variant observation at a genomic position with full alignment metadata.
Expand Down
1 change: 1 addition & 0 deletions phraya-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ version = "0.1.0"
edition = "2024"

[dependencies]
phraya-core = { workspace = true }
2 changes: 1 addition & 1 deletion phraya-index/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
pub mod minimizer;

pub use minimizer::{sketch, MinimimizerSketch};
pub use minimizer::{sketch, sketch_default, Sketch, MinimimizerSketch};
75 changes: 73 additions & 2 deletions phraya-index/src/minimizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
//! - k = 21: standard for bacterial genomics, good balance of specificity and coverage
//! - w = 11: window length, results in ~1 minimizer per k bases on average for random sequence

use phraya_core::types::Sequence;

#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MinimimizerSketch {
/// Sorted list of (minimizer_value, position) pairs
Expand All @@ -32,6 +34,38 @@ pub struct MinimimizerSketch {
pub w: usize,
}

/// Public wrapper type for k-mer sketching of Sequence objects.
///
/// This is the primary API for sketching bacterial sequences with k-mer minimizers.
/// It provides methods to query k, w, length, and check if empty.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Sketch {
/// Internal minimizer sketch representation
inner: MinimimizerSketch,
}

impl Sketch {
/// Get the k-mer length used for this sketch
pub fn k(&self) -> usize {
self.inner.k
}

/// Get the window length used for this sketch
pub fn w(&self) -> usize {
self.inner.w
}

/// Get the number of minimizers in the sketch
pub fn len(&self) -> usize {
self.inner.len()
}

/// Check if the sketch is empty
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
}

impl MinimimizerSketch {
/// Find shared minimizers between this sketch and another.
///
Expand Down Expand Up @@ -120,7 +154,44 @@ fn canonical_kmer(value: u64, k: usize) -> u64 {
std::cmp::min(value, rc)
}

/// Construct a minimizer sketch from a sequence.
/// Construct a minimizer sketch from a Sequence object with custom k and w parameters.
///
/// # Arguments
///
/// * `sequence` - Input Sequence object containing DNA bases
/// * `k` - K-mer length (typical: 21 for bacterial genomics)
/// * `w` - Window length (typical: 11)
///
/// # Returns
///
/// A `Sketch` containing the minimizers found in the sequence.
///
/// # Determinism
///
/// This function is deterministic: sketching the same sequence with the same parameters
/// always produces identical results. The sketch depends only on the sequence bases,
/// not on metadata like ID, description, or quality scores.
pub fn sketch(sequence: &Sequence, k: usize, w: usize) -> Sketch {
let inner = sketch_bytes(sequence.bases(), k, w);
Sketch { inner }
}

/// Construct a minimizer sketch from a Sequence using default parameters (k=21, w=11).
///
/// Default parameters are suitable for bacterial genomics (E. coli genome size ~4.6Mbp).
///
/// # Arguments
///
/// * `sequence` - Input Sequence object containing DNA bases
///
/// # Returns
///
/// A `Sketch` containing the minimizers found in the sequence.
pub fn sketch_default(sequence: &Sequence) -> Sketch {
sketch(sequence, 21, 11)
}

/// Internal function to construct a minimizer sketch from raw bytes.
///
/// # Arguments
///
Expand All @@ -135,7 +206,7 @@ fn canonical_kmer(value: u64, k: usize) -> u64 {
/// # Panics
///
/// Panics if k > w or if k is 0.
pub fn sketch(sequence: &[u8], k: usize, w: usize) -> MinimimizerSketch {
fn sketch_bytes(sequence: &[u8], k: usize, w: usize) -> MinimimizerSketch {
assert!(k > 0, "k must be greater than 0");

// NOTE: The contract specifies panicking when w < k, but this conflicts with
Expand Down
Loading