From 4d6a91eeb4bfd4b008f2a5fd63b73697d5d0ed59 Mon Sep 17 00:00:00 2001 From: Michael Lazear Date: Sun, 3 May 2026 18:59:48 -0700 Subject: [PATCH] feat: convert spectrum representation to SoA --- crates/sage-cli/src/output.rs | 77 ++------- crates/sage-cli/src/runner.rs | 48 ++---- crates/sage-cli/tests/integration.rs | 2 +- crates/sage/src/lfq.rs | 110 ++++++------ crates/sage/src/scoring.rs | 76 +++++---- crates/sage/src/spectrum.rs | 246 +++++++++++++++++---------- crates/sage/src/tmt.rs | 27 ++- 7 files changed, 302 insertions(+), 284 deletions(-) diff --git a/crates/sage-cli/src/output.rs b/crates/sage-cli/src/output.rs index 1efdcaab..23383494 100644 --- a/crates/sage-cli/src/output.rs +++ b/crates/sage-cli/src/output.rs @@ -1,14 +1,23 @@ use rayon::prelude::*; -use sage_core::spectrum::MS1Spectra; +use sage_core::spectrum::ProcessedSpectrum; use sage_core::{scoring::Feature, tmt::TmtQuant}; #[derive(Default)] pub struct SageResults { - pub ms1: MS1Spectra, + pub ms1: Vec, pub features: Vec, pub quant: Vec, } +impl SageResults { + fn fold(mut self, other: SageResults) -> Self { + self.ms1.extend(other.ms1); + self.features.extend(other.features); + self.quant.extend(other.quant); + self + } +} + impl FromParallelIterator for SageResults { fn from_par_iter(par_iter: I) -> Self where @@ -16,39 +25,7 @@ impl FromParallelIterator for SageResults { { par_iter .into_par_iter() - .reduce(SageResults::default, |mut acc, x| { - acc.features.extend(x.features); - acc.quant.extend(x.quant); - match (acc.ms1, x.ms1) { - (MS1Spectra::NoMobility(mut a), MS1Spectra::NoMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::NoMobility(a); - } - (MS1Spectra::WithMobility(mut a), MS1Spectra::WithMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::Empty; - } - (MS1Spectra::Empty, MS1Spectra::WithMobility(a)) - | (MS1Spectra::WithMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::NoMobility(a)) - | (MS1Spectra::NoMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::NoMobility(a); - } - _ => { - // In theory this can happen if someone is searching - // together files of different types, mixing the ones - // that support IMS and the ones that dont. - // ... I dont think this should be run-time recoverable. - unreachable!("Found combination of MS1 spectra with and without mobility.") - } - }; - acc - }) + .reduce(SageResults::default, SageResults::fold) } } @@ -59,34 +36,6 @@ impl FromIterator for SageResults { { par_iter .into_iter() - .fold(SageResults::default(), |mut acc, x| { - acc.features.extend(x.features); - acc.quant.extend(x.quant); - match (acc.ms1, x.ms1) { - (MS1Spectra::NoMobility(mut a), MS1Spectra::NoMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::NoMobility(a); - } - (MS1Spectra::WithMobility(mut a), MS1Spectra::WithMobility(b)) => { - a.extend(b); - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::Empty; - } - (MS1Spectra::Empty, MS1Spectra::WithMobility(a)) - | (MS1Spectra::WithMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::WithMobility(a); - } - (MS1Spectra::Empty, MS1Spectra::NoMobility(a)) - | (MS1Spectra::NoMobility(a), MS1Spectra::Empty) => { - acc.ms1 = MS1Spectra::NoMobility(a); - } - _ => { - unreachable!("Found combination of MS1 spectra with and without mobility.") - } - }; - acc - }) + .fold(SageResults::default(), SageResults::fold) } } diff --git a/crates/sage-cli/src/runner.rs b/crates/sage-cli/src/runner.rs index f5f04d98..046dcbbe 100644 --- a/crates/sage-cli/src/runner.rs +++ b/crates/sage-cli/src/runner.rs @@ -14,7 +14,7 @@ use sage_core::mass::Tolerance; use sage_core::peptide::Peptide; use sage_core::scoring::Fragments; use sage_core::scoring::{Feature, Scorer}; -use sage_core::spectrum::{MS1Spectra, ProcessedSpectrum, RawSpectrum, SpectrumProcessor}; +use sage_core::spectrum::{ProcessedSpectrum, RawSpectrum, SpectrumProcessor}; use sage_core::tmt::TmtQuant; use std::collections::{HashMap, HashSet}; use std::time::Instant; @@ -140,7 +140,7 @@ impl Runner { } pub fn prefilter_peptides(self, parallel: usize, fasta: Fasta) -> Vec { - let spectra: Option>> = + let spectra: Option> = match parallel >= self.parameters.mzml_paths.len() { true => Some( self.read_processed_spectra(&self.parameters.mzml_paths, 0, 0) @@ -241,7 +241,7 @@ impl Runner { fn peptide_filter_processed_spectra( &self, scorer: &Scorer, - spectra: &Vec>, + spectra: &[ProcessedSpectrum], keep: &[std::sync::atomic::AtomicBool], ) { use std::sync::atomic::{AtomicUsize, Ordering}; @@ -250,7 +250,7 @@ impl Runner { spectra .par_iter() - .filter(|spec| spec.peaks.len() >= self.parameters.min_peaks && spec.level == 2) + .filter(|spec| spec.masses.len() >= self.parameters.min_peaks && spec.level == 2) .for_each(|spectrum| { let prev = counter.fetch_add(1, Ordering::Relaxed); if prev > 0 && prev % 10_000 == 0 { @@ -301,7 +301,7 @@ impl Runner { fn search_processed_spectra( &self, scorer: &Scorer, - msn_spectra: &Vec>, + msn_spectra: &[ProcessedSpectrum], ) -> Vec { use std::sync::atomic::{AtomicUsize, Ordering}; let counter = AtomicUsize::new(0); @@ -309,7 +309,7 @@ impl Runner { let features: Vec<_> = msn_spectra .par_iter() - .filter(|spec| spec.peaks.len() >= self.parameters.min_peaks && spec.level == 2) + .filter(|spec| spec.masses.len() >= self.parameters.min_peaks && spec.level == 2) .map(|x| { let prev = counter.fetch_add(1, Ordering::Relaxed); if prev > 0 && prev % 10_000 == 0 { @@ -332,8 +332,8 @@ impl Runner { fn complete_features( &self, - msn_spectra: Vec>, - ms1_spectra: MS1Spectra, + msn_spectra: Vec, + ms1_spectra: Vec, features: Vec, ) -> SageResults { let quant = self @@ -378,10 +378,7 @@ impl Runner { chunk: &[Url], chunk_idx: usize, batch_size: usize, - ) -> ( - MS1Spectra, - Vec>, - ) { + ) -> (Vec, Vec) { // Read all of the spectra at once - this can help prevent memory over-consumption issues info!( "processing files {} .. {} ", @@ -462,26 +459,17 @@ impl Runner { .map(|s| sp.process(s)) .collect::>(); - // If all the MS1 spectra contain IMS, then we can process them - // we use the IMS! otherwise we dont. - // Note: Empty iterators return true. - let all_contain_ims = spectra.ms1.iter().all(|x| x.mobility.is_some()); - let ms1_empty = spectra.ms1.is_empty(); - let ms1_spectra = if ms1_empty { + let has_ims = spectra.ms1.iter().any(|x| x.mobility.is_some()); + let ms1_spectra = if spectra.ms1.is_empty() { log::trace!("no MS1 spectra found"); - MS1Spectra::Empty - } else if all_contain_ims { - log::trace!("Processing MS1 spectra with IMS"); - let spectra = spectra - .ms1 - .into_iter() - .map(|x| sp.process_with_mobility(x)) - .collect(); - MS1Spectra::WithMobility(spectra) + Vec::new() } else { - log::trace!("Processing MS1 spectra without IMS"); - let spectra = spectra.ms1.into_iter().map(|s| sp.process(s)).collect(); - MS1Spectra::NoMobility(spectra) + if has_ims { + log::trace!("Processing MS1 spectra with IMS columns"); + } else { + log::trace!("Processing MS1 spectra without IMS"); + } + spectra.ms1.into_par_iter().map(|s| sp.process(s)).collect() }; let io_time = Instant::now() - start; diff --git a/crates/sage-cli/tests/integration.rs b/crates/sage-cli/tests/integration.rs index 53370172..2b9bcf29 100644 --- a/crates/sage-cli/tests/integration.rs +++ b/crates/sage-cli/tests/integration.rs @@ -15,7 +15,7 @@ fn integration() -> anyhow::Result<()> { let sp = SpectrumProcessor::new(100, true, 0.0); let processed = sp.process(spectra[0].clone()); - assert!(processed.peaks.len() <= 300); + assert!(processed.masses.len() <= 300); let scorer = Scorer { db: &database, diff --git a/crates/sage/src/lfq.rs b/crates/sage/src/lfq.rs index 2be08c8c..4f708f6a 100644 --- a/crates/sage/src/lfq.rs +++ b/crates/sage/src/lfq.rs @@ -2,7 +2,7 @@ use crate::database::{binary_search_slice, IndexedDatabase, PeptideIx}; use crate::mass::{composition, Composition, Tolerance, NEUTRON}; use crate::ml::{matrix::Matrix, retention_alignment::Alignment}; use crate::scoring::Feature; -use crate::spectrum::MS1Spectra; +use crate::spectrum::ProcessedSpectrum; use dashmap::DashMap; use rayon::prelude::*; use serde::{Deserialize, Serialize}; @@ -226,80 +226,66 @@ impl FeatureMap { pub fn quantify( &self, db: &IndexedDatabase, - spectra: &MS1Spectra, + spectra: &[ProcessedSpectrum], alignments: &[Alignment], ) -> HashMap<(PrecursorId, bool), (Peak, Vec), fnv::FnvBuildHasher> { let scores: DashMap<(PrecursorId, bool), Grid, fnv::FnvBuildHasher> = DashMap::default(); log::info!("tracing MS1 features"); - // TODO: find a good way to abstract this ... I think a macro would be the way to go. - match spectra { - MS1Spectra::NoMobility(spectra) => spectra.par_iter().for_each(|spectrum| { + if spectra.is_empty() { + log::warn!("no MS1 spectra found for quantification"); + } else { + spectra.par_iter().for_each(|spectrum| { let a = alignments[spectrum.file_id]; let rt = (spectrum.scan_start_time / a.max_rt) * a.slope + a.intercept; let query = self.rt_slice(rt, RT_TOL); - for peak in &spectrum.peaks { - for entry in query.mass_lookup(peak.mass) { - let id = match self.settings.combine_charge_states { - true => PrecursorId::Combined(entry.peptide), - false => PrecursorId::Charged((entry.peptide, entry.charge)), - }; - - let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { - let p = &db[entry.peptide]; - let composition = p - .sequence - .iter() - .map(|r| composition(*r)) - .sum::(); - let dist = crate::isotopes::peptide_isotopes( - composition.carbon, - composition.sulfur, - ); - Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) - }); - - grid.add_entry(rt, entry.isotope, spectrum.file_id, peak.intensity); - } - } - }), - MS1Spectra::WithMobility(spectra) => spectra.par_iter().for_each(|spectrum| { - let a = alignments[spectrum.file_id]; - let rt = (spectrum.scan_start_time / a.max_rt) * a.slope + a.intercept; - let query = self.rt_slice(rt, RT_TOL); + let add_entry = |entry: &PrecursorRange, intensity: f32| { + let id = match self.settings.combine_charge_states { + true => PrecursorId::Combined(entry.peptide), + false => PrecursorId::Charged((entry.peptide, entry.charge)), + }; - for peak in &spectrum.peaks { - for entry in query.mass_mobility_lookup(peak.mass, peak.mobility) { - let id = match self.settings.combine_charge_states { - true => PrecursorId::Combined(entry.peptide), - false => PrecursorId::Charged((entry.peptide, entry.charge)), - }; - - let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { - let p = &db[entry.peptide]; - let composition = p - .sequence - .iter() - .map(|r| composition(*r)) - .sum::(); - let dist = crate::isotopes::peptide_isotopes( - composition.carbon, - composition.sulfur, - ); - Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) - }); - - grid.add_entry(rt, entry.isotope, spectrum.file_id, peak.intensity); + let mut grid = scores.entry((id, entry.decoy)).or_insert_with(|| { + let p = &db[entry.peptide]; + let composition = p + .sequence + .iter() + .map(|r| composition(*r)) + .sum::(); + let dist = crate::isotopes::peptide_isotopes( + composition.carbon, + composition.sulfur, + ); + Grid::new(entry, RT_TOL, dist, alignments.len(), GRID_SIZE) + }); + + grid.add_entry(rt, entry.isotope, spectrum.file_id, intensity); + }; + + if spectrum.mobilities.is_empty() { + for (&mass, &intensity) in + spectrum.masses.iter().zip(spectrum.intensities.iter()) + { + for entry in query.mass_lookup(mass) { + add_entry(entry, intensity); + } + } + } else { + for ((&mass, &intensity), &mobility) in spectrum + .masses + .iter() + .zip(spectrum.intensities.iter()) + .zip(spectrum.mobilities.iter()) + { + for entry in query.mass_mobility_lookup(mass, mobility) { + add_entry(entry, intensity); + } } } - }), - MS1Spectra::Empty => { - // Should never be called if no MS1 spectra are present - log::warn!("no MS1 spectra found for quantification"); - } - }; + }); + } log::info!("integrating MS1 features"); diff --git a/crates/sage/src/scoring.rs b/crates/sage/src/scoring.rs index 523a078a..ccc2109d 100644 --- a/crates/sage/src/scoring.rs +++ b/crates/sage/src/scoring.rs @@ -2,7 +2,7 @@ use crate::database::{IndexedDatabase, PeptideIx}; use crate::heap::bounded_min_heapify; use crate::ion_series::{IonSeries, Kind}; use crate::mass::{Tolerance, NEUTRON, PROTON}; -use crate::spectrum::{Peak, Precursor, ProcessedSpectrum}; +use crate::spectrum::{Precursor, ProcessedSpectrum}; use serde::{Deserialize, Serialize}; use std::ops::AddAssign; use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; @@ -254,7 +254,7 @@ impl<'db> Scorer<'db> { /// * `keep`: A vector of atomic bools is used to maintain an identification list across scans pub fn quick_score( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, prefilter_low_memory: bool, keep: &[AtomicBool], ) { @@ -297,7 +297,7 @@ impl<'db> Scorer<'db> { } } - pub fn score(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score(&self, query: &ProcessedSpectrum) -> Vec { assert_eq!( query.level, 2, "internal bug, trying to score a non-MS2 scan!" @@ -334,7 +334,7 @@ impl<'db> Scorer<'db> { /// in sorted order. fn matched_peaks_with_isotope( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor_mass: f32, precursor_charge: u8, precursor_tol: Tolerance, @@ -355,9 +355,9 @@ impl<'db> Scorer<'db> { preliminary: vec![PreScore::default(); potential], }; - for peak in query.peaks.iter() { + for peak_mass in query.masses.iter() { for charge in 1..max_fragment_charge { - let mass = peak.mass * charge as f32; + let mass = peak_mass * charge as f32; for frag in candidates.page_search(mass) { let idx = frag.peptide_index.0 as usize - candidates.pre_idx_lo; let sc = &mut hits.preliminary[idx]; @@ -383,7 +383,7 @@ impl<'db> Scorer<'db> { fn matched_peaks( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor_mass: f32, precursor_charge: u8, precursor_tol: Tolerance, @@ -415,7 +415,7 @@ impl<'db> Scorer<'db> { } } - fn initial_hits(&self, query: &ProcessedSpectrum, precursor: &Precursor) -> InitialHits { + fn initial_hits(&self, query: &ProcessedSpectrum, precursor: &Precursor) -> InitialHits { // Sage operates on masses without protons; [M] instead of [MH+] let mz = precursor.mz - PROTON; @@ -462,7 +462,7 @@ impl<'db> Scorer<'db> { } /// Score a single [`ProcessedSpectrum`] against the database - pub fn score_standard(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score_standard(&self, query: &ProcessedSpectrum) -> Vec { let precursor = query.precursors.first().unwrap_or_else(|| { panic!("missing MS1 precursor for {}", query.id); }); @@ -477,7 +477,7 @@ impl<'db> Scorer<'db> { /// best PSMs ([`Feature`]) fn build_features( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, precursor: &Precursor, hits: &InitialHits, report_psms: usize, @@ -595,7 +595,7 @@ impl<'db> Scorer<'db> { } /// Remove peaks matching a PSM from a query spectrum - fn remove_matched_peaks(&self, query: &mut ProcessedSpectrum, psm: &Feature) { + fn remove_matched_peaks(&self, query: &mut ProcessedSpectrum, psm: &Feature) { let peptide = &self.db[psm.peptide_idx]; let fragments = self .db @@ -610,28 +610,42 @@ impl<'db> Scorer<'db> { for frag in fragments { for charge in 1..max_fragment_charge { // Experimental peaks are multipled by charge, therefore theoretical are divided - if let Some(peak) = crate::spectrum::select_most_intense_peak( - &query.peaks, + if let Some(peak_idx) = crate::spectrum::select_most_intense_peak( + &query.masses, + &query.intensities, frag.monoisotopic_mass / charge as f32, self.fragment_tol, None, ) { - to_remove.push(*peak); + to_remove.push((query.masses[peak_idx], query.intensities[peak_idx])); } } } - query.peaks = query - .peaks - .drain(..) - .filter(|peak| !to_remove.contains(peak)) - .collect(); - query.total_ion_current = query.peaks.iter().map(|peak| peak.intensity).sum::(); + let mut masses = Vec::with_capacity(query.masses.len()); + let mut intensities = Vec::with_capacity(query.intensities.len()); + let mut mobilities = Vec::with_capacity(query.mobilities.len()); + + for idx in 0..query.masses.len() { + let peak = (query.masses[idx], query.intensities[idx]); + if !to_remove.contains(&peak) { + masses.push(query.masses[idx]); + intensities.push(query.intensities[idx]); + if !query.mobilities.is_empty() { + mobilities.push(query.mobilities[idx]); + } + } + } + + query.masses = masses; + query.intensities = intensities; + query.mobilities = mobilities; + query.total_ion_current = query.intensities.iter().sum::(); } /// Return multiple PSMs for each spectra - first is the best match, second PSM is the best match /// after all theoretical peaks assigned to the best match are removed, etc - pub fn score_chimera_fast(&self, query: &ProcessedSpectrum) -> Vec { + pub fn score_chimera_fast(&self, query: &ProcessedSpectrum) -> Vec { let precursor = query.precursors.first().unwrap_or_else(|| { panic!("missing MS1 precursor for {}", query.id); }); @@ -660,7 +674,7 @@ impl<'db> Scorer<'db> { /// Calculate full hyperscore for a given PSM fn score_candidate( &self, - query: &ProcessedSpectrum, + query: &ProcessedSpectrum, pre_score: &PreScore, ) -> (Score, Option) { let mut score = Score { @@ -692,27 +706,31 @@ impl<'db> Scorer<'db> { // Experimental peaks are multipled by charge, therefore theoretical are divided let mz = frag.monoisotopic_mass / charge as f32; - if let Some(peak) = crate::spectrum::select_most_intense_peak( - &query.peaks, + if let Some(peak_idx) = crate::spectrum::select_most_intense_peak( + &query.masses, + &query.intensities, mz, self.fragment_tol, None, ) { + let peak_mass = query.masses[peak_idx]; + let peak_intensity = query.intensities[peak_idx]; + score.ppm_difference += - peak.intensity * (mz - peak.mass).abs() * 2E6 / (mz + peak.mass); + peak_intensity * (mz - peak_mass).abs() * 2E6 / (mz + peak_mass); - let exp_mz = peak.mass + PROTON; + let exp_mz = peak_mass + PROTON; let calc_mz = mz + PROTON; match frag.kind { Kind::A | Kind::B | Kind::C => { score.matched_b += 1; - score.summed_b += peak.intensity; + score.summed_b += peak_intensity; b_run.matched(idx); } Kind::X | Kind::Y | Kind::Z => { score.matched_y += 1; - score.summed_y += peak.intensity; + score.summed_y += peak_intensity; y_run.matched(idx); } } @@ -729,7 +747,7 @@ impl<'db> Scorer<'db> { fragments_details.mz_experimental.push(exp_mz); fragments_details.mz_calculated.push(calc_mz); fragments_details.fragment_ordinals.push(idx); - fragments_details.intensities.push(peak.intensity); + fragments_details.intensities.push(peak_intensity); } } } diff --git a/crates/sage/src/spectrum.rs b/crates/sage/src/spectrum.rs index 308c88af..a96f0eae 100644 --- a/crates/sage/src/spectrum.rs +++ b/crates/sage/src/spectrum.rs @@ -24,31 +24,6 @@ impl Ord for Peak { } } -/// A charge-less peak at monoisotopic mass with ion mobility -#[derive(PartialEq, Copy, Clone, Default, Debug)] -pub struct IMPeak { - pub intensity: f32, - pub mass: f32, - pub mobility: f32, // I would use f16 but its not stable -} - -impl Eq for IMPeak {} - -impl PartialOrd for IMPeak { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for IMPeak { - fn cmp(&self, other: &Self) -> std::cmp::Ordering { - self.intensity - .total_cmp(&other.intensity) - .then_with(|| self.mass.total_cmp(&other.mass)) - .then_with(|| self.mobility.total_cmp(&other.mobility)) - } -} - /// A de-isotoped peak, that might have some charge state information #[derive(PartialEq, PartialOrd, Debug, Copy, Clone)] pub struct Deisotoped { @@ -80,7 +55,7 @@ pub struct Precursor { } #[derive(Clone, Default, Debug)] -pub struct ProcessedSpectrum { +pub struct ProcessedSpectrum { /// MSn level pub level: u8, /// Scan ID @@ -93,8 +68,12 @@ pub struct ProcessedSpectrum { pub ion_injection_time: f32, /// Selected ions for precursors, if `level > 1` pub precursors: Vec, - /// MS peaks, sorted by mass in ascending order - pub peaks: Vec, + /// MS peak masses, sorted in ascending order + pub masses: Vec, + /// MS peak intensities, parallel to `masses` + pub intensities: Vec, + /// Ion mobility values, parallel to `masses` for IMS spectra and empty otherwise + pub mobilities: Vec, /// Total ion current pub total_ion_current: f32, } @@ -144,14 +123,6 @@ pub enum Representation { Centroid, } -#[derive(Default)] -pub enum MS1Spectra { - NoMobility(Vec>), - WithMobility(Vec>), - #[default] - Empty, -} - /// Binary search followed by linear search to select the most intense peak within `tolerance` window /// * `offset` - this parameter allows for a static adjustment to the lower and upper bounds of the search window. /// @@ -161,28 +132,27 @@ pub enum MS1Spectra { /// measurements with ProteomeDiscoverer, FragPipe, etc, we need to account for this minor difference (which has an impact /// perhaps 0.01% of the time) pub fn select_most_intense_peak( - peaks: &[Peak], + masses: &[f32], + intensities: &[f32], center: f32, tolerance: Tolerance, offset: Option, -) -> Option<&Peak> { +) -> Option { + debug_assert_eq!(masses.len(), intensities.len()); let (lo, hi) = tolerance.bounds(center); let (lo, hi) = ( lo + offset.unwrap_or_default(), hi + offset.unwrap_or_default(), ); - let (i, j) = binary_search_slice(peaks, |peak, query| peak.mass.total_cmp(query), lo, hi); + let (i, j) = binary_search_slice(masses, |mass, query| mass.total_cmp(query), lo, hi); let mut best_peak = None; let mut max_int = 0.0; - for peak in peaks[i..j] - .iter() - .filter(|peak| peak.mass >= lo && peak.mass <= hi) - { - if peak.intensity >= max_int { - max_int = peak.intensity; - best_peak = Some(peak); + for idx in (i..j).filter(|&idx| masses[idx] >= lo && masses[idx] <= hi) { + if intensities[idx] >= max_int { + max_int = intensities[idx]; + best_peak = Some(idx); } } best_peak @@ -268,7 +238,15 @@ pub fn path_compression(peaks: &mut [Deisotoped]) { } } -impl ProcessedSpectrum { +impl ProcessedSpectrum { + pub fn len(&self) -> usize { + self.masses.len() + } + + pub fn is_empty(&self) -> bool { + self.masses.is_empty() + } + pub fn extract_ms1_precursor(&self) -> Option<(f32, u8)> { let precursor = self.precursors.first()?; let charge = precursor.charge?; @@ -357,7 +335,48 @@ impl SpectrumProcessor { } } - pub fn process(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { + pub fn process(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { + debug_assert_eq!(spectrum.mz.len(), spectrum.intensity.len()); + if let Some(mobilities) = spectrum.mobility.as_ref() { + debug_assert_eq!(spectrum.mz.len(), mobilities.len()); + } + + if spectrum.ms_level == 1 && spectrum.mobility.is_some() { + let raw_mobilities = spectrum.mobility.as_ref().expect("checked above"); + let mut peaks = spectrum + .mz + .iter() + .zip(spectrum.intensity.iter().zip(raw_mobilities.iter())) + .map(|(&mass, (&intensity, &mobility))| (mass - PROTON, intensity, mobility)) + .collect::>(); + + peaks.sort_by(|a, b| a.0.total_cmp(&b.0)); + + let mut masses = Vec::with_capacity(peaks.len()); + let mut intensities = Vec::with_capacity(peaks.len()); + let mut mobilities = Vec::with_capacity(peaks.len()); + for (mass, intensity, mobility) in peaks { + masses.push(mass); + intensities.push(intensity); + mobilities.push(mobility); + } + + let total_ion_current = intensities.iter().sum::(); + + return ProcessedSpectrum { + level: spectrum.ms_level, + id: spectrum.id, + file_id: spectrum.file_id, + scan_start_time: spectrum.scan_start_time, + ion_injection_time: spectrum.ion_injection_time, + precursors: spectrum.precursors, + masses, + intensities, + mobilities, + total_ion_current, + }; + } + let mut peaks = match spectrum.ms_level { 2 => self.process_ms2(self.deisotope, &spectrum), _ => spectrum @@ -372,46 +391,11 @@ impl SpectrumProcessor { }; peaks.sort_by(|a, b| a.mass.total_cmp(&b.mass)); - let total_ion_current = peaks.iter().map(|peak| peak.intensity).sum::(); - - ProcessedSpectrum { - level: spectrum.ms_level, - id: spectrum.id, - file_id: spectrum.file_id, - scan_start_time: spectrum.scan_start_time, - ion_injection_time: spectrum.ion_injection_time, - precursors: spectrum.precursors, - peaks, - total_ion_current, - } - } - - pub fn process_with_mobility(&self, spectrum: RawSpectrum) -> ProcessedSpectrum { - assert!( - spectrum.ms_level == 1, - "Logic error, mobility processing should only be used for MS1" - ); - let mut peaks = spectrum - .mz - .iter() - .zip( - spectrum - .intensity - .iter() - .zip(spectrum.mobility.unwrap().iter()), - ) - .map(|(&mass, (&intensity, &mobility))| { - let mass = (mass - PROTON) * 1.0; - IMPeak { - mass, - intensity, - mobility, - } - }) - .collect::>(); - - peaks.sort_by(|a, b| a.mass.total_cmp(&b.mass)); - let total_ion_current = peaks.iter().map(|peak| peak.intensity).sum::(); + let (masses, intensities): (Vec<_>, Vec<_>) = peaks + .into_iter() + .map(|peak| (peak.mass, peak.intensity)) + .unzip(); + let total_ion_current = intensities.iter().sum::(); ProcessedSpectrum { level: spectrum.ms_level, @@ -420,7 +404,9 @@ impl SpectrumProcessor { scan_start_time: spectrum.scan_start_time, ion_injection_time: spectrum.ion_injection_time, precursors: spectrum.precursors, - peaks, + masses, + intensities, + mobilities: Vec::new(), total_ion_current, } } @@ -580,4 +566,86 @@ mod test { ] ); } + + #[test] + fn select_most_intense_peak_uses_parallel_columns() { + let masses = vec![99.0, 100.0, 100.01, 100.02, 101.0]; + let intensities = vec![10.0, 20.0, 50.0, 30.0, 100.0]; + + let idx = select_most_intense_peak( + &masses, + &intensities, + 100.01, + Tolerance::Da(-0.02, 0.02), + None, + ) + .expect("peak in tolerance"); + + assert_eq!(idx, 2); + assert_eq!(masses[idx], 100.01); + assert_eq!(intensities[idx], 50.0); + } + + #[test] + fn select_most_intense_peak_applies_offset() { + let label = 126.127726; + let masses = vec![label - PROTON - 0.01, label - PROTON, label - PROTON + 0.01]; + let intensities = vec![10.0, 100.0, 50.0]; + + let idx = select_most_intense_peak( + &masses, + &intensities, + label, + Tolerance::Da(-0.005, 0.005), + Some(-PROTON), + ) + .expect("offset peak in tolerance"); + + assert_eq!(idx, 1); + } + + #[test] + fn process_ms1_without_mobility_builds_empty_mobility_column() { + let processor = SpectrumProcessor::new(10, false, 0.0); + let spectrum = RawSpectrum { + ms_level: 1, + mz: vec![102.0, 100.0, 101.0], + intensity: vec![30.0, 10.0, 20.0], + ..RawSpectrum::default_with_file_id(7) + }; + + let processed = processor.process(spectrum); + + assert_eq!(processed.file_id, 7); + assert_eq!( + processed.masses, + vec![100.0 - PROTON, 101.0 - PROTON, 102.0 - PROTON] + ); + assert_eq!(processed.intensities, vec![10.0, 20.0, 30.0]); + assert!(processed.mobilities.is_empty()); + assert_eq!(processed.total_ion_current, 60.0); + } + + #[test] + fn process_ms1_with_mobility_sorts_all_columns_by_mass() { + let processor = SpectrumProcessor::new(10, false, 0.0); + let spectrum = RawSpectrum { + ms_level: 1, + mz: vec![102.0, 100.0, 101.0], + intensity: vec![30.0, 10.0, 20.0], + mobility: Some(vec![3.0, 1.0, 2.0]), + ..RawSpectrum::default_with_file_id(7) + }; + + let processed = processor.process(spectrum); + + assert_eq!( + processed.masses, + vec![100.0 - PROTON, 101.0 - PROTON, 102.0 - PROTON] + ); + assert_eq!(processed.intensities, vec![10.0, 20.0, 30.0]); + assert_eq!(processed.mobilities, vec![1.0, 2.0, 3.0]); + assert_eq!(processed.masses.len(), processed.intensities.len()); + assert_eq!(processed.masses.len(), processed.mobilities.len()); + } } diff --git a/crates/sage/src/tmt.rs b/crates/sage/src/tmt.rs index f965d770..268c293a 100644 --- a/crates/sage/src/tmt.rs +++ b/crates/sage/src/tmt.rs @@ -181,24 +181,32 @@ pub struct Quant<'ms3> { /// SPS precursor purity for the chimeric hit pub chimera_purity: Option, /// Quanitified TMT reporter ion intensities - pub intensities: Vec>, + pub intensities: Vec>, /// MS3 spectrum - pub spectrum: &'ms3 ProcessedSpectrum, + pub spectrum: &'ms3 ProcessedSpectrum, } /// Return a vector containing the peaks closest to the m/zs defined in /// `labels`, within a given tolerance window. /// This function is MS-level agnostic, so it can be used for either MS2 or MS3 /// quant. -pub fn find_reporter_ions<'a>( - peaks: &'a [Peak], +pub fn find_reporter_ions( + masses: &[f32], + intensities: &[f32], labels: &[f32], label_tolerance: Tolerance, -) -> Vec> { +) -> Vec> { labels .iter() .map(|&label| { - spectrum::select_most_intense_peak(peaks, label, label_tolerance, Some(-PROTON)) + spectrum::select_most_intense_peak( + masses, + intensities, + label, + label_tolerance, + Some(-PROTON), + ) + .map(|idx| intensities[idx]) }) .collect() } @@ -304,7 +312,7 @@ pub struct TmtQuant { /// * `isobaric_tolerance`: specify label tolerance /// * `level`: MSn level to extract isobaric peaks from pub fn quantify( - spectra: &[ProcessedSpectrum], + spectra: &[ProcessedSpectrum], isobaric_labels: &Isobaric, isobaric_tolerance: Tolerance, level: u8, @@ -324,12 +332,13 @@ pub fn quantify( }; let peaks = find_reporter_ions( - &spectrum.peaks, + &spectrum.masses, + &spectrum.intensities, isobaric_labels.reporter_masses(), isobaric_tolerance, ) .into_iter() - .map(|peak| peak.map(|p| p.intensity).unwrap_or_default()) + .map(|peak| peak.unwrap_or_default()) .collect(); Some(TmtQuant {