From 88d50a1d114dbf5b335a25454d696a4c7a2af8e5 Mon Sep 17 00:00:00 2001 From: theGreatHerrLebert Date: Wed, 18 Feb 2026 12:32:54 +0100 Subject: [PATCH] fix(validate): escape regex quantifiers in Sage sequence parsing The + and - signs inside bracket patterns were not escaped, causing them to be interpreted as regex quantifiers instead of literal characters. This meant none of the Sage mass annotations (e.g. [+57.021465]) were being converted to UNIMOD format, inflating false positive counts by ~10K per run. Also corrected mass prefixes for acetyl (42.0106 -> 42.0105) and water loss (-18.0106 -> -18.0105). Co-Authored-By: Claude Opus 4.6 --- .../imspy_simulation/timsim/validate/parsing.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/packages/imspy-simulation/src/imspy_simulation/timsim/validate/parsing.py b/packages/imspy-simulation/src/imspy_simulation/timsim/validate/parsing.py index e1de8d10..6e96af1a 100644 --- a/packages/imspy-simulation/src/imspy_simulation/timsim/validate/parsing.py +++ b/packages/imspy-simulation/src/imspy_simulation/timsim/validate/parsing.py @@ -443,13 +443,15 @@ def format_sage_sequence(peptide: str) -> str: The formatted peptide sequence with UNIMOD annotations. """ # Common Sage mass-to-UNIMOD mappings + # NOTE: The +/- signs inside brackets must be escaped (\+ / \-) + # otherwise '+' acts as a regex quantifier on '[' mass_replacements = [ - (r"\[+57\.0214\d*\]", "[UNIMOD:4]"), # Carbamidomethyl (C) - (r"\[+15\.9949\d*\]", "[UNIMOD:35]"), # Oxidation (M) - (r"\[+42\.0106\d*\]", "[UNIMOD:1]"), # Acetyl (Protein N-term) - (r"\[+79\.9663\d*\]", "[UNIMOD:21]"), # Phospho (STY) - (r"\[-17\.0265\d*\]", "[UNIMOD:385]"), # Ammonia loss (N-term Q) - (r"\[-18\.0106\d*\]", "[UNIMOD:23]"), # Water loss (N-term E) + (r"\[\+57\.0214\d*\]", "[UNIMOD:4]"), # Carbamidomethyl (C) 57.021464 + (r"\[\+15\.9949\d*\]", "[UNIMOD:35]"), # Oxidation (M) 15.994915 + (r"\[\+42\.0105\d*\]", "[UNIMOD:1]"), # Acetyl (Protein N-term) 42.010565 + (r"\[\+79\.9663\d*\]", "[UNIMOD:21]"), # Phospho (STY) 79.966331 + (r"\[\-17\.0265\d*\]", "[UNIMOD:385]"), # Ammonia loss (N-term Q) -17.026549 + (r"\[\-18\.0105\d*\]", "[UNIMOD:23]"), # Water loss (N-term E) -18.010565 ] result = peptide