From aebfd99628a86a83594d3870a4139748893b0f71 Mon Sep 17 00:00:00 2001 From: "Z.-L. Deng" Date: Tue, 7 Oct 2025 17:38:06 +0200 Subject: [PATCH 1/3] Support escaped characters in mutate substitutions --- Cargo.lock | 2 +- src/mutate.rs | 109 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 96 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fb1215b..8fe21a0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,7 +603,7 @@ dependencies = [ [[package]] name = "tsvkit" -version = "0.9.0" +version = "0.9.2" dependencies = [ "anyhow", "calamine", diff --git a/src/mutate.rs b/src/mutate.rs index 4a3bc2e..398b24e 100644 --- a/src/mutate.rs +++ b/src/mutate.rs @@ -371,27 +371,22 @@ fn parse_substitution_expression( let content = content .strip_suffix('/') .with_context(|| "substitution expression must end with '/'")?; - let mut parts = content.splitn(3, '/'); - let selector_part = parts - .next() - .with_context(|| "missing selector list in substitution expression")?; - let pattern_part = parts - .next() - .with_context(|| "missing pattern in substitution expression")?; - let replacement_part = parts - .next() - .with_context(|| "missing replacement in substitution expression")?; - - let selectors = parse_selector_list(&normalize_selector_spec(selector_part))?; + let (selector_part, pattern_part, replacement_part) = + split_substitution_components(content).with_context(|| { + "substitution expression must use s/selectors/pattern/replacement/ syntax" + })?; + + let selectors = parse_selector_list(&normalize_selector_spec(selector_part.trim()))?; if selectors.is_empty() { bail!("substitution requires at least one target column"); } let indices = resolve_selectors(headers, &selectors, no_header)?; - let regex = Regex::new(pattern_part).with_context(|| "invalid regex in substitution")?; + let regex_pattern = unescape_substitution_component(pattern_part); + let regex = Regex::new(®ex_pattern).with_context(|| "invalid regex in substitution")?; Ok(MutateOp::Substitute { columns: indices, pattern: regex, - replacement: replacement_part.to_string(), + replacement: unescape_substitution_component(replacement_part), }) } @@ -399,6 +394,62 @@ fn normalize_selector_spec(spec: &str) -> String { spec.replace('$', "") } +fn split_substitution_components(content: &str) -> Option<(&str, &str, &str)> { + let mut split_points = Vec::with_capacity(2); + let mut escaped = false; + + for (idx, ch) in content.char_indices() { + if ch == '/' && !escaped { + split_points.push(idx); + if split_points.len() == 2 { + break; + } + } + + if ch == '\\' && !escaped { + escaped = true; + } else { + escaped = false; + } + } + + if split_points.len() != 2 { + return None; + } + + let first = &content[..split_points[0]]; + let second = &content[split_points[0] + 1..split_points[1]]; + let third = &content[split_points[1] + 1..]; + Some((first, second, third)) +} + +fn unescape_substitution_component(raw: &str) -> String { + let mut result = String::new(); + let mut chars = raw.chars(); + while let Some(ch) = chars.next() { + if ch == '\\' { + if let Some(next) = chars.next() { + match next { + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + '\\' => result.push('\\'), + '/' => result.push('/'), + _ => { + result.push('\\'); + result.push(next); + } + } + } else { + result.push('\\'); + } + } else { + result.push(ch); + } + } + result +} + fn split_args(input: &str) -> Vec { let mut args = Vec::new(); let mut current = String::new(); @@ -569,4 +620,34 @@ mod tests { assert_eq!(row[5], "4"); assert_eq!(row[6], "2"); } + + #[test] + fn substitution_replacement_supports_escape_sequences() { + let headers = vec!["col1".to_string()]; + let ops = parse_operations( + &vec!["s/$col1/\\t/ /".to_string()], + &headers, + false, + ) + .unwrap(); + + let mut row = vec!["field\tvalue".to_string()]; + process_row(&mut row, &ops).unwrap(); + assert_eq!(row[0], "field value"); + } + + #[test] + fn substitution_allows_escaped_slashes() { + let headers = vec!["col1".to_string()]; + let ops = parse_operations( + &vec![r"s/$col1/foo\/bar/hello\/world/".to_string()], + &headers, + false, + ) + .unwrap(); + + let mut row = vec!["foo/bar".to_string()]; + process_row(&mut row, &ops).unwrap(); + assert_eq!(row[0], "hello/world"); + } } From f740d3edfabf5f973988979231aef448e9198018 Mon Sep 17 00:00:00 2001 From: "Z.-L. Deng" Date: Tue, 7 Oct 2025 17:50:23 +0200 Subject: [PATCH 2/3] Preserve escape sequences in filter string literals --- src/expression.rs | 32 +++++++++++++++++++++++++++++++- src/filter.rs | 22 ++++++++++++++++++++++ src/mutate.rs | 32 +++++++++++++++++++++++++------- 3 files changed, 78 insertions(+), 8 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index b4e6279..6f283de 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -681,6 +681,24 @@ mod tests { let row = vec!["value".to_string()]; assert!(evaluate(&bound, &row)); } + + #[test] + fn value_expression_supports_common_escape_sequences() { + let expr = parse_value_expression("\"line\\nfeed\\tend\"").unwrap(); + match expr { + ValueExpr::String(text) => assert_eq!(text, "line\nfeed\tend"), + other => panic!("expected string literal, got {:?}", other), + } + } + + #[test] + fn value_expression_preserves_unknown_escape_sequences() { + let expr = parse_value_expression("\"\\\\.\\\\|\\\\(\\\\)\"").unwrap(); + match expr { + ValueExpr::String(text) => assert_eq!(text, "\\.\\|\\(\\)"), + other => panic!("expected string literal, got {:?}", other), + } + } } impl<'a> Lexer<'a> { @@ -896,7 +914,18 @@ impl<'a> Lexer<'a> { if self.pos >= self.chars.len() { bail!("unterminated escape sequence in string literal"); } - value.push(self.chars[self.pos] as char); + let escaped = self.chars[self.pos]; + match escaped { + b'"' => value.push('"'), + b'\\' => value.push('\\'), + b'n' => value.push('\n'), + b'r' => value.push('\r'), + b't' => value.push('\t'), + other => { + value.push('\\'); + value.push(other as char); + } + } } b'"' => { self.pos += 1; @@ -1297,3 +1326,4 @@ enum TokenKind { Slash, Caret, } + diff --git a/src/filter.rs b/src/filter.rs index 4e1a814..dfd6783 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -189,4 +189,26 @@ mod tests { let record = StringRecord::from(vec!["4", "3"]); assert!(evaluate(&bound, &record)); } + + #[test] + fn regex_literal_escape_preserved_for_special_characters() { + let expr = parse_expression("$1 ~ \"\\\\.\"").unwrap(); + let headers = vec!["col1".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let matching = StringRecord::from(vec!["value.with.dot"]); + let non_matching = StringRecord::from(vec!["value without dot"]); + assert!(evaluate(&bound, &matching)); + assert!(!evaluate(&bound, &non_matching)); + } + + #[test] + fn regex_literal_escape_handles_vertical_bar() { + let expr = parse_expression("$1 ~ \"\\\\|\"").unwrap(); + let headers = vec!["col1".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let matching = StringRecord::from(vec!["left|right"]); + let non_matching = StringRecord::from(vec!["left/right"]); + assert!(evaluate(&bound, &matching)); + assert!(!evaluate(&bound, &non_matching)); + } } diff --git a/src/mutate.rs b/src/mutate.rs index 398b24e..745cf48 100644 --- a/src/mutate.rs +++ b/src/mutate.rs @@ -482,13 +482,19 @@ fn parse_string_literal(value: &str) -> Result { while let Some(ch) = chars.next() { if ch == '\\' { if let Some(next) = chars.next() { - result.push(match next { - 'n' => '\n', - 't' => '\t', - '"' => '"', - '\\' => '\\', - other => other, - }); + match next { + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + '"' => result.push('"'), + '\\' => result.push('\\'), + other => { + result.push('\\'); + result.push(other); + } + } + } else { + result.push('\\'); } } else { result.push(ch); @@ -650,4 +656,16 @@ mod tests { process_row(&mut row, &ops).unwrap(); assert_eq!(row[0], "hello/world"); } + + #[test] + fn parse_string_literal_preserves_regex_escapes() { + let literal = parse_string_literal("\"\\\\.\\\\*\"").unwrap(); + assert_eq!(literal, "\\.\\*"); + } + + #[test] + fn parse_string_literal_handles_common_escapes() { + let literal = parse_string_literal("\"line\\nfeed\\tend\"").unwrap(); + assert_eq!(literal, "line\nfeed\tend"); + } } From f21fbf6323914cee109f24b8b9858190f747bcf6 Mon Sep 17 00:00:00 2001 From: "Z.-L. Deng" Date: Wed, 8 Oct 2025 00:04:18 +0800 Subject: [PATCH 3/3] Update Cargo.toml --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index ad869c3..baf0318 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tsvkit" -version = "0.9.2" +version = "0.9.3" edition = "2024" [dependencies]