Skip to content
Merged

Dev #24

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tsvkit"
version = "0.9.2"
version = "0.9.3"
edition = "2024"

[dependencies]
Expand Down
32 changes: 31 additions & 1 deletion src/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -681,6 +681,24 @@ mod tests {
let row = vec!["value".to_string()];
assert!(evaluate(&bound, &row));
}

#[test]
fn value_expression_supports_common_escape_sequences() {
let expr = parse_value_expression("\"line\\nfeed\\tend\"").unwrap();
match expr {
ValueExpr::String(text) => assert_eq!(text, "line\nfeed\tend"),
other => panic!("expected string literal, got {:?}", other),
}
}

#[test]
fn value_expression_preserves_unknown_escape_sequences() {
let expr = parse_value_expression("\"\\\\.\\\\|\\\\(\\\\)\"").unwrap();
match expr {
ValueExpr::String(text) => assert_eq!(text, "\\.\\|\\(\\)"),
other => panic!("expected string literal, got {:?}", other),
}
}
}

impl<'a> Lexer<'a> {
Expand Down Expand Up @@ -896,7 +914,18 @@ impl<'a> Lexer<'a> {
if self.pos >= self.chars.len() {
bail!("unterminated escape sequence in string literal");
}
value.push(self.chars[self.pos] as char);
let escaped = self.chars[self.pos];
match escaped {
b'"' => value.push('"'),
b'\\' => value.push('\\'),
b'n' => value.push('\n'),
b'r' => value.push('\r'),
b't' => value.push('\t'),
other => {
value.push('\\');
value.push(other as char);
}
}
}
b'"' => {
self.pos += 1;
Expand Down Expand Up @@ -1297,3 +1326,4 @@ enum TokenKind {
Slash,
Caret,
}

22 changes: 22 additions & 0 deletions src/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,26 @@ mod tests {
let record = StringRecord::from(vec!["4", "3"]);
assert!(evaluate(&bound, &record));
}

#[test]
fn regex_literal_escape_preserved_for_special_characters() {
let expr = parse_expression("$1 ~ \"\\\\.\"").unwrap();
let headers = vec!["col1".to_string()];
let bound = bind_expression(expr, &headers, false).unwrap();
let matching = StringRecord::from(vec!["value.with.dot"]);
let non_matching = StringRecord::from(vec!["value without dot"]);
assert!(evaluate(&bound, &matching));
assert!(!evaluate(&bound, &non_matching));
}

#[test]
fn regex_literal_escape_handles_vertical_bar() {
let expr = parse_expression("$1 ~ \"\\\\|\"").unwrap();
let headers = vec!["col1".to_string()];
let bound = bind_expression(expr, &headers, false).unwrap();
let matching = StringRecord::from(vec!["left|right"]);
let non_matching = StringRecord::from(vec!["left/right"]);
assert!(evaluate(&bound, &matching));
assert!(!evaluate(&bound, &non_matching));
}
}
141 changes: 120 additions & 21 deletions src/mutate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -371,34 +371,85 @@ fn parse_substitution_expression(
let content = content
.strip_suffix('/')
.with_context(|| "substitution expression must end with '/'")?;
let mut parts = content.splitn(3, '/');
let selector_part = parts
.next()
.with_context(|| "missing selector list in substitution expression")?;
let pattern_part = parts
.next()
.with_context(|| "missing pattern in substitution expression")?;
let replacement_part = parts
.next()
.with_context(|| "missing replacement in substitution expression")?;

let selectors = parse_selector_list(&normalize_selector_spec(selector_part))?;
let (selector_part, pattern_part, replacement_part) =
split_substitution_components(content).with_context(|| {
"substitution expression must use s/selectors/pattern/replacement/ syntax"
})?;

let selectors = parse_selector_list(&normalize_selector_spec(selector_part.trim()))?;
if selectors.is_empty() {
bail!("substitution requires at least one target column");
}
let indices = resolve_selectors(headers, &selectors, no_header)?;
let regex = Regex::new(pattern_part).with_context(|| "invalid regex in substitution")?;
let regex_pattern = unescape_substitution_component(pattern_part);
let regex = Regex::new(&regex_pattern).with_context(|| "invalid regex in substitution")?;
Ok(MutateOp::Substitute {
columns: indices,
pattern: regex,
replacement: replacement_part.to_string(),
replacement: unescape_substitution_component(replacement_part),
})
}

fn normalize_selector_spec(spec: &str) -> String {
spec.replace('$', "")
}

fn split_substitution_components(content: &str) -> Option<(&str, &str, &str)> {
let mut split_points = Vec::with_capacity(2);
let mut escaped = false;

for (idx, ch) in content.char_indices() {
if ch == '/' && !escaped {
split_points.push(idx);
if split_points.len() == 2 {
break;
}
}

if ch == '\\' && !escaped {
escaped = true;
} else {
escaped = false;
}
}

if split_points.len() != 2 {
return None;
}

let first = &content[..split_points[0]];
let second = &content[split_points[0] + 1..split_points[1]];
let third = &content[split_points[1] + 1..];
Some((first, second, third))
}

fn unescape_substitution_component(raw: &str) -> String {
let mut result = String::new();
let mut chars = raw.chars();
while let Some(ch) = chars.next() {
if ch == '\\' {
if let Some(next) = chars.next() {
match next {
'n' => result.push('\n'),
'r' => result.push('\r'),
't' => result.push('\t'),
'\\' => result.push('\\'),
'/' => result.push('/'),
_ => {
result.push('\\');
result.push(next);
}
}
} else {
result.push('\\');
}
} else {
result.push(ch);
}
}
result
}

fn split_args(input: &str) -> Vec<String> {
let mut args = Vec::new();
let mut current = String::new();
Expand Down Expand Up @@ -431,13 +482,19 @@ fn parse_string_literal(value: &str) -> Result<String> {
while let Some(ch) = chars.next() {
if ch == '\\' {
if let Some(next) = chars.next() {
result.push(match next {
'n' => '\n',
't' => '\t',
'"' => '"',
'\\' => '\\',
other => other,
});
match next {
'n' => result.push('\n'),
'r' => result.push('\r'),
't' => result.push('\t'),
'"' => result.push('"'),
'\\' => result.push('\\'),
other => {
result.push('\\');
result.push(other);
}
}
} else {
result.push('\\');
}
} else {
result.push(ch);
Expand Down Expand Up @@ -569,4 +626,46 @@ mod tests {
assert_eq!(row[5], "4");
assert_eq!(row[6], "2");
}

#[test]
fn substitution_replacement_supports_escape_sequences() {
let headers = vec!["col1".to_string()];
let ops = parse_operations(
&vec!["s/$col1/\\t/ /".to_string()],
&headers,
false,
)
.unwrap();

let mut row = vec!["field\tvalue".to_string()];
process_row(&mut row, &ops).unwrap();
assert_eq!(row[0], "field value");
}

#[test]
fn substitution_allows_escaped_slashes() {
let headers = vec!["col1".to_string()];
let ops = parse_operations(
&vec![r"s/$col1/foo\/bar/hello\/world/".to_string()],
&headers,
false,
)
.unwrap();

let mut row = vec!["foo/bar".to_string()];
process_row(&mut row, &ops).unwrap();
assert_eq!(row[0], "hello/world");
}

#[test]
fn parse_string_literal_preserves_regex_escapes() {
let literal = parse_string_literal("\"\\\\.\\\\*\"").unwrap();
assert_eq!(literal, "\\.\\*");
}

#[test]
fn parse_string_literal_handles_common_escapes() {
let literal = parse_string_literal("\"line\\nfeed\\tend\"").unwrap();
assert_eq!(literal, "line\nfeed\tend");
}
}