diff --git a/Cargo.lock b/Cargo.lock index 3ac7c72..022c69f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,7 +603,7 @@ dependencies = [ [[package]] name = "tsvkit" -version = "0.9.6" +version = "0.9.9" dependencies = [ "anyhow", "calamine", diff --git a/Cargo.toml b/Cargo.toml index 6be690c..f79fb48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tsvkit" -version = "0.9.6" +version = "0.9.9" edition = "2024" [dependencies] diff --git a/README.md b/README.md index 3625c63..5969f0c 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ - [`melt`](#melt) - [`pivot`](#pivot) - [`slice`](#slice) + - [`head`](#head) - [`pretty`](#pretty) + - [`transpose`](#transpose) - [`excel`](#excel) - [`csv`](#csv) - [Additional tips](#additional-tips) @@ -104,7 +106,9 @@ The list below provides a one-line description of every `tsvkit` subcommand. Eac - [`melt`](#melt) — convert wide tables into tidy long form with `variable/value` pairs. - [`pivot`](#pivot) — convert long form back to wide with optional fill value for missing cells. - [`slice`](#slice) — extract rows by 1-based indices or ranges. +- [`head`](#head) — preview the first rows as a boxed table (single or multiple files). - [`pretty`](#pretty) — render aligned, boxed tables for quick inspection or sharing. +- [`transpose`](#transpose) — transpose rows and columns. - [`excel`](#excel) — inspect, preview, export, or build `.xlsx` workbooks. - [`csv`](#csv) — convert delimited text to TSV with custom separators. @@ -174,6 +178,9 @@ List literals use square brackets: `[1,2,3]`, `["case","control"]`, `[IL6:IL10]` | `log2(expr)` | Base-2 logarithm | | `len(expr)` | Character count using Unicode code points. | | `is_na(expr)` | Returns `1` when the expression is blank/`NA`/`NaN`, otherwise `0`. | +| `upper(expr)` | Convert text to uppercase. | +| `lower(expr)` | Convert text to lowercase. | +| `cap(expr)` | Capitalize only the first character. | Functions accept column references (`abs($purity - 1)`), constants, or subexpressions. Empty or non-numeric values yield blanks. @@ -241,8 +248,63 @@ Regex selectors pick up columns whose headers match a pattern. Combine them with tsvkit cut -f '1,group,~"^IL",~"_pct$"' examples/qc.tsv ``` +Injecting the source file basename (`__base__`) or filename with path (`__file__`): + +```bash +tsvkit cut -f '__base__,1:' examples/qc*.tsv +``` + +Explanation: + +- `__base__` injects the source filename as the first column +- `1:` selects all existing columns from each input file + +You can now inject file-derived values directly in `-f` with template selectors: + +```bash +tsvkit cut -f '{file},{base:},1:2' examples/qc.tsv +tsvkit cut -f 'sample={base:#sample_!lower},1:' sample_A.tsv +``` + +Use `--inject-col-names` (aliases: `--file-col`, `--fc`) to rename injected columns. +If you have multiple injected selectors in `-f`, pass comma-separated names in order: + +```bash +tsvkit cut --inject-col-names sample -f '__base__,1:' examples/qc*.tsv +tsvkit cut --inject-col-names file_name,sample -f '{base:},sample={base:#sample_!upper},1:2' sample_A.tsv +``` + +Template tokens: + +- `{file}` / `{__file__}` full path +- `{base}` / `{__base__}` basename +- `{dir}` / `{__dir__}` parent dir +- `{base:}` basename without all extensions +- `{base.}` basename without last extension +- `{file%}` basename of `{file}` +- `{file/}` directory of `{file}` +- `{file^suffix}` remove a literal trailing suffix when present +- `{base:#prefix}` remove a literal prefix when present (example: `{base:#sample_}`) +- case controls: append `!upper`, `!lower`, or `!cap` (for example `{base:!upper}`) + +> Shell tip: `!` is interpreted by many shells in double quotes. Use **single quotes** for selector/template expressions like `'{base:!lower},1:3'` or `'sample={base:!lower},1:'`. If you must use double quotes, escape it: `"{base:\!lower}"`. + +Negative selectors in `cut -f`: + +- `-1` = last column +- `-2` = second-last column +- `-2:` = from second-last to the final column + + Matches deduplicate by default; add `-D/--allow-dups` to keep every occurrence when multiple selectors target the same column. +Useful operational flags: + +- `-H/--no-header` for headerless TSVs (selectors are index-based) +- `-C/--comment-char` to skip comment lines (default `#`) +- `-E/--ignore-empty-row` to skip blank rows +- `-I/--ignore-illegal-row` to skip rows with inconsistent column counts + ### `filter` Filter rows with boolean logic, arithmetic, column ranges, regexes, and list membership tests. @@ -254,6 +316,13 @@ tsvkit filter -e '$group == "case" & $purity >= 0.94' examples/samples.tsv tsvkit filter -e '$status !in ["fail","missing","error"] & $tech ~ "sRNA"' examples/samples.tsv ``` +Case helpers are supported in filter expressions: + +```bash +tsvkit filter -e 'cap($1) == "HELLO"' data.tsv +tsvkit filter -e 'upper($group) == "CASE"' examples/samples.tsv +``` + **Expression building blocks for `filter`** | Building block | Examples | Notes | @@ -287,6 +356,34 @@ tsvkit join -f subject_id examples/samples.tsv examples/subjects.tsv Control join type with `-k` (`-k 0` = full outer). Use `-F/--select` to specify output columns (defaults to all non-key columns); syntax mirrors `-f`. `--fill TEXT` supplies placeholders for missing combinations, while `--sorted` streams pre-sorted data. `tsvkit join` trims unused columns before indexing, and `-t/--threads` (default up to 8) balances throughput and resource usage. +Use `--add-header` to override emitted non-key header names with per-file/per-column templates: + +```bash +tsvkit join \ + -f 'subject_id;subject_id' \ + -F 'group,purity;sex,age' \ + --add-header '{base:}_group,{base:}_purity;{base:}_sex,{base:}_age' \ + examples/samples.tsv examples/subjects.tsv +``` + +Formatting rules: split files with `;`, columns with `,`, and keep counts aligned with `-F` for each file. Template tokens are shared with `cut -f '{...}'` template selectors. + +When using `-H` (no input header) together with `--add-header`, `join` emits a header row: +- join-key columns are named `index1`, `index2`, ..., `indexN` by default +- non-key columns use your `--add-header` templates. + +Use `--key-header` (alias `--index-name`) to rename join-key columns explicitly. Provide comma-separated names and match the number of join columns. + +Example: + +```bash +tsvkit join -H \ + -f '1;1' \ + --key-header 'sample_id' \ + --add-header 'patient_{base:#sample_}' \ + sample_A.tsv sample_B.tsv +``` + ### `mutate` Create derived columns or rewrite values using expressions. @@ -317,6 +414,25 @@ Apply in-place edits with the sed-style form: tsvkit mutate -e 's/$group/ctrl/control/' examples/samples.tsv ``` +Multiple expressions can be packed into one `-e` clause using `;`: + +```bash +tsvkit mutate -e 'v1=$7/$8;v2=$11/$12' data.tsv +``` + +Create new columns from regex replacement with: + +```bash +tsvkit mutate -e 'new=s/$2/aa[0-9]+/bb/' data.tsv +tsvkit mutate -e 'new2=${1/aa/bb}' data.tsv +``` + +String case helpers can be used directly in mutate expressions: + +```bash +tsvkit mutate -e 'v1=cap($2)' -e 'v2=upper($group)' data.tsv +``` + **Mutation building blocks** | Form | Meaning | Example | @@ -324,6 +440,8 @@ tsvkit mutate -e 's/$group/ctrl/control/' examples/samples.tsv | `name=EXPR` | Append a new column containing the evaluated expression. | `mean_signal=mean($sig1:$sig4)` | | `existing=EXPR` | Overwrite an existing column with the expression result. | `purity=round($purity,2)` (via custom helper script) | | `s/$selectors/pattern/replacement/` | Regex substitution on one or more columns (`$` optional). | `s/$group/ctrl/control/` | +| `new=s/$selector/pattern/replacement/` | Create a new column from one source column via regex replacement. | `new=s/$2/aa/bb/` | +| `new=${selector/pattern/replacement}` | Braced shorthand for assignment substitution. | `new=${1/aa/bb}` | **Row-wise aggregators shared by `filter` and `mutate`** @@ -426,6 +544,14 @@ Take specific rows (1-based indices or ranges, including open-ended forms like ` tsvkit slice -r 1,4:5 examples/samples.tsv ``` +### `head` +Print the first rows from TSV input (default `-n 10`). With one input (including stdin), output is plain TSV with no file banner; with multiple files, each block is prefixed by `# `. + +```bash +tsvkit head -n 5 examples/samples.tsv examples/subjects.tsv +cat examples/samples.tsv | tsvkit head -n 3 +``` + ### `pretty` Render aligned, boxed output for quick inspection. @@ -436,6 +562,13 @@ tsvkit filter -e '$group == "case"' examples/samples.tsv | tsvkit pretty - `--round DIGITS` (or `-r`) rounds numeric cells to the requested precision. Tiny magnitudes automatically switch to scientific notation so columns stay legible even when values approach zero. +### `transpose` +Transpose a table (rows become columns). With headers, the header row is included in transposition; use `-H` for headerless input. + +```bash +tsvkit transpose examples/samples.tsv +``` + ### `excel` Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new workbooks from TSV inputs. Unless `-H/--no-header` is supplied, the first row of each sheet is treated as the header row; use that flag when you need to preview or export raw rows. diff --git a/src/common.rs b/src/common.rs index 3af34f8..30048bf 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,7 +1,7 @@ use std::collections::HashSet; use std::fs::File; use std::io::{self, BufReader}; -use std::path::Path; +use std::path::{Path, PathBuf}; use anyhow::{Context, Result, anyhow, bail}; use csv::ReaderBuilder; @@ -30,18 +30,27 @@ pub enum ColumnSelector { FromEnd(usize), Name(String), Regex(String), + Template(String), Range(Option>, Option>), Special(SpecialColumn), } pub fn parse_selector_list(spec: &str) -> Result> { + parse_selector_list_impl(spec, false) +} + +pub fn parse_selector_list_with_templates(spec: &str) -> Result> { + parse_selector_list_impl(spec, true) +} + +fn parse_selector_list_impl(spec: &str, braces_as_templates: bool) -> Result> { if spec.trim().is_empty() { bail!("column specification must not be empty"); } tokenize_selector_spec(spec)? .into_iter() - .map(parse_selector_token) + .map(|token| parse_selector_token(token, braces_as_templates)) .collect() } @@ -136,6 +145,12 @@ fn resolve_selectors_with_options( resolve_selector_indices(headers, selector, no_header, allow_duplicates)?; indices.append(&mut resolved); } + ColumnSelector::Template(template) => { + bail!( + "template selector '{{{}}}' cannot be resolved as a positional index", + template + ); + } ColumnSelector::Range(start, end) => { if headers.is_empty() { bail!("column range cannot be resolved without any columns"); @@ -272,7 +287,235 @@ pub fn default_headers(len: usize) -> Vec { (1..=len).map(|i| format!("col{}", i)).collect() } -fn parse_selector_token(token: SelectorToken) -> Result { +#[derive(Debug, Clone)] +pub struct FileTemplateContext { + pub path: String, + pub base: String, + pub dir: String, +} + +impl FileTemplateContext { + pub fn from_path(path: &Path) -> Self { + if path == Path::new("-") { + return FileTemplateContext { + path: "-".to_string(), + base: "-".to_string(), + dir: ".".to_string(), + }; + } + let full = path.to_string_lossy().into_owned(); + let base = path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| full.clone()); + let dir = path + .parent() + .map(|p| p.to_string_lossy().into_owned()) + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| ".".to_string()); + FileTemplateContext { + path: full, + base, + dir, + } + } +} + +pub fn render_file_template(template: &str, ctx: &FileTemplateContext) -> Result { + let mut out = String::new(); + let chars = template.chars().collect::>(); + let mut i = 0usize; + while i < chars.len() { + if chars[i] == '{' { + let mut j = i + 1; + while j < chars.len() && chars[j] != '}' { + j += 1; + } + if j >= chars.len() { + bail!("unterminated '{{' in template '{}'", template); + } + let token = chars[i + 1..j].iter().collect::(); + out.push_str(&expand_file_token(token.trim(), ctx)?); + i = j + 1; + continue; + } + out.push(chars[i]); + i += 1; + } + Ok(out) +} + +fn expand_file_token(token: &str, ctx: &FileTemplateContext) -> Result { + let (core, case_mode) = if let Some((left, right)) = token.split_once('!') { + (left.trim(), Some(right.trim().to_ascii_lowercase())) + } else { + (token, None) + }; + + let (core, suffix) = if let Some((left, right)) = core.split_once('^') { + (left.trim(), Some(right.to_string())) + } else { + (core, None) + }; + + let (core, prefix) = if let Some((left, right)) = core.split_once('#') { + (left.trim(), Some(right.to_string())) + } else { + (core, None) + }; + + let mut value = match core { + "file" | "__file__" => ctx.path.clone(), + "base" | "__base__" => ctx.base.clone(), + "dir" | "__dir__" => ctx.dir.clone(), + _ => { + let source; + let mut actions = String::new(); + for ch in core.chars() { + if ch.is_ascii_alphabetic() { + actions.push(ch); + continue; + } + } + if core.starts_with("__base__") { + source = "base"; + actions = core["__base__".len()..].to_string(); + } else if core.starts_with("__dir__") { + source = "dir"; + actions = core["__dir__".len()..].to_string(); + } else if core.starts_with("__file__") { + source = "file"; + actions = core["__file__".len()..].to_string(); + } else if core.starts_with("base") { + source = "base"; + actions = core["base".len()..].to_string(); + } else if core.starts_with("dir") { + source = "dir"; + actions = core["dir".len()..].to_string(); + } else if core.starts_with("file") { + source = "file"; + actions = core["file".len()..].to_string(); + } else { + bail!("unsupported template token '{}'", token); + } + apply_template_actions(source, &actions, ctx)? + } + }; + + if let Some(sfx) = suffix { + if value.ends_with(&sfx) { + let trimmed = value.len().saturating_sub(sfx.len()); + value.truncate(trimmed); + } + } + + if let Some(pfx) = prefix { + if value.starts_with(&pfx) { + value = value[pfx.len()..].to_string(); + } + } + + if let Some(mode) = case_mode { + value = match mode.as_str() { + "upper" | "u" => value.to_uppercase(), + "lower" | "l" => value.to_lowercase(), + "cap" | "capitalize" | "c" => capitalize_first(&value), + _ => bail!("unsupported case mode '{}'", mode), + }; + } + + Ok(value) +} + +fn apply_template_actions( + source: &str, + actions: &str, + ctx: &FileTemplateContext, +) -> Result { + let mut value = match source { + "file" => ctx.path.clone(), + "base" => ctx.base.clone(), + "dir" => ctx.dir.clone(), + _ => bail!("unsupported template source '{}'", source), + }; + + for ch in actions.chars() { + match ch { + '%' => value = basename(&value), + '/' => value = basedir(&value), + ':' => value = strip_all_extensions(&value), + '.' => value = strip_last_extension(&value), + _ if ch.is_whitespace() => {} + _ => bail!("unsupported template modifier '{}' in '{}'", ch, actions), + } + } + Ok(value) +} + +fn basename(input: &str) -> String { + Path::new(input) + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input.to_string()) +} + +fn basedir(input: &str) -> String { + PathBuf::from(input) + .parent() + .map(|p| p.to_string_lossy().into_owned()) + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| ".".to_string()) +} + +fn strip_last_extension(input: &str) -> String { + let path = Path::new(input); + if let Some(stem) = path.file_stem() { + if let Some(parent) = path.parent() { + if parent.as_os_str().is_empty() { + return stem.to_string_lossy().into_owned(); + } + return format!("{}/{}", parent.to_string_lossy(), stem.to_string_lossy()); + } + return stem.to_string_lossy().into_owned(); + } + input.to_string() +} + +fn strip_all_extensions(input: &str) -> String { + let path = Path::new(input); + let name = path + .file_name() + .map(|s| s.to_string_lossy().into_owned()) + .unwrap_or_else(|| input.to_string()); + let mut core = name.clone(); + while let Some((left, _)) = core.rsplit_once('.') { + if left.is_empty() { + break; + } + core = left.to_string(); + } + if let Some(parent) = path.parent() { + if parent.as_os_str().is_empty() { + return core; + } + return format!("{}/{}", parent.to_string_lossy(), core); + } + core +} + +fn capitalize_first(input: &str) -> String { + let mut chars = input.chars(); + match chars.next() { + None => String::new(), + Some(first) => { + let mut out = first.to_uppercase().to_string(); + out.push_str(chars.as_str()); + out + } + } +} + +fn parse_selector_token(token: SelectorToken, braces_as_templates: bool) -> Result { if token.text.is_empty() { return Err(anyhow!("empty column selector")); } @@ -289,20 +532,26 @@ fn parse_selector_token(token: SelectorToken) -> Result { let start_selector = if start_trim.is_empty() { None } else { - Some(Box::new(parse_simple_selector(start_trim)?)) + Some(Box::new(parse_simple_selector( + start_trim, + braces_as_templates, + )?)) }; let end_selector = if end_trim.is_empty() { None } else { - Some(Box::new(parse_simple_selector(end_trim)?)) + Some(Box::new(parse_simple_selector( + end_trim, + braces_as_templates, + )?)) }; return Ok(ColumnSelector::Range(start_selector, end_selector)); } - parse_simple_selector(&token.text) + parse_simple_selector(&token.text, braces_as_templates) } -fn parse_simple_selector(token: &str) -> Result { +fn parse_simple_selector(token: &str, braces_as_templates: bool) -> Result { if token.is_empty() { return Err(anyhow!("empty column selector")); } @@ -313,6 +562,9 @@ fn parse_simple_selector(token: &str) -> Result { return Ok(ColumnSelector::Name(literal)); } if let Some(literal) = parse_brace_literal(token)? { + if braces_as_templates { + return Ok(ColumnSelector::Template(literal)); + } return Ok(ColumnSelector::Name(literal)); } match token { @@ -663,6 +915,10 @@ fn resolve_selector_indices( } Ok(matches) } + ColumnSelector::Template(template) => bail!( + "template selector '{{{}}}' cannot be resolved as a positional index", + template + ), ColumnSelector::Range(_, _) => unreachable!("range selectors handled separately"), ColumnSelector::Special(special) => bail!( "special column '{}' not supported without column injection", @@ -715,6 +971,12 @@ fn resolve_selector_index( ColumnSelector::Regex(_) => { bail!("regex column selectors cannot be used in range endpoints") } + ColumnSelector::Template(template) => { + bail!( + "template selector '{{{}}}' cannot be used in range endpoints", + template + ) + } ColumnSelector::Special(special) => bail!( "special column '{}' not supported without column injection", special.default_header() @@ -727,9 +989,18 @@ fn resolve_selector_index( #[cfg(test)] mod tests { + use std::fs::{self, File}; + use std::io::{Read, Write}; + use std::path::Path; + use std::time::{SystemTime, UNIX_EPOCH}; + + use flate2::{Compression, write::GzEncoder}; + use xz2::write::XzEncoder; + use super::{ - ColumnSelector, SpecialColumn, parse_selector_list, parse_single_selector, - resolve_selectors, resolve_selectors_allow_duplicates, + ColumnSelector, FileTemplateContext, SpecialColumn, parse_selector_list, + open_path_reader, parse_selector_list_with_templates, parse_single_selector, + render_file_template, resolve_selectors, resolve_selectors_allow_duplicates, }; #[test] @@ -831,6 +1102,13 @@ mod tests { assert!(matches!(selectors[1], ColumnSelector::Name(ref name) if name == "plain")); } + #[test] + fn parses_brace_templates_when_enabled() { + let selectors = parse_selector_list_with_templates("{base:},plain").unwrap(); + assert!(matches!(selectors[0], ColumnSelector::Template(ref name) if name == "base:")); + assert!(matches!(selectors[1], ColumnSelector::Name(ref name) if name == "plain")); + } + #[test] fn parses_negative_indices() { let headers = vec!["a".to_string(), "b".to_string(), "c".to_string()]; @@ -839,6 +1117,36 @@ mod tests { assert_eq!(indices, vec![2, 1]); } + #[test] + fn resolves_negative_open_range() { + let headers = vec![ + "a".to_string(), + "b".to_string(), + "c".to_string(), + "d".to_string(), + ]; + let selectors = parse_selector_list("-2:").unwrap(); + let indices = resolve_selectors(&headers, &selectors, false).unwrap(); + assert_eq!(indices, vec![2, 3]); + } + + #[test] + fn renders_file_template_variants() { + let ctx = FileTemplateContext::from_path(Path::new("/tmp/a.b.c.tsv")); + assert_eq!(render_file_template("{base}", &ctx).unwrap(), "a.b.c.tsv"); + assert_eq!(render_file_template("{base.}", &ctx).unwrap(), "a.b.c"); + assert_eq!(render_file_template("{base:}", &ctx).unwrap(), "a"); + assert_eq!(render_file_template("{file%:}", &ctx).unwrap(), "a"); + let ctx2 = FileTemplateContext::from_path(Path::new("/tmp/sample_A.tsv")); + assert_eq!(render_file_template("{base:#sample_}", &ctx2).unwrap(), "A"); + assert_eq!( + render_file_template("{file^.tsv}", &ctx).unwrap(), + "/tmp/a.b.c" + ); + assert_eq!(render_file_template("{__file__%:}", &ctx).unwrap(), "a"); + assert_eq!(render_file_template("{__base__.}", &ctx).unwrap(), "a.b.c"); + } + #[test] fn rejects_unterminated_backtick() { let err = parse_selector_list("`foo").unwrap_err(); @@ -881,4 +1189,52 @@ mod tests { let indices = resolve_selectors_allow_duplicates(&headers, &selectors, false).unwrap(); assert_eq!(indices, vec![0, 1]); } + + #[test] + fn open_path_reader_reads_gzip_files() { + let payload = b"a\tb\n1\t2\n"; + let path = unique_temp_path("common_reader_gz", "tsv.gz"); + + { + let file = File::create(&path).unwrap(); + let mut encoder = GzEncoder::new(file, Compression::default()); + encoder.write_all(payload).unwrap(); + encoder.finish().unwrap(); + } + + let mut reader = open_path_reader(&path).unwrap(); + let mut out = Vec::new(); + reader.read_to_end(&mut out).unwrap(); + assert_eq!(out, payload); + + fs::remove_file(path).unwrap(); + } + + #[test] + fn open_path_reader_reads_xz_files() { + let payload = b"a\tb\n1\t2\n"; + let path = unique_temp_path("common_reader_xz", "tsv.xz"); + + { + let file = File::create(&path).unwrap(); + let mut encoder = XzEncoder::new(file, 6); + encoder.write_all(payload).unwrap(); + encoder.finish().unwrap(); + } + + let mut reader = open_path_reader(&path).unwrap(); + let mut out = Vec::new(); + reader.read_to_end(&mut out).unwrap(); + assert_eq!(out, payload); + + fs::remove_file(path).unwrap(); + } + + fn unique_temp_path(prefix: &str, suffix: &str) -> std::path::PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos(); + std::env::temp_dir().join(format!("{prefix}_{nanos}.{suffix}")) + } } diff --git a/src/csv/mod.rs b/src/csv/mod.rs index deb8966..95af92c 100644 --- a/src/csv/mod.rs +++ b/src/csv/mod.rs @@ -9,7 +9,22 @@ use crate::common::{InputOptions, open_path_reader, should_skip_record}; #[derive(Args, Debug)] #[command( about = "Tools for working with comma-separated values", - long_about = "Convert comma-separated data to TSV while preserving headers and allowing custom delimiters." + long_about = "Convert delimited text (CSV by default) to clean TSV output. Supports stdin/files, custom delimiters, comment skipping, NA replacement for blanks, and forgiving parsing for messy quoted inputs.", + after_help = "Examples: + tsvkit csv data.csv > data.tsv + tsvkit csv --delim ';' data_semicolon.csv > data.tsv + tsvkit csv --na NA clinical.csv > clinical.tsv + tsvkit csv --lazy-quotes broken_quotes.csv > repaired.tsv + zcat data.csv.gz | tsvkit csv - > data.tsv + +When to use: + - Ingest CSV before running tsvkit cut/filter/summarize. + - Normalize messy vendor exports into a stable TSV pipeline. + +Option notes: + --delim expects exactly one character. + -H/--no-header treats the first row as data. + --lazy-quotes is useful when embedded quotes are not escaped correctly." )] pub struct CsvArgs { /// Input CSV file (use '-' for stdin; gz/xz supported) diff --git a/src/cut.rs b/src/cut.rs index 65a6c66..d1a4e33 100644 --- a/src/cut.rs +++ b/src/cut.rs @@ -5,27 +5,79 @@ use anyhow::{Context, Result, bail}; use clap::Args; use crate::common::{ - ColumnSelector, InputOptions, SpecialColumn, default_headers, parse_selector_list, - reader_for_path, resolve_selectors, resolve_selectors_allow_duplicates, should_skip_record, + ColumnSelector, FileTemplateContext, InputOptions, SpecialColumn, default_headers, + parse_selector_list_with_templates, reader_for_path, render_file_template, resolve_selectors, + resolve_selectors_allow_duplicates, should_skip_record, }; #[derive(Args, Debug)] #[command( about = "Select and reorder TSV columns", - long_about = "Pick columns by name or 1-based index. Combine comma-separated selectors with ranges (colA:colD or 2:6) and single fields in one spec. Use ~\"regex\" to match columns by pattern. Defaults to header-aware mode; add -H for headerless input.\n\nExamples:\n tsvkit cut -f id,sample3,sample1 examples/profiles.tsv\n tsvkit cut -f 'Purity,sample:FN,F1' examples/profiles.tsv\n tsvkit cut -H -f 3,1 data.tsv" + after_help = "Selector syntax: + name,index,range,regex mix: id,2,group:tech,~\"^IL\" + negative indices/ranges: -1,-2: (last column, second-last to end) + literal symbol names: `2:4` (avoid selector parsing) + +Injected selectors: + __file__ inject file path per row + __base__ inject file basename per row + {file} inject rendered template value (also {base}, {dir}, modifiers) + sample={base:!lower} inline header+template alias + +Injected header names: + --inject-col-names sample,file_base + (aliases: --file-col, --fc) + +Template tokens: + {file} {base} {dir} + aliases: __file__ == file, __base__ == base, __dir__ == dir + modifiers: : (strip all ext), . (strip last ext), % (basename), / (dirname) + trims: ^suffix (remove trailing suffix), #prefix (remove leading prefix) + case: !lower !upper !cap + +Shell quoting note: + Use single quotes when templates contain '!': '{base:!lower}' + In double quotes, escape '!': \"{base:\\!lower}\" + +Examples: + tsvkit cut -f 'sample_id,group,purity' examples/samples.tsv + tsvkit cut -f '1,group,~\"^IL\",-1' examples/cytokines.tsv + tsvkit cut -f '{file},{base:},1:2' examples/qc.tsv + tsvkit cut -f 'sample={base:#sample_!lower},1:' sample_A.tsv + tsvkit cut -f '__base__,1:' examples/qc*.tsv + tsvkit cut --inject-col-names sample -f '__base__,1:' examples/qc*.tsv + tsvkit cut --inject-col-names file_name,sample -f '{base:},sample={base:#sample_!upper},1:2' sample_A.tsv + tsvkit cut -H -f '3,1,-1' data.tsv + tsvkit cut -C ';' -E -I -f '1:3' dirty.tsv + tsvkit cut -D -f 'value,~\"^value$\"' duplicated_headers.tsv + +Practical tips: + - Use -f first, then pipe into filter/summarize/sort for analysis workflows. + - Use regex selectors (~\"...\") to keep evolving column groups (e.g. assays). + - Use template/injected selectors to preserve provenance when concatenating files." )] pub struct CutArgs { /// Input TSV file(s) (use '-' for stdin; supports gz/xz) #[arg(value_name = "FILES", num_args = 0.., default_values = ["-"])] pub files: Vec, - /// Fields to select, using names, 1-based indices, ranges (`colA:colD`, `2:5`), regex (`~"^sample"`), or mixes. Comma-separated list. - #[arg(short = 'f', long = "fields", value_name = "COLS", required = true)] + /// Fields to select, using names, 1-based indices, ranges (`colA:colD`, `2:5`), regex (`~"^sample"`), templates (`{base:}`), or mixes. Comma-separated list. + #[arg( + short = 'f', + long = "fields", + value_name = "COLS", + required = true, + allow_hyphen_values = true + )] pub fields: String, - /// Rename the injected file column when using `__file__` or `__base__` - #[arg(long = "file-col", visible_alias = "fc", value_name = "NAME")] - pub file_col: Option, + /// Comma-separated header names for injected columns (template/special selectors in -f) + #[arg( + long = "inject-col-names", + visible_aliases = ["file-col", "fc"], + value_name = "NAMES" + )] + pub inject_col_names: Option, /// Treat the input as headerless (columns referenced by 1-based indices) #[arg(short = 'H', long = "no-header")] @@ -54,7 +106,7 @@ pub struct CutArgs { } pub fn run(args: CutArgs) -> Result<()> { - let selectors = parse_selector_list(&args.fields)?; + let selectors = parse_selector_list_with_templates(&args.fields)?; let input_opts = InputOptions::from_flags( &args.comment_char, args.ignore_empty_row, @@ -62,7 +114,7 @@ pub fn run(args: CutArgs) -> Result<()> { )?; let mut writer = BufWriter::new(io::stdout().lock()); - let file_column_config = FileColumnConfig::new(args.file_col.as_deref()); + let file_column_config = FileColumnConfig::parse(args.inject_col_names.as_deref())?; let mut header_emitted = false; for path in &args.files { @@ -155,6 +207,8 @@ fn process_header_file( let expected_width = headers.len(); if !*header_emitted { + file_column_config.validate_count(&columns)?; + let mut injected_idx = 0usize; let header_fields: Vec = columns .iter() .map(|column| match column { @@ -163,7 +217,20 @@ fn process_header_file( .map(|s| s.as_str()) .unwrap_or("") .to_string(), - CutColumn::Injected(special) => file_column_config.header_for(*special), + CutColumn::Injected(special) => { + let name = file_column_config + .header_for(injected_idx) + .unwrap_or_else(|| special.default_header().to_string()); + injected_idx += 1; + name + } + CutColumn::Template { header, .. } => { + let name = file_column_config + .header_for(injected_idx) + .unwrap_or_else(|| header.clone()); + injected_idx += 1; + name + } }) .collect(); if !header_fields.is_empty() { @@ -191,15 +258,14 @@ fn emit_record( let mut fields = Vec::with_capacity(columns.len()); for column in columns { match column { - CutColumn::Index(idx) => fields.push(record.get(*idx).unwrap_or("")), - CutColumn::Injected(special) => fields.push(file_info.value_for(*special)), + CutColumn::Index(idx) => fields.push(record.get(*idx).unwrap_or("").to_string()), + CutColumn::Injected(special) => fields.push(file_info.rendered_value_for(*special)?), + CutColumn::Template { template, .. } => { + fields.push(render_file_template(template, &file_info.context)?) + } } } - if !fields.is_empty() { - writeln!(writer, "{}", fields.join("\t"))?; - } else { - writer.write_all(b"\n")?; - } + writeln!(writer, "{}", fields.join("\t"))?; Ok(()) } @@ -213,15 +279,26 @@ fn build_cut_columns( for selector in selectors { match selector { ColumnSelector::Special(special) => columns.push(CutColumn::Injected(*special)), + ColumnSelector::Template(template) => { + let raw = format!("{{{}}}", template); + columns.push(CutColumn::Template { + header: raw.clone(), + template: raw, + }); + } ColumnSelector::Range(start, end) => { - if start - .as_deref() - .map_or(false, |sel| matches!(sel, ColumnSelector::Special(_))) - || end - .as_deref() - .map_or(false, |sel| matches!(sel, ColumnSelector::Special(_))) - { - bail!("special columns cannot be used within a range selector"); + if start.as_deref().map_or(false, |sel| { + matches!( + sel, + ColumnSelector::Special(_) | ColumnSelector::Template(_) + ) + }) || end.as_deref().map_or(false, |sel| { + matches!( + sel, + ColumnSelector::Special(_) | ColumnSelector::Template(_) + ) + }) { + bail!("special/template columns cannot be used within a range selector"); } let indices = if allow_duplicates { resolve_selectors_allow_duplicates(headers, &[selector.clone()], no_header)? @@ -231,6 +308,12 @@ fn build_cut_columns( columns.extend(indices.into_iter().map(CutColumn::Index)); } _ => { + if let ColumnSelector::Name(name) = selector { + if let Some((header, template)) = parse_inline_template_selector(name) { + columns.push(CutColumn::Template { header, template }); + continue; + } + } let indices = if allow_duplicates { resolve_selectors_allow_duplicates(headers, &[selector.clone()], no_header)? } else { @@ -245,55 +328,109 @@ fn build_cut_columns( #[derive(Clone)] struct FileInfo { - path: String, - base: String, + context: FileTemplateContext, } impl FileInfo { fn from_path(path: &Path) -> Self { - if path == Path::new("-") { - return FileInfo { - path: "-".to_string(), - base: "-".to_string(), - }; - } - let path_str = path.to_string_lossy().into_owned(); - let base = path - .file_name() - .map(|s| s.to_string_lossy().into_owned()) - .unwrap_or_else(|| path_str.clone()); FileInfo { - path: path_str, - base, + context: FileTemplateContext::from_path(path), } } - fn value_for(&self, special: SpecialColumn) -> &str { - match special { - SpecialColumn::FilePath => self.path.as_str(), - SpecialColumn::FileBase => self.base.as_str(), - } + fn rendered_value_for(&self, special: SpecialColumn) -> Result { + Ok(match special { + SpecialColumn::FilePath => self.context.path.clone(), + SpecialColumn::FileBase => self.context.base.clone(), + }) } } struct FileColumnConfig<'a> { - rename: Option<&'a str>, + names: Vec<&'a str>, } impl<'a> FileColumnConfig<'a> { - fn new(rename: Option<&'a str>) -> Self { - FileColumnConfig { rename } + fn parse(spec: Option<&'a str>) -> Result { + let names = spec + .map(|s| { + s.split(',') + .map(str::trim) + .filter(|name| !name.is_empty()) + .collect::>() + }) + .unwrap_or_default(); + Ok(FileColumnConfig { names }) + } + + fn header_for(&self, injected_idx: usize) -> Option { + self.names.get(injected_idx).map(|s| s.to_string()) } - fn header_for(&self, special: SpecialColumn) -> String { - match self.rename { - Some(name) => name.to_string(), - None => special.default_header().to_string(), + fn validate_count(&self, columns: &[CutColumn]) -> Result<()> { + let injected_count = columns.iter().filter(|c| c.is_injected()).count(); + if self.names.len() > injected_count { + bail!( + "--inject-col-names provided {} name(s), but only {} injected column(s) are selected", + self.names.len(), + injected_count + ); } + Ok(()) } } enum CutColumn { Index(usize), Injected(SpecialColumn), + Template { header: String, template: String }, +} + +impl CutColumn { + fn is_injected(&self) -> bool { + matches!(self, CutColumn::Injected(_) | CutColumn::Template { .. }) + } +} + +fn parse_inline_template_selector(token: &str) -> Option<(String, String)> { + let (header, template) = token.split_once('=')?; + let header = header.trim(); + let template = template.trim(); + if header.is_empty() + || template.len() < 2 + || !template.starts_with('{') + || !template.ends_with('}') + { + return None; + } + Some((header.to_string(), template.to_string())) +} + +#[cfg(test)] +mod tests { + use crate::common::SpecialColumn; + + use super::{CutColumn, FileColumnConfig, parse_inline_template_selector}; + + #[test] + fn parses_inline_template_selector() { + let parsed = parse_inline_template_selector("sample={base:!lower}").unwrap(); + assert_eq!(parsed.0, "sample"); + assert_eq!(parsed.1, "{base:!lower}"); + } + + #[test] + fn parses_injected_column_names() { + let cfg = FileColumnConfig::parse(Some("a, b ,c")).unwrap(); + assert_eq!(cfg.header_for(0).as_deref(), Some("a")); + assert_eq!(cfg.header_for(1).as_deref(), Some("b")); + assert_eq!(cfg.header_for(2).as_deref(), Some("c")); + } + + #[test] + fn rejects_too_many_injected_names() { + let cfg = FileColumnConfig::parse(Some("a,b")).unwrap(); + let columns = vec![CutColumn::Injected(SpecialColumn::FileBase)]; + assert!(cfg.validate_count(&columns).is_err()); + } } diff --git a/src/excel/mod.rs b/src/excel/mod.rs index 70c463d..e93a18c 100644 --- a/src/excel/mod.rs +++ b/src/excel/mod.rs @@ -19,7 +19,34 @@ type CellValue = Data; #[derive(Args, Debug)] #[command( about = "Interact with Excel workbooks", - long_about = "Inspect, preview, export, or build Excel workbooks (xlsx).", + long_about = "Inspect, preview, export, or build Excel workbooks (.xlsx).\n\n`tsvkit excel` has 4 mutually exclusive modes:\n --sheets FILE list sheet metadata (name, size, inferred types)\n --preview FILE show first rows from selected sheets\n --dump FILE export selected sheet as TSV\n --load TSV ... create a workbook from TSV input(s), including `.tsv`, `.tsv.gz`, and `.tsv.xz`", + after_help = "Common workflows: + 1) Inspect workbook structure + tsvkit excel --sheets examples/bioinfo_example.xlsx + + 2) Preview one sheet with pretty rendering + tsvkit excel --preview book.xlsx -s Sheet1 -n 15 --pretty + + 3) Dump a subset of rows/columns as TSV + tsvkit excel --dump book.xlsx -s 2 -f 'A:D,score' -r '1:200' + + 4) Build a new workbook from multiple TSV files + tsvkit excel --load cohort.tsv --load qc.tsv -o report.xlsx + +Key option explanations: + -s/--sheet NAME|INDEX repeat to select sheets; default is all/first depending on mode + -f/--fields SPEC for dump mode, select columns by name/index/Excel letters + -r/--rows SPEC for dump mode, select row ranges (supports from-end selectors) + --values/--formulas choose evaluated values (default) or raw formulas + --dates MODE date rendering/writing mode: raw, excel, iso + --na STR blank replacement during dump; NA marker during load + --types MODE load mode type handling: infer or string + --max-rows-per-sheet N split large loads across multiple sheets safely + +Tips: + - Use --no-header when the first row is data, not column names. + - Pair `--dump` with other tsvkit commands for robust TSV pipelines." +, group = ArgGroup::new("mode") .args(["sheets", "preview", "dump", "load"]) .required(true), @@ -40,7 +67,7 @@ pub struct ExcelArgs { #[arg(long = "dump", value_name = "FILE", conflicts_with_all = ["sheets", "preview", "load"])] pub dump: Option, - /// TSV inputs to load into a new workbook (repeatable) + /// TSV inputs to load into a new workbook (repeatable; `.tsv`, `.tsv.gz`, `.tsv.xz` supported) #[arg(long = "load", value_name = "TSV", action = ArgAction::Append, conflicts_with_all = ["sheets", "preview", "dump"])] pub load: Vec, diff --git a/src/expression.rs b/src/expression.rs index 98662c5..221260e 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -69,6 +69,9 @@ pub enum FunctionName { Log2, Len, IsNa, + Upper, + Lower, + Cap, } impl FunctionName { @@ -83,8 +86,11 @@ impl FunctionName { "log2" => Ok(FunctionName::Log2), "len" => Ok(FunctionName::Len), "is_na" => Ok(FunctionName::IsNa), + "upper" => Ok(FunctionName::Upper), + "lower" => Ok(FunctionName::Lower), + "cap" => Ok(FunctionName::Cap), other => bail!( - "unsupported function '{}': try abs, sqrt, exp, exp2, ln, log, log10, log2, len, is_na", + "unsupported function '{}': try abs, sqrt, exp, exp2, ln, log, log10, log2, len, is_na, upper, lower, cap", other ), } @@ -497,6 +503,29 @@ where || text.eq_ignore_ascii_case("nan"); bool_eval(is_na) } + FunctionName::Upper => EvalValue { + text: Cow::Owned(inner_eval.text.to_uppercase()), + numeric: None, + }, + FunctionName::Lower => EvalValue { + text: Cow::Owned(inner_eval.text.to_lowercase()), + numeric: None, + }, + FunctionName::Cap => { + let mut chars = inner_eval.text.chars(); + let text = match chars.next() { + None => String::new(), + Some(first) => { + let mut out = first.to_uppercase().to_string(); + out.push_str(chars.as_str()); + out + } + }; + EvalValue { + text: Cow::Owned(text), + numeric: None, + } + } } } BoundValue::Aggregate(spec) => { @@ -1220,6 +1249,39 @@ mod tests { assert_eq!(result.numeric, Some(5.0)); } + #[test] + fn string_case_functions_transform_text() { + let headers = vec!["text".to_string()]; + let upper = bind_value_expression( + parse_value_expression("upper($1)").unwrap(), + &headers, + false, + ) + .unwrap(); + let lower = bind_value_expression( + parse_value_expression("lower($1)").unwrap(), + &headers, + false, + ) + .unwrap(); + let cap = + bind_value_expression(parse_value_expression("cap($1)").unwrap(), &headers, false) + .unwrap(); + let row = vec!["hELLo".to_string()]; + assert_eq!(eval_value(&upper, &row).text.as_ref(), "HELLO"); + assert_eq!(eval_value(&lower, &row).text.as_ref(), "hello"); + assert_eq!(eval_value(&cap, &row).text.as_ref(), "HELLo"); + } + + #[test] + fn cap_function_works_in_filter_comparisons() { + let expr = parse_expression("cap($1) == \"HELLO\"").unwrap(); + let headers = vec!["greet".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let row = csv::StringRecord::from(vec!["hELLO"]); + assert!(evaluate(&bound, &row)); + } + #[test] fn membership_operator_accepts_literal_list() { let expr = parse_expression("$group in [\"case\",\"control\"]").unwrap(); diff --git a/src/filter.rs b/src/filter.rs index dfd6783..60cd1bd 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -15,7 +15,22 @@ use crate::expression::{bind_expression, evaluate, parse_expression}; Examples: tsvkit filter -e '$sample2>=5 & $sample3!=9' examples/profiles.tsv tsvkit filter -e '$kingdom ~ "^Bact"' examples/abundance.tsv - tsvkit filter -e 'log2($coverage) > 10' reads.tsv"# + tsvkit filter -e 'log2($coverage) > 10' reads.tsv"#, + after_help = "More expression patterns: + Numeric range: + tsvkit filter -e '$score >= 0.8 & $score <= 0.95' data.tsv + Membership: + tsvkit filter -e '$group in [\"case\",\"control\"]' data.tsv + Negation: + tsvkit filter -e '!($status == \"failed\")' data.tsv + Row-wise aggregate filter: + tsvkit filter -e 'mean($sample1:$sample5) > 10' data.tsv + +Tips: + - Always quote expressions so your shell does not expand `$col`. + - Prefer double quotes for string literals inside expressions. + - In -H mode, use `$1`, `$2`, ... selectors. + - `filter` emits headers only when at least one row matches." )] pub struct FilterArgs { /// Input TSV file (use '-' for stdin; compressed files supported) diff --git a/src/head.rs b/src/head.rs new file mode 100644 index 0000000..d36f362 --- /dev/null +++ b/src/head.rs @@ -0,0 +1,120 @@ +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use clap::Args; + +use crate::common::{InputOptions, inconsistent_width_error, should_skip_record}; + +#[derive(Args, Debug)] +#[command( + about = "Print first rows from TSV input", + long_about = "Preview the top portion of one or more TSV files. In header mode (default), the header row is always emitted before data rows. When multiple files are provided, each section is prefixed by '# '.\n\nUseful for quick sanity checks in pipelines before running heavier commands.", + after_help = "Examples: + tsvkit head examples/profiles.tsv + tsvkit head -n 25 examples/abundance.tsv + tsvkit head -n 5 sample_a.tsv sample_b.tsv + tsvkit head -H -n 20 raw_no_header.tsv + zcat large.tsv.gz | tsvkit head -n 10 - + +Notes: + -n/--lines counts only data rows (header is additional when present). + -H/--no-header disables header handling and prints raw first rows. + Pair with `tsvkit pretty` for easier terminal reading: + tsvkit head -n 15 data.tsv | tsvkit pretty" +)] +pub struct HeadArgs { + /// Input TSV file(s) (use '-' for stdin; gz/xz supported) + #[arg(value_name = "FILES", default_value = "-", num_args = 0..)] + pub files: Vec, + + /// Number of data rows to print from the top (default 10) + #[arg(short = 'n', long = "lines", default_value_t = 10)] + pub lines: usize, + + /// Treat input as headerless (no header row is emitted) + #[arg(short = 'H', long = "no-header")] + pub no_header: bool, + + /// Lines starting with this comment character are skipped + #[arg(short = 'C', long = "comment-char", default_value = "#")] + pub comment_char: String, + + /// Ignore rows where every field is empty/whitespace + #[arg(short = 'E', long = "ignore-empty-row")] + pub ignore_empty_row: bool, + + /// Ignore rows whose column count differs from the header/first row + #[arg(short = 'I', long = "ignore-illegal-row")] + pub ignore_illegal_row: bool, +} + +pub fn run(args: HeadArgs) -> Result<()> { + let input_opts = InputOptions::from_flags( + &args.comment_char, + args.ignore_empty_row, + args.ignore_illegal_row, + )?; + + let mut writer = BufWriter::new(io::stdout().lock()); + let show_file_banner = args.files.len() > 1; + + for (idx, file) in args.files.iter().enumerate() { + if idx > 0 { + writeln!(writer)?; + } + if show_file_banner { + writeln!(writer, "# {}", file.display())?; + } + + let mut reader = crate::common::reader_for_path(file, args.no_header, &input_opts)?; + let source_name = format!("\"{}\"", file.display()); + let mut reference_width = None; + let mut header_rows = 0usize; + if !args.no_header { + let header = reader + .headers() + .with_context(|| format!("failed reading header from {}", source_name))? + .iter() + .map(|s| s.to_string()) + .collect::>(); + reference_width = Some(header.len()); + header_rows = 1; + writeln!(writer, "{}", header.join("\t"))?; + } + let mut row_number = 0usize; + let mut emitted_rows = 0usize; + + for record in reader.records() { + if emitted_rows >= args.lines { + break; + } + let record = record.with_context(|| format!("failed reading from {}", source_name))?; + row_number += 1; + if let Some(width) = reference_width { + if record.len() != width { + if input_opts.ignore_illegal { + continue; + } + return Err(inconsistent_width_error( + &source_name, + row_number + header_rows, + width, + record.len(), + )); + } + } + if should_skip_record(&record, &input_opts, reference_width) { + continue; + } + if reference_width.is_none() { + reference_width = Some(record.len()); + } + writeln!(writer, "{}", record.iter().collect::>().join("\t"))?; + emitted_rows += 1; + } + } + + writer.flush()?; + Ok(()) +} diff --git a/src/info.rs b/src/info.rs index b1ec02f..0c0c695 100644 --- a/src/info.rs +++ b/src/info.rs @@ -10,7 +10,25 @@ use crate::common::{InputOptions, reader_for_path, should_skip_record}; #[derive(Args, Debug)] #[command( about = "Inspect TSV dimensions, column types, and value previews", - long_about = r#"Report the table shape and column details for a TSV file (or stdin). The output starts with #shape(rows, cols) followed by a TSV summary listing each column's index, optional name, inferred type (num/date/str), and the first N observed values (default 3). Respects shared options like -H/--no-header, -C/--comment-char, -E/--ignore-empty-row, and -I/--ignore-illegal-row."# + long_about = r#"Report table shape and per-column profile details for a TSV file (or stdin). Output starts with #shape(rows, cols), followed by a summary table containing each column's index, optional name, inferred type (num/date/str), and the first N observed values (default 3)."#, + after_help = "Examples: + tsvkit info examples/profiles.tsv + tsvkit info -n 5 examples/abundance.tsv + tsvkit info -H raw_no_header.tsv + zcat large.tsv.gz | tsvkit info - + +How to read output: + #shape(r, c) -> number of data rows and final detected columns + type=num -> all non-empty observed values looked numeric + type=date -> all non-empty observed values matched YYYY-MM-DD + type=str -> mixed/other values + firstN=[...] -> first observed values in that column (for quick QA) + +Practical workflow: + Use `info` first, then feed column names into: + tsvkit cut -f ... + tsvkit filter -e ... + tsvkit summarize -s ..." )] pub struct InfoArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) diff --git a/src/join.rs b/src/join.rs index 100c8c8..92f0208 100644 --- a/src/join.rs +++ b/src/join.rs @@ -8,15 +8,29 @@ use num_cpus; use rayon::{ThreadPoolBuilder, prelude::*}; use crate::common::{ - ColumnSelector, InputOptions, default_headers, inconsistent_width_error, - parse_multi_selector_spec, parse_selector_list, reader_for_path, resolve_selectors, - should_skip_record, + ColumnSelector, FileTemplateContext, InputOptions, default_headers, inconsistent_width_error, + parse_multi_selector_spec, parse_selector_list, reader_for_path, render_file_template, + resolve_selectors, should_skip_record, }; #[derive(Args, Debug)] #[command( about = "Join multiple TSV files on shared key columns", - long_about = "Join two or more TSV files on one or more key columns. Provide selectors with -f/--fields (comma-separated list; use semicolons to give per-file specs). Each file must contribute the same number of key columns. Use -F/--select to control which non-key columns are emitted per file (wrap multi-file specs in quotes). Keys default to an inner join; adjust with -k/--keep. Control parallel input loading with -t/--threads (defaults to min(8, available CPUs)). When inputs are pre-sorted by the key, add --sorted to stream without buffering.\n\nExamples:\n tsvkit join -f id examples/metadata.tsv examples/abundance.tsv\n tsvkit join -f 'sample_id,taxon;id,taxon_id' file1.tsv file2.tsv\n tsvkit join -f subject_id;subject_id -F 'sample_id,group;age,sex' examples/samples.tsv examples/subjects.tsv\n tsvkit join -f id -k 0 examples/metadata.tsv examples/abundance.tsv" + long_about = "Join two or more TSV files on one or more key columns. Provide selectors with -f/--fields (comma-separated list; use semicolons to give per-file specs). Each file must contribute the same number of key columns. Use -F/--select to control which non-key columns are emitted per file (wrap multi-file specs in quotes). Keys default to an inner join; adjust with -k/--keep. Control parallel input loading with -t/--threads (defaults to min(8, available CPUs)). When inputs are pre-sorted by the key, add --sorted to stream without buffering.\n\nExamples:\n tsvkit join -f id examples/metadata.tsv examples/abundance.tsv\n tsvkit join -f 'sample_id,taxon;id,taxon_id' file1.tsv file2.tsv\n tsvkit join -f subject_id;subject_id -F 'sample_id,group;age,sex' examples/samples.tsv examples/subjects.tsv\n tsvkit join -f id -k 0 examples/metadata.tsv examples/abundance.tsv", + after_help = "Join mode guide: + default (inner): keep keys present in every file + -k 0 : full outer join (keep union of keys) + -k 1,3 : keep keys present in file1 or file3 (plus standard matches) + +Field spec guide: + -f 'id;id' -> join file1.id with file2.id + -f 'a,b;x,y' -> 2-column key join + -F 'name,group;count' -> choose emitted non-key columns per file + --add-header 'meta_{base},abund_{base}' -> custom output header templates + +Performance tips: + Use --sorted when all inputs are already sorted by join keys. + Use -t to tune parallel loading for very large datasets." )] pub struct JoinArgs { /// Input TSV files to join (use '-' to read from stdin; `.tsv`, `.tsv.gz`, `.tsv.xz` all supported) @@ -67,6 +81,14 @@ pub struct JoinArgs { /// Fill value to use when a joined file lacks data for a given key (defaults to empty string) #[arg(long = "fill", value_name = "TEXT")] pub fill: Option, + + /// Override emitted non-key headers. Use comma per included column and ';' per file. Templates support {file}, {base}, {dir}, {base:}, {base.}, {file%}, {file/}, {file^suffix}. + #[arg(long = "add-header", value_name = "SPEC")] + pub add_header: Option, + + /// Override emitted join-key header names (comma-separated). Must match number of join columns. Alias: --index-name. + #[arg(long = "key-header", alias = "index-name", value_name = "NAMES")] + pub key_header: Option, } #[derive(Debug, Clone)] @@ -312,9 +334,11 @@ fn execute_join(args: &JoinArgs, input_opts: &InputOptions, fill_value: &str) -> &select_specs, &keep, args.no_header, - !args.no_header, + !args.no_header || args.add_header.is_some() || args.key_header.is_some(), input_opts, fill_value, + args.add_header.as_deref(), + args.key_header.as_deref(), ); } @@ -344,7 +368,15 @@ fn execute_join(args: &JoinArgs, input_opts: &InputOptions, fill_value: &str) -> }) .collect::>>()?; - output_joined(tables, &keep, !args.no_header) + output_joined( + tables, + &keep, + !args.no_header || args.add_header.is_some() || args.key_header.is_some(), + args.no_header, + &args.files, + args.add_header.as_deref(), + args.key_header.as_deref(), + ) } fn load_table( @@ -695,40 +727,25 @@ fn parse_keep_option(spec: Option<&str>, file_count: usize) -> Result, keep: &KeepStrategy, has_header: bool) -> Result<()> { +fn output_joined( + tables: Vec, + keep: &KeepStrategy, + emit_header: bool, + no_header: bool, + files: &[PathBuf], + add_header_spec: Option<&str>, + key_header_spec: Option<&str>, +) -> Result<()> { let mut writer = BufWriter::new(io::stdout().lock()); - if has_header { - let mut seen: HashMap = HashMap::new(); - let mut header_fields = Vec::new(); - - if let Some(first_table) = tables.first() { - for &idx in &first_table.join_indices { - let original = first_table.headers.get(idx).cloned().unwrap_or_default(); - let entry = seen.entry(original.clone()).or_insert(0); - if *entry == 0 { - *entry = 1; - header_fields.push(original); - } else { - *entry += 1; - header_fields.push(format!("{}#{}", original, *entry)); - } - } - } - - for (table_idx, table) in tables.iter().enumerate() { - for &col_idx in &table.include_indices { - let original = table.headers.get(col_idx).cloned().unwrap_or_default(); - let entry = seen.entry(original.clone()).or_insert(0); - if *entry == 0 { - *entry = 1; - header_fields.push(original); - } else { - *entry += 1; - header_fields.push(format!("{}#{}", original, table_idx + 1)); - } - } - } + if emit_header { + let header_fields = build_join_headers( + tables.as_slice(), + files, + add_header_spec, + key_header_spec, + no_header, + )?; if !header_fields.is_empty() { writeln!(writer, "{}", header_fields.join(" "))?; } @@ -824,9 +841,11 @@ fn stream_join( select_specs: &[Option>], keep: &KeepStrategy, no_header: bool, - has_header: bool, + emit_header: bool, input_opts: &InputOptions, fill_value: &str, + add_header_spec: Option<&str>, + key_header_spec: Option<&str>, ) -> Result<()> { let mut tables = Vec::with_capacity(files.len()); for (idx, path) in files.iter().enumerate() { @@ -853,8 +872,15 @@ fn stream_join( let mut writer = BufWriter::new(io::stdout().lock()); - if has_header { - write_stream_header(&tables, &mut writer)?; + if emit_header { + write_stream_header( + &tables, + files, + add_header_spec, + key_header_spec, + no_header, + &mut writer, + )?; } loop { @@ -931,14 +957,45 @@ fn gather_row_sets<'a>( fn write_stream_header( tables: &[StreamTable], + files: &[PathBuf], + add_header_spec: Option<&str>, + key_header_spec: Option<&str>, + no_header: bool, writer: &mut BufWriter>, ) -> Result<()> { + let header_fields = + build_join_headers_for_stream(tables, files, add_header_spec, key_header_spec, no_header)?; + if !header_fields.is_empty() { + writeln!(writer, "{}", header_fields.join("\t"))?; + } + Ok(()) +} + +fn build_join_headers_for_stream( + tables: &[StreamTable], + files: &[PathBuf], + add_header_spec: Option<&str>, + key_header_spec: Option<&str>, + no_header: bool, +) -> Result> { let mut seen: HashMap = HashMap::new(); let mut header_fields = Vec::new(); if let Some(first_table) = tables.first() { - for &idx in &first_table.join_indices { - let original = first_table.headers.get(idx).cloned().unwrap_or_default(); + let key_headers = + resolve_key_headers_spec(key_header_spec, first_table.join_indices.len())?; + for (pos, &idx) in first_table.join_indices.iter().enumerate() { + let original = key_headers + .as_ref() + .and_then(|vals| vals.get(pos)) + .cloned() + .unwrap_or_else(|| { + if no_header { + format!("index{}", pos + 1) + } else { + first_table.headers.get(idx).cloned().unwrap_or_default() + } + }); let entry = seen.entry(original.clone()).or_insert(0); if *entry == 0 { *entry = 1; @@ -951,8 +1008,18 @@ fn write_stream_header( } for (table_idx, table) in tables.iter().enumerate() { - for &col_idx in &table.include_indices { - let original = table.headers.get(col_idx).cloned().unwrap_or_default(); + let custom_headers = resolve_add_headers_for_file( + add_header_spec, + files, + table_idx, + table.include_indices.len(), + )?; + for (pos, &col_idx) in table.include_indices.iter().enumerate() { + let original = custom_headers + .as_ref() + .and_then(|vals| vals.get(pos)) + .cloned() + .unwrap_or_else(|| table.headers.get(col_idx).cloned().unwrap_or_default()); let entry = seen.entry(original.clone()).or_insert(0); if *entry == 0 { *entry = 1; @@ -964,11 +1031,154 @@ fn write_stream_header( } } - if !header_fields.is_empty() { - writeln!(writer, "{}", header_fields.join("\t"))?; + Ok(header_fields) +} + +fn build_join_headers( + tables: &[Table], + files: &[PathBuf], + add_header_spec: Option<&str>, + key_header_spec: Option<&str>, + no_header: bool, +) -> Result> { + let mut seen: HashMap = HashMap::new(); + let mut header_fields = Vec::new(); + + if let Some(first_table) = tables.first() { + let key_headers = + resolve_key_headers_spec(key_header_spec, first_table.join_indices.len())?; + for (pos, &idx) in first_table.join_indices.iter().enumerate() { + let original = key_headers + .as_ref() + .and_then(|vals| vals.get(pos)) + .cloned() + .unwrap_or_else(|| { + if no_header { + format!("index{}", pos + 1) + } else { + first_table.headers.get(idx).cloned().unwrap_or_default() + } + }); + let entry = seen.entry(original.clone()).or_insert(0); + if *entry == 0 { + *entry = 1; + header_fields.push(original); + } else { + *entry += 1; + header_fields.push(format!("{}#{}", original, *entry)); + } + } } - Ok(()) + for (table_idx, table) in tables.iter().enumerate() { + let custom_headers = resolve_add_headers_for_file( + add_header_spec, + files, + table_idx, + table.include_indices.len(), + )?; + for (pos, &col_idx) in table.include_indices.iter().enumerate() { + let original = custom_headers + .as_ref() + .and_then(|vals| vals.get(pos)) + .cloned() + .unwrap_or_else(|| table.headers.get(col_idx).cloned().unwrap_or_default()); + let entry = seen.entry(original.clone()).or_insert(0); + if *entry == 0 { + *entry = 1; + header_fields.push(original); + } else { + *entry += 1; + header_fields.push(format!("{}#{}", original, table_idx + 1)); + } + } + } + + Ok(header_fields) +} + +fn resolve_key_headers_spec(spec: Option<&str>, join_width: usize) -> Result>> { + let Some(raw) = spec.map(|s| s.trim()).filter(|s| !s.is_empty()) else { + return Ok(None); + }; + let headers: Vec = raw + .split(',') + .map(|item| item.trim().to_string()) + .filter(|item| !item.is_empty()) + .collect(); + if headers.is_empty() { + bail!("--key-header specification must not be empty"); + } + if headers.len() != join_width { + bail!( + "--key-header defines {} names, but join uses {} key columns", + headers.len(), + join_width + ); + } + Ok(Some(headers)) +} + +fn resolve_add_headers_for_file( + add_header_spec: Option<&str>, + files: &[PathBuf], + file_idx: usize, + include_count: usize, +) -> Result>> { + let Some(raw) = add_header_spec.map(|s| s.trim()).filter(|s| !s.is_empty()) else { + return Ok(None); + }; + let groups = parse_add_header_groups(raw, files.len())?; + let group = groups + .get(file_idx) + .with_context(|| format!("missing --add-header group for file {}", file_idx + 1))?; + if group.len() != include_count { + bail!( + "--add-header group {} defines {} headers, but file {} includes {} non-key columns", + file_idx + 1, + group.len(), + file_idx + 1, + include_count + ); + } + let context = FileTemplateContext::from_path( + files + .get(file_idx) + .with_context(|| format!("missing path for file {}", file_idx + 1))?, + ); + let rendered = group + .iter() + .map(|item| render_file_template(item, &context)) + .collect::>>()?; + Ok(Some(rendered)) +} + +fn parse_add_header_groups(raw: &str, file_count: usize) -> Result>> { + let parts: Vec> = raw + .split(';') + .map(|group| { + group + .split(',') + .map(|item| item.trim().to_string()) + .filter(|item| !item.is_empty()) + .collect::>() + }) + .filter(|group| !group.is_empty()) + .collect(); + if parts.is_empty() { + bail!("--add-header specification must not be empty"); + } + if parts.len() == 1 && file_count > 1 { + return Ok(vec![parts[0].clone(); file_count]); + } + if parts.len() != file_count { + bail!( + "--add-header expects {} file groups, got {}", + file_count, + parts.len() + ); + } + Ok(parts) } fn write_stream_combinations( @@ -1048,3 +1258,74 @@ fn write_combinations( } } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn add_header_with_no_header_uses_index_prefix_for_join_keys() { + let table1 = Table { + headers: vec!["col1".to_string(), "col2".to_string()], + join_indices: vec![0], + include_indices: vec![1], + rows: Vec::new(), + key_to_rows: HashMap::new(), + key_order: Vec::new(), + empty_row: vec![], + }; + let table2 = Table { + headers: vec!["col1".to_string(), "col2".to_string()], + join_indices: vec![0], + include_indices: vec![1], + rows: Vec::new(), + key_to_rows: HashMap::new(), + key_order: Vec::new(), + empty_row: vec![], + }; + let files = vec![ + PathBuf::from("/tmp/sample_A.tsv"), + PathBuf::from("/tmp/sample_B.tsv"), + ]; + let headers = build_join_headers( + &[table1, table2], + &files, + Some("patient_{base:#sample_};patient_{base:#sample_}"), + None, + true, + ) + .unwrap(); + assert_eq!(headers[0], "index1"); + assert_eq!(headers[1], "patient_A"); + assert_eq!(headers[2], "patient_B"); + } + + #[test] + fn key_header_overrides_join_key_names() { + let table1 = Table { + headers: vec!["col1".to_string(), "col2".to_string()], + join_indices: vec![0, 1], + include_indices: vec![], + rows: Vec::new(), + key_to_rows: HashMap::new(), + key_order: Vec::new(), + empty_row: vec![], + }; + let table2 = Table { + headers: vec!["col1".to_string(), "col2".to_string()], + join_indices: vec![0, 1], + include_indices: vec![], + rows: Vec::new(), + key_to_rows: HashMap::new(), + key_order: Vec::new(), + empty_row: vec![], + }; + let files = vec![ + PathBuf::from("/tmp/sample_A.tsv"), + PathBuf::from("/tmp/sample_B.tsv"), + ]; + let headers = + build_join_headers(&[table1, table2], &files, None, Some("id,sample"), true).unwrap(); + assert_eq!(headers, vec!["id".to_string(), "sample".to_string()]); + } +} diff --git a/src/main.rs b/src/main.rs index 5160e9b..cbfcd61 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ mod cut; mod excel; mod expression; mod filter; +mod head; mod info; mod join; mod melt; @@ -19,13 +20,14 @@ mod pretty; mod slice; mod sort; mod summarize; +mod transpose; #[derive(Parser)] #[command( name = "tsvkit", version, about = "High-level TSV toolkit: join, filter, reshape, summarize.", - long_about = "tsvkit is a Swiss-army knife for tab-separated data. It ships focused subcommands for joins, filtering, column selection, statistics, reshaping, and pretty-printing. Every subcommand reads from files or standard input and respects headers by default.", + long_about = "tsvkit is a Swiss-army knife for tab-separated data. It ships focused subcommands for joins, filtering, column selection, statistics, reshaping, spreadsheet interop, and pretty-printing.\n\nQuick start workflow:\n 1) Inspect columns and value types\n tsvkit info data.tsv\n 2) Keep only columns you need\n tsvkit cut -f 'sample_id,group,score' data.tsv\n 3) Filter rows with an expression\n tsvkit filter -e '$score >= 80 & $group == \"case\"' data.tsv\n 4) Sort results and preview\n tsvkit sort -k score:nr data.tsv | tsvkit head -n 20 | tsvkit pretty\n\nMost commands are header-aware by default and share these safety flags:\n -H/--no-header treat input as headerless\n -C/--comment-char skip comment-prefixed lines (default '#')\n -E/--ignore-empty-row skip fully empty rows\n -I/--ignore-illegal-row skip inconsistent-width rows\n\nFile inputs are auto-detected as plain `.tsv`, `.tsv.gz`, or `.tsv.xz` whenever a subcommand reads TSV data.\n\nGet detailed usage for any subcommand:\n tsvkit --help\n\nExamples:\n tsvkit join --help\n tsvkit summarize --help\n tsvkit excel --help", author = "tsvkit" )] struct Cli { @@ -35,31 +37,35 @@ struct Cli { #[derive(Subcommand)] enum Commands { - /// Join multiple TSV files on matching keys + /// Join multiple TSV files on matching keys (inner/left/right/full) Join(join::JoinArgs), - /// Summarize columns after grouping rows + /// Summarize numeric/text columns with grouped statistics Summarize(summarize::SummarizeArgs), - /// Select and reorder columns + /// Select, reorder, and inject columns (including file metadata) Cut(cut::CutArgs), - /// Pretty-print the table with aligned columns + /// Pretty-print TSV as an aligned terminal table Pretty(pretty::PrettyArgs), - /// Filter rows using expressions on column values + /// Filter rows using boolean expressions on column values Filter(filter::FilterArgs), - /// Pivot long data into wide format + /// Pivot long/tidy data into wide format Pivot(pivot::PivotArgs), - /// Melt wide data into long format + /// Melt wide tables into long/tidy format Melt(melt::MeltArgs), - /// Sort rows by one or more key columns + /// Sort rows by one or more text/numeric key columns Sort(sort::SortArgs), - /// Add derived columns or rewrite existing ones + /// Add derived columns or rewrite values with expressions/substitutions Mutate(mutate::MutateArgs), - /// Slice rows by 1-based index + /// Print the first N rows (header-aware) + Head(head::HeadArgs), + /// Transpose rows and columns + Transpose(transpose::TransposeArgs), + /// Slice rows by 1-based index/ranges (supports from-end selectors) Slice(slice::SliceArgs), - /// Excel-focused helpers (inspect, preview, convert, load) + /// Excel helpers: inspect sheets, preview, dump TSV, or load TSV into xlsx Excel(excel::ExcelArgs), - /// CSV utilities (convert to TSV) + /// CSV utilities (convert CSV/TSV-like delimited text into TSV) Csv(csv::CsvArgs), - /// Inspect TSV schema, column types, and previews + /// Inspect table shape, schema hints, and sample values Info(info::InfoArgs), } @@ -76,6 +82,8 @@ fn main() -> Result<()> { Commands::Melt(args) => melt::run(args), Commands::Sort(args) => sort::run(args), Commands::Mutate(args) => mutate::run(args), + Commands::Head(args) => head::run(args), + Commands::Transpose(args) => transpose::run(args), Commands::Slice(args) => slice::run(args), Commands::Excel(args) => excel::run(args, &raw_args), Commands::Csv(args) => csv::run(args), diff --git a/src/melt.rs b/src/melt.rs index 1421025..3986a64 100644 --- a/src/melt.rs +++ b/src/melt.rs @@ -13,7 +13,22 @@ use crate::common::{ #[derive(Args, Debug)] #[command( about = "Melt wide TSV tables into long form", - long_about = "Convert wide TSV tables into a tidy long format. Use -i/--id to keep identifier columns, optionally -v/--value-cols to target specific value columns, and rename the generated columns with --variable/--value. Defaults to header-aware mode; add -H for headerless files.\n\nExample:\n tsvkit melt -i id examples/profiles.tsv" + long_about = "Convert wide TSV tables into a tidy long format. Use -i/--id to keep identifier columns, optionally -v/--value-cols to target specific value columns, and rename the generated columns with --variable/--value. Defaults to header-aware mode; add -H for headerless files.\n\nExample:\n tsvkit melt -i id examples/profiles.tsv", + after_help = "Use cases: + Keep subject metadata while melting assay columns: + tsvkit melt -i 'sample_id,group' -v 'IL6:IL10' cytokines.tsv + Melt all non-id columns: + tsvkit melt -i id profiles.tsv + Headerless matrix: + tsvkit melt -H -i 1 -v 2: raw.tsv + +Column naming: + --variable sets the output column storing former header names. + --value sets the output column storing cell values. + +Tip: + `melt` is commonly paired with `pivot`: + tsvkit melt -i id wide.tsv | tsvkit pivot -i id -c variable -v value" )] pub struct MeltArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) diff --git a/src/mutate.rs b/src/mutate.rs index 3900bb7..bdc7c5d 100644 --- a/src/mutate.rs +++ b/src/mutate.rs @@ -21,7 +21,19 @@ Examples: tsvkit mutate -e "coverage_sum=sum($1,$3:$5)" examples/profiles.tsv tsvkit mutate -e "log_counts=mean($count1:$count5)" counts.tsv tsvkit mutate -e "title_clean=sub($title,\"\\s+\", \"_\")" titles.tsv - tsvkit mutate -e 's/$col1:$col3/NA/0/' data.tsv"# + tsvkit mutate -e 's/$col1:$col3/NA/0/' data.tsv"#, + after_help = "Mutation patterns: + Add new columns (evaluated left-to-right): + tsvkit mutate -e 'total=$a+$b' -e 'ratio=$a/$total' data.tsv + Recode text in place: + tsvkit mutate -e 's/$group/^ctrl$/control/' data.tsv + Combine assignment + substitution: + tsvkit mutate -e 'score_z=($score-mean($score))/sd($score)' -e 's/$status/NA/unknown/' data.tsv + +Important notes: + - Always prefix column references with `$`. + - New columns created in earlier -e expressions can be used in later ones. + - Use single quotes around -e arguments to protect `$...` from shell expansion." )] pub struct MutateArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) @@ -194,7 +206,7 @@ fn parse_operations( ) -> Result> { let mut ops = Vec::new(); let mut current_headers = headers.to_vec(); - for expr in exprs { + for expr in flatten_expr_clauses(exprs)? { let trimmed = expr.trim(); if trimmed.is_empty() { bail!("mutation expression must not be empty"); @@ -310,6 +322,25 @@ fn find_assignment(expr: &str) -> Option { fn parse_function(value: &str, headers: &[String], no_header: bool) -> Result { let trimmed = value.trim(); + if trimmed.starts_with("s/") { + let sub = parse_substitution_spec(trimmed, headers, no_header)?; + return Ok(FunctionSpec::SubNew { + column: sub.column, + pattern: sub.pattern, + replacement: sub.replacement, + }); + } + + if trimmed.starts_with("${") && trimmed.ends_with('}') { + let inner = &trimmed[2..trimmed.len() - 1]; + let sub = parse_braced_substitution_spec(inner, headers, no_header)?; + return Ok(FunctionSpec::SubNew { + column: sub.column, + pattern: sub.pattern, + replacement: sub.replacement, + }); + } + if let Some(rest) = trimmed.strip_prefix("sub(") { let inner = rest .strip_suffix(')') @@ -390,6 +421,61 @@ fn parse_substitution_expression( }) } +struct SubstitutionSpec { + column: usize, + pattern: Regex, + replacement: String, +} + +fn parse_substitution_spec( + expr: &str, + headers: &[String], + no_header: bool, +) -> Result { + let content = expr.trim_start_matches("s/"); + let content = content + .strip_suffix('/') + .with_context(|| "substitution expression must end with '/'")?; + let (selector_part, pattern_part, replacement_part) = split_substitution_components(content) + .with_context( + || "substitution expression must use s/selectors/pattern/replacement/ syntax", + )?; + let selectors = parse_selector_list(&normalize_selector_spec(selector_part.trim()))?; + if selectors.len() != 1 { + bail!("assignment substitution requires exactly one target column"); + } + let indices = resolve_selectors(headers, &selectors, no_header)?; + let regex_pattern = unescape_substitution_component(pattern_part); + let regex = Regex::new(®ex_pattern).with_context(|| "invalid regex in substitution")?; + Ok(SubstitutionSpec { + column: indices[0], + pattern: regex, + replacement: unescape_substitution_component(replacement_part), + }) +} + +fn parse_braced_substitution_spec( + expr: &str, + headers: &[String], + no_header: bool, +) -> Result { + let (selector_part, pattern_part, replacement_part) = split_substitution_components(expr) + .with_context(|| "braced substitution must use ${selector/pattern/replacement} syntax")?; + let selector_text = selector_part.trim().trim_start_matches('$'); + let selectors = parse_selector_list(selector_text)?; + if selectors.len() != 1 { + bail!("braced substitution requires exactly one target column"); + } + let indices = resolve_selectors(headers, &selectors, no_header)?; + let regex = Regex::new(&unescape_substitution_component(pattern_part)) + .with_context(|| "invalid regex in braced substitution")?; + Ok(SubstitutionSpec { + column: indices[0], + pattern: regex, + replacement: unescape_substitution_component(replacement_part), + }) +} + fn normalize_selector_spec(spec: &str) -> String { spec.replace('$', "") } @@ -473,6 +559,44 @@ fn split_args(input: &str) -> Vec { args } +fn flatten_expr_clauses(exprs: &[String]) -> Result> { + let mut out = Vec::new(); + for expr in exprs { + let mut in_single = false; + let mut in_double = false; + let mut escaped = false; + let mut current = String::new(); + for ch in expr.chars() { + match ch { + '\\' if !escaped => { + escaped = true; + current.push(ch); + continue; + } + '\'' if !escaped && !in_double => in_single = !in_single, + '"' if !escaped && !in_single => in_double = !in_double, + ';' if !in_single && !in_double => { + if !current.trim().is_empty() { + out.push(current.trim().to_string()); + } + current.clear(); + continue; + } + _ => {} + } + escaped = false; + current.push(ch); + } + if in_single || in_double { + bail!("unterminated quote in -e expression '{}'", expr); + } + if !current.trim().is_empty() { + out.push(current.trim().to_string()); + } + } + Ok(out) +} + fn parse_string_literal(value: &str) -> Result { let trimmed = value.trim(); if trimmed.len() >= 2 && trimmed.starts_with('"') && trimmed.ends_with('"') { @@ -679,4 +803,32 @@ mod tests { let literal = parse_string_literal("\"line\\nfeed\\tend\"").unwrap(); assert_eq!(literal, "line\nfeed\tend"); } + + #[test] + fn supports_multiple_assignments_in_single_e_clause() { + let headers = vec!["x".to_string(), "y".to_string()]; + let ops = parse_operations(&["v1=$x;v2=$y".to_string()], &headers, false).unwrap(); + let mut row = vec!["6".to_string(), "3".to_string()]; + process_row(&mut row, &ops).unwrap(); + assert_eq!(row[2], "6"); + assert_eq!(row[3], "3"); + } + + #[test] + fn assignment_substitution_supports_s_syntax_and_braced_syntax() { + let headers = vec!["c1".to_string()]; + let ops = parse_operations( + &[ + "new=s/$c1/aa[0-9]+/bb/".to_string(), + "new2=${1/aa/bb}".to_string(), + ], + &headers, + false, + ) + .unwrap(); + let mut row = vec!["aa12".to_string()]; + process_row(&mut row, &ops).unwrap(); + assert_eq!(row[1], "bb"); + assert_eq!(row[2], "bb12"); + } } diff --git a/src/pivot.rs b/src/pivot.rs index 58d074c..58a967c 100644 --- a/src/pivot.rs +++ b/src/pivot.rs @@ -13,7 +13,19 @@ use crate::common::{ #[derive(Args, Debug)] #[command( about = "Pivot long-form TSV data into wide tables", - long_about = "Convert long (tidy) TSV data into a wide table by promoting row values to columns. Specify which columns remain as identifiers (-i/--index), which column provides the new headers (-c/--column), and which supplies cell values (-v/--value). Use --fill for missing combinations.\n\nExample:\n tsvkit melt -i id examples/profiles.tsv | tsvkit pivot -i id -c variable -v value" + long_about = "Convert long (tidy) TSV data into a wide table by promoting row values to columns. Specify which columns remain as identifiers (-i/--index), which column provides the new headers (-c/--column), and which supplies cell values (-v/--value). Use --fill for missing combinations.\n\nExample:\n tsvkit melt -i id examples/profiles.tsv | tsvkit pivot -i id -c variable -v value", + after_help = "Pivot recipe: + -i/--index row identity columns (can be multiple) + -c/--column values in this column become new output headers + -v/--value values in this column fill pivoted cells + +Examples: + tsvkit pivot -i sample_id -c analyte -v signal long.tsv + tsvkit pivot -i 'subject,visit' -c metric -v value --fill 0 long.tsv + tsvkit pivot -H -i 1 -c 2 -v 3 raw_long.tsv + +Round-trip with melt: + tsvkit melt -i id wide.tsv | tsvkit pivot -i id -c variable -v value" )] pub struct PivotArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) diff --git a/src/pretty.rs b/src/pretty.rs index 914c6c6..826607c 100644 --- a/src/pretty.rs +++ b/src/pretty.rs @@ -10,10 +10,19 @@ use crate::common::{InputOptions, inconsistent_width_error, reader_for_path, sho #[derive(Args, Debug)] #[command( about = "Render TSV data in a pretty table", - long_about = "Format TSV rows into an aligned, boxed table for quick inspection. Reads from files or stdin, keeps headers by default, and supports width control.\n\nExample:\n tsvkit pretty examples/profiles.tsv" + long_about = "Format TSV rows into an aligned, boxed table for quick inspection in the terminal. Reads from files or stdin, including `.tsv`, `.tsv.gz`, and `.tsv.xz` inputs, and supports table width limits to avoid wrapping explosions on very wide datasets.", + after_help = "Examples: + tsvkit pretty examples/profiles.tsv + tsvkit cut -f 'sample_id,group,purity' examples/samples.tsv | tsvkit pretty + tsvkit filter -e '$score > 80' scores.tsv | tsvkit head -n 20 | tsvkit pretty + +Tips: + Use -n to preview only the first rows for huge files. + Use -w to cap rendered width (truncates long cells visually). + Great for interactive exploration before exporting raw TSV downstream." )] pub struct PrettyArgs { - /// Input TSV file (use '-' for stdin) + /// Input TSV file (use '-' for stdin; `.tsv`, `.tsv.gz`, `.tsv.xz` supported) #[arg(value_name = "FILE", default_value = "-")] pub file: PathBuf, diff --git a/src/slice.rs b/src/slice.rs index a019747..3818eb0 100644 --- a/src/slice.rs +++ b/src/slice.rs @@ -13,7 +13,20 @@ use crate::common::{InputOptions, reader_for_path, should_skip_record}; Examples: tsvkit slice -r 1,10:20 examples/profiles.tsv - tsvkit slice -H -r 5:10 raw.tsv"# + tsvkit slice -H -r 5:10 raw.tsv"#, + after_help = "Row selector patterns: + 7 -> single row + 10:20 -> inclusive range + :50 -> from first row through 50 + 100: -> from row 100 to end + -1 -> last row + -10: -> last 10 rows + 1,5,9:12 -> mixed selectors + +Tips: + Header mode (default): header is printed once before selected rows. + No-header mode (-H): emits only selected data rows. + From-end selectors require buffering to know total row count." )] pub struct SliceArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) diff --git a/src/sort.rs b/src/sort.rs index aa88263..9ea2b18 100644 --- a/src/sort.rs +++ b/src/sort.rs @@ -14,7 +14,21 @@ use crate::common::{ #[derive(Args, Debug)] #[command( about = "Sort TSV rows by column keys", - long_about = "Sort TSV rows by one or more keys. Provide -k/--key with column selectors (names or 1-based indices) and optional modifiers: :n (numeric asc), :nr (numeric desc), :r (reverse text). Repeat -k for additional sort levels. Defaults to header-aware mode; add -H for headerless files.\n\nExamples:\n tsvkit sort -k count:nr examples/abundance.tsv\n tsvkit sort -k $1:nr -k $2:r examples/profiles.tsv" + long_about = "Sort TSV rows by one or more keys. Provide -k/--key with column selectors (names or 1-based indices) and optional modifiers: :n (numeric asc), :nr (numeric desc), :r (reverse text). Repeat -k for additional sort levels. Defaults to header-aware mode; add -H for headerless files.\n\nExamples:\n tsvkit sort -k count:nr examples/abundance.tsv\n tsvkit sort -k $1:nr -k $2:r examples/profiles.tsv", + after_help = "Key spec quick reference: + -k col text ascending + -k col:r text descending + -k col:n numeric ascending + -k col:nr numeric descending + +Multi-key examples: + tsvkit sort -k group -k score:nr data.tsv + tsvkit sort -k date -k sample_id data.tsv + tsvkit sort -H -k 3:n -k 1 raw.tsv + +Tips: + Stable sort (default) preserves input order for equal keys. + Use --unstable for speed when equal-key order does not matter." )] pub struct SortArgs { /// Input TSV file (use '-' for stdin; gz/xz supported) diff --git a/src/summarize.rs b/src/summarize.rs index 022b141..bc95dd3 100644 --- a/src/summarize.rs +++ b/src/summarize.rs @@ -17,7 +17,24 @@ use crate::common::{ #[derive(Args, Debug)] #[command( about = "Grouped statistics over TSV columns", - long_about = "Group rows with -g/--group and compute statistics for selected columns via -s/--stat. Each --stat accepts COLUMN=ops, where COLUMN can be names, indices, ranges, or mixes, and ops include sum, mean, median, quantiles (q1, q50, q0.9), var, sd, mode, distinct, and more. Headers are used by default; add -H for headerless input.\n\nExamples:\n tsvkit summarize -s 'sample1:sample3=mean' examples/profiles.tsv\n tsvkit summarize -g group -s 'sample1=mean,sd' -s 'sample2:sample3=sum' examples/profiles.tsv\n tsvkit summarize -s 'sample1=q1,q3,var' examples/profiles.tsv" + long_about = "Group rows with -g/--group and compute statistics for selected columns via -s/--stat. Each --stat accepts COLUMN=ops, where COLUMN can be names, indices, ranges, or mixes, and ops include sum, mean, median, quantiles (q1, q50, q0.9), var, sd, mode, distinct, and more. Headers are used by default; add -H for headerless input.\n\nExamples:\n tsvkit summarize -s 'sample1:sample3=mean' examples/profiles.tsv\n tsvkit summarize -g group -s 'sample1=mean,sd' -s 'sample2:sample3=sum' examples/profiles.tsv\n tsvkit summarize -s 'sample1=q1,q3,var' examples/profiles.tsv", + after_help = "Stats guide: + Basic numeric ops: sum, mean, median, min, max, sd, var + Count-like ops: count, distinct(countunique), unique, collapse + Robust/advanced: trimmean, iqr, mode, antimode, entropy, argmin, argmax + Quantiles: q1, q3, q50, q0.9, p95, etc. + +Examples by use case: + One-row table summary: + tsvkit summarize -s 'value=sum,mean,sd' data.tsv + Per-group KPIs: + tsvkit summarize -g cohort -s 'score=mean,sd,q1,q3' data.tsv + Multiple targets with regex: + tsvkit summarize -s '~\"^IL\"=mean,max' cytokines.tsv + +Tips: + Repeat -s for readability on complex summaries. + Use -D when regex/name selectors intentionally match duplicate headers." )] pub struct SummarizeArgs { /// Input TSV file (use '-' for stdin; compressed files are detected automatically) diff --git a/src/transpose.rs b/src/transpose.rs new file mode 100644 index 0000000..f182eba --- /dev/null +++ b/src/transpose.rs @@ -0,0 +1,103 @@ +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use clap::Args; + +use crate::common::{InputOptions, inconsistent_width_error, reader_for_path, should_skip_record}; + +#[derive(Args, Debug)] +#[command( + about = "Transpose rows and columns", + long_about = "Swap table axes so rows become columns and columns become rows. This is handy when samples are rows but downstream tooling expects samples as columns (or vice versa). By default, headers are included as the first row before transposition.", + after_help = "Examples: + tsvkit transpose examples/profiles.tsv + tsvkit transpose -H matrix_no_header.tsv + tsvkit cut -f 'sample1:sample3' examples/profiles.tsv | tsvkit transpose + +Behavior notes: + Header mode (default): header participates in transpose as row 1. + No-header mode (-H): all rows are treated as data only. + Ragged rows are rejected unless -I/--ignore-illegal-row is enabled." +)] +pub struct TransposeArgs { + /// Input TSV file (use '-' for stdin; gz/xz supported) + #[arg(value_name = "FILE", default_value = "-")] + pub file: PathBuf, + + /// Treat input as headerless + #[arg(short = 'H', long = "no-header")] + pub no_header: bool, + + /// Lines starting with this comment character are skipped + #[arg(short = 'C', long = "comment-char", default_value = "#")] + pub comment_char: String, + + /// Ignore rows where every field is empty/whitespace + #[arg(short = 'E', long = "ignore-empty-row")] + pub ignore_empty_row: bool, + + /// Ignore rows whose column count differs from the header/first row + #[arg(short = 'I', long = "ignore-illegal-row")] + pub ignore_illegal_row: bool, +} + +pub fn run(args: TransposeArgs) -> Result<()> { + let input_opts = InputOptions::from_flags( + &args.comment_char, + args.ignore_empty_row, + args.ignore_illegal_row, + )?; + let mut reader = reader_for_path(&args.file, args.no_header, &input_opts)?; + let source_name = format!("\"{}\"", args.file.display()); + + let mut matrix = Vec::>::new(); + if !args.no_header { + let header = reader + .headers() + .with_context(|| format!("failed reading header from {}", source_name))? + .iter() + .map(|s| s.to_string()) + .collect::>(); + matrix.push(header); + } + + let mut expected_width = matrix.first().map(|r| r.len()); + let mut row_number = 0usize; + for record in reader.records() { + let record = record.with_context(|| format!("failed reading from {}", source_name))?; + row_number += 1; + if should_skip_record(&record, &input_opts, expected_width) { + continue; + } + if let Some(width) = expected_width { + if record.len() != width { + if input_opts.ignore_illegal { + continue; + } + return Err(inconsistent_width_error( + &source_name, + row_number, + width, + record.len(), + )); + } + } else { + expected_width = Some(record.len()); + } + matrix.push(record.iter().map(|s| s.to_string()).collect()); + } + + let cols = matrix.iter().map(|r| r.len()).max().unwrap_or(0); + let rows = matrix.len(); + let mut writer = BufWriter::new(io::stdout().lock()); + for c in 0..cols { + let mut out = Vec::with_capacity(rows); + for r in 0..rows { + out.push(matrix[r].get(c).cloned().unwrap_or_default()); + } + writeln!(writer, "{}", out.join("\t"))?; + } + writer.flush()?; + Ok(()) +}