diff --git a/Cargo.lock b/Cargo.lock index ca20a1e..fb1215b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,7 +603,7 @@ dependencies = [ [[package]] name = "tsvkit" -version = "0.8.7" +version = "0.9.0" dependencies = [ "anyhow", "calamine", diff --git a/Cargo.toml b/Cargo.toml index f6a008b..ad869c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tsvkit" -version = "0.8.8" +version = "0.9.2" edition = "2024" [dependencies] diff --git a/README.md b/README.md index e43a4e3..7bde0bf 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,57 @@ # tsvkit -`tsvkit` is a fast, ergonomic toolkit for working with TSV tables. Written in Rust, it brings familiar data-wrangling verbs (join, cut, filter, mutate, summarize, reshape, slice, pretty-print) to the command line with consistent column selection, rich expressions, and streaming-friendly performance. `tsvkit` is inspired by tools such as `csvtk`, `csvkit`, `datamash`, `awk`, `xsv`, and `mlr`. Many of its options are designed to be compatible with `csvtk` (https://github.com/shenwei356/csvtk), making it easier for existing users to adopt. - -Compared with existing tools, `tsvkit` offers a more powerful and flexible way to select rows and columns. It combines versatile in- and output column selectors with an expression engine for statistics, filtering, and data transformation. This makes it possible, for example, to generate a gene expression matrix from `samtools idxstats` or `featureCounts` outputs of multiple samples in a single command: - -`tsvkit join -f -F `, - -to compute different summary statistics by groups across multiple columns, each with one or more stats functions: - -`tsvkit summarize -g group -s 'purity=mean,sd' -s 'dna_ug:contamination_pct=max,q3' examples/samples.tsv`. - -In addition, it natively supports previewing and processing multi-sheet Excel files, making it easier to work with complex datasets across both TSV and spreadsheet formats. - -All commands accept input from files or `-` for stdin and transparently read `.tsv`, `.tsv.gz`, and `.tsv.xz` archives. +`tsvkit` is a fast, ergonomic toolkit for working with TSV tables. Written in Rust, it brings familiar data-wrangling verbs (join, cut, filter, mutate, summarize, reshape, slice, pretty-print) to the command line with consistent column selection, rich expressions, and streaming-friendly performance. The CLI is inspired by projects such as `csvtk`, `csvkit`, `datamash`, `awk`, `xsv`, and `mlr`, and many options are intentionally compatible with `csvtk` so existing users can transition quickly. + +## Table of Contents +- [Overview](#overview) + - [Key features](#key-features) +- [Installation](#installation) +- [Sample data](#sample-data) +- [Quick start pipeline](#quick-start-pipeline) +- [Command overview](#command-overview) +- [Core concepts](#core-concepts) + - [Column selectors](#column-selectors) + - [Streaming and file handling](#streaming-and-file-handling) + - [Expression language essentials](#expression-language-essentials) +- [Command reference](#command-reference) + - [`info`](#info) + - [`cut`](#cut) + - [`filter`](#filter) + - [`join`](#join) + - [`mutate`](#mutate) + - [`summarize`](#summarize) + - [`sort`](#sort) + - [`melt`](#melt) + - [`pivot`](#pivot) + - [`slice`](#slice) + - [`pretty`](#pretty) + - [`excel`](#excel) + - [`csv`](#csv) +- [Additional tips](#additional-tips) + +## Overview +`tsvkit` combines versatile column selection with an expression engine for statistics, filtering, and data transformation. This makes it straightforward to generate matrices from `samtools idxstats` or `featureCounts`, compute multi-column summaries, or pipe TSV/Excel data through complex workflows without leaving the shell. Multi-sheet Excel workbooks are supported alongside `.tsv`, `.tsv.gz`, and `.tsv.xz` files. + +### Key features +- Stream-friendly processing; every command reads from files or standard input and writes to standard output. +- Column selectors that accept names, 1-based indices, ranges, and multi-file specifications. +- Expression language with arithmetic, comparisons, logical operators, regex matching, and numeric helper functions. +- Aggregations for grouped summaries (`summarize`) and row-wise calculations (`mutate`). +- Excel tooling to inspect, preview, export, and assemble `.xlsx` workbooks. ## Installation - ```bash cargo build --release # binary available at target/release/tsvkit ``` - Run any command with `--help` to see detailed usage and concrete examples: - ```bash tsvkit --help tsvkit join --help ``` -## Sample Data - -The repository ships curated example tables under `examples/` that are used throughout this guide. +## Sample data +Curated example tables live under `examples/` and power the walkthroughs below. | File | Description | | ---- | ----------- | @@ -45,12 +66,7 @@ The repository ships curated example tables under `examples/` that are used thro | `subjects.tsv` | Subject demographics linked to samples via `subject_id`. | | `bioinfo_example.xlsx` | Two-sheet workbook (`Samples`, `Cytokines`) built from the TSVs above for the Excel tooling. | - - -You can download the repository, inspect the files directly, or adapt them to your own pipelines. - -## Quick Start Pipeline - +## Quick start pipeline Join sample and subject metadata, derive a total cytokine score, filter high-purity case samples, and pretty-print the result: ```bash @@ -65,7 +81,7 @@ cat examples/cytokines.tsv \ > Tip: wrap every `-e` expression in single quotes so your shell keeps `$column` selectors intact. Inside an expression, always prefix column references with `$` (e.g. `$total`, `$1`). _Output_ -``` +```text +-----------+-----+-----+------+------+-----------+------------+-------+-----------+--------+ | sample_id | IL6 | TNF | IFNG | IL10 | log_total | subject_id | group | timepoint | purity | +-----------+-----+-----+------+------+-----------+------------+-------+-----------+--------+ @@ -73,12 +89,10 @@ _Output_ | S03 | 4.9 | 3.6 | 7.4 | 2.6 | 4.209453 | P001 | case | week4 | 0.96 | +-----------+-----+-----+------+------+-----------+------------+-------+-----------+--------+ ``` - The same pipeline works if the cytokine table is compressed (`examples/cytokines.tsv.gz`). -## Command Overview - -The table below lists every `tsvkit` subcommand and a one-line purpose summary; each item links to the detailed section later in this guide. +## Command overview +The list below provides a one-line description of every `tsvkit` subcommand. Each item links to the detailed section later in this guide. - [`info`](#info) — inspect table shape, inferred column types, and sample values. - [`cut`](#cut) — select or reorder columns via names, indices, or ranges. @@ -94,60 +108,74 @@ The table below lists every `tsvkit` subcommand and a one-line purpose summary; - [`excel`](#excel) — inspect, preview, export, or build `.xlsx` workbooks. - [`csv`](#csv) — convert delimited text to TSV with custom separators. -The following sections cover shared concepts first, then dive into each command with practical examples. - -## Core Concepts - +## Core concepts These conventions appear across the toolkit; understanding them once makes each subcommand predictable. ### Column selectors +Selectors are reused in `cut`, `filter`, `join`, `mutate`, `summarize`, and others. -- **Names**: `sample_id,purity` -- **1-based indices**: `1,4,9` -- **Ranges**: `IL6:IL10`, `2:5`, open-ended forms like `:IL10` (from the first column) or `IL6:` (through the last column) -- **Whole-table**: `:` selects every column in order -- **Mixed lists**: `sample_id,3:5,tech` -- **Clustered sequences**: combine selectors such as `sample_id,quality:` or `:purity,tech` -- **Multi-file specs**: separate selectors for each input with semicolons, e.g. `sample_id;subject_id` for `join -f`. - -Anywhere you access column *values* inside an expression, prefix the selector with `$` (`$purity`, `$1`, `$IL6:$IL10`). - -### Expression language - -- Quote each `-e` argument with single quotes so the shell leaves `$column` references untouched. -- Use double quotes inside expressions for string literals (`"case"`). -- Arithmetic operators: `+ - * /`; comparisons: `== != < <= > >=`. -- Logical combinators: `&` and `|` (or `and`/`or`); negation: `!`/`not`. -- Regex operators: `~` (match) and `!~` (does not match). Patterns follow Rust's `regex` crate syntax. -- Numeric helpers: `abs`, `sqrt`, `exp`, `exp2`, `ln`, `log`/`log10`, `log2`. -- String helpers in `mutate`: `sub(column, pattern, replacement)` and the sed-inspired `s/cols/pattern/replacement/` syntax. +| Pattern | Meaning | Example | +| ------- | ------- | ------- | +| `name` | Column by header name. | `sample_id,purity` | +| `index` | 1-based column index. | `1,4,9` | +| `-index` | Column counted from the end (1 = last). | `-1,-2` | +| `start:end` | Inclusive range by name or index. Supports open ends. | `IL6:IL10`, `2:5`, `:IL10`, `IL6:` | +| `:` | Select every column in order. | `-f ':'` | +| `mixed` | Combine names, indices, and ranges. | `sample_id,3:5,tech` | +| `multi-file` | Separate selectors for each input with semicolons (primarily `join`). | `sample_id;subject_id` | +| `range in expressions` | Prefixed with `$` to access a slice of values. | `$IL6:$IL10` | -### Aggregations +> Wrap selectors in backticks or braces to treat punctuation literally. For example, ``-f '`IL6:IL10`,`total,reads`'`` or `-f '{IL6:IL10},{total,reads}'` selects columns named `IL6:IL10` and `total,reads` instead of expanding a range or splitting on the comma. -Aggregators support descriptive statistics in `summarize` and row-wise calculations in `mutate`: +Negative indices are also valid inside ranges: `:-2` selects every column except the final two, while `-3:` keeps the last three columns. -- Totals and averages: `sum`, `mean`/`avg` -- Spread: `sd`/`std` -- Medians: `median`/`med` -- Quantiles: `q1`, `q2`, `q3`, `q0.25`, `p95`, etc. (`q` = fraction, `p` = percentile) - -### CLI conventions +Anywhere you access column *values* inside an expression, prefix the selector with `$` (`$purity`, `$1`, `$IL6:$IL10`). +### Streaming and file handling - Every command accepts files or `-` (stdin) and auto-detects `.tsv`, `.tsv.gz`, and `.tsv.xz` inputs. - Add `-H/--no-header` when your data lacks a header row; selectors fall back to 1-based indices. -- Commands are stream-friendly—pipe them freely to build larger workflows. - Use `-C/--comment-char`, `-E/--ignore-empty-row`, and `-I/--ignore-illegal-row` on any subcommand to control how input lines are filtered before processing. - `tsvkit join` parallelizes input loading; control the worker count with `-t/--threads` (defaults to the lesser of 8 and the available CPU cores). - `--fill TEXT` lets join, melt, pivot (and others) swap empty cells for a custom placeholder. ---- +### Expression language essentials +The same expression language powers `filter -e`, `mutate -e name=EXPR`, and regex substitutions. Wrap expressions in single quotes to protect `$columns` from the shell. + +**Operators and comparisons** + +| Symbol / keyword | Description | Works on | +| ---------------- | ----------- | -------- | +| `+ - * / ^` | Arithmetic operators (`^` is exponentiation, right-associative). | Numbers | +| `== != < <= > >=` | Comparisons. | Numbers or strings | +| `&` / `and` | Logical AND. | Booleans | +| `|` / `or` | Logical OR. | Booleans | +| `!` / `not` | Logical negation. | Booleans | +| `~` | Regex match. Right-hand side can be literal text or a `$range`. | Strings | +| `!~` | Regex does *not* match. | Strings | -## Command Reference +> Reference columns whose names contain operators or punctuation with `${column-name}` inside expressions (e.g. `${dna-} - $rna_ug`). This prevents the parser from treating the characters as arithmetic. +**Numeric helper functions** + +| Function | Description | +| -------- | ----------- | +| `abs(expr)` | Absolute value | +| `sqrt(expr)` | Square root | +| `exp(expr)` / `exp2(expr)` | Exponential (`e^x`) / base-2 exponential | +| `ln(expr)` | Natural logarithm | +| `log(expr)` / `log10(expr)` | Base-10 logarithm | +| `log2(expr)` | Base-2 logarithm | + +Functions accept column references (`abs($purity - 1)`), constants, or subexpressions. Empty or non-numeric values yield blanks. + +**Row-wise aggregation helpers** + +Available within `mutate` expressions via functions such as `sum($col1:$col5)`; see the [Mutate](#mutate) section for the full list. + +## Command reference Each subsection highlights the core options, shows realistic invocations, and calls out relevant selectors or expressions. ### `info` - Get a quick, structured summary of any TSV: the overall shape plus one row per column with inferred types and sample values. The preview column defaults to the first three rows, but you can raise or lower it with `-n` (e.g. `-n 5`). Combine with `-H` when the input has no header row so the summary omits the name column. ```bash @@ -155,7 +183,7 @@ tsvkit info examples/samples.tsv ``` _Output_ -``` +```text #shape(6, 9) index name type first3 1 sample_id str [S01, S02, S03] @@ -170,7 +198,6 @@ index name type first3 ``` ### `cut` - Select or reorder columns by name, index, or range. ```bash @@ -184,24 +211,36 @@ tsvkit cut -f 'sample_id,IL6:IL10' examples/cytokines.tsv ``` ### `filter` - -Filter rows with boolean logic, arithmetic, and regexes. +Filter rows with boolean logic, arithmetic, column ranges, and regexes. ```bash tsvkit filter -e '$group == "case" & $purity >= 0.94' examples/samples.tsv ``` -Regex operators `~` / `!~` make pattern filters concise and now work across column ranges: - -```bash -tsvkit filter -e '$tech ~ "sRNA"' examples/samples.tsv -tsvkit filter -e '$gene:$notes ~ "kinase"' data.tsv # match either column -tsvkit filter -e '$expr:, $status ~ "fail"' qc.tsv # mixed open range + column list -tsvkit filter -e '~ "control"' results.tsv # search all columns -``` +**Expression building blocks for `filter`** + +| Building block | Examples | Notes | +| -------------- | -------- | ----- | +| Column values | `$purity`, `$1`, `$IL6:$IL10` | Ranges produce a list; use in regex matches or aggregate helpers. | +| Literals | `1.25`, `"case"` | Strings use double quotes; escape inner quotes with `\"`. | +| Arithmetic | `($rna_ug - $dna_ug) / $rna_ug` | Standard precedence applies (parentheses for clarity). | +| Comparisons | `$purity >= 0.9`, `$group != "control"` | Works on numeric or string data. | +| Logical | `($purity >= 0.9) & ($group == "case")` | `&`, `|`, and `!` (or `and`, `or`, `not`). | +| Numeric functions | `log2($total)`, `sqrt($reads)` | See [Expression language essentials](#expression-language-essentials). | +| Row-wise aggregators | `sum($dna_ug:$rna_ug)`, `mode($1,$3)`, `countunique($gene:)` | Same catalog as [`summarize`](#summarize): totals, quantiles (`q*` / `p*`), variance/SD, products, entropy, argmin/argmax, membership stats. Works with ranges, lists, and open selectors. | +| Regex match | `$tech ~ "sRNA"`, `$notes !~ "(?i)fail"` | Patterns follow Rust `regex` syntax. `(?i)` enables case-insensitive matching. | +| Regex across ranges | `$gene:$notes ~ "kinase"`, `~ "control"` | When the left-hand side is omitted, `~` scans all columns. | + +**Regex usage at a glance** + +| Pattern | Description | +| ------- | ----------- | +| `$col ~ "^ABC"` | Keep rows where the column starts with `ABC`. | +| `$col !~ "xyz$"` | Exclude rows where the column ends with `xyz`. | +| `$A:$C ~ "kinase"` | Match if *any* column in the range contains `kinase`. | +| `~ "(?i)na"` | Match if any column (entire row) contains `na`, case-insensitive. | ### `join` - Merge tables on shared keys. Provide selectors with `-f/--fields`; when all inputs use the same key you can specify it once. ```bash @@ -211,7 +250,6 @@ tsvkit join -f subject_id examples/samples.tsv examples/subjects.tsv Control join type with `-k` (`-k 0` = full outer). Use `-F/--select` to specify output columns (defaults to all non-key columns); syntax mirrors `-f`. `--fill TEXT` supplies placeholders for missing combinations, while `--sorted` streams pre-sorted data. `tsvkit join` trims unused columns before indexing, and `-t/--threads` (default up to 8) balances throughput and resource usage. ### `mutate` - Create derived columns or rewrite values using expressions. ```bash @@ -228,9 +266,29 @@ Apply in-place edits with the sed-style form: tsvkit mutate -e 's/$group/ctrl/control/' examples/samples.tsv ``` -### `summarize` +**Mutation building blocks** + +| Form | Meaning | Example | +| ---- | ------- | ------- | +| `name=EXPR` | Append a new column containing the evaluated expression. | `mean_signal=mean($sig1:$sig4)` | +| `existing=EXPR` | Overwrite an existing column with the expression result. | `purity=round($purity,2)` (via custom helper script) | +| `s/$selectors/pattern/replacement/` | Regex substitution on one or more columns (`$` optional). | `s/$group/ctrl/control/` | + +**Row-wise aggregators shared by `filter` and `mutate`** + +| Category | Functions (aliases) | Description | +| -------- | ------------------- | ----------- | +| Totals & centers | `sum`, `mean`/`avg`, `median`/`med`, `trimmean`, `iqr` | Numeric summaries that ignore blanks; `iqr` computes `q3 - q1`. | +| Dispersion | `sd`/`std`/`stddev`, `var`/`variance`, `entropy` | Spread metrics and Shannon entropy of the value distribution. | +| Extremes & positions | `min`, `max`, `absmin`, `absmax`, `argmin`, `argmax` | `arg*` return the 1-based position among numeric entries. | +| Membership & counts | `count`, `first`, `last`, `rand`/`random`, `unique`, `collapse`, `countunique`/`distinct`, `mode`, `antimode` | Operate on the original strings (including duplicates and blanks). | +| Products | `prod`/`product` | Multiply all numeric inputs (skips blanks and NaNs). | +| Quantiles | `q*` (`q1`, `q0.9`, `q_0_25`), `p*` (`p95`, `p99.5`) | Fractions `0–1` and percents `0–100`; underscores may replace dots. | -Group rows and compute descriptive statistics. +Aggregators accept any range, list, or open-ended selector (`sum($1,$3:)`). Non-numeric cells are skipped for numeric summaries. Results are appended as new columns unless you assign them back to an existing name. + +### `summarize` +Group rows and compute descriptive statistics. Without `-g/--group`, the entire table is treated as a single group. ```bash tsvkit summarize \ @@ -240,8 +298,50 @@ tsvkit summarize \ examples/samples.tsv ``` -### `sort` +**Aggregators supported by `summarize`** + +_Counts & membership_ + +| Aggregator (aliases) | Description | Output type | +| -------------------- | ----------- | ----------- | +| `count` | Number of rows in the group (ignores blanks). | Numeric | +| `first` | First non-empty value encountered. | Original type | +| `last` | Last non-empty value encountered. | Original type | +| `rand` / `random` | Random value from the group. | Original type | +| `unique` | Comma-separated list of distinct values in encounter order. | String | +| `collapse` | Concatenate every value (comma-separated, includes duplicates). | String | +| `countunique` / `distinct` | Count of distinct values. | Numeric | +| `mode` | Most frequent value (ties resolved by first occurrence). | Original type | +| `antimode` | Least frequent value (ties resolved by first occurrence). | Original type | +| `entropy` | Shannon entropy calculated from value frequencies. | Numeric | + +_Numeric summaries_ + +| Aggregator (aliases) | Description | +| -------------------- | ----------- | +| `sum` | Sum of numeric values. | +| `mean` / `avg` | Arithmetic mean. | +| `median` / `med` | Median (50th percentile). | +| `trimmean` | Mean of values after trimming 25% from each tail. | +| `iqr` | Interquartile range (`q3 - q1`). | +| `sd` / `std` / `stddev` | Sample standard deviation. | +| `var` / `variance` | Sample variance. | +| `min` / `max` | Minimum / maximum value. | +| `absmin` / `absmax` | Value with the smallest / largest absolute magnitude (returned with original sign). | +| `prod` / `product` | Product of numeric values. | +| `argmin` / `argmax` | 1-based row index within the group where the min/max occurs. | + +_Quantiles_ + +| Pattern | Description | +| ------- | ----------- | +| `q1`, `q2`, `q3`, `q4` | Quartiles (`q2` equals the median). | +| `q0`, `q0.25`, `q0.75`, `q0.9` | Fractional quantiles between 0 and 1 (underscores allowed instead of dots). | +| `p0`, `p25`, `p95`, `p99.5` | Percentiles between 0 and 100. | + +Quantile aggregators accept any `q*` (fraction) or `p*` (percent) token. Values may include decimals (`q0.05`, `p99.5`) or integers. Non-numeric cells are ignored for numeric summaries and quantiles. `absmin`, `absmax`, `mode`, `antimode`, and `entropy` inspect the original string values, so they work even without numeric conversion. +### `sort` Sort rows by one or more keys. Modifiers: `:n` (numeric), `:nr` (numeric descending), `:r` (reverse text). ```bash @@ -249,7 +349,6 @@ tsvkit sort -k purity:nr -k contamination_pct examples/samples.tsv ``` ### `melt` - Convert wide tables into tidy long form. Add `--fill TEXT` to substitute blanks with a chosen value. ```bash @@ -257,7 +356,6 @@ tsvkit melt -i sample_id -v IL6:IL10 examples/cytokines.tsv ``` ### `pivot` - Promote long-form values to columns. `-c/--column` also accepts the short alias `-f`, and `--fill TEXT` sets a default value for missing combinations. ```bash @@ -265,59 +363,53 @@ tsvkit pivot -i gene -c sample_id -v expression examples/expression.tsv ``` ### `slice` - -Take specific rows (1-based indices or ranges, including open-ended forms like `:10`, `10:`, or even `:` for everything). +Take specific rows (1-based indices or ranges, including open-ended forms like `:10`, `10:`, or even `:` for everything). Negative indices count from the end, so `:-2` emits every row except the final two and `-3:` keeps the last three rows. ```bash tsvkit slice -r 1,4:5 examples/samples.tsv ``` ### `pretty` - Render aligned, boxed output for quick inspection. ```bash tsvkit filter -e '$group == "case"' examples/samples.tsv | tsvkit pretty ``` -### `excel` +- `--round DIGITS` (or `-r`) rounds numeric cells to the requested precision. Tiny magnitudes automatically switch to scientific + notation so columns stay legible even when values approach zero. +### `excel` Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new workbooks from TSV inputs. Unless `-H/--no-header` is supplied, the first row of each sheet is treated as the header row; use that flag when you need to preview or export raw rows. - **List sheets** (`--sheets`) with row/column counts and inferred column types: - ```bash tsvkit excel --sheets examples/bioinfo_example.xlsx ``` - _Output_ - ``` - 1 Samples rows=21 cols=4 types=[string,string,mixed,mixed] - 2 Variants rows=21 cols=4 types=[string,mixed,mixed,mixed] - 3 Expression rows=21 cols=3 types=[string,mixed,mixed] - 4 Pathways rows=11 cols=3 types=[string,string,mixed] - 5 QC rows=21 cols=4 types=[string,mixed,mixed,mixed] - 6 ClinMetadata rows=21 cols=4 types=[string,mixed,string,string] - 7 Taxonomy rows=21 cols=4 types=[string,mixed,mixed,mixed] - 8 Coverage rows=11 cols=2 types=[string,mixed] - 9 Proteomics rows=21 cols=3 types=[string,mixed,mixed] - 10 Metabolites rows=21 cols=2 types=[string,mixed] + ```text + 1 Samples rows=21 cols=4 types=[string,string,mixed,mixed] + 2 Variants rows=21 cols=4 types=[string,mixed,mixed,mixed] + 3 Expression rows=21 cols=3 types=[string,mixed,mixed] + 4 Pathways rows=11 cols=3 types=[string,string,mixed] + 5 QC rows=21 cols=4 types=[string,mixed,mixed,mixed] + 6 ClinMetadata rows=21 cols=4 types=[string,mixed,string,string] + 7 Taxonomy rows=21 cols=4 types=[string,mixed,mixed,mixed] + 8 Coverage rows=11 cols=2 types=[string,mixed] + 9 Proteomics rows=21 cols=3 types=[string,mixed,mixed] + 10 Metabolites rows=21 cols=2 types=[string,mixed] ``` - **Preview** (`--preview`) the first rows of every sheet (header + N rows by default). Use `-s` to focus on one sheet, `-n` to change the window, `--formulas` to show Excel formulas instead of values, `--dates raw|excel|iso` to control date rendering (`iso` is the default), and `--pretty` to render the preview with aligned borders. Add `-H/--no-header` when the sheet lacks a header row so the preview shows raw rows only: - ```bash tsvkit excel --preview reports.xlsx -n 5 -s Summary tsvkit excel --preview reports.xlsx --formulas --dates raw tsvkit excel --preview reports.xlsx --pretty - ``` - - ```bash tsvkit excel --preview examples/bioinfo_example.xlsx -n 3 --pretty ``` - _Output_ (Only the first three sheets are shown below, the actual output has ten) - ``` + _Output_ (Only the first three sheets are shown below; the actual output has ten.) + ```text #1 Samples +----------+-------+--------+--------+ | SampleID | Group | Purity | DNA_ug | @@ -326,7 +418,7 @@ Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new | S002 | Case | 0.837 | 8.59 | | S003 | Case | 0.703 | 1.15 | +----------+-------+--------+--------+ - + #2 Variants +----------+------+--------+------+ | SampleID | SNPs | Indels | CNVs | @@ -335,7 +427,7 @@ Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new | S002 | 3191 | 241 | 19 | | S003 | 2463 | 210 | 8 | +----------+------+--------+------+ - + #3 Expression +-------+-----------+--------------+ | Gene | Expr_Case | Expr_Control | @@ -346,16 +438,13 @@ Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new +-------+-----------+--------------+ ... ``` - - **Dump** (`--dump`) a sheet (or subset) to TSV. Columns accept names, indices, or Excel letters/ranges (e.g. `A:C,Expr`, `:C`, `C:`). Rows accept 1-based indices or inclusive ranges (`1,10:20,:25,100:`). `--na` replaces blanks, `--escape-*` makes TSV-safe output, and the same `--values/--formulas` + `--dates` controls apply. Use `-H/--no-header` when the sheet lacks a header row so column names fall back to indices: - ```bash tsvkit excel --dump examples/bioinfo_example.xlsx -s Samples -f 'SampleID,Group,Purity' -r 2:3 | tsvkit pretty ``` - _Output_ - ``` + ```text +----------+-------+--------+ | SampleID | Group | Purity | +----------+-------+--------+ @@ -365,16 +454,13 @@ Inspect `.xlsx` workbooks, preview sheets, export ranges as TSV, or assemble new ``` - **Load** (`--load`) one or more TSV files into a new workbook. Each `TSV` can be followed by `-s SHEETNAME`. Use `-H` when the TSV lacks headers, `--fields` to supply header names in that case, `--types infer|string` to control numeric inference, `--dates excel|iso|raw` to influence how date strings are written, `--na` to treat specific tokens as blanks, and `--max-rows-per-sheet` to split very tall sheets (defaults to Excel's 1,048,576 rows): - ```bash tsvkit excel --load examples/samples.tsv -s Samples --load examples/cytokines.tsv -s Cytokines -o examples/testout.xlsx ``` -Only `.xlsx` files are supported at the moment. Sheets created via `--load` are renamed `Name (2)`, `Name (3)`, … when row limits force splits. -When `-s` is omitted the sheet falls back to its 1-based position (`1`, `2`, …) in the load order, so a mixture of named and unnamed inputs still yields deterministic sheet names. +Only `.xlsx` files are supported at the moment. Sheets created via `--load` are renamed `Name (2)`, `Name (3)`, … when row limits force splits. When `-s` is omitted the sheet falls back to its 1-based position (`1`, `2`, …) in the load order, so a mixture of named and unnamed inputs still yields deterministic sheet names. ### `csv` - Convert delimited text into TSV. Use `--delim` to specify the input delimiter (default `,`) and `-H` when the source has no header row. The converter also mirrors common TSV-reader switches: - `-C/--comment-char` skips comment lines. @@ -390,12 +476,11 @@ tsvkit csv examples/data.csv > examples/data.tsv tsvkit csv examples/semicolon.csv --delim ';' -H > tmp.tsv ``` -## Additional Tips - +## Additional tips - `tsvkit` automatically detects `.tsv`, `.tsv.gz`, and `.tsv.xz`. Pipe from `curl`/`zcat` for other formats. - Numeric functions treat empty cells as missing; regex syntax follows Rust's `regex` crate. - For massive joins, pre-sort inputs and use `join --sorted` to keep memory usage flat. +- Combine `mutate`, `filter`, and `summarize` to build complete pipelines directly on the command line. ## Contributing - -Issues and pull requests are welcome. If you extend the toolkit, please add regression tests (`cargo test`) and update the documentation. For large feature ideas (new subcommands, storage backends), start a discussion first. +Issues and pull requests are welcome! diff --git a/src/aggregate.rs b/src/aggregate.rs new file mode 100644 index 0000000..d84ddee --- /dev/null +++ b/src/aggregate.rs @@ -0,0 +1,535 @@ +use std::collections::{HashSet, hash_map::DefaultHasher}; +use std::hash::{Hash, Hasher}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use anyhow::{Context, Result, bail}; +use indexmap::{IndexMap, IndexSet}; + +#[derive(Debug, Clone, PartialEq)] +pub enum AggregateKind { + Sum, + Mean, + Median, + TrimMean, + Iqr, + First, + Last, + Count, + Rand, + Unique, + Collapse, + CountUnique, + Sd, + Var, + Min, + Max, + AbsMin, + AbsMax, + Mode, + AntiMode, + Prod, + Entropy, + ArgMin, + ArgMax, + Quantile { fraction: f64 }, +} + +#[derive(Debug, Clone)] +pub struct AggregateValue { + pub text: String, + pub numeric: Option, +} + +impl AggregateValue { + pub fn empty() -> Self { + AggregateValue { + text: String::new(), + numeric: None, + } + } + + pub fn from_number(value: f64) -> Self { + if value.is_finite() { + AggregateValue { + text: format_number(value), + numeric: Some(value), + } + } else { + AggregateValue::empty() + } + } + + pub fn from_text(text: String) -> Self { + let numeric = parse_float(&text); + AggregateValue { text, numeric } + } +} + +pub fn try_parse_aggregate_kind(name: &str) -> Result> { + let lower = name.to_ascii_lowercase(); + if let Some(fraction) = try_parse_quantile(&lower)? { + return Ok(Some(AggregateKind::Quantile { fraction })); + } + + let kind = match lower.as_str() { + "sum" => Some(AggregateKind::Sum), + "mean" | "avg" => Some(AggregateKind::Mean), + "median" | "med" => Some(AggregateKind::Median), + "trimmean" => Some(AggregateKind::TrimMean), + "iqr" => Some(AggregateKind::Iqr), + "first" => Some(AggregateKind::First), + "last" => Some(AggregateKind::Last), + "count" => Some(AggregateKind::Count), + "rand" | "random" => Some(AggregateKind::Rand), + "unique" => Some(AggregateKind::Unique), + "collapse" => Some(AggregateKind::Collapse), + "countunique" | "distinct" => Some(AggregateKind::CountUnique), + "sd" | "std" | "stddev" => Some(AggregateKind::Sd), + "var" | "variance" => Some(AggregateKind::Var), + "min" => Some(AggregateKind::Min), + "max" => Some(AggregateKind::Max), + "absmin" => Some(AggregateKind::AbsMin), + "absmax" => Some(AggregateKind::AbsMax), + "mode" => Some(AggregateKind::Mode), + "antimode" => Some(AggregateKind::AntiMode), + "prod" | "product" => Some(AggregateKind::Prod), + "entropy" => Some(AggregateKind::Entropy), + "argmin" => Some(AggregateKind::ArgMin), + "argmax" => Some(AggregateKind::ArgMax), + _ => None, + }; + Ok(kind) +} + +pub fn parse_quantile_fraction(token: &str) -> Result { + let rest = token.trim_start_matches('q'); + if rest.is_empty() { + bail!("quantile function must specify a value (e.g. q1 or q0.25)"); + } + if rest.chars().all(|c| c.is_ascii_digit()) { + let int_val: u32 = rest.parse().with_context(|| "invalid quantile index")?; + if int_val <= 4 { + return Ok(int_val as f64 / 4.0); + } + if int_val <= 100 { + return Ok(int_val as f64 / 100.0); + } + bail!("quantile integer must be between 1 and 100"); + } + let cleaned = rest.replace('_', "."); + let value: f64 = cleaned + .parse() + .with_context(|| "invalid quantile fraction")?; + if (0.0..=1.0).contains(&value) { + Ok(value) + } else if (1.0..=100.0).contains(&value) { + Ok(value / 100.0) + } else { + bail!("quantile fraction must lie between 0 and 1 (or 0-100 for percentages)"); + } +} + +pub fn parse_percent_fraction(token: &str) -> Result { + let rest = token.trim_start_matches('p'); + if rest.is_empty() { + bail!("percentile function must specify a value (e.g. p95)"); + } + let cleaned = rest.replace('_', "."); + let value: f64 = cleaned + .parse() + .with_context(|| "invalid percentile value")?; + if !(0.0..=100.0).contains(&value) { + bail!("percentile value must be between 0 and 100"); + } + Ok(value / 100.0) +} + +fn try_parse_quantile(token: &str) -> Result> { + let trimmed = token.trim(); + if trimmed.len() < 2 { + return Ok(None); + } + let prefix = trimmed.chars().next().unwrap(); + match prefix { + 'q' => parse_quantile_fraction(trimmed).map(Some), + 'p' => parse_percent_fraction(trimmed).map(Some), + _ => Ok(None), + } +} + +pub fn evaluate_row_aggregate(kind: &AggregateKind, values: &[&str]) -> AggregateValue { + match kind { + AggregateKind::Sum => numeric_result(values, |nums| Some(nums.iter().sum::())), + AggregateKind::Mean => numeric_result(values, |nums| { + if nums.is_empty() { + None + } else { + Some(nums.iter().sum::() / nums.len() as f64) + } + }), + AggregateKind::Median => numeric_result(values, |nums| median(nums)), + AggregateKind::TrimMean => numeric_result(values, |nums| trimmed_mean(nums)), + AggregateKind::Iqr => numeric_result(values, |nums| interquartile_range(nums)), + AggregateKind::Sd => numeric_result(values, |nums| stddev(nums)), + AggregateKind::Var => numeric_result(values, |nums| variance(nums)), + AggregateKind::Min => numeric_result(values, |nums| nums.iter().cloned().reduce(f64::min)), + AggregateKind::Max => numeric_result(values, |nums| nums.iter().cloned().reduce(f64::max)), + AggregateKind::AbsMin => numeric_result(values, |nums| abs_min(nums)), + AggregateKind::AbsMax => numeric_result(values, |nums| abs_max(nums)), + AggregateKind::Prod => numeric_result(values, |nums| { + if nums.is_empty() { + None + } else { + let mut product = 1.0; + for num in nums { + product *= num; + } + Some(product) + } + }), + AggregateKind::Quantile { fraction } => { + numeric_result(values, |nums| quantile(nums, *fraction)) + } + AggregateKind::First => values.first().map_or_else(AggregateValue::empty, |v| { + AggregateValue::from_text((*v).to_string()) + }), + AggregateKind::Last => values.last().map_or_else(AggregateValue::empty, |v| { + AggregateValue::from_text((*v).to_string()) + }), + AggregateKind::Count => AggregateValue::from_number(values.len() as f64), + AggregateKind::Rand => random_choice(values) + .map(AggregateValue::from_text) + .unwrap_or_else(AggregateValue::empty), + AggregateKind::Unique => { + let mut seen = IndexSet::new(); + for value in values { + seen.insert((*value).to_string()); + } + let text = seen.into_iter().collect::>().join(","); + AggregateValue::from_text(text) + } + AggregateKind::Collapse => { + if values.is_empty() { + AggregateValue::empty() + } else { + AggregateValue::from_text(values.join(",")) + } + } + AggregateKind::CountUnique => { + let mut distinct = HashSet::new(); + for value in values { + distinct.insert((*value).to_string()); + } + AggregateValue::from_number(distinct.len() as f64) + } + AggregateKind::Mode => { + let counts = build_counts(values); + AggregateValue::from_text(mode_value(&counts)) + } + AggregateKind::AntiMode => { + let counts = build_counts(values); + AggregateValue::from_text(antimode_value(&counts)) + } + AggregateKind::Entropy => { + let counts = build_counts(values); + let total = values.len(); + entropy(&counts, total) + .map(AggregateValue::from_number) + .unwrap_or_else(AggregateValue::empty) + } + AggregateKind::ArgMin => { + let mut best: Option<(f64, usize)> = None; + let mut position = 0usize; + for value in values { + if let Some(num) = parse_float(value) { + if !num.is_finite() { + continue; + } + position += 1; + match best { + Some((current, _)) if num >= current => {} + _ => best = Some((num, position)), + } + } + } + best.map(|(_, pos)| AggregateValue::from_number(pos as f64)) + .unwrap_or_else(AggregateValue::empty) + } + AggregateKind::ArgMax => { + let mut best: Option<(f64, usize)> = None; + let mut position = 0usize; + for value in values { + if let Some(num) = parse_float(value) { + if !num.is_finite() { + continue; + } + position += 1; + match best { + Some((current, _)) if num <= current => {} + _ => best = Some((num, position)), + } + } + } + best.map(|(_, pos)| AggregateValue::from_number(pos as f64)) + .unwrap_or_else(AggregateValue::empty) + } + } +} + +fn numeric_result(values: &[&str], compute: F) -> AggregateValue +where + F: Fn(&[f64]) -> Option, +{ + let numbers = collect_numeric(values); + compute(&numbers) + .map(AggregateValue::from_number) + .unwrap_or_else(AggregateValue::empty) +} + +fn collect_numeric(values: &[&str]) -> Vec { + values + .iter() + .filter_map(|value| parse_float(value)) + .filter(|num| num.is_finite()) + .collect() +} + +fn median(values: &[f64]) -> Option { + if values.is_empty() { + return None; + } + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let mid = sorted.len() / 2; + if sorted.len() % 2 == 0 { + Some((sorted[mid - 1] + sorted[mid]) / 2.0) + } else { + Some(sorted[mid]) + } +} + +fn trimmed_mean(values: &[f64]) -> Option { + if values.is_empty() { + return None; + } + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let n = sorted.len(); + let trim = ((n as f64) * 0.1).floor() as usize; + if trim == 0 || trim * 2 >= n { + let sum: f64 = sorted.iter().sum(); + return Some(sum / n as f64); + } + let slice = &sorted[trim..(n - trim)]; + if slice.is_empty() { + return None; + } + let sum: f64 = slice.iter().sum(); + Some(sum / slice.len() as f64) +} + +fn interquartile_range(values: &[f64]) -> Option { + if values.is_empty() { + return None; + } + let q1 = quantile(values, 0.25)?; + let q3 = quantile(values, 0.75)?; + Some(q3 - q1) +} + +fn stddev(values: &[f64]) -> Option { + variance(values).map(|var| var.sqrt()) +} + +fn variance(values: &[f64]) -> Option { + if values.is_empty() { + return None; + } + let mean = values.iter().sum::() / values.len() as f64; + let var = values + .iter() + .map(|v| { + let diff = v - mean; + diff * diff + }) + .sum::() + / values.len() as f64; + Some(var.max(0.0)) +} + +fn abs_min(values: &[f64]) -> Option { + let mut best: Option<(f64, f64)> = None; + for &value in values { + let abs_val = value.abs(); + match best { + Some((_, current_abs)) if abs_val >= current_abs => {} + _ => best = Some((value, abs_val)), + } + } + best.map(|(value, _)| value) +} + +fn abs_max(values: &[f64]) -> Option { + let mut best: Option<(f64, f64)> = None; + for &value in values { + let abs_val = value.abs(); + match best { + Some((_, current_abs)) if abs_val <= current_abs => {} + _ => best = Some((value, abs_val)), + } + } + best.map(|(value, _)| value) +} + +fn quantile(values: &[f64], fraction: f64) -> Option { + if values.is_empty() { + return None; + } + let mut sorted = values.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); + let position = fraction.clamp(0.0, 1.0) * (sorted.len() - 1) as f64; + let lower = position.floor() as usize; + let upper = position.ceil() as usize; + if lower == upper { + Some(sorted[lower]) + } else { + let weight = position - lower as f64; + let lower_value = sorted[lower]; + let upper_value = sorted[upper]; + Some(lower_value + (upper_value - lower_value) * weight) + } +} + +fn random_choice(values: &[&str]) -> Option { + if values.is_empty() { + return None; + } + let mut hasher = DefaultHasher::new(); + values.len().hash(&mut hasher); + for (idx, value) in values.iter().enumerate() { + idx.hash(&mut hasher); + value.hash(&mut hasher); + } + if let Ok(duration) = SystemTime::now().duration_since(UNIX_EPOCH) { + duration.as_nanos().hash(&mut hasher); + } + let index = (hasher.finish() as usize) % values.len(); + values.get(index).map(|v| (*v).to_string()) +} + +fn build_counts(values: &[&str]) -> IndexMap { + let mut counts = IndexMap::new(); + for value in values { + let entry = counts.entry((*value).to_string()).or_insert(0); + *entry += 1; + } + counts +} + +fn mode_value(counts: &IndexMap) -> String { + let mut best_value = String::new(); + let mut best_count = 0usize; + for (value, count) in counts { + if *count > best_count { + best_count = *count; + best_value = value.clone(); + } + } + best_value +} + +fn antimode_value(counts: &IndexMap) -> String { + if counts.is_empty() { + return String::new(); + } + let mut best_value = String::new(); + let mut best_count = usize::MAX; + for (value, count) in counts { + if *count < best_count { + best_count = *count; + best_value = value.clone(); + } + } + best_value +} + +fn entropy(counts: &IndexMap, total: usize) -> Option { + if total == 0 || counts.is_empty() { + return None; + } + let total = total as f64; + let mut entropy = 0.0; + for &count in counts.values() { + if count == 0 { + continue; + } + let probability = count as f64 / total; + entropy -= probability * probability.log2(); + } + Some(entropy.max(0.0)) +} + +pub fn format_number(value: f64) -> String { + if value.fract() == 0.0 { + format!("{:.0}", value) + } else { + format!("{:.6}", value) + } +} + +pub fn parse_float(text: &str) -> Option { + let trimmed = text.trim(); + if trimmed.is_empty() { + return None; + } + trimmed.parse::().ok() +} + +#[cfg(test)] +mod tests { + use super::{AggregateKind, evaluate_row_aggregate, try_parse_aggregate_kind}; + + #[test] + fn parses_quantile_aliases() { + let q1 = try_parse_aggregate_kind("q1").unwrap().unwrap(); + match q1 { + AggregateKind::Quantile { fraction } => { + assert!((fraction - 0.25).abs() < f64::EPSILON) + } + _ => panic!("expected quantile"), + } + let p95 = try_parse_aggregate_kind("p95").unwrap().unwrap(); + match p95 { + AggregateKind::Quantile { fraction } => { + assert!((fraction - 0.95).abs() < f64::EPSILON) + } + _ => panic!("expected quantile"), + } + } + + #[test] + fn aggregate_sum_over_values() { + let values = vec!["1", "2", "3"]; + let result = evaluate_row_aggregate(&AggregateKind::Sum, &values); + assert_eq!(result.text, "6"); + } + + #[test] + fn aggregate_mode_prefers_first_encounter() { + let values = vec!["A", "B", "A", "C", "B", "A"]; + let result = evaluate_row_aggregate(&AggregateKind::Mode, &values); + assert_eq!(result.text, "A"); + } + + #[test] + fn aggregate_entropy_handles_empty() { + let empty: Vec<&str> = Vec::new(); + let result = evaluate_row_aggregate(&AggregateKind::Entropy, &empty); + assert!(result.text.is_empty()); + } + + #[test] + fn try_parse_non_aggregate_returns_none() { + assert!(try_parse_aggregate_kind("abs").unwrap().is_none()); + } +} diff --git a/src/common.rs b/src/common.rs index b9a4f63..06bc5c1 100644 --- a/src/common.rs +++ b/src/common.rs @@ -10,6 +10,7 @@ use xz2::read::XzDecoder; #[derive(Debug, Clone)] pub enum ColumnSelector { Index(usize), + FromEnd(usize), Name(String), Range(Option>, Option>), } @@ -19,8 +20,9 @@ pub fn parse_selector_list(spec: &str) -> Result> { bail!("column specification must not be empty"); } - spec.split(',') - .map(|token| parse_selector_token(token.trim())) + tokenize_selector_spec(spec)? + .into_iter() + .map(parse_selector_token) .collect() } @@ -84,7 +86,7 @@ pub fn resolve_selectors( let mut indices = Vec::with_capacity(selectors.len()); for selector in selectors { match selector { - ColumnSelector::Index(_) | ColumnSelector::Name(_) => { + ColumnSelector::Index(_) | ColumnSelector::FromEnd(_) | ColumnSelector::Name(_) => { let index = resolve_selector_index(headers, selector, no_header)?; indices.push(index); } @@ -209,14 +211,17 @@ pub fn default_headers(len: usize) -> Vec { (1..=len).map(|i| format!("col{}", i)).collect() } -fn parse_selector_token(token: &str) -> Result { - if token.is_empty() { +fn parse_selector_token(token: SelectorToken) -> Result { + if token.text.is_empty() { return Err(anyhow!("empty column selector")); } - if let Some((start, end, extra)) = split_range_token(token) { + if let Some((start, end, extra)) = split_range_token(&token.text) { if extra { - bail!("invalid column range '{}': too many ':' characters", token); + bail!( + "invalid column range '{}': too many ':' characters", + token.text + ); } let start_trim = start.trim(); let end_trim = end.trim(); @@ -233,31 +238,247 @@ fn parse_selector_token(token: &str) -> Result { return Ok(ColumnSelector::Range(start_selector, end_selector)); } - parse_simple_selector(token) + parse_simple_selector(&token.text) } fn parse_simple_selector(token: &str) -> Result { if token.is_empty() { return Err(anyhow!("empty column selector")); } + if let Some(literal) = parse_backtick_literal(token)? { + return Ok(ColumnSelector::Name(literal)); + } + if let Some(literal) = parse_brace_literal(token)? { + return Ok(ColumnSelector::Name(literal)); + } + if let Some(stripped) = token.strip_prefix('-') { + if stripped.is_empty() { + bail!("column selector '-' must include an index"); + } + let offset: usize = stripped + .parse() + .with_context(|| format!("invalid trailing index '{}'", token))?; + if offset == 0 { + bail!("column selector '-0' is not allowed"); + } + return Ok(ColumnSelector::FromEnd(offset)); + } if let Ok(idx) = token.parse::() { if idx == 0 { bail!("column indices use 1-based positions"); } - Ok(ColumnSelector::Index(idx - 1)) - } else { - Ok(ColumnSelector::Name(token.to_string())) + return Ok(ColumnSelector::Index(idx - 1)); + } + Ok(ColumnSelector::Name(token.to_string())) +} + +fn parse_backtick_literal(token: &str) -> Result> { + let trimmed = token.trim(); + if !trimmed.starts_with('`') { + return Ok(None); + } + let bytes = trimmed.as_bytes(); + let mut value = String::new(); + let mut idx = 1; + while idx < bytes.len() { + let b = bytes[idx]; + if b == b'\\' { + idx += 1; + if idx >= bytes.len() { + bail!("unterminated escape sequence in backtick-quoted column selector"); + } + value.push(bytes[idx] as char); + idx += 1; + continue; + } + if b == b'`' { + idx += 1; + if idx != bytes.len() { + bail!("unexpected characters after closing backtick in column selector"); + } + return Ok(Some(value)); + } + value.push(b as char); + idx += 1; + } + bail!("unterminated backtick-quoted column selector"); +} + +fn parse_brace_literal(token: &str) -> Result> { + let trimmed = token.trim(); + if !trimmed.starts_with('{') { + return Ok(None); + } + let bytes = trimmed.as_bytes(); + let mut value = String::new(); + let mut idx = 1; + while idx < bytes.len() { + let b = bytes[idx]; + if b == b'\\' { + idx += 1; + if idx >= bytes.len() { + bail!("unterminated escape sequence in '{{' column selector"); + } + value.push(bytes[idx] as char); + idx += 1; + continue; + } + if b == b'}' { + idx += 1; + if idx != bytes.len() { + bail!("unexpected characters after closing '}}' in column selector"); + } + return Ok(Some(value)); + } + value.push(b as char); + idx += 1; } + bail!("unterminated '{{' in column selector"); } fn split_range_token(token: &str) -> Option<(&str, &str, bool)> { - let mut parts = token.split(':'); - let start = parts.next()?; - let end = parts.next()?; - let extra = parts.next().is_some(); + let mut in_backtick = false; + let mut in_braces = false; + let mut escaped = false; + let mut colon_index = None; + let mut extra = false; + let bytes = token.as_bytes(); + let mut idx = 0; + while idx < bytes.len() { + let b = bytes[idx]; + if in_backtick { + if escaped { + escaped = false; + } else if b == b'\\' { + escaped = true; + } else if b == b'`' { + in_backtick = false; + } + idx += 1; + continue; + } + if in_braces { + if escaped { + escaped = false; + } else if b == b'\\' { + escaped = true; + } else if b == b'}' { + in_braces = false; + } + idx += 1; + continue; + } + match b { + b'`' => { + in_backtick = true; + } + b'{' => { + in_braces = true; + } + b':' => { + if colon_index.is_none() { + colon_index = Some(idx); + } else { + extra = true; + } + } + _ => {} + } + idx += 1; + } + let colon = colon_index?; + let start = &token[..colon]; + let end = &token[colon + 1..]; Some((start, end, extra)) } +#[derive(Debug)] +struct SelectorToken { + text: String, +} + +fn tokenize_selector_spec(spec: &str) -> Result> { + let mut tokens = Vec::new(); + let mut current = String::new(); + let mut chars = spec.chars().peekable(); + let mut in_backtick = false; + let mut in_braces = false; + let mut escaped = false; + + while let Some(ch) = chars.next() { + if in_backtick { + current.push(ch); + if escaped { + escaped = false; + continue; + } + if ch == '\\' { + escaped = true; + } else if ch == '`' { + in_backtick = false; + } + continue; + } + if in_braces { + current.push(ch); + if escaped { + escaped = false; + continue; + } + if ch == '\\' { + escaped = true; + } else if ch == '}' { + in_braces = false; + } + continue; + } + match ch { + ',' => { + let trimmed = current.trim(); + if trimmed.is_empty() { + bail!("column specification must not be empty"); + } + tokens.push(SelectorToken { + text: trimmed.to_string(), + }); + current.clear(); + } + '`' => { + current.push(ch); + in_backtick = true; + escaped = false; + } + '{' => { + current.push(ch); + in_braces = true; + escaped = false; + } + c if c.is_whitespace() && current.is_empty() => { + continue; + } + other => current.push(other), + } + } + + if in_backtick { + bail!("unterminated backtick-quoted column selector"); + } + if in_braces { + bail!("unterminated '{{' in column selector"); + } + if !current.is_empty() { + let trimmed = current.trim(); + if trimmed.is_empty() { + bail!("column specification must not be empty"); + } + tokens.push(SelectorToken { + text: trimmed.to_string(), + }); + } + + Ok(tokens) +} + fn resolve_selector_index( headers: &[String], selector: &ColumnSelector, @@ -275,6 +496,20 @@ fn resolve_selector_index( } Ok(index) } + ColumnSelector::FromEnd(offset) => { + let offset = *offset; + if offset == 0 { + bail!("column selector '-0' is not allowed"); + } + if offset > headers.len() { + bail!( + "column selector '-{}' out of range ({} columns)", + offset, + headers.len() + ); + } + Ok(headers.len() - offset) + } ColumnSelector::Name(name) => { if no_header { bail!("column names cannot be used when input lacks a header row"); @@ -293,7 +528,7 @@ fn resolve_selector_index( #[cfg(test)] mod tests { - use super::{parse_selector_list, parse_single_selector, resolve_selectors}; + use super::{ColumnSelector, parse_selector_list, parse_single_selector, resolve_selectors}; #[test] fn resolves_name_range() { @@ -351,4 +586,60 @@ mod tests { let selectors = parse_selector_list("col1:col3").unwrap(); assert!(resolve_selectors(&headers, &selectors, true).is_err()); } + + #[test] + fn parses_backtick_literals() { + let selectors = parse_selector_list("`a:b`,`c,d`,plain").unwrap(); + assert!(matches!(selectors[0], ColumnSelector::Name(ref name) if name == "a:b")); + assert!(matches!(selectors[1], ColumnSelector::Name(ref name) if name == "c,d")); + assert!(matches!(selectors[2], ColumnSelector::Name(ref name) if name == "plain")); + } + + #[test] + fn quoted_tokens_ignore_range_syntax() { + let selectors = parse_selector_list("`2:4`").unwrap(); + assert!(matches!(selectors[0], ColumnSelector::Name(ref name) if name == "2:4")); + } + + #[test] + fn quoted_names_allow_ranges() { + let selectors = parse_selector_list("`dna_ug`:`rna_ug`").unwrap(); + assert!(matches!( + selectors[0], + ColumnSelector::Range(Some(ref start), Some(ref end)) + if matches!(**start, ColumnSelector::Name(ref name) if name == "dna_ug") + && matches!(**end, ColumnSelector::Name(ref name) if name == "rna_ug") + )); + } + + #[test] + fn quoted_names_allow_open_range() { + let selectors = parse_selector_list("`dna_ug`:").unwrap(); + assert!(matches!( + selectors[0], + ColumnSelector::Range(Some(ref start), None) + if matches!(**start, ColumnSelector::Name(ref name) if name == "dna_ug") + )); + } + + #[test] + fn parses_brace_literals() { + let selectors = parse_selector_list("{a:b},plain").unwrap(); + assert!(matches!(selectors[0], ColumnSelector::Name(ref name) if name == "a:b")); + assert!(matches!(selectors[1], ColumnSelector::Name(ref name) if name == "plain")); + } + + #[test] + fn parses_negative_indices() { + let headers = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + let selectors = parse_selector_list("-1,-2").unwrap(); + let indices = resolve_selectors(&headers, &selectors, false).unwrap(); + assert_eq!(indices, vec![2, 1]); + } + + #[test] + fn rejects_unterminated_backtick() { + let err = parse_selector_list("`foo").unwrap_err(); + assert!(err.to_string().contains("unterminated backtick")); + } } diff --git a/src/excel/mod.rs b/src/excel/mod.rs index e017627..70c463d 100644 --- a/src/excel/mod.rs +++ b/src/excel/mod.rs @@ -151,9 +151,26 @@ struct ColumnToken { #[derive(Debug, Clone)] struct RowFilter { + ranges: Vec, +} + +#[derive(Debug, Clone)] +struct ResolvedRowFilter { ranges: Vec<(Option, Option)>, } +#[derive(Debug, Clone)] +struct RowRangeSpec { + start: Option, + end: Option, +} + +#[derive(Debug, Clone, Copy)] +enum RowBound { + Absolute(usize), + FromEnd(usize), +} + #[derive(Debug, Clone)] struct LoadInputSpec { path: PathBuf, @@ -454,25 +471,25 @@ fn parse_row_filter(spec: &str) -> Result { continue; } if let Some((start, end)) = token.split_once(':') { - let start_idx = if start.trim().is_empty() { - None - } else { - Some(parse_positive_index(start.trim())?) - }; - let end_idx = if end.trim().is_empty() { - None - } else { - Some(parse_positive_index(end.trim())?) - }; - if let (Some(s), Some(e)) = (start_idx, end_idx) { + let start_bound = parse_optional_row_bound(start)?; + let end_bound = parse_optional_row_bound(end)?; + if let (Some(RowBound::Absolute(s)), Some(RowBound::Absolute(e))) = + (start_bound, end_bound) + { if s > e { bail!("row range start {} exceeds end {}", s, e); } } - ranges.push((start_idx, end_idx)); + ranges.push(RowRangeSpec { + start: start_bound, + end: end_bound, + }); } else { - let value = parse_positive_index(token)?; - ranges.push((Some(value), Some(value))); + let bound = parse_row_bound(token)?; + ranges.push(RowRangeSpec { + start: Some(bound), + end: Some(bound), + }); } } if ranges.is_empty() { @@ -491,6 +508,70 @@ fn parse_positive_index(token: &str) -> Result { Ok(value) } +fn parse_row_bound(text: &str) -> Result { + if let Some(stripped) = text.trim().strip_prefix('-') { + if stripped.is_empty() { + bail!("row selector '-' must include an index"); + } + let value: usize = stripped + .parse() + .with_context(|| format!("invalid 1-based index '{}'", text))?; + if value == 0 { + bail!("row selector '-0' is not allowed"); + } + Ok(RowBound::FromEnd(value)) + } else { + parse_positive_index(text.trim()).map(RowBound::Absolute) + } +} + +fn parse_optional_row_bound(text: &str) -> Result> { + let trimmed = text.trim(); + if trimmed.is_empty() { + return Ok(None); + } + parse_row_bound(trimmed).map(Some) +} + +impl RowFilter { + fn resolve(&self, total_rows: usize) -> Result { + let mut ranges = Vec::with_capacity(self.ranges.len()); + for spec in &self.ranges { + let start = match spec.start { + Some(RowBound::Absolute(value)) => Some(value), + Some(RowBound::FromEnd(offset)) => Some(resolve_row_bound(offset, total_rows)?), + None => None, + }; + let end = match spec.end { + Some(RowBound::Absolute(value)) => Some(value), + Some(RowBound::FromEnd(offset)) => Some(resolve_row_bound(offset, total_rows)?), + None => None, + }; + if let (Some(s), Some(e)) = (start, end) { + if s > e { + bail!("row range start {} exceeds end {}", s, e); + } + } + ranges.push((start, end)); + } + Ok(ResolvedRowFilter { ranges }) + } +} + +fn resolve_row_bound(offset: usize, total_rows: usize) -> Result { + if offset == 0 { + bail!("row selector '-0' is not allowed"); + } + if offset > total_rows { + bail!( + "row selector '-{}' exceeds total row count {}", + offset, + total_rows + ); + } + Ok(total_rows - offset + 1) +} + fn resolve_columns( tokens: &[ColumnToken], headers: &[String], @@ -770,13 +851,16 @@ fn emit_table( return Ok(()); } + let resolved_filter = row_filter + .map(|filter| filter.resolve(data_total_rows)) + .transpose()?; let mut data_row_number = 0usize; let start_row = if has_header { 1 } else { 0 }; for row_idx in start_row..height { let absolute_row = start.0 as usize + row_idx; data_row_number += 1; - if let Some(filter) = row_filter { + if let Some(filter) = resolved_filter.as_ref() { if !row_filter_contains(filter, data_row_number, data_total_rows) { continue; } @@ -803,7 +887,7 @@ fn emit_table( Ok(()) } -fn row_filter_contains(filter: &RowFilter, row: usize, total_rows: usize) -> bool { +fn row_filter_contains(filter: &ResolvedRowFilter, row: usize, total_rows: usize) -> bool { for (start, end) in &filter.ranges { let start_idx = start.unwrap_or(1); let end_idx = end.unwrap_or(total_rows); diff --git a/src/expression.rs b/src/expression.rs index e1df788..b4e6279 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -4,7 +4,10 @@ use std::sync::Arc; use anyhow::{Context, Result, bail}; use regex::Regex; -use crate::common::{ColumnSelector, parse_selector_list, resolve_selectors}; +use crate::aggregate::{AggregateKind, evaluate_row_aggregate, try_parse_aggregate_kind}; +use crate::common::{ + ColumnSelector, parse_selector_list, parse_single_selector, resolve_selectors, +}; #[derive(Debug, Clone)] pub enum Expr { @@ -24,6 +27,13 @@ pub enum ValueExpr { Unary(UnaryOp, Box), Binary(BinaryOp, Box, Box), Function(FunctionName, Box), + Aggregate(AggregateSpecExpr), +} + +#[derive(Debug, Clone)] +pub struct AggregateSpecExpr { + pub kind: AggregateKind, + pub selectors: Vec, } #[derive(Debug, Clone, Copy)] @@ -37,6 +47,7 @@ pub enum BinaryOp { Sub, Mul, Div, + Pow, } #[derive(Debug, Clone, Copy)] @@ -97,6 +108,7 @@ enum Token { Minus, Star, Slash, + Caret, } pub fn parse_expression(input: &str) -> Result { @@ -199,6 +211,13 @@ pub enum BoundValue { Unary(UnaryOp, Box), Binary(BinaryOp, Box, Box), Function(FunctionName, Box), + Aggregate(BoundAggregate), +} + +#[derive(Debug, Clone)] +pub struct BoundAggregate { + pub kind: AggregateKind, + pub columns: Vec, } #[derive(Debug)] @@ -307,6 +326,14 @@ where numeric_eval(a / b) } } + BinaryOp::Pow => { + let value = a.powf(b); + if value.is_finite() { + numeric_eval(value) + } else { + empty_eval() + } + } }, _ => empty_eval(), } @@ -337,6 +364,18 @@ where empty_eval() } } + BoundValue::Aggregate(spec) => { + let values = spec + .columns + .iter() + .map(|&idx| row.get(idx).unwrap_or("")) + .collect::>(); + let result = evaluate_row_aggregate(&spec.kind, &values); + EvalValue { + text: Cow::Owned(result.text), + numeric: result.numeric, + } + } } } @@ -444,6 +483,20 @@ fn bind_value(value: ValueExpr, headers: &[String], no_header: bool) -> Result { + let mut indices = Vec::new(); + for selector in spec.selectors { + let mut resolved = resolve_selectors(headers, &[selector], no_header)?; + indices.append(&mut resolved); + } + if indices.is_empty() { + bail!("aggregate selector resolved to no columns"); + } + Ok(BoundValue::Aggregate(BoundAggregate { + kind: spec.kind, + columns: indices, + })) + } ValueExpr::String(text) => Ok(BoundValue::String(text)), ValueExpr::Number(number) => Ok(BoundValue::Number(number)), ValueExpr::Unary(op, inner) => { @@ -582,6 +635,52 @@ mod tests { ]; assert!(evaluate(&bound, &row)); } + + #[test] + fn subtraction_without_spaces_between_columns() { + let expr = parse_expression("($dna_ug-$rna_ug)>10").unwrap(); + let headers = vec!["dna_ug".to_string(), "rna_ug".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let row = vec!["25.0".to_string(), "12.0".to_string()]; + assert!(evaluate(&bound, &row)); + } + + #[test] + fn subtraction_with_space_after_minus() { + let expr = parse_expression("($dna_ug- $rna_ug)>10").unwrap(); + let headers = vec!["dna_ug".to_string(), "rna_ug".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let row = vec!["22.0".to_string(), "5.0".to_string()]; + assert!(evaluate(&bound, &row)); + } + + #[test] + fn exponentiation_operator_supported() { + let expr = parse_value_expression("$dna_ug^2").unwrap(); + let headers = vec!["dna_ug".to_string()]; + let bound = bind_value_expression(expr, &headers, false).unwrap(); + let row = vec!["3".to_string()]; + let eval = eval_value(&bound, &row); + assert_eq!(eval.numeric, Some(9.0)); + } + + #[test] + fn exponentiation_is_right_associative() { + let expr = parse_value_expression("2^3^2").unwrap(); + let bound = bind_value_expression(expr, &[], false).unwrap(); + let row: Vec = Vec::new(); + let eval = eval_value(&bound, &row); + assert_eq!(eval.numeric, Some(512.0)); + } + + #[test] + fn braces_allow_literal_column_names() { + let expr = parse_expression("${dna-}").unwrap(); + let headers = vec!["dna-".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let row = vec!["value".to_string()]; + assert!(evaluate(&bound, &row)); + } } impl<'a> Lexer<'a> { @@ -681,6 +780,10 @@ impl<'a> Lexer<'a> { self.pos += 1; Ok(Some(Token::Slash)) } + b'^' => { + self.pos += 1; + Ok(Some(Token::Caret)) + } c if c.is_ascii_digit() || c == b'.' => self.lex_number(), c if c.is_ascii_alphabetic() || c == b'_' => { let ident = self.lex_identifier(); @@ -692,6 +795,28 @@ impl<'a> Lexer<'a> { fn lex_column(&mut self) -> Result> { self.pos += 1; + if self.match_char(b'{') { + self.pos += 1; + let mut value = String::new(); + let mut escaped = false; + while self.pos < self.chars.len() { + let c = self.chars[self.pos]; + self.pos += 1; + if escaped { + value.push(c as char); + escaped = false; + continue; + } + match c { + b'\\' => escaped = true, + b'}' => { + return Ok(Some(Token::Column(ColumnSelector::Name(value)))); + } + other => value.push(other as char), + } + } + bail!("unterminated '{{' in column selector"); + } let start = self.pos; let mut is_numeric = true; let mut has_range_syntax = false; @@ -722,6 +847,19 @@ impl<'a> Lexer<'a> { if is_numeric && self.pos > start { break; } + let mut should_break = false; + if self.pos > start { + if let Some(next) = self.peek_non_whitespace(1) { + should_break = matches!( + next, + b'$' | b'(' | b')' | b'+' | b'-' | b'*' | b'/' | b'^' | b'"' + ) || next.is_ascii_digit() + || next == b'.'; + } + } + if should_break { + break; + } is_numeric = false; self.pos += 1; continue; @@ -742,14 +880,9 @@ impl<'a> Lexer<'a> { } return Ok(Some(Token::Columns(selectors))); } - if let Ok(idx) = text.parse::() { - if idx == 0 { - bail!("column indices use 1-based positions"); - } - Ok(Some(Token::Column(ColumnSelector::Index(idx - 1)))) - } else { - Ok(Some(Token::Column(ColumnSelector::Name(text.to_string())))) - } + let selector = parse_single_selector(text) + .with_context(|| format!("invalid column selector '{}'", text))?; + Ok(Some(Token::Column(selector))) } fn lex_string(&mut self) -> Result> { @@ -829,6 +962,18 @@ impl<'a> Lexer<'a> { self.chars.get(self.pos + offset).copied() } + fn peek_non_whitespace(&self, offset: usize) -> Option { + let mut idx = self.pos + offset; + while idx < self.chars.len() { + let c = self.chars[idx]; + if !c.is_ascii_whitespace() { + return Some(c); + } + idx += 1; + } + None + } + fn skip_whitespace(&mut self) { while self.pos < self.chars.len() && self.chars[self.pos].is_ascii_whitespace() { self.pos += 1; @@ -883,11 +1028,28 @@ impl Parser { ) { self.parse_regex_without_left() } else if self.match_token(TokenKind::LParen) { + let saved_pos = self.pos; self.pos += 1; let expr = self.parse_expr()?; if !self.consume_token(TokenKind::RParen) { bail!("missing closing ')' in expression"); } + if matches!(expr, Expr::Value(_)) { + if let Some(next) = self.peek_token() { + if matches!( + next, + Token::Compare(_) + | Token::Plus + | Token::Minus + | Token::Star + | Token::Slash + | Token::Caret + ) { + self.pos = saved_pos; + return self.parse_comparison(); + } + } + } Ok(expr) } else { self.parse_comparison() @@ -960,15 +1122,15 @@ impl Parser { } fn parse_term(&mut self) -> Result { - let mut expr = self.parse_factor()?; + let mut expr = self.parse_power()?; loop { if self.match_token(TokenKind::Star) { self.pos += 1; - let rhs = self.parse_factor()?; + let rhs = self.parse_power()?; expr = ValueExpr::Binary(BinaryOp::Mul, Box::new(expr), Box::new(rhs)); } else if self.match_token(TokenKind::Slash) { self.pos += 1; - let rhs = self.parse_factor()?; + let rhs = self.parse_power()?; expr = ValueExpr::Binary(BinaryOp::Div, Box::new(expr), Box::new(rhs)); } else { break; @@ -977,6 +1139,16 @@ impl Parser { Ok(expr) } + fn parse_power(&mut self) -> Result { + let mut expr = self.parse_factor()?; + if self.match_token(TokenKind::Caret) { + self.pos += 1; + let rhs = self.parse_power()?; + expr = ValueExpr::Binary(BinaryOp::Pow, Box::new(expr), Box::new(rhs)); + } + Ok(expr) + } + fn parse_factor(&mut self) -> Result { if self.match_token(TokenKind::Minus) { self.pos += 1; @@ -1029,6 +1201,20 @@ impl Parser { if !self.consume_token(TokenKind::RParen) { bail!("missing ')' after function call"); } + if let Some(kind) = try_parse_aggregate_kind(&name)? { + let selectors = match argument { + ValueExpr::Column(selector) => vec![selector], + ValueExpr::Columns(list) => list, + other => { + bail!( + "function '{}' expects column selectors, got {:?}", + name, + other + ) + } + }; + return Ok(ValueExpr::Aggregate(AggregateSpecExpr { kind, selectors })); + } let func = FunctionName::from_ident(&name)?; Ok(ValueExpr::Function(func, Box::new(argument))) } else { @@ -1067,6 +1253,7 @@ impl Parser { (TokenKind::Minus, Some(Token::Minus)) => true, (TokenKind::Star, Some(Token::Star)) => true, (TokenKind::Slash, Some(Token::Slash)) => true, + (TokenKind::Caret, Some(Token::Caret)) => true, _ => false, } } @@ -1108,4 +1295,5 @@ enum TokenKind { Minus, Star, Slash, + Caret, } diff --git a/src/filter.rs b/src/filter.rs index 12ccaff..8e5cdd3 100644 --- a/src/filter.rs +++ b/src/filter.rs @@ -10,7 +10,7 @@ use crate::expression::{bind_expression, evaluate, parse_expression}; #[derive(Args, Debug)] #[command( about = "Filter TSV rows using boolean expressions", - long_about = r#"Filter rows using expressions with column references ($name or $index), comparisons, logical operators, arithmetic, regex (~ and !~), and numeric functions (abs, sqrt, exp, exp2, ln, log, log10, log2). Wrap each -e argument in single quotes so the shell preserves $column selectors; inside the expression, use double quotes around string literals. Defaults to header-aware mode; add -H for headerless input. + long_about = r#"Filter rows using expressions with column references ($name or $index), comparisons, logical operators, arithmetic, regex (~ and !~), numeric functions (abs, sqrt, exp, exp2, ln, log, log10, log2), and row-wise aggregators (sum/mean/median/trimmean/iqr, sd/var, min/max/absmin/absmax, mode/antimode, count/unique/collapse, prod, entropy, argmin/argmax, quantiles via q*/p*). Wrap each -e argument in single quotes so the shell preserves $column selectors; inside the expression, use double quotes around string literals. Defaults to header-aware mode; add -H for headerless input. Examples: tsvkit filter -e '$sample2>=5 & $sample3!=9' examples/profiles.tsv @@ -22,7 +22,7 @@ pub struct FilterArgs { #[arg(value_name = "FILE", default_value = "-")] pub file: PathBuf, - /// Filter expression (e.g. `$purity>=0.9 & log2($dna_ug)>4`); supports `$col`/`$1` selectors, comparisons, arithmetic, regex (~ / !~), and functions (abs, sqrt, exp, exp2, ln, log, log10, log2) + /// Filter expression (e.g. `$purity>=0.9 & sum($dna_ug:$rna_ug)>6`); supports `$col`/`$1` selectors, comparisons, arithmetic, regex (~ / !~), numeric functions, and summarize-style aggregators (sum, mean, sd, var, min/max, mode, unique, q*/p*, etc.) #[arg(short = 'e', long = "expr", value_name = "EXPR", required = true)] pub expr: String, @@ -167,4 +167,22 @@ mod tests { let record = StringRecord::from(vec!["10"]); assert!(evaluate(&bound, &record)); } + + #[test] + fn sum_aggregator_supported() { + let expr = parse_expression("sum($1:$3) > 6").unwrap(); + let headers = vec!["a".to_string(), "b".to_string(), "c".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let record = StringRecord::from(vec!["2", "3", "4"]); + assert!(evaluate(&bound, &record)); + } + + #[test] + fn parentheses_with_addition_parse() { + let expr = parse_expression("($1 + $2) > 6").unwrap(); + let headers = vec!["dna".to_string(), "rna".to_string()]; + let bound = bind_expression(expr, &headers, false).unwrap(); + let record = StringRecord::from(vec!["4", "3"]); + assert!(evaluate(&bound, &record)); + } } diff --git a/src/main.rs b/src/main.rs index 31870f7..5160e9b 100644 --- a/src/main.rs +++ b/src/main.rs @@ -3,6 +3,7 @@ use clap::{Parser, Subcommand}; use std::env; use std::io; +mod aggregate; mod common; mod csv; mod cut; diff --git a/src/mutate.rs b/src/mutate.rs index 9320ab0..4a3bc2e 100644 --- a/src/mutate.rs +++ b/src/mutate.rs @@ -5,6 +5,7 @@ use anyhow::{Context, Result, bail}; use clap::Args; use regex::Regex; +use crate::aggregate::{AggregateKind, evaluate_row_aggregate, try_parse_aggregate_kind}; use crate::common::{ InputOptions, default_headers, parse_selector_list, reader_for_path, resolve_selectors, should_skip_record, @@ -14,7 +15,7 @@ use crate::expression::{BoundValue, bind_value_expression, eval_value, parse_val #[derive(Args, Debug)] #[command( about = "Create or transform TSV columns", - long_about = r#"Add derived columns or rewrite existing ones. Use -e/--expr to specify operations. Assignments can be arbitrary expressions (`name=EXPR`) using the filter expression language (column selectors, arithmetic, abs/sqrt/exp/ln/log/log2/log10/exp2, regex matches, etc.), row-wise aggregates (sum, mean, median, sd, q1–q4, q0.25, p95, etc.), or string helpers like sub(). Always prefix column references with `$`, including columns created earlier in the same invocation (e.g. `log_total=log2($total)`), and wrap each -e argument in single quotes so the shell leaves `$` selectors alone. You can also run in-place substitutions with sed-style syntax s/selectors/pattern/replacement/. + long_about = r#"Add derived columns or rewrite existing ones. Use -e/--expr to specify operations. Assignments can be arbitrary expressions (`name=EXPR`) using the filter expression language (column selectors, arithmetic, abs/sqrt/exp/ln/log/log2/log10/exp2, regex matches, etc.), row-wise aggregates (sum, mean, median, trimmean, iqr, sd/var, min/max/absmin/absmax, mode/antimode, count/unique/collapse/rand, prod, entropy, argmin/argmax, quantiles via q*/p*), or string helpers like sub(). Always prefix column references with `$`, including columns created earlier in the same invocation (e.g. `log_total=log2($total)`), and wrap each -e argument in single quotes so the shell leaves `$` selectors alone. You can also run in-place substitutions with sed-style syntax s/selectors/pattern/replacement/. Examples: tsvkit mutate -e "coverage_sum=sum($1,$3:$5)" examples/profiles.tsv @@ -167,23 +168,12 @@ fn evaluate_function(func: &FunctionSpec, row: &[String]) -> Result { Ok(evaluated.text.into_owned()) } FunctionSpec::Aggregate { kind, columns } => { - let values = collect_numeric(row, columns); - if values.is_empty() { - return Ok(String::new()); - } - let result = match kind { - AggregateKind::Sum => values.iter().sum::(), - AggregateKind::Mean => values.iter().sum::() / values.len() as f64, - AggregateKind::Sd => stddev(&values).unwrap_or(f64::NAN), - AggregateKind::Quantile { fraction } => { - quantile(&values, *fraction).unwrap_or(f64::NAN) - } - }; - if result.is_finite() { - Ok(format_number(result)) - } else { - Ok(String::new()) - } + let values = columns + .iter() + .map(|&idx| row.get(idx).map(|s| s.as_str()).unwrap_or("")) + .collect::>(); + let result = evaluate_row_aggregate(kind, &values); + Ok(result.text) } FunctionSpec::SubNew { column, @@ -197,14 +187,6 @@ fn evaluate_function(func: &FunctionSpec, row: &[String]) -> Result { } } -fn collect_numeric(row: &[String], indices: &[usize]) -> Vec { - indices - .iter() - .filter_map(|&idx| row.get(idx)) - .filter_map(|value| parse_float(value)) - .collect() -} - fn parse_operations( exprs: &[String], headers: &[String], @@ -354,7 +336,7 @@ fn parse_function(value: &str, headers: &[String], no_header: bool) -> Result Result { } } -fn parse_aggregate_kind(name: &str) -> Result { - let lower = name.to_ascii_lowercase(); - match lower.as_str() { - "sum" => Ok(AggregateKind::Sum), - "mean" | "avg" => Ok(AggregateKind::Mean), - "median" | "med" => Ok(AggregateKind::Quantile { fraction: 0.5 }), - "sd" | "std" | "stddev" => Ok(AggregateKind::Sd), - _ if lower.starts_with('q') => { - let fraction = parse_quantile_fraction(&lower)?; - Ok(AggregateKind::Quantile { fraction }) - } - _ if lower.starts_with('p') => { - let fraction = parse_percent_fraction(&lower)?; - Ok(AggregateKind::Quantile { fraction }) - } - other => bail!( - "unsupported function '{}': try sum, mean, median, sd, q*, or p*", - other - ), - } -} - -fn parse_quantile_fraction(token: &str) -> Result { - let rest = &token[1..]; - if rest.is_empty() { - bail!("quantile function must specify a value (e.g. q1 or q0.25)"); - } - if rest.chars().all(|c| c.is_ascii_digit()) { - let int_val: u32 = rest.parse().with_context(|| "invalid quantile index")?; - if int_val <= 4 { - return Ok(int_val as f64 / 4.0); - } else if int_val <= 100 { - return Ok(int_val as f64 / 100.0); - } else { - bail!("quantile integer must be between 1 and 100"); - } - } - let fractional = rest.replace('_', "."); - let value: f64 = fractional - .parse() - .with_context(|| "invalid quantile fraction")?; - if !(0.0..=1.0).contains(&value) { - bail!("quantile fraction must lie between 0 and 1"); - } - Ok(value) -} - -fn parse_percent_fraction(token: &str) -> Result { - let rest = &token[1..]; - if rest.is_empty() { - bail!("percentile function must specify a value (e.g. p95)"); - } - let value: f64 = rest - .replace('_', ".") - .parse() - .with_context(|| "invalid percentile value")?; - if !(0.0..=100.0).contains(&value) { - bail!("percentile value must be between 0 and 100"); - } - Ok(value / 100.0) -} - -fn stddev(values: &[f64]) -> Option { - if values.is_empty() { - return None; - } - let mean = values.iter().sum::() / values.len() as f64; - let variance = values - .iter() - .map(|v| { - let diff = v - mean; - diff * diff - }) - .sum::() - / values.len() as f64; - Some(variance.max(0.0).sqrt()) -} - -fn quantile(values: &[f64], fraction: f64) -> Option { - if values.is_empty() { - return None; - } - if values.len() == 1 { - return Some(values[0]); - } - let mut sorted = values.to_vec(); - sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); - let position = fraction.clamp(0.0, 1.0) * (sorted.len() - 1) as f64; - let lower = position.floor() as usize; - let upper = position.ceil() as usize; - if lower == upper { - Some(sorted[lower]) - } else { - let weight = position - lower as f64; - let lower_value = sorted[lower]; - let upper_value = sorted[upper]; - Some(lower_value + (upper_value - lower_value) * weight) - } -} - -fn format_number(value: f64) -> String { - if value.fract() == 0.0 { - format!("{:.0}", value) - } else { - format!("{:.6}", value) - } -} - -fn parse_float(text: &str) -> Option { - let trimmed = text.trim(); - if trimmed.is_empty() { - return None; - } - trimmed.parse::().ok() -} - #[derive(Debug)] enum MutateOp { Create { @@ -612,30 +478,10 @@ enum FunctionSpec { }, } -#[derive(Debug)] -enum AggregateKind { - Sum, - Mean, - Sd, - Quantile { fraction: f64 }, -} - #[cfg(test)] mod tests { use super::*; - #[test] - fn parse_quantile_aliases() { - assert!((parse_quantile_fraction("q1").unwrap() - 0.25).abs() < f64::EPSILON); - assert!((parse_quantile_fraction("q2").unwrap() - 0.5).abs() < f64::EPSILON); - assert!((parse_quantile_fraction("q75").unwrap() - 0.75).abs() < f64::EPSILON); - } - - #[test] - fn parse_percent_aliases() { - assert!((parse_percent_fraction("p95").unwrap() - 0.95).abs() < f64::EPSILON); - } - #[test] fn split_args_handles_quotes() { let args = split_args("$col1, \"a,b\", \"c\""); @@ -692,4 +538,35 @@ mod tests { let log_total: f64 = row[7].parse().unwrap(); assert!((log_total - 5f64.log2()).abs() < 1e-6); } + + #[test] + fn advanced_row_aggregators_work_across_ranges() { + let headers = vec![ + "c1".to_string(), + "c2".to_string(), + "c3".to_string(), + "c4".to_string(), + ]; + let ops = parse_operations( + &vec![ + "sum_open=sum($c1:)".to_string(), + "count_all=count($c1:$c4)".to_string(), + "mode_val=mode($c1:$c4)".to_string(), + ], + &headers, + false, + ) + .unwrap(); + let mut row = vec![ + "1".to_string(), + "2".to_string(), + "2".to_string(), + "".to_string(), + ]; + process_row(&mut row, &ops).unwrap(); + assert_eq!(row.len(), 7); + assert_eq!(row[4], "5"); + assert_eq!(row[5], "4"); + assert_eq!(row[6], "2"); + } } diff --git a/src/pretty.rs b/src/pretty.rs index 7d0001e..6dbe6de 100644 --- a/src/pretty.rs +++ b/src/pretty.rs @@ -4,6 +4,7 @@ use std::path::PathBuf; use anyhow::{Context, Result, bail}; use clap::Args; +use crate::aggregate::parse_float; use crate::common::{InputOptions, reader_for_path, should_skip_record}; #[derive(Args, Debug)] @@ -36,6 +37,10 @@ pub struct PrettyArgs { /// Ignore rows whose column count differs from the header/first row #[arg(short = 'I', long = "ignore-illegal-row")] pub ignore_illegal_row: bool, + + /// Round floating-point values to N decimal places (enables scientific notation for tiny magnitudes) + #[arg(short = 'r', long = "round", value_name = "DIGITS")] + pub round: Option, } pub fn run(args: PrettyArgs) -> Result<()> { @@ -78,12 +83,94 @@ pub fn run(args: PrettyArgs) -> Result<()> { if reference_width.is_none() { reference_width = Some(record.len()); } - rows.push(record.iter().map(|s| s.to_string()).collect::>()); + let rounded = record + .iter() + .map(|s| maybe_round_value(s, args.round)) + .collect::>(); + rows.push(rounded); } render_table(header, rows) } +fn maybe_round_value(value: &str, digits: Option) -> String { + let Some(places) = digits else { + return value.to_string(); + }; + let Some(number) = parse_float(value) else { + return value.to_string(); + }; + if !number.is_finite() { + return value.to_string(); + } + let exponent = (places as i32).max(4); + let threshold = 10f64.powi(-exponent); + if number != 0.0 && number.abs() < threshold { + format_with_scientific(number, places) + } else { + format_with_precision(number, places) + } +} + +fn format_with_precision(number: f64, places: u32) -> String { + let precision = places as usize; + let formatted = format!("{number:.precision$}"); + trim_trailing_zeros(formatted) +} + +fn format_with_scientific(number: f64, places: u32) -> String { + let precision = places as usize; + let formatted = format!("{number:.precision$e}"); + trim_trailing_zeros(formatted) +} + +fn trim_trailing_zeros(value: String) -> String { + if let Some(idx) = value.find(['e', 'E']) { + let (mantissa, exponent) = value.split_at(idx); + let cleaned = trim_decimal_suffix(mantissa.to_string()); + return cleaned + exponent; + } + trim_decimal_suffix(value) +} + +fn trim_decimal_suffix(mut value: String) -> String { + if value.contains('.') { + while value.ends_with('0') { + value.pop(); + } + if value.ends_with('.') { + value.pop(); + } + if value.is_empty() { + return "0".to_string(); + } + } + value +} + +#[cfg(test)] +mod tests { + use super::maybe_round_value; + + #[test] + fn rounds_numeric_values_to_requested_precision() { + let result = maybe_round_value("3.14159", Some(2)); + assert_eq!(result, "3.14"); + } + + #[test] + fn formats_very_small_numbers_in_scientific_notation() { + let result = maybe_round_value("0.0000001234", Some(3)); + assert_eq!(result, "1.234e-7"); + } + + #[test] + fn leaves_text_values_unchanged() { + let value = maybe_round_value("NA", Some(2)); + assert_eq!(value, "NA"); + } +} + pub fn render_table(header: Option>, rows: Vec>) -> Result<()> { let mut writer = BufWriter::new(io::stdout().lock()); diff --git a/src/slice.rs b/src/slice.rs index 8632bee..a019747 100644 --- a/src/slice.rs +++ b/src/slice.rs @@ -47,8 +47,8 @@ pub struct SliceArgs { } pub fn run(args: SliceArgs) -> Result<()> { - let ranges = parse_row_spec(&args.rows)?; - if ranges.is_empty() { + let selection = parse_row_spec(&args.rows)?; + if selection.is_empty() { bail!("row specification must select at least one row"); } @@ -60,41 +60,117 @@ pub fn run(args: SliceArgs) -> Result<()> { let mut reader = reader_for_path(&args.file, args.no_header, &input_opts)?; let mut writer = BufWriter::new(io::stdout().lock()); - let mut data_row_idx = 0usize; - - if args.no_header { - for record in reader.records() { - let record = record.with_context(|| format!("failed reading from {:?}", args.file))?; - if should_skip_record(&record, &input_opts, None) { - continue; + if !selection.requires_from_end() { + let ranges = selection.resolve_absolute()?; + if ranges.is_empty() { + bail!("row specification must select at least one row"); + } + if args.no_header { + let mut expected_width: Option = None; + let mut data_row_idx = 0usize; + for record in reader.records() { + let record = + record.with_context(|| format!("failed reading from {:?}", args.file))?; + if should_skip_record(&record, &input_opts, expected_width) { + continue; + } + if expected_width.is_none() { + expected_width = Some(record.len()); + } + data_row_idx += 1; + if row_selected(data_row_idx, &ranges) { + writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + writer.write_all(b"\n")?; + } } - data_row_idx += 1; - if row_selected(data_row_idx, &ranges) { - writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + } else { + let headers = reader + .headers() + .with_context(|| format!("failed reading header from {:?}", args.file))? + .iter() + .map(|s| s.to_string()) + .collect::>(); + if !headers.is_empty() { + writer.write_all(headers.join("\t").as_bytes())?; writer.write_all(b"\n")?; } + let mut data_row_idx = 0usize; + for record in reader.records() { + let record = + record.with_context(|| format!("failed reading from {:?}", args.file))?; + if should_skip_record(&record, &input_opts, Some(headers.len())) { + continue; + } + data_row_idx += 1; + if row_selected(data_row_idx, &ranges) { + writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + writer.write_all(b"\n")?; + } + } } } else { - let headers = reader - .headers() - .with_context(|| format!("failed reading header from {:?}", args.file))? - .iter() - .map(|s| s.to_string()) - .collect::>(); - if !headers.is_empty() { - writer.write_all(headers.join("\t").as_bytes())?; - writer.write_all(b"\n")?; - } - for record in reader.records() { - let record = record.with_context(|| format!("failed reading from {:?}", args.file))?; - if should_skip_record(&record, &input_opts, Some(headers.len())) { - continue; + if args.no_header { + let mut expected_width: Option = None; + let mut data_rows = Vec::new(); + for record in reader.records() { + let record = + record.with_context(|| format!("failed reading from {:?}", args.file))?; + if should_skip_record(&record, &input_opts, expected_width) { + continue; + } + if expected_width.is_none() { + expected_width = Some(record.len()); + } + data_rows.push(record); + } + if data_rows.is_empty() { + writer.flush()?; + return Ok(()); } - data_row_idx += 1; - if row_selected(data_row_idx, &ranges) { - writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + let ranges = selection.resolve_with_total(data_rows.len())?; + if ranges.is_empty() { + bail!("row specification must select at least one row"); + } + for (idx, record) in data_rows.iter().enumerate() { + if row_selected(idx + 1, &ranges) { + writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + writer.write_all(b"\n")?; + } + } + } else { + let headers = reader + .headers() + .with_context(|| format!("failed reading header from {:?}", args.file))? + .iter() + .map(|s| s.to_string()) + .collect::>(); + if !headers.is_empty() { + writer.write_all(headers.join("\t").as_bytes())?; writer.write_all(b"\n")?; } + let mut data_rows = Vec::new(); + for record in reader.records() { + let record = + record.with_context(|| format!("failed reading from {:?}", args.file))?; + if should_skip_record(&record, &input_opts, Some(headers.len())) { + continue; + } + data_rows.push(record); + } + if data_rows.is_empty() { + writer.flush()?; + return Ok(()); + } + let ranges = selection.resolve_with_total(data_rows.len())?; + if ranges.is_empty() { + bail!("row specification must select at least one row"); + } + for (idx, record) in data_rows.iter().enumerate() { + if row_selected(idx + 1, &ranges) { + writer.write_all(record.iter().collect::>().join("\t").as_bytes())?; + writer.write_all(b"\n")?; + } + } } } @@ -102,33 +178,66 @@ pub fn run(args: SliceArgs) -> Result<()> { Ok(()) } -fn parse_row_spec(spec: &str) -> Result> { - let mut ranges = Vec::new(); +fn parse_row_spec(spec: &str) -> Result { + let mut specs = Vec::new(); for token in spec.split(',') { let trimmed = token.trim(); if trimmed.is_empty() { continue; } if let Some((start, end)) = trimmed.split_once(':') { - let start_idx = parse_optional_index(start)?; - let end_idx = parse_optional_index(end)?; - if let (Some(s), Some(e)) = (start_idx, end_idx) { + let start_ep = parse_optional_row_endpoint(start)?; + let end_ep = parse_optional_row_endpoint(end)?; + if let (Some(RowEndpoint::Absolute(s)), Some(RowEndpoint::Absolute(e))) = + (start_ep, end_ep) + { if s > e { bail!("row range start {} is greater than end {}", s, e); } } - ranges.push(RowRange { - start: start_idx, - end: end_idx, + specs.push(RowRangeSpec { + start: start_ep, + end: end_ep, }); } else { - let idx = parse_positive_index(trimmed)?; - ranges.push(RowRange { - start: Some(idx), - end: Some(idx), + let endpoint = parse_row_endpoint(trimmed)?; + specs.push(RowRangeSpec { + start: Some(endpoint), + end: Some(endpoint), }); } } + Ok(RowSelection { specs }) +} + +fn convert_row_specs(selection: &RowSelection, total_rows: Option) -> Result> { + let mut ranges = Vec::with_capacity(selection.specs.len()); + for spec in &selection.specs { + let start = match spec.start { + Some(RowEndpoint::Absolute(idx)) => Some(idx), + Some(RowEndpoint::FromEnd(offset)) => { + let total = total_rows + .with_context(|| "row selector requires total row count to resolve")?; + Some(resolve_from_end(offset, total)?) + } + None => None, + }; + let end = match spec.end { + Some(RowEndpoint::Absolute(idx)) => Some(idx), + Some(RowEndpoint::FromEnd(offset)) => { + let total = total_rows + .with_context(|| "row selector requires total row count to resolve")?; + Some(resolve_from_end(offset, total)?) + } + None => None, + }; + if let (Some(s), Some(e)) = (start, end) { + if s > e { + bail!("row range start {} is greater than end {}", s, e); + } + } + ranges.push(RowRange { start, end }); + } ranges.sort_by_key(|r| r.start.unwrap_or(1)); merge_ranges(ranges) } @@ -170,22 +279,49 @@ fn row_selected(row_idx: usize, ranges: &[RowRange]) -> bool { false } -fn parse_positive_index(text: &str) -> Result { - let value: usize = text - .parse() - .with_context(|| format!("invalid row index '{}'", text))?; - if value == 0 { - bail!("row indices are 1-based"); +fn parse_row_endpoint(text: &str) -> Result { + if let Some(stripped) = text.strip_prefix('-') { + if stripped.is_empty() { + bail!("row selector '-' must include an index"); + } + let value: usize = stripped + .parse() + .with_context(|| format!("invalid row index '{}'", text))?; + if value == 0 { + bail!("row selector '-0' is not allowed"); + } + Ok(RowEndpoint::FromEnd(value)) + } else { + let value: usize = text + .parse() + .with_context(|| format!("invalid row index '{}'", text))?; + if value == 0 { + bail!("row indices are 1-based"); + } + Ok(RowEndpoint::Absolute(value)) } - Ok(value) } -fn parse_optional_index(text: &str) -> Result> { +fn parse_optional_row_endpoint(text: &str) -> Result> { let trimmed = text.trim(); if trimmed.is_empty() { return Ok(None); } - parse_positive_index(trimmed).map(Some) + parse_row_endpoint(trimmed).map(Some) +} + +fn resolve_from_end(offset: usize, total_rows: usize) -> Result { + if offset == 0 { + bail!("row selector '-0' is not allowed"); + } + if offset > total_rows { + bail!( + "row selector '-{}' exceeds total row count {}", + offset, + total_rows + ); + } + Ok(total_rows - offset + 1) } fn ranges_touch_or_overlap(a: &RowRange, b: &RowRange) -> bool { @@ -216,6 +352,44 @@ fn merge_pair(mut a: RowRange, b: RowRange) -> RowRange { a } +#[derive(Clone, Debug)] +struct RowSelection { + specs: Vec, +} + +impl RowSelection { + fn is_empty(&self) -> bool { + self.specs.is_empty() + } + + fn requires_from_end(&self) -> bool { + self.specs.iter().any(|spec| { + matches!(spec.start, Some(RowEndpoint::FromEnd(_))) + || matches!(spec.end, Some(RowEndpoint::FromEnd(_))) + }) + } + + fn resolve_absolute(&self) -> Result> { + convert_row_specs(self, None) + } + + fn resolve_with_total(&self, total_rows: usize) -> Result> { + convert_row_specs(self, Some(total_rows)) + } +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +struct RowRangeSpec { + start: Option, + end: Option, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +enum RowEndpoint { + Absolute(usize), + FromEnd(usize), +} + #[derive(Copy, Clone, Debug, PartialEq, Eq)] struct RowRange { start: Option, @@ -228,7 +402,8 @@ mod tests { #[test] fn parse_single_rows() { - let ranges = parse_row_spec("1,5,10").unwrap(); + let selection = parse_row_spec("1,5,10").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![ @@ -250,7 +425,8 @@ mod tests { #[test] fn parse_ranges() { - let ranges = parse_row_spec("1:3,10:12").unwrap(); + let selection = parse_row_spec("1:3,10:12").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![ @@ -268,7 +444,8 @@ mod tests { #[test] fn merge_overlapping_ranges() { - let ranges = parse_row_spec("1:3,2:5,10").unwrap(); + let selection = parse_row_spec("1:3,2:5,10").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![ @@ -286,7 +463,8 @@ mod tests { #[test] fn parse_open_start_range() { - let ranges = parse_row_spec(":5").unwrap(); + let selection = parse_row_spec(":5").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![RowRange { @@ -298,7 +476,8 @@ mod tests { #[test] fn parse_open_end_range() { - let ranges = parse_row_spec("10:").unwrap(); + let selection = parse_row_spec("10:").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![RowRange { @@ -310,7 +489,8 @@ mod tests { #[test] fn parse_full_range() { - let ranges = parse_row_spec(":").unwrap(); + let selection = parse_row_spec(":").unwrap(); + let ranges = selection.resolve_absolute().unwrap(); assert_eq!( ranges, vec![RowRange { @@ -324,4 +504,24 @@ mod tests { fn reject_zero_index() { assert!(parse_row_spec("0").is_err()); } + + #[test] + fn parse_from_end_range() { + let selection = parse_row_spec("-3:").unwrap(); + assert!(selection.requires_from_end()); + let ranges = selection.resolve_with_total(5).unwrap(); + assert_eq!( + ranges, + vec![RowRange { + start: Some(3), + end: None + }] + ); + } + + #[test] + fn from_end_range_errors_when_too_large() { + let selection = parse_row_spec(":-2").unwrap(); + assert!(selection.resolve_with_total(1).is_err()); + } }