diff --git a/docs/cli.md b/docs/cli.md index d10ee9e..c9c5f74 100644 --- a/docs/cli.md +++ b/docs/cli.md @@ -269,3 +269,65 @@ name,reads1,reads2,project,organism,cell_type,source,source__annotation $ flowbio samples batch-template --sample-type rna_seq --json [{"name": "name", "kind": "reserved", "required": true, "options": null, "description": "..."}, ...] ``` + +### `samples upload-batch` + +Upload many samples from a filled-in CSV sample sheet, applying one sample type +to every row. + +``` +flowbio samples upload-batch --sheet PATH --sample-type TYPE + [--skip-invalid] [--stop-on-error] +``` + +Run `flowbio samples upload-batch --help` for the full option list. The sheet is +the CSV produced by `samples batch-template` (see that command for the schema); +it must be a `.csv` — an `.xlsx` or `.tsv` is a usage error directing you to +export to CSV. Reads paths in the sheet are resolved relative to the **sheet +file's own directory** (absolute paths are used as-is), and empty cells are +omitted rather than sent as empty values. The sample type is sent as-is and +validated server-side. + +**Validation is up front.** Every row is validated *before any upload*, and all +problems on a row are reported together — a missing `name`/`reads1`, a reads file +that is not on disk, a name containing spaces, a value outside a closed-option +attribute's allowed values, metadata required for the chosen type that is +missing, or a `__annotation` companion set without its value or on an +attribute that does not permit annotations. + +- By default, **any** invalid row aborts the whole run: every row's errors are + reported (with its 1-based row number and name), nothing is uploaded, exit `2`. +- `--skip-invalid` skips the invalid rows (reporting why) and uploads the rest. + +Valid rows upload **sequentially in sheet order**. By default a row that fails to +upload is recorded and the run continues; `--stop-on-error` aborts on the first +failing row, reporting the rows already uploaded. + +**Output** — human: each row's outcome on stderr, then a final counts summary on +stdout. `--json`: a single document on stdout with `uploaded`, `failed`, and +`skipped` lists plus a `counts` summary: + +```json +{ + "uploaded": [{"row_number": 1, "name": "s1", "sample_id": "samp_1"}], + "failed": [{"row_number": 2, "name": "s2", "message": "..."}], + "skipped": [{"row_number": 3, "name": "s3", "reasons": ["..."]}], + "counts": {"uploaded": 1, "failed": 1, "skipped": 1} +} +``` + +**Exit codes** — `0` every row uploaded; `2` a pre-flight validation failure +(without `--skip-invalid`) or a non-CSV sheet; `1` any upload failure; `3` +authentication failure; otherwise the standard mapping above. + +**Example** + +```bash +$ flowbio samples upload-batch --sheet ./samples.csv --sample-type rna_seq +Row 1 (liver_r1): uploaded samp_1 +Row 2 (liver_r2): uploaded samp_2 +Uploaded 2, failed 0, skipped 0. + +$ flowbio samples upload-batch --sheet ./samples.csv --sample-type rna_seq --json +{"uploaded": [{"row_number": 1, "name": "liver_r1", "sample_id": "samp_1"}], "failed": [], "skipped": [], "counts": {"uploaded": 1, "failed": 0, "skipped": 0}} +``` diff --git a/flowbio/cli/_samples.py b/flowbio/cli/_samples.py index f9ebc41..abd1896 100644 --- a/flowbio/cli/_samples.py +++ b/flowbio/cli/_samples.py @@ -17,8 +17,15 @@ from flowbio.cli._exit_codes import CliUsageError, ExitCode from flowbio.cli._files import existing_file from flowbio.cli._output import Output, format_issue +from flowbio.cli._sheet import ( + ANNOTATION_SUFFIX, + SheetRow, + parse_sheet, + validate_row, +) from flowbio.cli._types import JsonValue from flowbio.v2.client import Client +from flowbio.v2.exceptions import FlowApiError from flowbio.v2.samples import MetadataAttribute, SampleTypeId @@ -60,6 +67,15 @@ def register( "--json) for use with 'samples upload-batch'." ), )) + _configure_upload_batch(verbs.add_parser( + "upload-batch", + parents=[global_parent], + help="Upload many samples from a CSV sample sheet.", + description=( + "Validate every row of a CSV sample sheet up front, then upload the " + "valid rows sequentially, reporting each row's outcome." + ), + )) def _configure_upload(upload: argparse.ArgumentParser) -> None: @@ -187,6 +203,36 @@ def _configure_batch_template(batch_template: argparse.ArgumentParser) -> None: ) +def _configure_upload_batch(upload_batch: argparse.ArgumentParser) -> None: + upload_batch.set_defaults( + command_parser=upload_batch, handler=_upload_batch_command, + ) + upload_batch.add_argument( + "--sheet", + required=True, + metavar="PATH", + type=Path, + help="CSV sample sheet (the filled-in `samples batch-template` output).", + ) + upload_batch.add_argument( + "--sample-type", + required=True, + metavar="TYPE", + type=SampleTypeId, + help="Sample type applied to every row (sent as-is; validated server-side).", + ) + upload_batch.add_argument( + "--skip-invalid", + action="store_true", + help="Skip invalid rows (reporting why) instead of aborting the batch.", + ) + upload_batch.add_argument( + "--stop-on-error", + action="store_true", + help="Abort on the first row that fails to upload.", + ) + + def _upload_command(args: argparse.Namespace, client: Client, output: Output) -> ExitCode: """Upload a single sample and report its identifier. @@ -362,7 +408,7 @@ def _template_columns( if attribute.allow_annotation: columns.append( _TemplateColumn( - name=f"{attribute.identifier}__annotation", + name=f"{attribute.identifier}{ANNOTATION_SUFFIX}", kind="annotation", required=False, options=None, @@ -381,6 +427,137 @@ def _required_summary(columns: list[_TemplateColumn]) -> str: ) +@dataclass(frozen=True) +class _BatchResult: + """The outcome of an ``upload-batch`` run, rendered to text or JSON.""" + + uploaded: list[dict[str, JsonValue]] + failed: list[dict[str, JsonValue]] + skipped: list[dict[str, JsonValue]] + + @property + def counts(self) -> dict[str, int]: + return { + "uploaded": len(self.uploaded), + "failed": len(self.failed), + "skipped": len(self.skipped), + } + + @property + def document(self) -> dict[str, JsonValue]: + return { + "uploaded": self.uploaded, + "failed": self.failed, + "skipped": self.skipped, + "counts": self.counts, + } + + @property + def summary(self) -> str: + counts = self.counts + return ( + f"Uploaded {counts['uploaded']}, failed {counts['failed']}, " + f"skipped {counts['skipped']}." + ) + + @property + def exit_code(self) -> ExitCode: + return ExitCode.RUNTIME if self.failed else ExitCode.SUCCESS + + +def _upload_batch_command( + args: argparse.Namespace, client: Client, output: Output, +) -> ExitCode: + """Validate a sample sheet up front, then upload the valid rows. + + :param args: Parsed command-line arguments. + :param client: The authenticated Flow client. + :param output: The result/error renderer. + :returns: :attr:`ExitCode.SUCCESS` when every row uploaded, + :attr:`ExitCode.USAGE` on a pre-flight validation failure without + ``--skip-invalid``, or :attr:`ExitCode.RUNTIME` if any upload failed. + """ + sheet = parse_sheet(args.sheet) + attributes = client.samples.get_metadata_attributes() + classified = [ + (row, validate_row(row, attributes, args.sample_type)) + for row in sheet.rows + ] + invalid = [(row, reasons) for row, reasons in classified if reasons] + valid = [row for row, reasons in classified if not reasons] + + if invalid and not args.skip_invalid: + output.emit_error( + "Sample sheet has invalid rows; nothing was uploaded.", + details=[_invalid_line(row, reasons) for row, reasons in invalid], + ) + return ExitCode.USAGE + + for row, reasons in invalid: + output.emit_advisory(f"Skipped {_invalid_line(row, reasons)}") + skipped = [ + {"row_number": row.row_number, "name": row.name, "reasons": reasons} + for row, reasons in invalid + ] + result = _upload_rows(valid, args, client, output, skipped) + output.emit_result(result.summary, result.document) + return result.exit_code + + +def _upload_rows( + rows: list[SheetRow], + args: argparse.Namespace, + client: Client, + output: Output, + skipped: list[dict[str, JsonValue]], +) -> _BatchResult: + uploaded: list[dict[str, JsonValue]] = [] + failed: list[dict[str, JsonValue]] = [] + for row in rows: + try: + sample = client.samples.upload_sample( + name=row.name, + sample_type=args.sample_type, + data=_row_reads(row), + metadata=row.metadata or None, + project_id=row.project, + organism_id=row.organism, + ) + except FlowApiError as error: + failed.append({ + "row_number": row.row_number, + "name": row.name, + "message": error.message, + }) + output.emit_advisory( + f"Row {row.row_number} ({row.name}): upload failed — {error.message}", + ) + if args.stop_on_error: + break + continue + uploaded.append({ + "row_number": row.row_number, + "name": row.name, + "sample_id": sample.id, + }) + output.emit_advisory( + f"Row {row.row_number} ({row.name}): uploaded {sample.id}", + ) + return _BatchResult(uploaded=uploaded, failed=failed, skipped=skipped) + + +def _row_reads(row: SheetRow) -> dict[str, Path]: + return { + label: path + for label, path in (("reads1", row.reads1), ("reads2", row.reads2)) + if path is not None + } + + +def _invalid_line(row: SheetRow, reasons: list[str]) -> str: + return f"Row {row.row_number} ({row.name}): {'; '.join(reasons)}" + + def _merge_metadata( pairs: list[str] | None, json_text: str | None, ) -> dict[str, str]: diff --git a/flowbio/cli/_sheet.py b/flowbio/cli/_sheet.py new file mode 100644 index 0000000..e956ccb --- /dev/null +++ b/flowbio/cli/_sheet.py @@ -0,0 +1,183 @@ +"""CSV sample-sheet parsing and pre-flight validation for ``upload-batch``. + +A sample sheet is a CSV the user fills in from ``samples batch-template``. Parsing +splits the reserved columns (``name``, ``reads1``, ``reads2``, ``project``, +``organism``) from the metadata-identifier columns, resolves reads paths relative +to the sheet's own directory, and drops empty cells. Validation then collects +*every* problem on each row up front (FR-028) so a batch never half-uploads +before surfacing a fixable mistake. +""" +from __future__ import annotations + +import csv +from dataclasses import dataclass +from pathlib import Path + +from flowbio.cli._exit_codes import CliUsageError +from flowbio.cli._files import existing_file +from flowbio.v2.samples import MetadataAttribute, SampleTypeId + +RESERVED_COLUMNS = ("name", "reads1", "reads2", "project", "organism") +ANNOTATION_SUFFIX = "__annotation" +"""Suffix that marks a metadata column as the free-text annotation companion of +````. Shared with ``batch-template`` so the sheet round-trips.""" + + +@dataclass(frozen=True) +class SheetRow: + """One data row of a sample sheet, with reads paths already resolved. + + ``name``/``reads1`` may be absent (empty cell) — that is reported by + :func:`validate_row` rather than rejected here, so all errors surface + together. + """ + + row_number: int + name: str + reads1: Path | None + reads2: Path | None + project: str | None + organism: str | None + metadata: dict[str, str] + + +@dataclass(frozen=True) +class SampleSheet: + """A parsed sample sheet.""" + + path: Path + metadata_columns: list[str] + rows: list[SheetRow] + + +def parse_sheet(path: Path) -> SampleSheet: + """Parse a CSV sample sheet into a :class:`SampleSheet`. + + :param path: The sample-sheet file. Must be a ``.csv`` — an ``.xlsx`` or + ``.tsv`` is a usage error directing the user to export to CSV. + :returns: The parsed sheet with reserved/metadata columns separated, reads + paths resolved relative to the sheet's directory, and empty cells dropped. + :raises CliUsageError: If the file is not a readable ``.csv``. + """ + if path.suffix.lower() != ".csv": + raise CliUsageError( + f"Sample sheet must be a .csv file: {path}. " + f"Export your spreadsheet to CSV first.", + ) + existing_file(path) + # utf-8-sig transparently strips a leading BOM, which spreadsheet tools + # (notably Excel's "CSV UTF-8" export) prepend — otherwise the first header + # parses as "name" and every row reports a missing name. + with path.open(newline="", encoding="utf-8-sig") as handle: + reader = csv.DictReader(handle) + headers = reader.fieldnames or [] + metadata_columns = [ + header for header in headers if header not in RESERVED_COLUMNS + ] + rows = [ + _build_row(record, row_number, path.parent, metadata_columns) + for row_number, record in enumerate(reader, start=1) + ] + return SampleSheet(path=path, metadata_columns=metadata_columns, rows=rows) + + +def validate_row( + row: SheetRow, + attributes: list[MetadataAttribute], + sample_type: SampleTypeId, +) -> list[str]: + """Return every validation problem on ``row`` (empty when the row is valid). + + :param row: The parsed row to validate. + :param attributes: The server's metadata attributes, deciding required and + closed-option columns. + :param sample_type: The sample type applied to the whole batch; an attribute + required for it must be present. + :returns: One human-readable message per problem, collected so the caller can + report them all at once. + """ + by_identifier = {attribute.identifier: attribute for attribute in attributes} + errors: list[str] = [] + if not row.name: + errors.append("missing required value: name") + elif " " in row.name: + errors.append(f"name must not contain spaces: '{row.name}'") + if row.reads1 is None: + errors.append("missing required value: reads1") + for label, reads in (("reads1", row.reads1), ("reads2", row.reads2)): + if reads is not None and not reads.is_file(): + errors.append(f"{label} file not found: {reads}") + errors.extend(_metadata_errors(row, by_identifier, attributes, sample_type)) + return errors + + +def _build_row( + record: dict[str, str], + row_number: int, + base_dir: Path, + metadata_columns: list[str], +) -> SheetRow: + def cell(column: str) -> str | None: + value = (record.get(column) or "").strip() + return value or None + + metadata = { + column: value + for column in metadata_columns + if (value := (record.get(column) or "").strip()) + } + return SheetRow( + row_number=row_number, + name=cell("name") or "", + reads1=_resolve(cell("reads1"), base_dir), + reads2=_resolve(cell("reads2"), base_dir), + project=cell("project"), + organism=cell("organism"), + metadata=metadata, + ) + + +def _resolve(value: str | None, base_dir: Path) -> Path | None: + if value is None: + return None + path = Path(value) + return path if path.is_absolute() else base_dir / path + + +def _metadata_errors( + row: SheetRow, + by_identifier: dict[str, MetadataAttribute], + attributes: list[MetadataAttribute], + sample_type: SampleTypeId, +) -> list[str]: + errors: list[str] = [] + for attribute in attributes: + required = ( + attribute.required or sample_type in attribute.required_for_sample_types + ) + if required and not row.metadata.get(attribute.identifier): + errors.append(f"missing required metadata: {attribute.identifier}") + for key, value in row.metadata.items(): + if key.endswith(ANNOTATION_SUFFIX): + continue + attribute = by_identifier.get(key) + if ( + attribute is not None + and attribute.options is not None + and value not in attribute.options + ): + errors.append( + f"value '{value}' for {key} is not one of: " + f"{', '.join(attribute.options)}", + ) + for key in row.metadata: + if not key.endswith(ANNOTATION_SUFFIX): + continue + base = key[: -len(ANNOTATION_SUFFIX)] + if not row.metadata.get(base): + errors.append(f"{key} set without a value for {base}") + continue + attribute = by_identifier.get(base) + if attribute is not None and not attribute.allow_annotation: + errors.append(f"{base} does not allow an annotation") + return errors diff --git a/specs/001-flowbio-cli/tasks.md b/specs/001-flowbio-cli/tasks.md index b063e7f..2e5c90d 100644 --- a/specs/001-flowbio-cli/tasks.md +++ b/specs/001-flowbio-cli/tasks.md @@ -160,14 +160,14 @@ Single-project layout (per plan.md): CLI under `flowbio/cli/` (every module `_`- ### Tests for User Story 4 (MANDATORY — write first) ⚠️ -- [ ] T030 [P] [US4] Write failing tests in `tests/unit/cli/test_sheet.py` for `_sheet` parsing and pre-flight validation: reserved vs metadata columns; reads paths resolved relative to the sheet directory (absolute used as-is); empty cells omitted; all per-row errors collected (missing required value, missing reads file on disk, space in name, value outside an attribute's options, required-for-type metadata missing, annotation set without its value / on a non-annotation attribute); non-CSV (`.xlsx`/`.tsv`) rejected with an "export to CSV" message (FR-020…FR-023, FR-028) -- [ ] T032 [P] [US4] Write failing tests in `tests/unit/cli/test_samples.py` covering US4 scenarios 1–7: uniform sample type applied to all rows (exit 0); relative-path resolution; any invalid row (default) reports all errors with 1-based row number + name and uploads nothing → exit 2; `--skip-invalid` skips & reports invalid rows and uploads the rest; sequential upload in row order, default continues past upload failures → exit 1, `--stop-on-error` aborts on the first failing row reporting already-uploaded rows; `--json` document with `uploaded`/`failed`/`skipped` + `counts`; non-CSV sheet → exit 2 (contracts/samples-upload-batch.md) +- [X] T030 [P] [US4] Write failing tests in `tests/unit/cli/test_sheet.py` for `_sheet` parsing and pre-flight validation: reserved vs metadata columns; reads paths resolved relative to the sheet directory (absolute used as-is); empty cells omitted; all per-row errors collected (missing required value, missing reads file on disk, space in name, value outside an attribute's options, required-for-type metadata missing, annotation set without its value / on a non-annotation attribute); non-CSV (`.xlsx`/`.tsv`) rejected with an "export to CSV" message (FR-020…FR-023, FR-028) +- [X] T032 [P] [US4] Write failing tests in `tests/unit/cli/test_samples.py` covering US4 scenarios 1–7: uniform sample type applied to all rows (exit 0); relative-path resolution; any invalid row (default) reports all errors with 1-based row number + name and uploads nothing → exit 2; `--skip-invalid` skips & reports invalid rows and uploads the rest; sequential upload in row order, default continues past upload failures → exit 1, `--stop-on-error` aborts on the first failing row reporting already-uploaded rows; `--json` document with `uploaded`/`failed`/`skipped` + `counts`; non-CSV sheet → exit 2 (contracts/samples-upload-batch.md) ### Implementation for User Story 4 -- [ ] T031 [US4] Implement `flowbio/cli/_sheet.py`: `SampleSheet`/`SheetRow` CSV parsing, relative-path resolution, empty-cell omission, the FR-028 per-row pre-flight validation collecting all errors (using `MetadataAttribute.allow_annotation` from T026), and the non-CSV → USAGE rejection (depends on T030, T026) -- [ ] T033 [US4] Implement the `upload-batch` handler and `BatchResult` in `flowbio/cli/_samples.py` (parse + validate all rows before any upload; `--skip-invalid`; sequential upload reusing the T018 single-sample path; default-continue vs `--stop-on-error`; exit code: all uploaded→0, pre-flight invalid without `--skip-invalid`→2, any upload failure→1) and register the subcommand via the `register()` in `flowbio/cli/_samples.py` (wired into `flowbio/cli/_parser.py`) with `help=`/description text on every argument (FR-003, SC-008) (FR-027…FR-032) -- [ ] T034 [US4] Document `upload-batch` (validation behaviour, `--skip-invalid`/`--stop-on-error`, `--json` shape, worked example) in `docs/cli.md` (FR-041, FR-042) +- [X] T031 [US4] Implement `flowbio/cli/_sheet.py`: `SampleSheet`/`SheetRow` CSV parsing, relative-path resolution, empty-cell omission, the FR-028 per-row pre-flight validation collecting all errors (using `MetadataAttribute.allow_annotation` from T026), and the non-CSV → USAGE rejection (depends on T030, T026) +- [X] T033 [US4] Implement the `upload-batch` handler and `BatchResult` in `flowbio/cli/_samples.py` (parse + validate all rows before any upload; `--skip-invalid`; sequential upload reusing the T018 single-sample path; default-continue vs `--stop-on-error`; exit code: all uploaded→0, pre-flight invalid without `--skip-invalid`→2, any upload failure→1) and register the subcommand via the `register()` in `flowbio/cli/_samples.py` (wired into `flowbio/cli/_parser.py`) with `help=`/description text on every argument (FR-003, SC-008) (FR-027…FR-032) +- [X] T034 [US4] Document `upload-batch` (validation behaviour, `--skip-invalid`/`--stop-on-error`, `--json` shape, worked example) in `docs/cli.md` (FR-041, FR-042) **Checkpoint**: Batch upload works independently; every row's outcome is reported. diff --git a/tests/unit/cli/test_samples.py b/tests/unit/cli/test_samples.py index f14c80b..5c461c1 100644 --- a/tests/unit/cli/test_samples.py +++ b/tests/unit/cli/test_samples.py @@ -1,3 +1,4 @@ +import csv import json from http import HTTPStatus from pathlib import Path @@ -660,3 +661,265 @@ def test_missing_sample_type_is_usage_error(self, run_cli) -> None: result = run_cli("samples", "batch-template", "--token", TOKEN) assert result.exit_code == 2 + + +BATCH_HEADERS = [ + "name", "reads1", "reads2", "project", "organism", + "cell_type", "source", "source__annotation", +] + + +def _write_batch_sheet(directory: Path, *records: dict[str, str]) -> Path: + path = directory / "sheet.csv" + with path.open("w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=BATCH_HEADERS) + writer.writeheader() + writer.writerows(records) + return path + + +class TestSamplesUploadBatch: + + @respx.mock + def test_uploads_all_rows_with_uniform_sample_type( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = respx.post(SAMPLE_UPLOAD_URL) + upload.side_effect = [ + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_1", "data_id": "d1"}), + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_2", "data_id": "d2"}), + ] + _reads(tmp_path, "r1.fq.gz") + _reads(tmp_path, "r2.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "r2.fq.gz", "cell_type": "Fibroblast"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", "--no-progress", + ) + + assert result.exit_code == 0 + assert upload.call_count == 2 + for call in upload.calls: + fields, _ = parse_multipart(call.request) + assert fields["sample_type"] == "rna_seq" + + @respx.mock + def test_reads_resolved_relative_to_sheet_directory( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + _mock_single_upload() + sheet_dir = tmp_path / "run" + sheet_dir.mkdir() + _reads(sheet_dir, "r1.fq.gz") + sheet = _write_batch_sheet( + sheet_dir, {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", "--no-progress", + ) + + assert result.exit_code == 0 + + @respx.mock + def test_invalid_row_aborts_and_uploads_nothing( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = _mock_single_upload() + _reads(tmp_path, "r1.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "missing.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", "--no-progress", + ) + + assert result.exit_code == 2 + assert upload.call_count == 0 + assert "Row 2" in result.stderr + assert "s2" in result.stderr + + @respx.mock + def test_invalid_row_abort_emits_json_error_on_stderr_only( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = _mock_single_upload() + _reads(tmp_path, "r1.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "missing.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", + "--no-progress", "--json", + ) + + assert result.exit_code == 2 + assert upload.call_count == 0 + assert result.stdout == "" + assert any( + "s2" in detail for detail in json.loads(result.stderr)["errors"] + ) + + @respx.mock + def test_skip_invalid_uploads_valid_rows( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = _mock_single_upload("samp_ok") + _reads(tmp_path, "r1.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "missing.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", + "--skip-invalid", "--no-progress", + ) + + assert result.exit_code == 0 + assert upload.call_count == 1 + assert "s2" in result.stderr + + @respx.mock + def test_default_continues_past_upload_failure( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = respx.post(SAMPLE_UPLOAD_URL) + upload.side_effect = [ + httpx.Response(HTTPStatus.BAD_REQUEST, json={"error": "bad"}), + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_2", "data_id": "d2"}), + ] + _reads(tmp_path, "r1.fq.gz") + _reads(tmp_path, "r2.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "r2.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", "--no-progress", + ) + + assert result.exit_code == 1 + assert upload.call_count == 2 + + @respx.mock + def test_stop_on_error_aborts_after_first_failure( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = respx.post(SAMPLE_UPLOAD_URL) + upload.side_effect = [ + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_1", "data_id": "d1"}), + httpx.Response(HTTPStatus.BAD_REQUEST, json={"error": "bad"}), + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_3", "data_id": "d3"}), + ] + for name in ("r1.fq.gz", "r2.fq.gz", "r3.fq.gz"): + _reads(tmp_path, name) + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "r2.fq.gz", "cell_type": "Neuron"}, + {"name": "s3", "reads1": "r3.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", + "--stop-on-error", "--no-progress", "--json", + ) + + assert result.exit_code == 1 + assert upload.call_count == 2 + assert [row["name"] for row in json.loads(result.stdout)["uploaded"]] == ["s1"] + + @respx.mock + def test_json_document_reports_outcomes_and_counts( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = respx.post(SAMPLE_UPLOAD_URL) + upload.side_effect = [ + httpx.Response(HTTPStatus.OK, json={"sample_id": "samp_1", "data_id": "d1"}), + httpx.Response(HTTPStatus.BAD_REQUEST, json={"error": "bad"}), + ] + _reads(tmp_path, "r1.fq.gz") + _reads(tmp_path, "r2.fq.gz") + sheet = _write_batch_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "Neuron"}, + {"name": "s2", "reads1": "r2.fq.gz", "cell_type": "Neuron"}, + {"name": "s3", "reads1": "missing.fq.gz", "cell_type": "Neuron"}, + ) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "rna_seq", + "--skip-invalid", "--no-progress", "--json", + ) + + assert result.exit_code == 1 + document = json.loads(result.stdout) + assert result.stdout.count("\n") == 1 + assert document["counts"] == {"uploaded": 1, "failed": 1, "skipped": 1} + assert document["uploaded"][0]["sample_id"] == "samp_1" + assert document["failed"][0]["name"] == "s2" + assert document["skipped"][0]["name"] == "s3" + + @respx.mock + def test_sample_type_is_not_pre_validated_against_server( + self, run_cli, tmp_path: Path, + ) -> None: + _mock_metadata() + upload = _mock_single_upload("samp_unknown") + _reads(tmp_path, "r1.fq.gz") + sheet = _write_batch_sheet(tmp_path, {"name": "s1", "reads1": "r1.fq.gz"}) + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(sheet), "--sample-type", "absent_from_types", + "--no-progress", + ) + + assert result.exit_code == 0 + assert upload.call_count == 1 + + @respx.mock + def test_non_csv_sheet_is_usage_error(self, run_cli, tmp_path: Path) -> None: + _mock_metadata() + upload = _mock_single_upload() + xlsx = tmp_path / "sheet.xlsx" + xlsx.write_bytes(b"PK\x03\x04") + + result = run_cli( + "--token", TOKEN, "samples", "upload-batch", + "--sheet", str(xlsx), "--sample-type", "rna_seq", "--no-progress", + ) + + assert result.exit_code == 2 + assert upload.call_count == 0 + assert "CSV" in result.stderr diff --git a/tests/unit/cli/test_sheet.py b/tests/unit/cli/test_sheet.py new file mode 100644 index 0000000..bb3d6c4 --- /dev/null +++ b/tests/unit/cli/test_sheet.py @@ -0,0 +1,229 @@ +import csv +from pathlib import Path + +import pytest + +from flowbio.cli._exit_codes import CliUsageError +from flowbio.cli._sheet import parse_sheet, validate_row +from flowbio.v2.samples import MetadataAttribute, SampleTypeId + +SAMPLE_TYPE = SampleTypeId("rna_seq") +HEADERS = [ + "name", "reads1", "reads2", "project", "organism", + "cell_type", "source", "source__annotation", +] + + +def _attributes() -> list[MetadataAttribute]: + return [ + MetadataAttribute( + identifier="cell_type", + name="Cell Type", + description="The cell type", + required=False, + required_for_sample_types=[SAMPLE_TYPE], + options=["Neuron", "Fibroblast"], + allow_annotation=False, + ), + MetadataAttribute( + identifier="source", + name="Source", + description="Sample source", + required=False, + required_for_sample_types=[], + options=None, + allow_annotation=True, + ), + ] + + +def _reads(directory: Path, name: str) -> Path: + path = directory / name + path.write_bytes(b"ATCG") + return path + + +def _write_sheet( + directory: Path, *records: dict[str, str], headers: list[str] = HEADERS, +) -> Path: + path = directory / "sheet.csv" + with path.open("w", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=headers) + writer.writeheader() + writer.writerows(records) + return path + + +class TestParseSheet: + + def test_separates_reserved_and_metadata_columns(self, tmp_path: Path) -> None: + sheet = parse_sheet( + _write_sheet(tmp_path, {"name": "s1", "reads1": "r1.fq.gz"}), + ) + + assert sheet.metadata_columns == ["cell_type", "source", "source__annotation"] + + def test_reads_paths_resolved_relative_to_sheet_directory( + self, tmp_path: Path, + ) -> None: + sheet = parse_sheet( + _write_sheet(tmp_path, {"name": "s1", "reads1": "r1.fq.gz"}), + ) + + assert sheet.rows[0].reads1 == tmp_path / "r1.fq.gz" + + def test_absolute_reads_path_used_as_is(self, tmp_path: Path) -> None: + absolute = tmp_path / "elsewhere" / "r1.fq.gz" + + sheet = parse_sheet( + _write_sheet(tmp_path, {"name": "s1", "reads1": str(absolute)}), + ) + + assert sheet.rows[0].reads1 == absolute + + def test_empty_cells_omitted_from_metadata(self, tmp_path: Path) -> None: + sheet = parse_sheet(_write_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz", "cell_type": "", "source": "blood"}, + )) + + assert sheet.rows[0].metadata == {"source": "blood"} + + def test_utf8_bom_is_stripped_from_first_header(self, tmp_path: Path) -> None: + path = tmp_path / "sheet.csv" + with path.open("w", newline="", encoding="utf-8-sig") as handle: + writer = csv.DictWriter(handle, fieldnames=HEADERS) + writer.writeheader() + writer.writerow({"name": "s1", "reads1": "r1.fq.gz"}) + + sheet = parse_sheet(path) + + assert sheet.metadata_columns == ["cell_type", "source", "source__annotation"] + assert sheet.rows[0].name == "s1" + + def test_row_numbers_are_one_based(self, tmp_path: Path) -> None: + sheet = parse_sheet(_write_sheet( + tmp_path, + {"name": "s1", "reads1": "r1.fq.gz"}, + {"name": "s2", "reads1": "r2.fq.gz"}, + )) + + assert [row.row_number for row in sheet.rows] == [1, 2] + + def test_non_csv_xlsx_rejected_with_export_message(self, tmp_path: Path) -> None: + xlsx = tmp_path / "sheet.xlsx" + xlsx.write_bytes(b"PK") + + with pytest.raises(CliUsageError, match="CSV"): + parse_sheet(xlsx) + + def test_tsv_sheet_rejected(self, tmp_path: Path) -> None: + tsv = tmp_path / "sheet.tsv" + tsv.write_text("name\treads1\n") + + with pytest.raises(CliUsageError, match="CSV"): + parse_sheet(tsv) + + +class TestValidateRow: + + def _row(self, directory: Path, **overrides: str): + _reads(directory, "r1.fq.gz") + record = {"name": "s1", "reads1": "r1.fq.gz"} + record.update(overrides) + sheet = parse_sheet(_write_sheet(directory, record)) + return sheet.rows[0] + + def test_valid_row_has_no_errors(self, tmp_path: Path) -> None: + row = self._row(tmp_path, cell_type="Neuron") + + assert validate_row(row, _attributes(), SAMPLE_TYPE) == [] + + def test_missing_name_reports_error(self, tmp_path: Path) -> None: + row = self._row(tmp_path, name="", cell_type="Neuron") + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("name" in error for error in errors) + + def test_missing_reads1_reports_error(self, tmp_path: Path) -> None: + row = self._row(tmp_path, reads1="", cell_type="Neuron") + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("reads1" in error for error in errors) + + def test_missing_reads_file_on_disk_reports_error(self, tmp_path: Path) -> None: + sheet = parse_sheet(_write_sheet( + tmp_path, + {"name": "s1", "reads1": "absent.fq.gz", "cell_type": "Neuron"}, + )) + + errors = validate_row(sheet.rows[0], _attributes(), SAMPLE_TYPE) + + assert any("absent.fq.gz" in error for error in errors) + + def test_name_with_space_reports_error(self, tmp_path: Path) -> None: + row = self._row(tmp_path, name="bad name", cell_type="Neuron") + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("space" in error.lower() for error in errors) + + def test_value_outside_options_reports_error(self, tmp_path: Path) -> None: + row = self._row(tmp_path, cell_type="Alien") + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("Alien" in error for error in errors) + + def test_required_for_type_metadata_missing_reports_error( + self, tmp_path: Path, + ) -> None: + row = self._row(tmp_path) + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("cell_type" in error for error in errors) + + def test_annotation_set_without_its_value_reports_error( + self, tmp_path: Path, + ) -> None: + row = self._row( + tmp_path, cell_type="Neuron", **{"source__annotation": "qPCR"}, + ) + + errors = validate_row(row, _attributes(), SAMPLE_TYPE) + + assert any("source__annotation" in error for error in errors) + + def test_annotation_on_non_annotation_attribute_reports_error( + self, tmp_path: Path, + ) -> None: + headers = ["name", "reads1", "cell_type", "cell_type__annotation"] + _reads(tmp_path, "r1.fq.gz") + sheet = parse_sheet(_write_sheet( + tmp_path, + { + "name": "s1", "reads1": "r1.fq.gz", + "cell_type": "Neuron", "cell_type__annotation": "note", + }, + headers=headers, + )) + + errors = validate_row(sheet.rows[0], _attributes(), SAMPLE_TYPE) + + assert any( + "annotation" in error.lower() and "cell_type" in error + for error in errors + ) + + def test_all_per_row_errors_collected(self, tmp_path: Path) -> None: + sheet = parse_sheet(_write_sheet( + tmp_path, + {"name": "bad name", "reads1": "absent.fq.gz", "cell_type": "Alien"}, + )) + + errors = validate_row(sheet.rows[0], _attributes(), SAMPLE_TYPE) + + assert len(errors) >= 3