harvardinformatics · adamfreedman · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -0,0 +1,127 @@
+name: tests
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      run_conda_validation:
+        description: "Create workflow conda envs and run package import checks"
+        required: false
+        default: false
+        type: boolean
+      run_container_validation:
+        description: "Pull workflow containers"
+        required: false
+        default: false
+        type: boolean
+      run_doubletfinder_install:
+        description: "Run the networked DoubletFinder GitHub install rule"
+        required: false
+        default: false
+        type: boolean
+
+jobs:
+  pytest:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+    env:
+      SNAKEMAKE_CONDA_FRONTEND: mamba
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Create test environment
+        uses: mamba-org/setup-micromamba@v2
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Cache Snakemake rule environments
+        uses: actions/cache@v4
+        with:
+          path: .snakemake/conda
+          key: snakemake-conda-${{ runner.os }}-${{ hashFiles('workflow/envs/*.yml') }}
+          restore-keys: |
+            snakemake-conda-${{ runner.os }}-
+
+      - name: Run tests
+        run: pytest tests -q
+
+  conda-validation:
+    if: github.event_name == 'workflow_dispatch' && inputs.run_conda_validation
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        env_name:
+          - cellbender.yml
+          - doubletfinder.yml
+          - emptydrops.yml
+          - posthocfilter.yml
+          - scdblfinder.yml
+          - soupx.yml
+          - tenx2seuratrds.yml
+    defaults:
+      run:
+        shell: bash -el {0}
+    env:
+      SNAKEMAKE_CONDA_FRONTEND: mamba
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Create test environment
+        uses: mamba-org/setup-micromamba@v2
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Run conda validation for ${{ matrix.env_name }}
+        run: pytest tests/test_conda_container_validation.py --run-conda-validation --conda-env-name "${{ matrix.env_name }}" -q
+
+  container-validation:
+    if: github.event_name == 'workflow_dispatch' && inputs.run_container_validation
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Create test environment
+        uses: mamba-org/setup-micromamba@v2
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Run container validation
+        run: pytest tests/test_conda_container_validation.py --run-container-validation -q
+
+  doubletfinder-install:
+    if: github.event_name == 'workflow_dispatch' && inputs.run_doubletfinder_install
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -el {0}
+    env:
+      SNAKEMAKE_CONDA_FRONTEND: mamba
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+
+      - name: Create test environment
+        uses: mamba-org/setup-micromamba@v2
+        with:
+          environment-file: environment.yml
+          cache-environment: true
+          cache-downloads: true
+
+      - name: Run DoubletFinder install validation
+        run: pytest tests/test_conda_container_validation.py --run-doubletfinder-install -q
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,21 @@
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.cache/
+Rplots.pdf
+logs/
+.snakemake/
+.jupyter/
+.vscode/
+.ipython/
+scratch/
+*err
+*out
+results/
+testdata/results/
+# Keep golden test outputs trackable even though results/ is ignored.
+!tests/reference_outputs/
+!tests/reference_outputs/**
+!tests/reference_outputs/testdata/
+!tests/reference_outputs/testdata/results/
+!tests/reference_outputs/testdata/results/**
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,6 +1,7 @@
 conda-channel-priority: "strict"
 
 sampleTable: "samplesheet.tsv"
+workflow_seed: 12345
 
 emptydrop_removal_methods: ["tenx","emptydrops"]
 ambient_decon_methods: ["soupx","cellbender_fromraw"]

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,13 @@
+name: scrnaseq-preprocessing-tests
+channels:
+  - conda-forge
+  - bioconda
+channel_priority: strict
+dependencies:
+  - python=3.12
+  - pytest
+  - pyyaml
+  - snakemake=9.16.3
+  - conda
+  - mamba
+  - hdf5
diff --git a/testdata/filtered_feature_bc_matrix/barcodes.tsv.gz b/testdata/filtered_feature_bc_matrix/barcodes.tsv.gz
diff --git a/testdata/filtered_feature_bc_matrix/features.tsv.gz b/testdata/filtered_feature_bc_matrix/features.tsv.gz
diff --git a/testdata/filtered_feature_bc_matrix/matrix.mtx.gz b/testdata/filtered_feature_bc_matrix/matrix.mtx.gz
diff --git a/testdata/raw_feature_bc_matrix.h5 b/testdata/raw_feature_bc_matrix.h5
diff --git a/testdata/raw_feature_bc_matrix/barcodes.tsv.gz b/testdata/raw_feature_bc_matrix/barcodes.tsv.gz
diff --git a/testdata/raw_feature_bc_matrix/features.tsv.gz b/testdata/raw_feature_bc_matrix/features.tsv.gz
diff --git a/testdata/raw_feature_bc_matrix/matrix.mtx.gz b/testdata/raw_feature_bc_matrix/matrix.mtx.gz
diff --git a/testdata/samplesheet_test.tsv b/testdata/samplesheet_test.tsv
@@ -0,0 +1,2 @@
+sampleid	tenx_datadir
+test	testdata
diff --git a/tests/README.md b/tests/README.md
@@ -0,0 +1,64 @@
+# Tests
+
+These tests are intended to run from the repository root in any environment where `snakemake` and `pytest` are available on `PATH`. How that environment is created or activated is site-specific; for example, an HPC may require loading a module before `conda activate` is available.
+
+Example:
+
+```bash
+conda env create -f environment.yml
+conda activate scrnaseq-preprocessing-tests
+pytest tests
+```
+
+`python -m pytest tests` is also fine when `python` resolves to the same environment that provides `pytest` and `snakemake`. If it does not, the active shell is likely resolving commands from different environments. Confirm with:
+
+```bash
+which python
+which pytest
+which snakemake
+```
+
+The dry-run DAG test uses the small CellRanger-style input data in `testdata/` and the test-specific sample sheet `testdata/samplesheet_test.tsv`. The tests override `sampleTable` on the Snakemake command line, so they do not use the normal workflow `samplesheet.tsv`. They do not require `--conda-prefix`, a cluster profile, or any site-specific paths. Sample-sheet validation tests cover malformed headers, duplicate sample IDs, missing or incomplete 10x data paths, absolute `tenx_datadir` values, and multi-sample DAG expansion. Config validation tests cover required keys, allowed method values, threshold types/ranges, invalid-config failures, and dry-run DAG variants driven by method-list config changes. The default suite also runs `snakemake --lint` against the test-data configuration, which catches workflow-structure problems such as missing `log:` directives, helper functions embedded in rule files, long `run:` blocks that should live in scripts, path-composition warnings, missing rule-level conda/container declarations, and shell commands that directly interpolate global workflow variables instead of passing values through `params`.
+
+GitHub Actions runs the default test suite via `.github/workflows/tests.yml`, using the same top-level `environment.yml` test runner environment and caching Snakemake-created rule environments under `.snakemake/conda`.
+
+The default test suite also includes a focused local rule-execution smoke test. It runs the real `tenx2seuratrds`, `find_markers`, and `combine_markers` rule chain against `testdata/`, using Snakemake's `--use-conda` support and writing outputs under pytest's temporary directory. This catches broken R package imports, script argument drift, invalid Seurat object creation, and marker CSV schema changes without submitting to SLURM.
+
+The R output validator also checks that the Seurat object has at least 100 features and 100 cells; metadata rows match the cell count; barcode row names are present, unique, and nonempty; `orig.ident`, `nCount_RNA`, `nFeature_RNA`, `percent.mt`, and `seurat_clusters` metadata columns exist; RNA count and feature-count metadata values are finite and positive; mitochondrial percentages are finite and within `[0, 100]`; at least two clusters are present; PCA and UMAP reductions exist; the marker table is nonempty and has the expected columns; marker gene symbols are present and nonempty; marker numeric columns are finite; marker p-value and percent columns are within `[0, 1]`; marker clusters are present in the Seurat metadata; markers are reported for at least two clusters; and the marker `workflow` column matches the expected test workflow label. The test runner environment is defined in the repository-level `environment.yml`; the rule-specific R environment is still created by Snakemake from `workflow/envs/tenx2seuratrds.yml`. A separate lightweight checkpoint-expansion test uses a fake `Rscript` to materialize the `marker_manifest` checkpoint, verify that dynamic `find_markers` jobs are generated for each cluster id, and confirm that `combine_markers` receives the expected marker chunks.
+
+
+## Optional conda and container validation
+
+The default suite validates that workflow conda environment files are well formed, that rule-level conda references resolve to existing files, and that container declarations are recognizable. Expensive network-dependent validation is opt-in:
+
+```bash
+pytest tests --run-conda-validation
+pytest tests --run-container-validation
+pytest tests --run-doubletfinder-install
+```
+
+`--run-conda-validation` creates each `workflow/envs/*.yml` environment in a pytest temporary directory and checks key R/Python package imports. To validate only one environment, pass `--conda-env-name ENV_FILE`, for example `--conda-env-name soupx.yml`; GitHub Actions uses this selector to run conda validation as one matrix job per env. `--run-container-validation` pulls the CellBender container with Docker, Apptainer, or Singularity. `--run-doubletfinder-install` runs the networked Snakemake `install_doubletfinder` rule and confirms that `DoubletFinder` can be imported from the created rule environment.
+
+
+## Optional full workflow run
+
+The default tests build and inspect the DAG and run a focused local R-rule smoke test. To submit the full test-data workflow and verify all declared outputs against the reference snapshot, opt in explicitly:
+
+```bash
+pytest tests --run-workflow
+```
+
+The full-run test calls `tests/run_test_workflow.sh`, which uses `testdata/samplesheet_test.tsv` and overrides the workflow output directory with `resultsDir=testdata/results`. The manifest in `tests/test_sample_rule_output_files.txt` is therefore written with paths under `testdata/results/`.
+
+For testing, omit `--snakemake-conda-prefix` so Snakemake uses its default `.snakemake/conda` location under the repository root. The runner assumes that the current environment already provides `snakemake` on `PATH`.
+
+
+## Reference outputs
+
+The full workflow test compares regenerated files in `testdata/results/` against reference files under `tests/reference_outputs/`. The compared file list is in `tests/test_reference_output_files.txt`. Seurat `.rds` files are compared at the metadata-table level, marker CSVs are compared by columns and `(cluster, genesymbol)` rows with numeric tolerance, emptyDrops matrix files are compared after gzip decompression, CellBender H5 outputs are compared with `h5diff`, and remaining durable outputs are compared byte-for-byte.
+
+To refresh the reference snapshot after intentionally changing workflow behavior, first run the full test workflow so `testdata/results/` contains the desired outputs, then run:
+
+```bash
+python tests/update_reference_outputs.py
+```
diff --git a/tests/compare_seurat_metadata.R b/tests/compare_seurat_metadata.R
@@ -0,0 +1,97 @@
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) == 0 || length(args) %% 2 != 0) {
+  stop("Provide current/reference RDS path pairs", call. = FALSE)
+}
+
+suppressPackageStartupMessages(library("Seurat"))
+
+numeric_tolerance <- as.numeric(Sys.getenv("SEURAT_METADATA_NUMERIC_TOLERANCE", "1e-8"))
+if (is.na(numeric_tolerance) || length(numeric_tolerance) != 1) {
+  stop("SEURAT_METADATA_NUMERIC_TOLERANCE must be numeric", call. = FALSE)
+}
+
+fail <- function(path, message) {
+  stop(sprintf("%s: %s", path, message), call. = FALSE)
+}
+
+compare_metadata_column <- function(current_path, column, current_values, reference_values) {
+  current_na <- is.na(current_values)
+  reference_na <- is.na(reference_values)
+  if (!identical(current_na, reference_na)) {
+    fail(current_path, sprintf("metadata column %s has different NA positions", column))
+  }
+
+  both_numeric <- is.numeric(current_values) && is.numeric(reference_values)
+  both_integer <- is.integer(current_values) && is.integer(reference_values)
+  if (both_numeric || both_integer) {
+    current_numeric <- as.numeric(current_values)
+    reference_numeric <- as.numeric(reference_values)
+    comparable <- !current_na & !reference_na
+    if (any(comparable)) {
+      diff <- abs(current_numeric[comparable] - reference_numeric[comparable])
+      scale <- pmax(abs(current_numeric[comparable]), abs(reference_numeric[comparable]), 1)
+      bad <- diff > numeric_tolerance * scale
+      if (any(bad)) {
+        fail(
+          current_path,
+          sprintf(
+            "metadata column %s differs numerically; max abs diff %.12g",
+            column,
+            max(diff)
+          )
+        )
+      }
+    }
+    return(invisible(TRUE))
+  }
+
+  current_character <- as.character(current_values)
+  reference_character <- as.character(reference_values)
+  comparable <- !current_na & !reference_na
+  if (!identical(current_character[comparable], reference_character[comparable])) {
+    fail(current_path, sprintf("metadata column %s differs", column))
+  }
+
+  invisible(TRUE)
+}
+
+compare_metadata_pair <- function(current_path, reference_path) {
+  current <- readRDS(current_path)
+  reference <- readRDS(reference_path)
+
+  current_metadata <- current@meta.data
+  reference_metadata <- reference@meta.data
+
+  if (!identical(colnames(current_metadata), colnames(reference_metadata))) {
+    fail(
+      current_path,
+      paste0(
+        "metadata columns differ\ncurrent: ",
+        paste(colnames(current_metadata), collapse = ","),
+        "\nreference: ",
+        paste(colnames(reference_metadata), collapse = ",")
+      )
+    )
+  }
+
+  if (!identical(rownames(current_metadata), rownames(reference_metadata))) {
+    fail(current_path, "metadata barcodes differ")
+  }
+
+  for (column in colnames(current_metadata)) {
+    compare_metadata_column(
+      current_path,
+      column,
+      current_metadata[[column]],
+      reference_metadata[[column]]
+    )
+  }
+
+  invisible(TRUE)
+}
+
+for (i in seq(1, length(args), by = 2)) {
+  compare_metadata_pair(args[[i]], args[[i + 1]])
+}
+
+cat(sprintf("Compared Seurat metadata for %d object(s)\n", length(args) / 2))
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,36 @@
+def pytest_addoption(parser):
+    parser.addoption(
+        "--run-workflow",
+        action="store_true",
+        default=False,
+        help="run the full Snakemake workflow on the test data",
+    )
+    parser.addoption(
+        "--snakemake-conda-prefix",
+        default=None,
+        help="optional value to pass to Snakemake --conda-prefix for --run-workflow",
+    )
+    parser.addoption(
+        "--run-conda-validation",
+        action="store_true",
+        default=False,
+        help="create workflow conda environments and validate key package imports",
+    )
+    parser.addoption(
+        "--conda-env-name",
+        default=None,
+        help="optional workflow/envs/*.yml filename to validate with --run-conda-validation",
+    )
+    parser.addoption(
+        "--run-container-validation",
+        action="store_true",
+        default=False,
+        help="pull workflow containers with docker, apptainer, or singularity",
+    )
+    parser.addoption(
+        "--run-doubletfinder-install",
+        action="store_true",
+        default=False,
+        help="execute the DoubletFinder GitHub install rule on test outputs",
+    )
+
diff --git a/tests/reference_outputs/testdata/results/cellbender/cellbender_test.h5 b/tests/reference_outputs/testdata/results/cellbender/cellbender_test.h5
diff --git a/tests/reference_outputs/testdata/results/cellbender/cellbender_test_filtered.h5 b/tests/reference_outputs/testdata/results/cellbender/cellbender_test_filtered.h5
diff --git a/.../reference_outputs/testdata/results/cellbender_fromraw/seurat_cellbender_fromraw_test.rds b/.../reference_outputs/testdata/results/cellbender_fromraw/seurat_cellbender_fromraw_test.rds