From 81f2285c98feb8422ae67327ed682de1f73c6632 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 23:35:20 +0000 Subject: [PATCH 01/59] feat: cli --- .github/workflows/test-spras.yml | 7 ++++++- MANIFEST.in | 3 +++ pyproject.toml | 3 +++ spras/__main__.py | 3 +++ spras/cli.py | 19 +++++++++++++++++++ 5 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 MANIFEST.in create mode 100644 spras/__main__.py create mode 100644 spras/cli.py diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index 48089ab10..b9782daaf 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -47,6 +47,11 @@ jobs: - name: Install spras in conda env shell: bash --login {0} run: pip install . + - name: Get pipx + shell: bash --login {0} + run: pip install pipx + - shell: bash --login {0} + run: pipx install . # Log conda environment contents - name: Log conda environment shell: bash --login {0} @@ -63,7 +68,7 @@ jobs: run: pytest -vs - name: Run Snakemake workflow shell: bash --login {0} - run: snakemake --cores 2 --configfile config/config.yaml --show-failed-logs + run: spras --cores 2 --configfile config/config.yaml --show-failed-logs # Run pre-commit checks on source files pre-commit: diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 000000000..5d455adfd --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include LICENSE +include Snakefile diff --git a/pyproject.toml b/pyproject.toml index 22acca360..fc20aee66 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,9 @@ dev = [ "Homepage" = "https://github.com/Reed-CompBio/spras" "Issues" = "https://github.com/Reed-CompBio/spras/issues" +[project.entry-points."pipx.run"] +spras = "spras.cli:run" + [build-system] requires = ["setuptools>=64.0"] build-backend = "setuptools.build_meta" diff --git a/spras/__main__.py b/spras/__main__.py new file mode 100644 index 000000000..16b442b7e --- /dev/null +++ b/spras/__main__.py @@ -0,0 +1,3 @@ +if __name__ == "__main__": + from spras.cli import run + run() diff --git a/spras/cli.py b/spras/cli.py new file mode 100644 index 000000000..45c643669 --- /dev/null +++ b/spras/cli.py @@ -0,0 +1,19 @@ +import itertools +import os +from pathlib import Path +import subprocess +import sys + +# https://stackoverflow.com/a/5137509/7589775 +# The file we want, visjs.html, is also included in MANIFEST.in +dir_path = os.path.dirname(os.path.realpath(__file__)) +snakefile_path = Path(dir_path, "..", "Snakefile") + +def run(): + subprocess.run(list(itertools.chain( + ["snakemake", "-s", snakefile_path], + sys.argv[1:] + ))) + +if __name__ == '__main__': + run() From f39de85f981324a94d9860d9cdd8152cfa88dbc6 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 23:36:26 +0000 Subject: [PATCH 02/59] style: fmt --- spras/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/cli.py b/spras/cli.py index 45c643669..d37d3dd7e 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -1,8 +1,8 @@ import itertools import os -from pathlib import Path import subprocess import sys +from pathlib import Path # https://stackoverflow.com/a/5137509/7589775 # The file we want, visjs.html, is also included in MANIFEST.in From ec9fc43e2f7dcdcc7892196af8a34db11c4904ec Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 23:49:38 +0000 Subject: [PATCH 03/59] chore: add scripts spras --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index fc20aee66..bc2b660f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,9 @@ dev = [ [project.entry-points."pipx.run"] spras = "spras.cli:run" +[project.scripts] +spras = "spras.cli:run" + [build-system] requires = ["setuptools>=64.0"] build-backend = "setuptools.build_meta" From e4aebf6aaf16e58e8ec2e74701f5246f51d67414 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Tue, 15 Jul 2025 19:36:40 -0700 Subject: [PATCH 04/59] docs: update visjs -> Snakefile --- spras/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/cli.py b/spras/cli.py index d37d3dd7e..d5602f8c0 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -5,7 +5,7 @@ from pathlib import Path # https://stackoverflow.com/a/5137509/7589775 -# The file we want, visjs.html, is also included in MANIFEST.in +# The file we want, Snakefile, is also included in MANIFEST.in dir_path = os.path.dirname(os.path.realpath(__file__)) snakefile_path = Path(dir_path, "..", "Snakefile") From 04523e095cb7f63b160ed97bc798be550664fb36 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 24 Aug 2025 04:50:13 -0700 Subject: [PATCH 05/59] ci: fix duplicate comments --- .github/workflows/test-spras.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index e731cd648..7fda8475c 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -52,9 +52,7 @@ jobs: run: pip install pipx - shell: bash --login {0} run: pipx install . - # Log conda environment contents - - name: Log conda environment - # Log conda environment contents + - name: Log conda environment contents shell: bash --login {0} run: conda list - name: Install Apptainer From 146443e60029eb817b7437091ad3f6a58da3bfdc Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 28 Aug 2025 00:17:26 +0000 Subject: [PATCH 06/59] feat(cli): use run subcommand --- spras/cli.py | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/spras/cli.py b/spras/cli.py index d5602f8c0..642795d16 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -1,3 +1,4 @@ +import argparse import itertools import os import subprocess @@ -9,11 +10,41 @@ dir_path = os.path.dirname(os.path.realpath(__file__)) snakefile_path = Path(dir_path, "..", "Snakefile") +# Removes the very awkwardly phrased "{subcommand1, subcommand2}" from the subcommand help +# from https://stackoverflow.com/a/13429281/7589775 +class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter): + def _format_action(self, action): + parts = super(argparse.RawDescriptionHelpFormatter, self)._format_action(action) + if action.nargs == argparse.PARSER: + parts = "\n".join(parts.split("\n")[1:]) + return parts + +def get_parser(): + parser = argparse.ArgumentParser( + prog='SPRAS', + description='The wrapping tool for SPRAS (signaling pathway reconstruction analysis streamliner)', + epilog='SPRAS is in alpha. Report issues or suggest features on GitHub: https://github.com/Reed-CompBio/spras', + formatter_class=SubcommandHelpFormatter) + + subparsers = parser.add_subparsers(title='subcommands', + help='subcommand help', + dest='subcommand') + subparsers = subparsers.add_parser('run', help='Run the SPRAS Snakemake workflow') + + return parser + def run(): - subprocess.run(list(itertools.chain( - ["snakemake", "-s", snakefile_path], - sys.argv[1:] - ))) + parser = get_parser() + args = parser.parse_args() + + if args.subcommand == "run": + subprocess.run(list(itertools.chain( + ["snakemake", "-s", snakefile_path], + sys.argv[1:] + ))) + return + + parser.print_help() if __name__ == '__main__': run() From d4b6ad3ccb57769869c2b66d50e9f860e5291ef9 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 28 Aug 2025 00:17:34 +0000 Subject: [PATCH 07/59] style: fmt --- spras/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spras/cli.py b/spras/cli.py index 642795d16..fc16f54f8 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -25,7 +25,7 @@ def get_parser(): description='The wrapping tool for SPRAS (signaling pathway reconstruction analysis streamliner)', epilog='SPRAS is in alpha. Report issues or suggest features on GitHub: https://github.com/Reed-CompBio/spras', formatter_class=SubcommandHelpFormatter) - + subparsers = parser.add_subparsers(title='subcommands', help='subcommand help', dest='subcommand') @@ -43,7 +43,7 @@ def run(): sys.argv[1:] ))) return - + parser.print_help() if __name__ == '__main__': From ed672be32eda58f8007ff310dfd371f6dd05dede Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 28 Aug 2025 00:26:42 +0000 Subject: [PATCH 08/59] fix: update cli usage across files --- .github/workflows/test-spras.yml | 2 +- README.md | 2 +- docs/contributing/index.rst | 2 +- docs/usage.rst | 4 ++-- spras/cli.py | 9 ++++++--- 5 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index 77b96a8f8..93184f132 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -95,7 +95,7 @@ jobs: # We enable high parallelization (cores 4) to test our way out of the experienced # race conditions from #268 and #279 # We also enforce strict DAG evaluation to catch DAG problems before they appear as user errors. (#359) - run: spras --cores 4 --configfile config/config.yaml --show-failed-logs --strict-dag-evaluation cyclic-graph --strict-dag-evaluation functions --strict-dag-evaluation periodic-wildcards + run: spras run --cores 4 --configfile config/config.yaml --show-failed-logs --strict-dag-evaluation cyclic-graph --strict-dag-evaluation functions --strict-dag-evaluation periodic-wildcards # Run pre-commit checks on source files pre-commit: diff --git a/README.md b/README.md index 026b20398..5db214f6e 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ After installing Docker, start Docker before running SPRAS. Once you have activated the conda environment and started Docker, you can run SPRAS with the example Snakemake workflow. From the root directory of the `spras` repository, run the command ``` -snakemake --cores 1 --configfile config/config.yaml +spras run --cores 1 --configfile config/config.yaml ``` This will run the SPRAS workflow with the example config file (`config/config.yaml`) and input files. Output files will be written to the `output` directory. diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst index ae1947b38..c4528710f 100644 --- a/docs/contributing/index.rst +++ b/docs/contributing/index.rst @@ -296,7 +296,7 @@ through SPRAS with .. code:: bash - snakemake --cores 1 --configfile config/config.yaml + spras run --cores 1 --configfile config/config.yaml Make sure to run the command inside the ``spras`` conda environment. diff --git a/docs/usage.rst b/docs/usage.rst index 161b39675..5ede42923 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -2,11 +2,11 @@ Using SPRAS =========== SPRAS is run through `Snakemake `_, which comes -with the SPRAS conda environment. +with both the SPRAS conda environment and as a dependency of SPRAS. To run SPRAS, run the following command inside the ``spras`` directory, specifying a ``config.yaml`` and the number of cores to run SPRAS with: .. code-block:: bash - snakemake --cores 1 --configfile config.yaml + spras run --cores 1 --configfile config.yaml diff --git a/spras/cli.py b/spras/cli.py index fc16f54f8..677e56851 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -29,18 +29,21 @@ def get_parser(): subparsers = parser.add_subparsers(title='subcommands', help='subcommand help', dest='subcommand') - subparsers = subparsers.add_parser('run', help='Run the SPRAS Snakemake workflow') + subparsers = subparsers.add_parser('run', + help='Run the SPRAS Snakemake workflow', + # We let snakemake handle help + add_help=False) return parser def run(): parser = get_parser() - args = parser.parse_args() + (args, unknown_args) = parser.parse_known_args() if args.subcommand == "run": subprocess.run(list(itertools.chain( ["snakemake", "-s", snakefile_path], - sys.argv[1:] + unknown_args ))) return From f9db06051296ee97680ac47f318ebf3710a32019 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 28 Aug 2025 03:59:24 +0000 Subject: [PATCH 09/59] chore: resolve snakefile path --- spras/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spras/cli.py b/spras/cli.py index 677e56851..fb58d650f 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -8,7 +8,8 @@ # https://stackoverflow.com/a/5137509/7589775 # The file we want, Snakefile, is also included in MANIFEST.in dir_path = os.path.dirname(os.path.realpath(__file__)) -snakefile_path = Path(dir_path, "..", "Snakefile") +# we resolve to simplify the path name in errors +snakefile_path = Path(dir_path, "..", "Snakefile").resolve() # Removes the very awkwardly phrased "{subcommand1, subcommand2}" from the subcommand help # from https://stackoverflow.com/a/13429281/7589775 From 5d19632fcd25104412d6dd4e5f7ca51c3f30e6fe Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Wed, 15 Oct 2025 04:39:46 +0000 Subject: [PATCH 10/59] style: fmt --- spras/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spras/cli.py b/spras/cli.py index fb58d650f..eaef47492 100644 --- a/spras/cli.py +++ b/spras/cli.py @@ -2,7 +2,6 @@ import itertools import os import subprocess -import sys from pathlib import Path # https://stackoverflow.com/a/5137509/7589775 From f81a33ec3f537ff04777b075cd73d6330845d74a Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 12 Jan 2026 09:41:39 +0000 Subject: [PATCH 11/59] feat: timeout --- Snakefile | 4 +- config/config.yaml | 1 + environment.yml | 1 + pyproject.toml | 1 + spras/allpairs.py | 5 +- spras/btb.py | 5 +- spras/config/algorithms.py | 3 +- spras/config/config.py | 11 ++++- spras/containers.py | 93 ++++++++++++++++++++++++++++++++------ spras/domino.py | 5 +- spras/meo.py | 6 +-- spras/mincostflow.py | 5 +- spras/omicsintegrator1.py | 3 +- spras/omicsintegrator2.py | 3 +- spras/pathlinker.py | 5 +- spras/prm.py | 23 ++++++---- spras/responsenet.py | 5 +- spras/runner.py | 15 ++++-- spras/rwr.py | 5 +- spras/strwr.py | 5 +- 20 files changed, 154 insertions(+), 50 deletions(-) diff --git a/Snakefile b/Snakefile index cf075b0fa..b9b076e3e 100644 --- a/Snakefile +++ b/Snakefile @@ -274,10 +274,12 @@ rule reconstruct: params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() # Declare the input files as a dictionary. inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True)) + # Get the timeout from the config + timeout = _config.config.algorithm_timeouts[wildcards.algorithm] # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml if '_spras_run_name' in params: params.pop('_spras_run_name') - runner.run(wildcards.algorithm, inputs, output.pathway_file, params, container_settings) + runner.run(wildcards.algorithm, inputs, output.pathway_file, timeout, params, container_settings) # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output diff --git a/config/config.yaml b/config/config.yaml index f2899fb9a..40aea251a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -102,6 +102,7 @@ algorithms: - name: "allpairs" include: true + timeout: 1d - name: "domino" include: true diff --git a/environment.yml b/environment.yml index e5fc75b0c..c65643fa1 100644 --- a/environment.yml +++ b/environment.yml @@ -15,6 +15,7 @@ dependencies: - scikit-learn=1.7.0 - seaborn=0.13.2 - spython=0.3.14 + - pytimeparse=1.1.8 # conda-specific for dsub - python-dateutil=2.9.0 diff --git a/pyproject.toml b/pyproject.toml index bfc602c6d..38074f771 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,7 @@ dependencies = [ "scikit-learn==1.7.0", "seaborn==0.13.2", "spython==0.3.14", + "pytimeparse==1.1.8", # toolchain deps "pip==25.3", diff --git a/spras/allpairs.py b/spras/allpairs.py index 21fca6ee4..500e864f4 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -74,7 +74,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() AllPairs.validate_required_run_args(inputs) @@ -111,7 +111,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/btb.py b/spras/btb.py index d2f18debe..e19ed5d19 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -72,7 +72,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() BowTieBuilder.validate_required_run_args(inputs) @@ -130,7 +130,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Output is already written to raw-pathway.txt file diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 552fbc4e0..d918f5333 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -5,7 +5,7 @@ """ import ast import copy -from typing import Annotated, Any, Callable, Literal, Union, cast, get_args +from typing import Annotated, Any, Callable, Literal, Union, cast, get_args, Optional import numpy as np from pydantic import ( @@ -167,6 +167,7 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod return create_model( f'{name}Model', name=Literal[name], + timeout=(Optional[str], None), include=bool, # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm # if no runs are specified. For example, the following config diff --git a/spras/config/config.py b/spras/config/config.py index e180183cc..9c8bdc8a0 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -16,8 +16,9 @@ import itertools as it import os import warnings -from typing import Any +from typing import Any, Optional +from pytimeparse import parse import numpy as np import yaml @@ -73,6 +74,8 @@ def __init__(self, raw_config: dict[str, Any]): self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length) # The list of algorithms to run in the workflow. Each is a dict with 'name' as an expected key. self.algorithms = None + # Dictionary of algorithms to their respective timeout in seconds + self.algorithm_timeouts: dict[str, Optional[int]] = dict() # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() @@ -156,6 +159,12 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue + if alg.timeout: + # Coerce to an `int` if an int isn't possible. + timeout = parse(alg.timeout, granularity='seconds') + if not timeout: raise RuntimeError(f"Algorithm {alg} has unparsable timeout string {alg.timeout}.") + self.algorithm_timeouts[alg.name] = int(timeout) + runs: dict[str, Any] = alg.runs # Each set of runs should be 1 level down in the config file diff --git a/spras/containers.py b/spras/containers.py index 124b97411..c4b41a529 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -1,6 +1,7 @@ import os import platform import re +import requests import subprocess import textwrap from pathlib import Path, PurePath, PurePosixPath @@ -166,6 +167,17 @@ def streams_contain(self, needle: str): def __str__(self): return self.message +class TimeoutError(RuntimeError): + """Raises when a function times out.""" + timeout: int + message: str + + def __init__(self, timeout: int, *args): + self.timeout = timeout + self.message = f"Timed out after {timeout}s." + + super(TimeoutError, self).__init__(timeout, *args) + def env_to_items(environment: dict[str, str]) -> Iterator[str]: """ Turns an environment variable dictionary to KEY=VALUE pairs. @@ -176,7 +188,17 @@ def env_to_items(environment: dict[str, str]) -> Iterator[str]: # TODO consider a better default environment variable # Follow docker-py's naming conventions (https://docker-py.readthedocs.io/en/stable/containers.html) # Technically the argument is an image, not a container, but we use container here. -def run_container(container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, out_dir: str | os.PathLike, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None, network_disabled = False): +def run_container( + container_suffix: str, + command: List[str], + volumes: List[Tuple[PurePath, PurePath]], + working_dir: str, + out_dir: str | os.PathLike, + container_settings: ProcessedContainerSettings, + timeout: Optional[int], + environment: Optional[dict[str, str]] = None, + network_disabled = False +): """ Runs a command in the container using Singularity or Docker @param container_suffix: name of the DockerHub container without the 'docker://' prefix @@ -185,6 +207,7 @@ def run_container(container_suffix: str, command: List[str], volumes: List[Tuple @param working_dir: the working directory in the container @param container_settings: the settings to use to run the container @param out_dir: output directory for the rule's artifacts. Only passed into run_container_singularity for the purpose of profiling. + @param timeout: the timeout (in seconds), throwing a TimeoutException if the timeout is reached. @param environment: environment variables to set in the container @param network_disabled: Disables the network on the container. Only works for docker for now. This acts as a 'runtime assertion' that a container works w/o networking. @return: output from Singularity execute or Docker run @@ -193,7 +216,7 @@ def run_container(container_suffix: str, command: List[str], volumes: List[Tuple container = container_settings.prefix + "/" + container_suffix if normalized_framework == 'docker': - return run_container_docker(container, command, volumes, working_dir, environment, network_disabled) + return run_container_docker(container, command, volumes, working_dir, environment, timeout, network_disabled) elif normalized_framework == 'singularity' or normalized_framework == "apptainer": return run_container_singularity(container, command, volumes, working_dir, out_dir, container_settings, environment) elif normalized_framework == 'dsub': @@ -201,7 +224,17 @@ def run_container(container_suffix: str, command: List[str], volumes: List[Tuple else: raise ValueError(f'{container_settings.framework} is not a recognized container framework. Choose "docker", "dsub", "apptainer", or "singularity".') -def run_container_and_log(name: str, container_suffix: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, out_dir: str | os.PathLike, container_settings: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None, network_disabled=False): +def run_container_and_log( + name: str, + container_suffix: str, + command: List[str], + volumes: List[Tuple[PurePath, PurePath]], + working_dir: str, out_dir: str | os.PathLike, + container_settings: ProcessedContainerSettings, + timeout: Optional[int], + environment: Optional[dict[str, str]] = None, + network_disabled=False +): """ Runs a command in the container using Singularity or Docker with associated pretty printed messages. @param name: the display name of the running container for logging purposes @@ -210,6 +243,7 @@ def run_container_and_log(name: str, container_suffix: str, command: List[str], @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container @param container_settings: the container settings to use + @param timeout: the timeout (in seconds), throwing a TimeoutException if the timeout is reached. @param environment: environment variables to set in the container @param network_disabled: Disables the network on the container. Only works for docker for now. This acts as a 'runtime assertion' that a container works w/o networking. @return: output from Singularity execute or Docker run @@ -219,7 +253,17 @@ def run_container_and_log(name: str, container_suffix: str, command: List[str], print('Running {} on container framework "{}" on env {} with command: {}'.format(name, container_settings.framework, list(env_to_items(environment)), ' '.join(command)), flush=True) try: - out = run_container(container_suffix=container_suffix, command=command, volumes=volumes, working_dir=working_dir, out_dir=out_dir, container_settings=container_settings, environment=environment, network_disabled=network_disabled) + out = run_container( + container_suffix=container_suffix, + command=command, + volumes=volumes, + working_dir=working_dir, + out_dir=out_dir, + container_settings=container_settings, + timeout=timeout, + environment=environment, + network_disabled=network_disabled + ) if out is not None: if isinstance(out, list): out = ''.join(out) @@ -250,7 +294,15 @@ def run_container_and_log(name: str, container_suffix: str, command: List[str], raise ContainerError(message, err.exit_status, stdout, stderr) from None # TODO any issue with creating a new client each time inside this function? -def run_container_docker(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, environment: Optional[dict[str, str]] = None, network_disabled=False): +def run_container_docker( + container: str, + command: List[str], + volumes: List[Tuple[PurePath, PurePath]], + working_dir: str, + environment: Optional[dict[str, str]] = None, + timeout: Optional[int] = None, + network_disabled=False +): """ Runs a command in the container using Docker. Attempts to automatically correct file owner and group for new files created by the container, setting them to the @@ -261,6 +313,8 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple @param volumes: a list of volumes to mount where each item is a (source, destination) tuple @param working_dir: the working directory in the container @param environment: environment variables to set in the container + @param timeout: the timeout (in seconds), throwing a TimeoutException if the timeout is reached. + @param network_disabled: if enabled, disables the underlying network: useful when containers don't fetch any online resources. @return: output from Docker run, or will error if the container errored. """ @@ -290,13 +344,22 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple bind_paths = [f'{prepare_path_docker(src)}:{dest}' for src, dest in volumes] - out = client.containers.run(container, - command, - stderr=True, - volumes=bind_paths, - working_dir=working_dir, - network_disabled=network_disabled, - environment=environment).decode('utf-8') + container_obj = client.containers.create( + container, + command, + volumes=bind_paths, + working_dir=working_dir, + network_disabled=network_disabled, + environment=environment + ) + + if timeout: + try: + container_obj.wait(timeout=timeout) + except requests.exceptions.ReadTimeout: + raise TimeoutError(timeout) + + out = container_obj.attach(stderr=True).decode('utf-8') # TODO does this cleanup need to still run even if there was an error in the above run command? # On Unix, files written by the above Docker run command will be owned by root and cannot be modified @@ -345,7 +408,7 @@ def run_container_docker(container: str, command: List[str], volumes: List[Tuple return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, out_dir: str, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): +def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, out_dir: str | os.PathLike, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): """ Runs a command in the container using Singularity. Only available on Linux. @@ -427,7 +490,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ for bind in bind_paths: singularity_cmd.extend(["--bind", bind]) singularity_cmd.extend(singularity_options) - singularity_cmd.append(image_to_run) + singularity_cmd.append(str(image_to_run)) singularity_cmd.extend(command) my_cgroup = create_peer_cgroup() @@ -438,7 +501,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ proc = subprocess.run(cmd, capture_output=True, text=True, stderr=subprocess.STDOUT) print("Reading memory and CPU stats from cgroup") - create_apptainer_container_stats(my_cgroup, out_dir) + create_apptainer_container_stats(my_cgroup, str(out_dir)) result = proc.stdout else: diff --git a/spras/domino.py b/spras/domino.py index 316044432..1faaf9d6b 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -83,7 +83,7 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = DominoParams() DOMINO.validate_required_run_args(inputs) @@ -121,7 +121,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) except ContainerError as err: # Occurs when DOMINO gets passed some empty dataframe from network_file. # This counts as an empty input, so we return an empty output. diff --git a/spras/meo.py b/spras/meo.py index 5d4630f43..b7be4db93 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -143,10 +143,9 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['edges'], sep='\t', index=False, columns=['Interactor1', 'EdgeType', 'Interactor2', 'Weight'], header=False) - # TODO add parameter validation # TODO document required arguments @staticmethod - def run(inputs, output_file=None, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -203,7 +202,8 @@ def run(inputs, output_file=None, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) properties_file_local.unlink(missing_ok=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index dad1d706c..cbb373d20 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = MinCostFlowParams() MinCostFlow.validate_required_run_args(inputs) @@ -127,7 +127,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Check the output of the container out_dir_content = sorted(out_dir.glob('*.sif')) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 3e5cbf1b7..7a053ec90 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -158,7 +158,7 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(inputs, output_file, args, container_settings=None): + def run(inputs, output_file, timeout, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() OmicsIntegrator1.validate_required_run_args(inputs, ["dummy_nodes"]) @@ -231,6 +231,7 @@ def run(inputs, output_file, args, container_settings=None): work_dir, out_dir, container_settings, + timeout, {'TMPDIR': mapped_out_dir}) conf_file_local.unlink(missing_ok=True) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 38624d3ab..8994dd1fd 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -108,7 +108,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = OmicsIntegrator2Params() OmicsIntegrator2.validate_required_run_args(inputs) @@ -160,6 +160,7 @@ def run(inputs, output_file, args=None, container_settings=None): work_dir, out_dir, container_settings, + timeout, network_disabled=True) # TODO do we want to retain other output files? diff --git a/spras/pathlinker.py b/spras/pathlinker.py index c534f2944..eb91c49c6 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -75,7 +75,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = PathLinkerParams() PathLinker.validate_required_run_args(inputs) @@ -115,7 +115,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Rename the primary output file to match the desired output filename # Currently PathLinker only writes one output file so we do not need to delete others diff --git a/spras/prm.py b/spras/prm.py index 636859cf2..2f79c376a 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -59,9 +59,10 @@ def get_params_generic(cls) -> type[T]: # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. @classmethod - def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings): + def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, timeout: Optional[int], args: dict[str, Any], container_settings: ProcessedContainerSettings): """ - This is similar to PRA.run, but it does pydantic logic internally to re-validate argument parameters. + This is similar to PRA.run, but `args` is a dictionary and not a pydantic structure. + However, this method still re-validates `args` against the associated pydantic PRM argument model. """ T_class = cls.get_params_generic() @@ -75,17 +76,23 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) T_parsed = T_class.model_validate(args) - return cls.run(inputs, output_file, T_parsed, container_settings) + return cls.run(inputs, output_file, timeout, T_parsed, container_settings) @staticmethod @abstractmethod - def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, timeout: Optional[int], args: T, container_settings: ProcessedContainerSettings): """ - Runs an algorithm with the specified inputs, algorithm params (T), - the designated output_file, and the desired container_settings. - - See the algorithm-specific `generate_inputs` and `parse_output` + Runs an algorithm. + @param inputs: specified inputs + @param output_file: designated reconstructed pathway output + @param timeout: timeout in seconds to run the container with + @param args: (T) typed algorithm params + @param container_settings: what settings should be associated with the individual container. + + See the algorithm-specific `PRM.generate_inputs` and `PRM.parse_output` for information about the input and output format. + + Also see `PRM.run_typeless` for the non-pydantic version of this method (where `args` is a dict). """ raise NotImplementedError diff --git a/spras/responsenet.py b/spras/responsenet.py index 4989de482..565f02da0 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -68,7 +68,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_settings=None): + def run(inputs, output_file, timeout, args=None, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() ResponseNet.validate_required_run_args(inputs) if not args: args = ResponseNetParams() @@ -117,7 +117,8 @@ def run(inputs, output_file, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Rename the primary output file to match the desired output filename out_file_suffixed.rename(output_file) diff --git a/spras/runner.py b/spras/runner.py index d138d8e33..b5e689103 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,8 +1,10 @@ -from typing import Any +from os import PathLike +from typing import Any, Optional # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder +from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset from spras.domino import DOMINO from spras.meo import MEO @@ -35,14 +37,21 @@ def get_algorithm(algorithm: str) -> type[PRM]: except KeyError as exc: raise NotImplementedError(f'{algorithm} is not currently supported.') from exc -def run(algorithm: str, inputs, output_file, args, container_settings): +def run( + algorithm: str, + inputs: dict[str, str | PathLike], + output_file: str | PathLike, + timeout: Optional[int], + args: dict[str, Any], + container_settings: ProcessedContainerSettings +): """ A generic interface to the algorithm-specific run functions """ algorithm_runner = get_algorithm(algorithm) # We can't use config.config here else we would get a cyclic dependency. # Since args is a dict here, we use the 'run_typeless' utility PRM function. - algorithm_runner.run_typeless(inputs, output_file, args, container_settings) + algorithm_runner.run_typeless(inputs, output_file, timeout, args, container_settings) def get_required_inputs(algorithm: str): diff --git a/spras/rwr.py b/spras/rwr.py index e6f54d674..e359767f3 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -56,7 +56,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_settings=None): + def run(inputs, output_file, timeout, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() RWR.validate_required_run_args(inputs) @@ -103,7 +103,8 @@ def run(inputs, output_file, args, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Rename the primary output file to match the desired output filename output_edges = Path(out_dir, 'output.txt') diff --git a/spras/strwr.py b/spras/strwr.py index 42928e4cd..b25046a69 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -58,7 +58,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_settings=None): + def run(inputs, output_file, timeout, args, container_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() ST_RWR.validate_required_run_args(inputs) @@ -110,7 +110,8 @@ def run(inputs, output_file, args, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) # Rename the primary output file to match the desired output filename output_edges = Path(out_dir, 'output.txt') From 0342b5cbb8844222d3d79b80d69c4fbdbf2f8e61 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 12 Jan 2026 11:28:17 -0800 Subject: [PATCH 12/59] feat: snakemake err checkpoint --- Snakefile | 44 ++++++++++++++++++++++++++++++++---------- spras/config/config.py | 4 ++++ spras/containers.py | 3 +++ 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/Snakefile b/Snakefile index b9b076e3e..960ccf378 100644 --- a/Snakefile +++ b/Snakefile @@ -1,7 +1,10 @@ import os from spras import runner import shutil +import json import yaml +from pathlib import Path +from spras.containers import TimeoutError from spras.dataset import Dataset from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape @@ -261,31 +264,52 @@ def collect_prepared_input(wildcards): return prepared_inputs # Run the pathway reconstruction algorithm -rule reconstruct: +checkpoint reconstruct: input: collect_prepared_input # Each reconstruct call should be in a separate output subdirectory that is unique for the parameter combination so # that multiple instances of the container can run simultaneously without overwriting the output files # Overwriting files can happen because the pathway reconstruction algorithms often generate output files with the # same name regardless of the inputs or parameters, and these aren't renamed until after the container command # terminates - output: pathway_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']) + output: + pathway_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']) + log: + resource_info = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'resource-log.json']) + params: + # Get the timeout from the config and use it as an input. + # TODO: This has unexpected behavior when this rule succeeds but the timeout extends, + # making this rule run again. + timeout = lambda wildcards: _config.config.algorithm_timeouts[wildcards.algorithm] run: # Create a copy so that the updates are not written to the parameters logfile - params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() + algorithm_params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() # Declare the input files as a dictionary. inputs = dict(zip(runner.get_required_inputs(wildcards.algorithm), *{input}, strict=True)) - # Get the timeout from the config - timeout = _config.config.algorithm_timeouts[wildcards.algorithm] # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml - if '_spras_run_name' in params: - params.pop('_spras_run_name') - runner.run(wildcards.algorithm, inputs, output.pathway_file, timeout, params, container_settings) + if '_spras_run_name' in algorithm_params: + algorithm_params.pop('_spras_run_name') + try: + runner.run(wildcards.algorithm, inputs, output.pathway_file, params.timeout, algorithm_params, container_settings) + Path(log.resource_info).write_text(json.dumps({"status": "success"})) + except TimeoutError as err: + # We don't raise the error here (and use `--keep-going` to avoid re-running this rule [or others!] unnecessarily.) + Path(log.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) + # and we touch pathway_file still: Snakemake doesn't have optional files, so + # we'll filter the ones that didn't time out in collect_successful_reconstructions. + Path(output.pathway_file).touch() + +def collect_successful_reconstructions(wildcards): + reconstruct_checkpoint = checkpoints.reconstruct.get(**wildcards) + resource_info = json.loads(Path(reconstruct_checkpoint.log.resource_info).read_bytes()) + if resource_info["status"] == "success": + return [reconstruct_checkpoint.output.pathway_file] + return [] # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output rule parse_output: - input: - raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']), + input: + raw_file = collect_successful_reconstructions, dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt']) run: diff --git a/spras/config/config.py b/spras/config/config.py index 9c8bdc8a0..63e8e2d6f 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -164,6 +164,10 @@ def process_algorithms(self, raw_config: RawConfig): timeout = parse(alg.timeout, granularity='seconds') if not timeout: raise RuntimeError(f"Algorithm {alg} has unparsable timeout string {alg.timeout}.") self.algorithm_timeouts[alg.name] = int(timeout) + else: + # As per the type signature, we still want to say explicitly that this algorithm's timeout + # is uninhabited. + self.algorithm_timeouts[alg.name] = None runs: dict[str, Any] = alg.runs diff --git a/spras/containers.py b/spras/containers.py index c4b41a529..a72bf3a61 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -178,6 +178,9 @@ def __init__(self, timeout: int, *args): super(TimeoutError, self).__init__(timeout, *args) + def __str__(self): + return self.message + def env_to_items(environment: dict[str, str]) -> Iterator[str]: """ Turns an environment variable dictionary to KEY=VALUE pairs. From 841d24240eee1cc10cb3d4a9238b1dcf8095970a Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 12 Jan 2026 11:51:48 -0800 Subject: [PATCH 13/59] fix: use timeout correctly --- Snakefile | 14 +++++++------- spras/containers.py | 15 ++++++++------- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/Snakefile b/Snakefile index 960ccf378..758d46dfb 100644 --- a/Snakefile +++ b/Snakefile @@ -272,8 +272,8 @@ checkpoint reconstruct: # same name regardless of the inputs or parameters, and these aren't renamed until after the container command # terminates output: - pathway_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']) - log: + pathway_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']), + # Despite this being a 'log' file, we don't use the log directive as this rule doesn't actually throw errors. resource_info = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'resource-log.json']) params: # Get the timeout from the config and use it as an input. @@ -290,20 +290,20 @@ checkpoint reconstruct: algorithm_params.pop('_spras_run_name') try: runner.run(wildcards.algorithm, inputs, output.pathway_file, params.timeout, algorithm_params, container_settings) - Path(log.resource_info).write_text(json.dumps({"status": "success"})) + Path(output.resource_info).write_text(json.dumps({"status": "success"})) except TimeoutError as err: # We don't raise the error here (and use `--keep-going` to avoid re-running this rule [or others!] unnecessarily.) - Path(log.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) + Path(output.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) # and we touch pathway_file still: Snakemake doesn't have optional files, so # we'll filter the ones that didn't time out in collect_successful_reconstructions. Path(output.pathway_file).touch() def collect_successful_reconstructions(wildcards): reconstruct_checkpoint = checkpoints.reconstruct.get(**wildcards) - resource_info = json.loads(Path(reconstruct_checkpoint.log.resource_info).read_bytes()) + resource_info = json.loads(Path(reconstruct_checkpoint.output.resource_info).read_bytes()) if resource_info["status"] == "success": - return [reconstruct_checkpoint.output.pathway_file] - return [] + return reconstruct_checkpoint.output.pathway_file + return None # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output diff --git a/spras/containers.py b/spras/containers.py index a72bf3a61..63fb71701 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -347,20 +347,21 @@ def run_container_docker( bind_paths = [f'{prepare_path_docker(src)}:{dest}' for src, dest in volumes] - container_obj = client.containers.create( + container_obj = client.containers.run( container, command, volumes=bind_paths, working_dir=working_dir, network_disabled=network_disabled, - environment=environment + environment=environment, + detach=True ) - if timeout: - try: - container_obj.wait(timeout=timeout) - except requests.exceptions.ReadTimeout: - raise TimeoutError(timeout) + try: + container_obj.wait(timeout=timeout) + except requests.exceptions.ReadTimeout: + if timeout: raise TimeoutError(timeout) + else: raise RuntimeError("Timeout error but no timeout specified. Please file an issue with this error and stacktrace at https://github.com/Reed-CompBio/spras/issues/new.") out = container_obj.attach(stderr=True).decode('utf-8') From 75fd7f1cf15674b737a66d77c2c42e5d7c57a676 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 04:39:32 -0800 Subject: [PATCH 14/59] fix: filter files w/ errors --- Snakefile | 52 ++++++++++++++++++++++++++++----------------- spras/containers.py | 2 ++ 2 files changed, 35 insertions(+), 19 deletions(-) diff --git a/Snakefile b/Snakefile index 758d46dfb..afb2ad994 100644 --- a/Snakefile +++ b/Snakefile @@ -263,6 +263,21 @@ def collect_prepared_input(wildcards): return prepared_inputs +def mark_error(file, **details): + """Marks a file as an error with associated details.""" + Path(file).write_text(json.dumps({"status": "error", **details})) + +def is_error(file): + """Checks if a file was produced by mark_error.""" + try: + return json.loads(Path(file).read_bytes())["status"] == "error" + except ValueError: + return False + +def filter_successful(files): + """Convenient function for filtering iterators by whether or not their items are error files.""" + return [file for file in files if not is_error(file)] + # Run the pathway reconstruction algorithm checkpoint reconstruct: input: collect_prepared_input @@ -295,24 +310,23 @@ checkpoint reconstruct: # We don't raise the error here (and use `--keep-going` to avoid re-running this rule [or others!] unnecessarily.) Path(output.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) # and we touch pathway_file still: Snakemake doesn't have optional files, so - # we'll filter the ones that didn't time out in collect_successful_reconstructions. + # we'll filter the ones that didn't time out by passing around empty files. Path(output.pathway_file).touch() -def collect_successful_reconstructions(wildcards): - reconstruct_checkpoint = checkpoints.reconstruct.get(**wildcards) - resource_info = json.loads(Path(reconstruct_checkpoint.output.resource_info).read_bytes()) - if resource_info["status"] == "success": - return reconstruct_checkpoint.output.pathway_file - return None - # Original pathway reconstruction output to universal output # Use PRRunner as a wrapper to call the algorithm-specific parse_output rule parse_output: input: - raw_file = collect_successful_reconstructions, + raw_file = rules.reconstruct.output.pathway_file, + resource_info = rules.reconstruct.output.resource_info, dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt']) run: + resource_info = json.loads(Path(input.resource_info).read_bytes()) + if resource_info["status"] != "success": + mark_error(output.standardized_file) + return + params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() params['dataset'] = input.dataset_file runner.parse_output(wildcards.algorithm, input.raw_file, output.standardized_file, params) @@ -334,7 +348,7 @@ rule viz_cytoscape: output: session = SEP.join([out_dir, '{dataset}-cytoscape.cys']) run: - cytoscape.run_cytoscape(input.pathways, output.session, container_settings) + cytoscape.run_cytoscape(filter_successful(input.pathways), output.session, container_settings) # Write a single summary table for all pathways for each dataset @@ -347,7 +361,7 @@ rule summary_table: run: # Load the node table from the pickled dataset file node_table = Dataset.from_file(input.dataset_file).node_table - summary_df = summary.summarize_networks(input.pathways, node_table, algorithm_params, algorithms_with_params) + summary_df = summary.summarize_networks(filter_successful(input.pathways), node_table, algorithm_params, algorithms_with_params) summary_df.to_csv(output.summary_table, sep='\t', index=False) # Cluster the output pathways for each dataset @@ -363,7 +377,7 @@ rule ml_analysis: hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-horizontal.png']), hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', 'hac-clusters-horizontal.txt']), run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) @@ -377,7 +391,7 @@ rule jaccard_similarity: jaccard_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', 'jaccard-matrix.txt']), jaccard_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', 'jaccard-heatmap.png']) run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.jaccard_similarity_eval(summary_df, output.jaccard_similarity_matrix, output.jaccard_similarity_heatmap) @@ -388,7 +402,7 @@ rule ensemble: output: ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', 'ensemble-pathway.txt']) run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.ensemble_network(summary_df, output.ensemble_network_file) # Returns all pathways for a specific algorithm @@ -410,7 +424,7 @@ rule ml_analysis_aggregate_algo: hac_image_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-horizontal.png']), hac_clusters_horizontal = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-hac-clusters-horizontal.txt']), run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.hac_vertical(summary_df, output.hac_image_vertical, output.hac_clusters_vertical, **hac_params) ml.hac_horizontal(summary_df, output.hac_image_horizontal, output.hac_clusters_horizontal, **hac_params) ml.pca(summary_df, output.pca_image, output.pca_variance, output.pca_coordinates, **pca_params) @@ -422,7 +436,7 @@ rule ensemble_per_algo: output: ensemble_network_file = SEP.join([out_dir,'{dataset}-ml', '{algorithm}-ensemble-pathway.txt']) run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.ensemble_network(summary_df, output.ensemble_network_file) # Calculated Jaccard similarity between output pathways for each dataset per algorithm @@ -433,7 +447,7 @@ rule jaccard_similarity_per_algo: jaccard_similarity_matrix = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-jaccard-matrix.txt']), jaccard_similarity_heatmap = SEP.join([out_dir, '{dataset}-ml', '{algorithm}-jaccard-heatmap.png']) run: - summary_df = ml.summarize_networks(input.pathways) + summary_df = ml.summarize_networks(filter_successful(input.pathways)) ml.jaccard_similarity_eval(summary_df, output.jaccard_similarity_matrix, output.jaccard_similarity_heatmap) # Return the gold standard pickle file for a specific gold standard @@ -464,7 +478,7 @@ rule evaluation_pr_per_pathways: node_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-per-pathway-nodes.png']), run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table - pr_df = Evaluation.node_precision_and_recall(input.pathways, node_table) + pr_df = Evaluation.node_precision_and_recall(filter_successful(input.pathways), node_table) Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png) # Returns all pathways for a specific algorithm and dataset @@ -483,7 +497,7 @@ rule evaluation_per_algo_pr_per_pathways: node_pr_png = SEP.join([out_dir, '{dataset_gold_standard_pair}-eval', 'pr-per-pathway-for-{algorithm}-nodes.png']), run: node_table = Evaluation.from_file(input.node_gold_standard_file).node_table - pr_df = Evaluation.node_precision_and_recall(input.pathways, node_table) + pr_df = Evaluation.node_precision_and_recall(filter_successful(input.pathways), node_table) Evaluation.precision_and_recall_per_pathway(pr_df, output.node_pr_file, output.node_pr_png, include_aggregate_algo_eval) # Return pathway summary file per dataset diff --git a/spras/containers.py b/spras/containers.py index 63fb71701..57f2fc29a 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -360,6 +360,8 @@ def run_container_docker( try: container_obj.wait(timeout=timeout) except requests.exceptions.ReadTimeout: + container_obj.stop() + client.close() if timeout: raise TimeoutError(timeout) else: raise RuntimeError("Timeout error but no timeout specified. Please file an issue with this error and stacktrace at https://github.com/Reed-CompBio/spras/issues/new.") From 7abd7093f4fd008407d8ae2d3e93bfec4d615cf1 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 07:22:14 -0800 Subject: [PATCH 15/59] fix: correct timeout order --- Snakefile | 2 +- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/config/algorithms.py | 2 +- spras/config/config.py | 2 +- spras/containers.py | 10 +++++----- spras/domino.py | 5 +++-- spras/meo.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 2 +- spras/prm.py | 8 ++++---- spras/responsenet.py | 2 +- spras/runner.py | 6 +++--- spras/rwr.py | 2 +- spras/strwr.py | 2 +- 17 files changed, 28 insertions(+), 27 deletions(-) diff --git a/Snakefile b/Snakefile index afb2ad994..5e0330a04 100644 --- a/Snakefile +++ b/Snakefile @@ -304,7 +304,7 @@ checkpoint reconstruct: if '_spras_run_name' in algorithm_params: algorithm_params.pop('_spras_run_name') try: - runner.run(wildcards.algorithm, inputs, output.pathway_file, params.timeout, algorithm_params, container_settings) + runner.run(wildcards.algorithm, inputs, output.pathway_file, algorithm_params, container_settings, params.timeout) Path(output.resource_info).write_text(json.dumps({"status": "success"})) except TimeoutError as err: # We don't raise the error here (and use `--keep-going` to avoid re-running this rule [or others!] unnecessarily.) diff --git a/spras/allpairs.py b/spras/allpairs.py index 500e864f4..d015acfdb 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -74,7 +74,7 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() AllPairs.validate_required_run_args(inputs) diff --git a/spras/btb.py b/spras/btb.py index e19ed5d19..8c8358878 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -72,7 +72,7 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() BowTieBuilder.validate_required_run_args(inputs) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index d918f5333..5abb4dbf6 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -5,7 +5,7 @@ """ import ast import copy -from typing import Annotated, Any, Callable, Literal, Union, cast, get_args, Optional +from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args import numpy as np from pydantic import ( diff --git a/spras/config/config.py b/spras/config/config.py index 63e8e2d6f..819fd74ad 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -18,9 +18,9 @@ import warnings from typing import Any, Optional -from pytimeparse import parse import numpy as np import yaml +from pytimeparse import parse from spras.config.container_schema import ProcessedContainerSettings from spras.config.schema import RawConfig diff --git a/spras/containers.py b/spras/containers.py index 57f2fc29a..2f1995646 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -1,7 +1,6 @@ import os import platform import re -import requests import subprocess import textwrap from pathlib import Path, PurePath, PurePosixPath @@ -9,6 +8,7 @@ import docker import docker.errors +import requests from spras.config.container_schema import ProcessedContainerSettings from spras.logging import indent @@ -262,7 +262,7 @@ def run_container_and_log( volumes=volumes, working_dir=working_dir, out_dir=out_dir, - container_settings=container_settings, + container_settings=container_settings, timeout=timeout, environment=environment, network_disabled=network_disabled @@ -359,11 +359,11 @@ def run_container_docker( try: container_obj.wait(timeout=timeout) - except requests.exceptions.ReadTimeout: + except requests.exceptions.ReadTimeout as err: container_obj.stop() client.close() - if timeout: raise TimeoutError(timeout) - else: raise RuntimeError("Timeout error but no timeout specified. Please file an issue with this error and stacktrace at https://github.com/Reed-CompBio/spras/issues/new.") + if timeout: raise TimeoutError(timeout) from err + else: raise RuntimeError("Timeout error but no timeout specified. Please file an issue with this error and stacktrace at https://github.com/Reed-CompBio/spras/issues/new.") from None out = container_obj.attach(stderr=True).decode('utf-8') diff --git a/spras/domino.py b/spras/domino.py index 1faaf9d6b..eda2934f7 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -83,7 +83,7 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = DominoParams() DOMINO.validate_required_run_args(inputs) @@ -156,7 +156,8 @@ def run(inputs, output_file, timeout, args=None, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) except ContainerError as err: # Occurs when DOMINO gets passed some empty dataframe from network_file. # This counts as an empty input, so we return an empty output. diff --git a/spras/meo.py b/spras/meo.py index b7be4db93..94e90c928 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -145,7 +145,7 @@ def generate_inputs(data, filename_map): # TODO document required arguments @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. diff --git a/spras/mincostflow.py b/spras/mincostflow.py index cbb373d20..2a8a8d585 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -76,7 +76,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = MinCostFlowParams() MinCostFlow.validate_required_run_args(inputs) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 7a053ec90..0d913ec10 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -158,7 +158,7 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(inputs, output_file, timeout, args, container_settings=None): + def run(inputs, output_file, args, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() OmicsIntegrator1.validate_required_run_args(inputs, ["dummy_nodes"]) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 8994dd1fd..25b4d5f2e 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -108,7 +108,7 @@ def generate_inputs(data: Dataset, filename_map): # TODO add reasonable default values @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = OmicsIntegrator2Params() OmicsIntegrator2.validate_required_run_args(inputs) diff --git a/spras/pathlinker.py b/spras/pathlinker.py index eb91c49c6..2f41dcc30 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -75,7 +75,7 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = PathLinkerParams() PathLinker.validate_required_run_args(inputs) diff --git a/spras/prm.py b/spras/prm.py index 2f79c376a..663587f9d 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -59,7 +59,7 @@ def get_params_generic(cls) -> type[T]: # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. @classmethod - def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, timeout: Optional[int], args: dict[str, Any], container_settings: ProcessedContainerSettings): + def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings, timeout: Optional[int]): """ This is similar to PRA.run, but `args` is a dictionary and not a pydantic structure. However, this method still re-validates `args` against the associated pydantic PRM argument model. @@ -76,18 +76,18 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) T_parsed = T_class.model_validate(args) - return cls.run(inputs, output_file, timeout, T_parsed, container_settings) + return cls.run(inputs, output_file, T_parsed, container_settings, timeout) @staticmethod @abstractmethod - def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, timeout: Optional[int], args: T, container_settings: ProcessedContainerSettings): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings, timeout: Optional[int]): """ Runs an algorithm. @param inputs: specified inputs @param output_file: designated reconstructed pathway output - @param timeout: timeout in seconds to run the container with @param args: (T) typed algorithm params @param container_settings: what settings should be associated with the individual container. + @param timeout: timeout in seconds to run the container with See the algorithm-specific `PRM.generate_inputs` and `PRM.parse_output` for information about the input and output format. diff --git a/spras/responsenet.py b/spras/responsenet.py index 565f02da0..0c813df8c 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -68,7 +68,7 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, timeout, args=None, container_settings=None): + def run(inputs, output_file, args=None, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() ResponseNet.validate_required_run_args(inputs) if not args: args = ResponseNetParams() diff --git a/spras/runner.py b/spras/runner.py index b5e689103..a9f251ba5 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -41,9 +41,9 @@ def run( algorithm: str, inputs: dict[str, str | PathLike], output_file: str | PathLike, - timeout: Optional[int], args: dict[str, Any], - container_settings: ProcessedContainerSettings + container_settings: ProcessedContainerSettings, + timeout: Optional[int] ): """ A generic interface to the algorithm-specific run functions @@ -51,7 +51,7 @@ def run( algorithm_runner = get_algorithm(algorithm) # We can't use config.config here else we would get a cyclic dependency. # Since args is a dict here, we use the 'run_typeless' utility PRM function. - algorithm_runner.run_typeless(inputs, output_file, timeout, args, container_settings) + algorithm_runner.run_typeless(inputs, output_file, args, container_settings, timeout) def get_required_inputs(algorithm: str): diff --git a/spras/rwr.py b/spras/rwr.py index e359767f3..5f4958854 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -56,7 +56,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, timeout, args, container_settings=None): + def run(inputs, output_file, args, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() RWR.validate_required_run_args(inputs) diff --git a/spras/strwr.py b/spras/strwr.py index b25046a69..8058c0cdd 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -58,7 +58,7 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, timeout, args, container_settings=None): + def run(inputs, output_file, args, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() ST_RWR.validate_required_run_args(inputs) From e07c96148c0246b6449b41e22a0d733fa9ff7b13 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 15:34:36 +0000 Subject: [PATCH 16/59] fix(cytoscape): specify optional timeout --- spras/analysis/cytoscape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spras/analysis/cytoscape.py b/spras/analysis/cytoscape.py index e84899506..6eadfaddc 100644 --- a/spras/analysis/cytoscape.py +++ b/spras/analysis/cytoscape.py @@ -58,5 +58,6 @@ def run_cytoscape(pathways: List[Union[str, PurePath]], output_file: str, contai # (https://github.com/Reed-CompBio/spras/pull/390/files#r2485100875) None, container_settings, + None, env) rmtree(cytoscape_output_dir) From d5b7e18b74fe609dd444b95b1cfc05f099d65bf6 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 16:03:31 +0000 Subject: [PATCH 17/59] chore(Snakefile): decheckpointify reconstruct --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 5e0330a04..6fbb01041 100644 --- a/Snakefile +++ b/Snakefile @@ -279,7 +279,7 @@ def filter_successful(files): return [file for file in files if not is_error(file)] # Run the pathway reconstruction algorithm -checkpoint reconstruct: +rule reconstruct: input: collect_prepared_input # Each reconstruct call should be in a separate output subdirectory that is unique for the parameter combination so # that multiple instances of the container can run simultaneously without overwriting the output files From 111e53fda43bcad5d97bc1f215e2ab51f2e85252 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 13 Jan 2026 17:48:22 +0000 Subject: [PATCH 18/59] perf(Snakefile): make is_error check not consume the entire file --- Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 6fbb01041..f0c745493 100644 --- a/Snakefile +++ b/Snakefile @@ -270,7 +270,8 @@ def mark_error(file, **details): def is_error(file): """Checks if a file was produced by mark_error.""" try: - return json.loads(Path(file).read_bytes())["status"] == "error" + with open(file, 'r') as f: + json.load(f)["status"] == "error" except ValueError: return False From cc46eeda1ca3bec86b0706be8e6b42b552edf386 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 23 Apr 2026 19:31:36 +0000 Subject: [PATCH 19/59] docs: timeout --- docs/index.rst | 1 + docs/timeout.rst | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) create mode 100644 docs/timeout.rst diff --git a/docs/index.rst b/docs/index.rst index 68c0f3df6..686c53b5e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -37,6 +37,7 @@ methods (PRMs) to omics data. output htcondor + timeout .. toctree:: :maxdepth: 1 diff --git a/docs/timeout.rst b/docs/timeout.rst new file mode 100644 index 000000000..02abd1c24 --- /dev/null +++ b/docs/timeout.rst @@ -0,0 +1,18 @@ +Timeouts +======== + +SPRAS allows for per-algorithm timeouts, specified under the global +configuration file. For example, to give the AllPairs algorithm a 1 day +timeout: + +.. code:: yaml + + - name: "allpairs" + include: true + timeout: 1d + +The timeout string parsing is delegated to +`pytimeparse `__, which allows +for more complicated timeout strings, such as ``3d2h32m``. + +**NOTE**: This feature only works with docker at the time of writing. From 83c5ed0ae02690afb6d9dfd2308dc853ea909a47 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 23 Apr 2026 19:34:42 +0000 Subject: [PATCH 20/59] docs: clarification on container_obj --- spras/containers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spras/containers.py b/spras/containers.py index 2f1995646..2d3b01826 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -347,6 +347,9 @@ def run_container_docker( bind_paths = [f'{prepare_path_docker(src)}:{dest}' for src, dest in volumes] + # We detach the container, allowing dockerpy to return a + # `Container` object for our further use. This is currently only + # to set docker-based container timeouts. container_obj = client.containers.run( container, command, From 71c1976739a9fd0dfa91133c1afbef46b89f4f20 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 23 Apr 2026 19:57:37 +0000 Subject: [PATCH 21/59] docs: remove the strange comment This perplexes me but from my tests we do not need --keep-going. I do not know my original intent here --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 27c1c30d0..6064f889e 100644 --- a/Snakefile +++ b/Snakefile @@ -308,7 +308,7 @@ rule reconstruct: runner.run(wildcards.algorithm, inputs, output.pathway_file, algorithm_params, container_settings, params.timeout) Path(output.resource_info).write_text(json.dumps({"status": "success"})) except TimeoutError as err: - # We don't raise the error here (and use `--keep-going` to avoid re-running this rule [or others!] unnecessarily.) + # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) Path(output.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) # and we touch pathway_file still: Snakemake doesn't have optional files, so # we'll filter the ones that didn't time out by passing around empty files. From 6e60afe784d620bcf839c9f9f02aefd03c8cc00e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 23 Apr 2026 13:21:13 -0700 Subject: [PATCH 22/59] refactor: use mark_error and is_error more often Co-Authored-By: Neha Talluri <78840540+ntalluri@users.noreply.github.com> --- Snakefile | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index 6064f889e..a2964e9a2 100644 --- a/Snakefile +++ b/Snakefile @@ -309,9 +309,10 @@ rule reconstruct: Path(output.resource_info).write_text(json.dumps({"status": "success"})) except TimeoutError as err: # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) - Path(output.resource_info).write_text(json.dumps({"status": "error", "type": "timeout", "duration": params.timeout})) - # and we touch pathway_file still: Snakemake doesn't have optional files, so - # we'll filter the ones that didn't time out by passing around empty files. + mark_error(output.resource_info, type="timeout", duration=params.timeout) + # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'resource info' file, + # which contains the status (success/failure) of specific Snakemake jobs. + # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` fucntion. Path(output.pathway_file).touch() # Original pathway reconstruction output to universal output @@ -319,12 +320,13 @@ rule reconstruct: rule parse_output: input: raw_file = rules.reconstruct.output.pathway_file, + + # We propagate up the resource_info error if it exists. resource_info = rules.reconstruct.output.resource_info, dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt']) run: - resource_info = json.loads(Path(input.resource_info).read_bytes()) - if resource_info["status"] != "success": + if is_error(input.resource_info): mark_error(output.standardized_file) return From 699ddcaccee071a20535330c85e36a72204b620c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 23 Apr 2026 22:37:14 +0000 Subject: [PATCH 23/59] style: fmt --- Snakefile | 2 +- docs/timeout.rst | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/Snakefile b/Snakefile index 65f633fbe..0d2a6e123 100644 --- a/Snakefile +++ b/Snakefile @@ -314,7 +314,7 @@ rule reconstruct: mark_error(output.resource_info, type="timeout", duration=params.timeout) # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'resource info' file, # which contains the status (success/failure) of specific Snakemake jobs. - # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` fucntion. + # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` function. Path(output.pathway_file).touch() # Original pathway reconstruction output to universal output diff --git a/docs/timeout.rst b/docs/timeout.rst index 02abd1c24..a05e26244 100644 --- a/docs/timeout.rst +++ b/docs/timeout.rst @@ -1,5 +1,6 @@ -Timeouts -======== +########## + Timeouts +########## SPRAS allows for per-algorithm timeouts, specified under the global configuration file. For example, to give the AllPairs algorithm a 1 day @@ -11,8 +12,8 @@ timeout: include: true timeout: 1d -The timeout string parsing is delegated to -`pytimeparse `__, which allows -for more complicated timeout strings, such as ``3d2h32m``. +The timeout string parsing is delegated to `pytimeparse +`__, which allows for more +complicated timeout strings, such as ``3d2h32m``. **NOTE**: This feature only works with docker at the time of writing. From 4e3c28f6cd08846fa48a603b3f7c7006b6db5db5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 02:18:11 +0000 Subject: [PATCH 24/59] docs: on errors --- docs/design/errors.rst | 20 ++++++++++++++++++++ docs/index.rst | 7 +++++++ 2 files changed, 27 insertions(+) create mode 100644 docs/design/errors.rst diff --git a/docs/design/errors.rst b/docs/design/errors.rst new file mode 100644 index 000000000..5137d61c6 --- /dev/null +++ b/docs/design/errors.rst @@ -0,0 +1,20 @@ +Errors +====== + +By default, whenever SPRAS runs into a container error (i.e. an internal +algorithm error), the full workflow will fail. However, there are +certain designated errors where we don't want this to be the case (at +the moment, these designated errors are only container timeouts, but +this may be extended to heuristics in the future). + +Due to the following design constraints: + +- Snakemake does not have support for such errors (the closest being + ``--keep-going``, which unnecessarily runs failed runs) +- SPRAS occasionally outputs empty files as genuine output +- We need to log errors that happen for user knowledge + +we opt to use a ``resource-info.json`` file, which keeps track of the +success/failure status at certain failable parts of the workflow. This +file contains whether or not this part of the workflow succeeded, and if +it failed, how it failed. diff --git a/docs/index.rst b/docs/index.rst index 95bcf9d5c..6966692e5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -63,6 +63,13 @@ reconstruction methods (PRMs) to omics data. contributing/patching contributing/design + +.. toctree:: + :maxdepth: 1 + :caption: Internal Designs + + design/errors + .. toctree:: :maxdepth: 1 :caption: Tutorials From a322f4db0951ff81e31d4aebde7bb472d0847063 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 02:42:40 +0000 Subject: [PATCH 25/59] fix: tests and such --- Snakefile | 4 +--- docs/design/errors.rst | 13 +++++++------ docs/index.rst | 1 - spras/diamond.py | 5 +++-- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/Snakefile b/Snakefile index 0d2a6e123..61894287d 100644 --- a/Snakefile +++ b/Snakefile @@ -321,11 +321,9 @@ rule reconstruct: # Use PRRunner as a wrapper to call the algorithm-specific parse_output rule parse_output: input: - raw_file = rules.reconstruct.output.pathway_file, - # We propagate up the resource_info error if it exists. resource_info = rules.reconstruct.output.resource_info, - raw_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']), + raw_file = rules.reconstruct.output.pathway_file, dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt']) run: diff --git a/docs/design/errors.rst b/docs/design/errors.rst index 5137d61c6..7ce212ee8 100644 --- a/docs/design/errors.rst +++ b/docs/design/errors.rst @@ -1,5 +1,6 @@ -Errors -====== +######## + Errors +######## By default, whenever SPRAS runs into a container error (i.e. an internal algorithm error), the full workflow will fail. However, there are @@ -9,10 +10,10 @@ this may be extended to heuristics in the future). Due to the following design constraints: -- Snakemake does not have support for such errors (the closest being - ``--keep-going``, which unnecessarily runs failed runs) -- SPRAS occasionally outputs empty files as genuine output -- We need to log errors that happen for user knowledge +- Snakemake does not have support for such errors (the closest being + ``--keep-going``, which unnecessarily runs failed runs) +- SPRAS occasionally outputs empty files as genuine output +- We need to log errors that happen for user knowledge we opt to use a ``resource-info.json`` file, which keeps track of the success/failure status at certain failable parts of the workflow. This diff --git a/docs/index.rst b/docs/index.rst index 6966692e5..42fc92f82 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -63,7 +63,6 @@ reconstruction methods (PRMs) to omics data. contributing/patching contributing/design - .. toctree:: :maxdepth: 1 :caption: Internal Designs diff --git a/spras/diamond.py b/spras/diamond.py index e8d5ed7d3..6b48575f9 100644 --- a/spras/diamond.py +++ b/spras/diamond.py @@ -63,7 +63,7 @@ def generate_inputs(data, filename_map): edges_df.to_csv(filename_map["network"], columns=["Interactor1", "Interactor2"], index=False, header=None, sep=',') @staticmethod - def run(inputs, output_file, args, container_settings=None): + def run(inputs, output_file, args, container_settings=None, timeout=None): if not container_settings: container_settings = ProcessedContainerSettings() DIAMOnD.validate_required_run_args(inputs) @@ -100,7 +100,8 @@ def run(inputs, output_file, args, container_settings=None): volumes, work_dir, out_dir, - container_settings) + container_settings, + timeout) except ContainerError as err: if err.streams_contain("KeyError: 'nix'"): raise RuntimeError(f"{err.stderr}\n" + \ From 53e96a750d0eac15aaee7ebd444a733f2f8b0947 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sat, 25 Apr 2026 13:12:57 -0700 Subject: [PATCH 26/59] style: fmt --- docs/usage.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/usage.rst b/docs/usage.rst index 6b4b2d053..e4932df63 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -3,7 +3,8 @@ ############# SPRAS is run through `Snakemake `_, -which comes with the SPRAS conda environment and as a dependency of SPRAS. +which comes with the SPRAS conda environment and as a dependency of +SPRAS. To run SPRAS, run the following command inside the ``spras`` directory, specifying a ``config.yaml`` and the number of cores to run SPRAS with: From 641608f63e69a9cd55109f767c66aeaec57a642b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 22:10:16 +0000 Subject: [PATCH 27/59] feat: singularity timeouts --- spras/containers.py | 62 ++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/spras/containers.py b/spras/containers.py index 2d3b01826..583b87653 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -417,7 +417,16 @@ def run_container_docker( return out -def run_container_singularity(container: str, command: List[str], volumes: List[Tuple[PurePath, PurePath]], working_dir: str, out_dir: str | os.PathLike, config: ProcessedContainerSettings, environment: Optional[dict[str, str]] = None): +def run_container_singularity( + container: str, + command: List[str], + volumes: List[Tuple[PurePath, PurePath]], + working_dir: str, + out_dir: str | os.PathLike, + config: ProcessedContainerSettings, + environment: Optional[dict[str, str]] = None, + timeout: Optional[int] = None, +): """ Runs a command in the container using Singularity. Only available on Linux. @@ -427,6 +436,7 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ @param working_dir: the working directory in the container @param out_dir: output directory for the rule's artifacts -- used here to store profiling data @param environment: environment variable to set in the container + @param timeout: the timeout (in seconds), throwing a TimeoutException if the timeout is reached. @return: output from Singularity execute """ @@ -489,37 +499,43 @@ def run_container_singularity(container: str, command: List[str], volumes: List[ # If not using the expanded sandbox image, we still need to prepend the docker:// prefix # so apptainer knows to pull and convert the image format from docker to apptainer. image_to_run = expanded_image if expanded_image else "docker://" + container + # We won't end up using the spython client if profiling or timeout is enabled because + # we need to run everything manually to set up the cgroup and add the timeout command as a prefix. + # Build the apptainer run command, which gets passed to the cgroup wrapper script + cmd = [ + "apptainer", "exec" + ] + for bind in bind_paths: + cmd.extend(["--bind", bind]) + cmd.extend(singularity_options) + cmd.append(str(image_to_run)) + cmd.extend(command) + + my_cgroup: Optional[str] = None if config.enable_profiling: - # We won't end up using the spython client if profiling is enabled because - # we need to run everything manually to set up the cgroup - # Build the apptainer run command, which gets passed to the cgroup wrapper script - singularity_cmd = [ - "apptainer", "exec" - ] - for bind in bind_paths: - singularity_cmd.extend(["--bind", bind]) - singularity_cmd.extend(singularity_options) - singularity_cmd.append(str(image_to_run)) - singularity_cmd.extend(command) - my_cgroup = create_peer_cgroup() # The wrapper script is packaged with spras, and should be located in the same directory # as `containers.py`. wrapper = os.path.join(os.path.dirname(__file__), "cgroup_wrapper.sh") - cmd = [wrapper, my_cgroup] + singularity_cmd - proc = subprocess.run(cmd, capture_output=True, text=True, stderr=subprocess.STDOUT) + cmd = [wrapper, my_cgroup] + cmd + if timeout is not None: + cmd = ["timeout", f"{timeout}s"] + cmd + proc = subprocess.run(cmd, capture_output=True, text=True, stderr=subprocess.STDOUT) + + # As per unix `timeout`, this is the status if the command times out and --preserve-status is not initially specified + # (where the latter above holds). + if proc.returncode == 124: + if timeout is not None: + raise TimeoutError(timeout) + else: + raise RuntimeError("Timeout return code occurred, yet `timeout` wasn't specified. " + \ + "Please file an issue with this error and stacktrace at https://github.com/Reed-CompBio/spras/issues/new.") + if my_cgroup is not None: print("Reading memory and CPU stats from cgroup") create_apptainer_container_stats(my_cgroup, str(out_dir)) - result = proc.stdout - else: - result = Client.execute( - image=image_to_run, - command=command, - options=singularity_options, - bind=bind_paths - ) + result = proc.stdout return result From c5ff6ad606a80ab5fbe19ec9d21403868191358c Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 22:13:02 +0000 Subject: [PATCH 28/59] docs: mention works with apptainer --- docs/timeout.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/timeout.rst b/docs/timeout.rst index a05e26244..a394ca6f6 100644 --- a/docs/timeout.rst +++ b/docs/timeout.rst @@ -16,4 +16,5 @@ The timeout string parsing is delegated to `pytimeparse `__, which allows for more complicated timeout strings, such as ``3d2h32m``. -**NOTE**: This feature only works with docker at the time of writing. +**NOTE**: This feature only works with docker and apptainer/singularity +at the time of writing. From 208eb4a9874e0ce68be8cb3c7204243a1fb2404e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 23:29:04 +0000 Subject: [PATCH 29/59] fix: don't use capture_output and stderr in the same command --- spras/containers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spras/containers.py b/spras/containers.py index 583b87653..336e06f5a 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -499,6 +499,7 @@ def run_container_singularity( # If not using the expanded sandbox image, we still need to prepend the docker:// prefix # so apptainer knows to pull and convert the image format from docker to apptainer. image_to_run = expanded_image if expanded_image else "docker://" + container + # We won't end up using the spython client if profiling or timeout is enabled because # we need to run everything manually to set up the cgroup and add the timeout command as a prefix. # Build the apptainer run command, which gets passed to the cgroup wrapper script @@ -520,7 +521,7 @@ def run_container_singularity( cmd = [wrapper, my_cgroup] + cmd if timeout is not None: cmd = ["timeout", f"{timeout}s"] + cmd - proc = subprocess.run(cmd, capture_output=True, text=True, stderr=subprocess.STDOUT) + proc = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # As per unix `timeout`, this is the status if the command times out and --preserve-status is not initially specified # (where the latter above holds). From 7a0c4f39bc9ba4aa8e54a03a817e91fa9197214b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Sat, 25 Apr 2026 23:52:39 +0000 Subject: [PATCH 30/59] fix: use correct variable reference for reconstruct --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 61894287d..c81d48870 100644 --- a/Snakefile +++ b/Snakefile @@ -307,7 +307,7 @@ rule reconstruct: if '_spras_run_name' in algorithm_params: algorithm_params.pop('_spras_run_name') try: - runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, params, container_settings, params.timeout) + runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, algorithm_params, container_settings, params.timeout) Path(output.resource_info).write_text(json.dumps({"status": "success"})) except TimeoutError as err: # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) From 4160db6723695e10890e99abe2cbb006ae28551a Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 29 Apr 2026 04:31:46 +0000 Subject: [PATCH 31/59] docs: move snakemake -> spras run --- docs/tutorial/beginner.rst | 6 +++--- docs/tutorial/intermediate.rst | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst index a0846666f..a50c43c90 100644 --- a/docs/tutorial/beginner.rst +++ b/docs/tutorial/beginner.rst @@ -285,7 +285,7 @@ From the root directory, run the command below from the command line: .. code:: bash - snakemake --cores 1 --configfile config/beginner.yaml + spras run --cores 1 --configfile config/beginner.yaml This command starts the workflow manager that automates all steps defined by SPRAS. It tells Snakemake to use one CPU core and to load @@ -430,7 +430,7 @@ After saving the changes, rerun with: .. code:: bash - snakemake --cores 1 --configfile config/beginner.yaml + spras run --cores 1 --configfile config/beginner.yaml What happens when you run this command -------------------------------------- @@ -600,7 +600,7 @@ After saving the changes, rerun with: .. code:: bash - snakemake --cores 1 --configfile config/beginner.yaml + spras run --cores 1 --configfile config/beginner.yaml What happens when you run this command -------------------------------------- diff --git a/docs/tutorial/intermediate.rst b/docs/tutorial/intermediate.rst index 1055b8799..21c1e84ad 100644 --- a/docs/tutorial/intermediate.rst +++ b/docs/tutorial/intermediate.rst @@ -542,7 +542,7 @@ From the root directory, run the command below from the command line: .. code:: bash - snakemake --cores 4 --configfile config/intermediate.yaml + spras run --cores 4 --configfile config/intermediate.yaml What happens when you run this command -------------------------------------- @@ -836,7 +836,7 @@ After saving the changes in the configuration file, rerun with: .. code:: bash - snakemake --cores 4 --configfile config/intermediate.yaml + spras run --cores 4 --configfile config/intermediate.yaml What happens when you run this command -------------------------------------- From c8fc32ae41ccbfd213a7168cddb2e0900c64b2d7 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Wed, 29 Apr 2026 05:52:32 +0000 Subject: [PATCH 32/59] chore: update dependencies --- environment.yml | 32 ++++++++++++++++---------------- pyproject.toml | 26 +++++++++++++------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/environment.yml b/environment.yml index 4361834ae..9c28badd2 100644 --- a/environment.yml +++ b/environment.yml @@ -3,16 +3,16 @@ channels: - conda-forge dependencies: - adjusttext=1.3.0 - - bioconda::snakemake-minimal=9.6.2 + - bioconda::snakemake-minimal=9.19.0 # Conda refers to pypi/docker as docker-py. - docker-py=7.1.0 - - matplotlib=3.10.3 - - networkx=3.5 - - pandas=2.3.0 - - pydantic=2.11.7 - - numpy=2.3.1 - - requests=2.32.4 - - scikit-learn=1.7.0 + - matplotlib=3.10.9 + - networkx=3.6.1 + - pandas=3.0.2 + - pydantic=2.13.3 + - numpy=2.4.4 + - requests=2.33.1 + - scikit-learn=1.8.0 - seaborn=0.13.2 - spython=0.3.14 @@ -24,16 +24,16 @@ dependencies: - tabulate=0.9.0 # toolchain deps - - pip=25.3 + - pip=26.1 # This should be the same as requires-python minus the >=. - - python=3.11 + - python=3.13 # development dependencies - - pre-commit=4.2.0 - - pytest=8.4.1 - - pytest-split=0.10.0 - - sphinx=7.4.7 - - sphinx-rtd-theme=2.0.0 + - pre-commit=4.6.0 + - pytest=9.0.3 + - pytest-split=0.11.0 + - sphinx=9.1.0 + - sphinx-rtd-theme=3.1.0 - pip: - - dsub==0.4.13 + - dsub==0.5.2 diff --git a/pyproject.toml b/pyproject.toml index bfc602c6d..46166ca83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,29 +18,29 @@ classifiers = [ ] requires-python = ">=3.11" dependencies = [ - "adjusttext==0.7.3", - "snakemake==9.6.2", + "adjusttext==1.3.0", + "snakemake==9.19.0", "docker==7.1.0", - "matplotlib==3.10.3", - "networkx==3.5", - "pandas==2.3.0", - "pydantic==2.11.7", - "numpy==2.3.1", - "requests==2.32.4", - "scikit-learn==1.7.0", + "matplotlib==3.10.9", + "networkx==3.6.1", + "pandas==3.0.2", + "pydantic==2.13.3", + "numpy==2.4.4", + "requests==2.33.1", + "scikit-learn==1.8.0", "seaborn==0.13.2", "spython==0.3.14", # toolchain deps - "pip==25.3", + "pip==26.1", ] [project.optional-dependencies] dev = [ # Only required for development - "pre-commit==4.2.0", - "pytest==8.4.1", - "pytest-split==0.10.0", + "pre-commit==4.6.0", + "pytest==9.0.3", + "pytest-split==0.11.0", ] [project.urls] From 7b3e383853676c0c7471b8962f34010cb280de9e Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 30 Apr 2026 21:24:31 +0000 Subject: [PATCH 33/59] fix: bump python to 3.13 --- .pre-commit-config.yaml | 2 +- .readthedocs.yaml | 2 +- environment.yml | 10 +++++----- pyproject.toml | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fe010814e..3b0581af9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,7 +2,7 @@ # See https://pre-commit.com/hooks.html for more hooks default_language_version: # Match this to the version specified in environment.yml - python: python3.11 + python: python3.13 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.4.0 # Use the ref you want to point at diff --git a/.readthedocs.yaml b/.readthedocs.yaml index cd6d38839..612c1d92f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,7 +8,7 @@ version: 2 build: os: ubuntu-22.04 tools: - python: "3.11" + python: "3.13" # Build documentation in the "docs/" directory with Sphinx sphinx: diff --git a/environment.yml b/environment.yml index 9c28badd2..197fe2146 100644 --- a/environment.yml +++ b/environment.yml @@ -18,13 +18,13 @@ dependencies: # conda-specific for dsub - python-dateutil=2.9.0 - - pytz=2025.2 - - pyyaml=6.0.2 - - tenacity=9.1.2 - - tabulate=0.9.0 + - pytz=2026.1 + - pyyaml=6.0.3 + - tenacity=9.1.4 + - tabulate=0.10.0 # toolchain deps - - pip=26.1 + - pip=26.0.1 # This should be the same as requires-python minus the >=. - python=3.13 diff --git a/pyproject.toml b/pyproject.toml index 46166ca83..45f9b572a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ classifiers = [ "Programming Language :: Python :: 3", "Topic :: Scientific/Engineering :: Bio-Informatics", ] -requires-python = ">=3.11" +requires-python = ">=3.13" dependencies = [ "adjusttext==1.3.0", "snakemake==9.19.0", @@ -32,7 +32,7 @@ dependencies = [ "spython==0.3.14", # toolchain deps - "pip==26.1", + "pip==26.0.1", ] [project.optional-dependencies] @@ -52,7 +52,7 @@ requires = ["setuptools>=64.0"] build-backend = "setuptools.build_meta" [tool.ruff] -target-version = "py311" +target-version = "py313" # Autofix errors when possible fix = true # Select categories or specific rules from https://beta.ruff.rs/docs/rules/ From 641df788aee2d954f631b8e91bc7bc3b05c87c5b Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 30 Apr 2026 21:26:35 +0000 Subject: [PATCH 34/59] fix: bump SPRAS image to Python 3.13 --- docker-wrappers/SPRAS/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker-wrappers/SPRAS/Dockerfile b/docker-wrappers/SPRAS/Dockerfile index 3d69943cc..4bb0d660f 100644 --- a/docker-wrappers/SPRAS/Dockerfile +++ b/docker-wrappers/SPRAS/Dockerfile @@ -4,7 +4,7 @@ FROM almalinux:9 RUN dnf update -y && \ dnf install -y epel-release && \ dnf install -y gcc gcc-c++ \ - python3.11 python3.11-pip python3.11-devel \ + python3.13 python3.13-pip python3.13-devel \ docker apptainer && \ dnf clean all @@ -13,4 +13,4 @@ RUN chmod -R 777 /spras WORKDIR /spras # Install spras into the container -RUN pip3.11 install . +RUN pip3.13 install . From fe68153146ee4db4e90daee23048a4344523491b Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Sun, 3 May 2026 20:00:42 -0700 Subject: [PATCH 35/59] refactor: move errors to be pydantic, add duration cmt Co-Authored-By: Neha Talluri <78840540+ntalluri@users.noreply.github.com> --- Snakefile | 27 +++++---------- docs/design/errors.rst | 30 ++++++++++++++--- docs/timeout.rst | 14 ++++---- spras/containers.py | 3 +- spras/errors.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 119 insertions(+), 30 deletions(-) create mode 100644 spras/errors.py diff --git a/Snakefile b/Snakefile index c81d48870..9575bcacd 100644 --- a/Snakefile +++ b/Snakefile @@ -10,6 +10,7 @@ from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape from spras.config.revision import detach_spras_revision import spras.config.config as _config +from spras.errors import mark_error, mark_success, is_error, TimeoutArtifactError # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -265,18 +266,6 @@ def collect_prepared_input(wildcards): return prepared_inputs -def mark_error(file, **details): - """Marks a file as an error with associated details.""" - Path(file).write_text(json.dumps({"status": "error", **details})) - -def is_error(file): - """Checks if a file was produced by mark_error.""" - try: - with open(file, 'r') as f: - json.load(f)["status"] == "error" - except ValueError: - return False - def filter_successful(files): """Convenient function for filtering iterators by whether or not their items are error files.""" return [file for file in files if not is_error(file)] @@ -292,7 +281,7 @@ rule reconstruct: output: pathway_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'raw-pathway.txt']), # Despite this being a 'log' file, we don't use the log directive as this rule doesn't actually throw errors. - resource_info = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'resource-log.json']) + artifact_info = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'artifact-log.json']) params: # Get the timeout from the config and use it as an input. # TODO: This has unexpected behavior when this rule succeeds but the timeout extends, @@ -308,11 +297,11 @@ rule reconstruct: algorithm_params.pop('_spras_run_name') try: runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, algorithm_params, container_settings, params.timeout) - Path(output.resource_info).write_text(json.dumps({"status": "success"})) + mark_success(output.artifact_info) except TimeoutError as err: # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) - mark_error(output.resource_info, type="timeout", duration=params.timeout) - # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'resource info' file, + mark_error(output.artifact_info, TimeoutArtifactError(duration=params.timeout)) + # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'artifact info' file, # which contains the status (success/failure) of specific Snakemake jobs. # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` function. Path(output.pathway_file).touch() @@ -321,13 +310,13 @@ rule reconstruct: # Use PRRunner as a wrapper to call the algorithm-specific parse_output rule parse_output: input: - # We propagate up the resource_info error if it exists. - resource_info = rules.reconstruct.output.resource_info, + # We propagate up the artifact_info error if it exists. + artifact_info = rules.reconstruct.output.artifact_info, raw_file = rules.reconstruct.output.pathway_file, dataset_file = SEP.join([out_dir, 'dataset-{dataset}-merged.pickle']) output: standardized_file = SEP.join([out_dir, '{dataset}-{algorithm}-{params}', 'pathway.txt']) run: - if is_error(input.resource_info): + if is_error(input.artifact_info): mark_error(output.standardized_file) return diff --git a/docs/design/errors.rst b/docs/design/errors.rst index 7ce212ee8..009065f0d 100644 --- a/docs/design/errors.rst +++ b/docs/design/errors.rst @@ -4,9 +4,7 @@ By default, whenever SPRAS runs into a container error (i.e. an internal algorithm error), the full workflow will fail. However, there are -certain designated errors where we don't want this to be the case (at -the moment, these designated errors are only container timeouts, but -this may be extended to heuristics in the future). +certain designated errors where we don't want this to be the case. Due to the following design constraints: @@ -15,7 +13,31 @@ Due to the following design constraints: - SPRAS occasionally outputs empty files as genuine output - We need to log errors that happen for user knowledge -we opt to use a ``resource-info.json`` file, which keeps track of the +we opt to use an ``artifact-info.json`` file, which keeps track of the success/failure status at certain failable parts of the workflow. This file contains whether or not this part of the workflow succeeded, and if it failed, how it failed. + +The ``artifact-info.json`` stores a JSON object, containing: + +- The key ``status``, which is either the value ``success`` or + ``error``, depending on what happened in the workflow. + +- If ``status`` is ``error``, we store associated error details in the + ``details`` key, which contains an object: + + - The ``details`` object varies per error in the form of a tagged + union: they are differentiated by the ``type`` key. We describe + each error down below. + +************* + Error Types +************* + +With the above schema, we detail the ``details`` key below. + +Timeout +======= + +Timeout has ``type: timeout``, with a single key ``duration``, which +describes the ``timeout`` value associated to that run. diff --git a/docs/timeout.rst b/docs/timeout.rst index a394ca6f6..e7a85a1eb 100644 --- a/docs/timeout.rst +++ b/docs/timeout.rst @@ -2,9 +2,8 @@ Timeouts ########## -SPRAS allows for per-algorithm timeouts, specified under the global -configuration file. For example, to give the AllPairs algorithm a 1 day -timeout: +The SPRAS global configuration can take optional per-algorithm timeouts. +For example, to give the AllPairs algorithm a 1 day timeout: .. code:: yaml @@ -13,8 +12,11 @@ timeout: timeout: 1d The timeout string parsing is delegated to `pytimeparse -`__, which allows for more -complicated timeout strings, such as ``3d2h32m``. +`__ (examples linked here). This +allows for more complicated timeout strings, such as ``3d2h32m``. + +If ``timeout`` is not specified, the algorithm will never be interrupted +due to running too long. **NOTE**: This feature only works with docker and apptainer/singularity -at the time of writing. +at the time of writing, not dsub. diff --git a/spras/containers.py b/spras/containers.py index 336e06f5a..8c3f26b67 100644 --- a/spras/containers.py +++ b/spras/containers.py @@ -1,3 +1,4 @@ +import datetime import os import platform import re @@ -174,7 +175,7 @@ class TimeoutError(RuntimeError): def __init__(self, timeout: int, *args): self.timeout = timeout - self.message = f"Timed out after {timeout}s." + self.message = f"Timed out after {datetime.timedelta(seconds=timeout)}." super(TimeoutError, self).__init__(timeout, *args) diff --git a/spras/errors.py b/spras/errors.py new file mode 100644 index 000000000..40d686927 --- /dev/null +++ b/spras/errors.py @@ -0,0 +1,75 @@ +""" +These are errors for the SPRAS workflow: we describe these as 'artifact' information, +as Snakemake produces artifacts, and the error/success status of these artifacts is associated with +a separate file, named `artifact-info.json`. Note that an `artifact-info.json` file is attached to a +part of the workflow run, which may produce multiple artifacts, and not a single artifact. + +This file makes some heavy use of pydantic discriminated unions and type adapters, +both of which happen to be described in the unions page: +https://pydantic.dev/docs/validation/latest/concepts/unions/#discriminated-unions +""" + +import json +from pathlib import Path +from typing import Annotated, Literal, Union + +from pydantic import BaseModel, Field, TypeAdapter + +from spras.util import LoosePathLike + +class TimeoutArtifactError(BaseModel): + # We can't use the key `type` without some extra pydantic aliasing. + error_type: Literal['timeout'] = 'timeout' + duration: int + +# NOTE: when we have several errors, replace this with Union[TimeoutError, , ...] +"""All possible distinguished errors.""" +ArtifactErrorOptions = TimeoutArtifactError + +class ArtifactError(BaseModel): + """ + One of the two variants of artifact information describing errors. See `ArtifactSuccess` for the other option. + + This variant is returned when we have a failing point in the workflow, + with `details` delegating to `ArtifactErrorOptions` for more information about the error. + """ + details: ArtifactErrorOptions = Field(discriminator="error_type") + status: Literal['error'] = 'error' + +class ArtifactSuccess(BaseModel): + """ + One of the two variants of artifact information describing successes. See `ArtifactError` for the other option. + + This variant only says that this part of the workflow succeeded. + """ + status: Literal['success'] = 'success' + +"""Describes what happened to a [potentially collection of] artifacts at a point in the workflow.""" +ArtifactInfo = Annotated[Union[ArtifactError, ArtifactSuccess], Field(discriminator="status")] +ArtifactInfoAdapter = TypeAdapter[ArtifactInfo](ArtifactInfo) + +# Collection of Snakemake utilities. + +def artifact_info_to_str(artifact_info: ArtifactInfo) -> str: + """Converts some `ArtifactInfo` into a string.""" + return json.dumps(ArtifactInfoAdapter.validate_python(artifact_info).model_dump(mode='json')) + +def artifact_info_from_file(file: LoosePathLike) -> ArtifactInfo: + """Converts a file into ArtifactInfo.""" + with open(file, 'r') as f: + return ArtifactInfoAdapter.validate_json(json.load(f)) + +def mark_error(file: LoosePathLike, artifact_error: ArtifactErrorOptions): + """Marks an artifact information file as an error with associated details.""" + Path(file).write_text(artifact_info_to_str(ArtifactError(details=artifact_error))) + +def mark_success(file: LoosePathLike): + """Marks an artifact information file as successful""" + Path(file).write_text(artifact_info_to_str(ArtifactSuccess())) + +def is_error(file: LoosePathLike): + """Checks if a file was produced by mark_error.""" + try: + return artifact_info_from_file(file).status == "error" + except ValueError: + return False From 071d4bd11652fd2fd4a12e0ba4298b06361fd925 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 03:04:25 +0000 Subject: [PATCH 36/59] style: fmt --- spras/errors.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spras/errors.py b/spras/errors.py index 40d686927..5b94ccf54 100644 --- a/spras/errors.py +++ b/spras/errors.py @@ -17,6 +17,7 @@ from spras.util import LoosePathLike + class TimeoutArtifactError(BaseModel): # We can't use the key `type` without some extra pydantic aliasing. error_type: Literal['timeout'] = 'timeout' From e42c3f0d54205e697a806c5698a84c669f34872b Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 03:17:46 +0000 Subject: [PATCH 37/59] fix: regenerate fordevs --- docs/design/errors.rst | 2 +- docs/fordevs/spras.config.rst | 36 +++++++++++++++++++++++++++++++++++ docs/fordevs/spras.rst | 27 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) diff --git a/docs/design/errors.rst b/docs/design/errors.rst index 009065f0d..4d436601f 100644 --- a/docs/design/errors.rst +++ b/docs/design/errors.rst @@ -39,5 +39,5 @@ With the above schema, we detail the ``details`` key below. Timeout ======= -Timeout has ``type: timeout``, with a single key ``duration``, which +Timeout has ``type: "timeout"``, with a single key ``duration``, which describes the ``timeout`` value associated to that run. diff --git a/docs/fordevs/spras.config.rst b/docs/fordevs/spras.config.rst index f7161447e..a0f6f2e9a 100644 --- a/docs/fordevs/spras.config.rst +++ b/docs/fordevs/spras.config.rst @@ -6,6 +6,15 @@ Submodules ************ +******************************** + spras.config.algorithms module +******************************** + +.. automodule:: spras.config.algorithms + :members: + :undoc-members: + :show-inheritance: + **************************** spras.config.config module **************************** @@ -15,6 +24,33 @@ :undoc-members: :show-inheritance: +************************************** + spras.config.container_schema module +************************************** + +.. automodule:: spras.config.container_schema + :members: + :undoc-members: + :show-inheritance: + +***************************** + spras.config.dataset module +***************************** + +.. automodule:: spras.config.dataset + :members: + :undoc-members: + :show-inheritance: + +****************************** + spras.config.revision module +****************************** + +.. automodule:: spras.config.revision + :members: + :undoc-members: + :show-inheritance: + **************************** spras.config.schema module **************************** diff --git a/docs/fordevs/spras.rst b/docs/fordevs/spras.rst index 8bd5060fe..d63f52e9c 100644 --- a/docs/fordevs/spras.rst +++ b/docs/fordevs/spras.rst @@ -52,6 +52,15 @@ :undoc-members: :show-inheritance: +********************** + spras.diamond module +********************** + +.. automodule:: spras.diamond + :members: + :undoc-members: + :show-inheritance: + ********************* spras.domino module ********************* @@ -61,6 +70,15 @@ :undoc-members: :show-inheritance: +********************* + spras.errors module +********************* + +.. automodule:: spras.errors + :members: + :undoc-members: + :show-inheritance: + ************************* spras.evaluation module ************************* @@ -142,6 +160,15 @@ :undoc-members: :show-inheritance: +************************ + spras.profiling module +************************ + +.. automodule:: spras.profiling + :members: + :undoc-members: + :show-inheritance: + ************************** spras.responsenet module ************************** From 28cfec0fe260b902aefee78661929c2f4f40e8b2 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 09:17:00 +0000 Subject: [PATCH 38/59] feat: `RunSettings` --- Snakefile | 2 +- config/config.yaml | 55 ++++++++++++--------- config/egfr.yaml | 84 ++++++++++++++++++-------------- docs/timeout.rst | 41 ++++++++++++++-- docs/tutorial/beginner.rst | 14 +++--- spras/config/algorithms.py | 42 +++++++++++----- spras/config/config.py | 29 ++++++----- test/analysis/input/egfr.yaml | 10 ++-- test/analysis/input/example.yaml | 15 +++--- 9 files changed, 190 insertions(+), 102 deletions(-) diff --git a/Snakefile b/Snakefile index 9575bcacd..a901f64bb 100644 --- a/Snakefile +++ b/Snakefile @@ -286,7 +286,7 @@ rule reconstruct: # Get the timeout from the config and use it as an input. # TODO: This has unexpected behavior when this rule succeeds but the timeout extends, # making this rule run again. - timeout = lambda wildcards: _config.config.algorithm_timeouts[wildcards.algorithm] + timeout = lambda wildcards: _config.config.algorithm_param_timeouts[wildcards.params.split("-")[1]] run: # Create a copy so that the updates are not written to the parameters logfile algorithm_params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() diff --git a/config/config.yaml b/config/config.yaml index 9c5ddbaea..b0e1df94e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -73,41 +73,47 @@ algorithms: include: true runs: run1: - k: range(100,201,100) + params: + k: range(100,201,100) - name: "omicsintegrator1" include: true runs: run1: - b: [5, 6] - w: np.linspace(0,5,2) - d: 10 - dummy_mode: "file" # Or "terminals", "all", "others" + params: + b: [5, 6] + w: np.linspace(0,5,2) + d: 10 + dummy_mode: "file" # Or "terminals", "all", "others" - name: "omicsintegrator2" include: true runs: run1: - b: 4 - g: 0 + params: + b: 4 + g: 0 run2: - b: 2 - g: 3 + params: + b: 2 + g: 3 - name: "meo" include: true runs: run1: - max_path_length: 3 - local_search: true - rand_restarts: 10 + params: + max_path_length: 3 + local_search: true + rand_restarts: 10 - name: "mincostflow" include: true runs: run1: - flow: 1 - capacity: 1 + params: + flow: 1 + capacity: 1 - name: "allpairs" include: true @@ -117,22 +123,25 @@ algorithms: include: true runs: run1: - slice_threshold: 0.3 - module_threshold: 0.05 + params: + slice_threshold: 0.3 + module_threshold: 0.05 - name: "strwr" include: true runs: run1: - alpha: [0.85] - threshold: [100, 200] + params: + alpha: [0.85] + threshold: [100, 200] - name: "rwr" include: true runs: run1: - alpha: [0.85] - threshold: [100, 200] + params: + alpha: [0.85] + threshold: [100, 200] - name: "bowtiebuilder" include: true @@ -141,12 +150,14 @@ algorithms: include: true runs: run1: - gamma: [10] + params: + gamma: [10] - name: "diamond" include: true runs: run1: - n: 1 + params: + n: 1 # Here we specify which pathways to run and other file location information. # DataLoader.py can currently only load a single dataset diff --git a/config/egfr.yaml b/config/egfr.yaml index b93c593c4..623b0121f 100644 --- a/config/egfr.yaml +++ b/config/egfr.yaml @@ -31,77 +31,89 @@ algorithms: include: true runs: run1: - k: - - 10 - - 20 - - 70 + params: + k: + - 10 + - 20 + - 70 - name: omicsintegrator1 include: true runs: run1: - b: - - 0.55 - - 2 - - 10 - d: 10 - g: 1e-3 - r: 0.01 - w: 0.1 - mu: 0.008 - dummy_mode: ["file"] + params: + b: + - 0.55 + - 2 + - 10 + d: 10 + g: 1e-3 + r: 0.01 + w: 0.1 + mu: 0.008 + dummy_mode: ["file"] - name: omicsintegrator2 include: true runs: run1: - b: 4 - g: 0 + params: + b: 4 + g: 0 run2: - b: 2 - g: 3 + params: + b: 2 + g: 3 - name: meo include: true runs: run1: - local_search: true - max_path_length: 3 - rand_restarts: 10 + params: + local_search: true + max_path_length: 3 + rand_restarts: 10 run2: - local_search: false - max_path_length: 2 - rand_restarts: 10 + params: + local_search: false + max_path_length: 2 + rand_restarts: 10 - name: allpairs include: true - name: domino include: true runs: run1: - slice_threshold: 0.3 - module_threshold: 0.05 + params: + slice_threshold: 0.3 + module_threshold: 0.05 - name: mincostflow include: true runs: run1: - capacity: 15 - flow: 80 + params: + capacity: 15 + flow: 80 run2: - capacity: 1 - flow: 6 + params: + capacity: 1 + flow: 6 run3: - capacity: 5 - flow: 60 + params: + capacity: 5 + flow: 60 - name: "strwr" include: true runs: run1: - alpha: [0.85] - threshold: [100, 200] + params: + alpha: [0.85] + threshold: [100, 200] - name: "rwr" include: true runs: run1: - alpha: [0.85] - threshold: [100, 200] + params: + alpha: [0.85] + threshold: [100, 200] - name: "bowtiebuilder" include: false diff --git a/docs/timeout.rst b/docs/timeout.rst index e7a85a1eb..a2e4b64f1 100644 --- a/docs/timeout.rst +++ b/docs/timeout.rst @@ -3,13 +3,17 @@ ########## The SPRAS global configuration can take optional per-algorithm timeouts. -For example, to give the AllPairs algorithm a 1 day timeout: +For example, to give a specific run of the PathLinker algorithm a 1 day +timeout: .. code:: yaml - - name: "allpairs" + - name: "pathlinker" include: true - timeout: 1d + runs: + run1: + timeout: 1d + k: 200 The timeout string parsing is delegated to `pytimeparse `__ (examples linked here). This @@ -20,3 +24,34 @@ due to running too long. **NOTE**: This feature only works with docker and apptainer/singularity at the time of writing, not dsub. + +********************* + Configuration notes +********************* + +Since ``timeout`` is a run parameter, it can also be moved to the top +level of an algorithm configuration, where that value will become the +default unless otherwise specified in a specific run. + +.. code:: yaml + + - name: "pathlinker" + include: true + timeout: 1d + runs: + run1: + # this uses timeout: 2d + timeout: 2d + k: 200 + run2: + # this uses timeout: 1d + k: 100 + +This is also useful for algorithms which take in no parameters, such as +``allpairs``: + +.. code:: yaml + + - name: "allpairs" + include: true + timeout: 1d diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst index a0846666f..f4de401d9 100644 --- a/docs/tutorial/beginner.rst +++ b/docs/tutorial/beginner.rst @@ -161,13 +161,15 @@ Algorithms params: include: true run1: - b: 0.1 - d: 10 - g: 1e-3 + params: + b: 0.1 + d: 10 + g: 1e-3 run2: - b: [0.55, 2, 10] - d: [10, 20] - g: 1e-3 + params: + b: [0.55, 2, 10] + d: [10, 20] + g: 1e-3 When defining an algorithm in the configuration file, its name must match one of the supported SPRAS algorithms. Each algorithm includes an diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 5abb4dbf6..af2910ee1 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -22,6 +22,12 @@ # This contains the dynamically generated algorithm schema for use in `schema.py` __all__ = ['AlgorithmUnion'] +class RunSettings(BaseModel): + """The associated timeout with a run, parsed with `pytimeparse`.""" + timeout: Optional[str] = None + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) + def is_numpy_friendly(type: type[Any] | None) -> bool: """ Whether the passed in type can have any numpy helpers. @@ -100,12 +106,6 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod - Ranges and other convenient calls are expanded (see `python_evalish_coerce`) """ - # Get the default model instance by trying to serialize the empty dictionary - try: - model_default = model.model_validate({}) - except ValidationError: - model_default = None - # First, we need to take our 'model' and coerce it to permit parameter combinations. # This assumes that all of the keys are flattened, so we only get a structure like so: # class AlgorithmParams(BaseModel): @@ -151,12 +151,30 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # Pass this as kwargs to create_model, which usually takes in parameters field_name=type. # We do need to cast create_model, since otherwise the type-checker complains that we may # have had a key that starts with __ in mapped_list_fields. The above assertion prevents this. - run_model = (cast(Any, create_model))( - f'{name}RunModel', + params_model = (cast(Any, create_model))( + f'{name}ParamModel', __config__=ConfigDict(extra='forbid'), **mapped_list_field ) + # Get the default model instance by trying to serialize the empty dictionary + try: + params_model_default = params_model.model_validate({}) + except ValidationError: + params_model_default = None + + # Then, we create a wrapping `run_model` which contains our params_model, + # as well as any associated options with an individual run. + run_model = create_model( + f'{name}RunModel', + params=(params_model, params_model_default), + __base__=RunSettings, + __config__=ConfigDict(extra='forbid') + ) + + # We use `model_validate` instead of the `run_model` constructor since `run_model` is based off of `RunSettings` + run_model_default = None if params_model_default is None else run_model.model_validate({"params": params_model_default}) + # Here is an example of how this would look like inside config.yaml # name: pathlinker # include: true @@ -167,7 +185,6 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod return create_model( f'{name}Model', name=Literal[name], - timeout=(Optional[str], None), include=bool, # For algorithms that have a default parameter config, we allow arbitrarily running an algorithm # if no runs are specified. For example, the following config @@ -175,8 +192,11 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # include: true # will run, despite there being no entries in `runs`. # (create_model entries take in either a type or (type, default)). - runs=dict[str, run_model] if model_default is None else (dict[str, run_model], {"default": model_default}), - __config__=ConfigDict(extra='forbid') + runs=dict[str, run_model] if run_model_default is None else (dict[str, run_model], {"default": run_model_default}), + __config__=ConfigDict(extra='forbid'), + # Note that both entire algorithms and their runs inherit from `RunSettings`, to allow default runs such as + # `allpairs` to have run-specific settings (e.g. allpairs with timeout.) + __base__=RunSettings ) algorithm_models: list[type[BaseModel]] = [construct_algorithm_model(name, model.get_params_generic()) for name, model in algorithms.items()] diff --git a/spras/config/config.py b/spras/config/config.py index ddaea2b1e..f30a7aacb 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -62,8 +62,8 @@ def __init__(self, raw_config: dict[str, Any]): self.hash_length = parsed_raw_config.hash_length # Container settings used by PRMs. self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length) - # Dictionary of algorithms to their respective timeout in seconds - self.algorithm_timeouts: dict[str, Optional[int]] = dict() + # Dictionary of parameter hashes to their respective timeout in seconds + self.algorithm_param_timeouts: dict[str, Optional[int]] = dict() # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() @@ -176,16 +176,6 @@ def process_algorithms(self, raw_config: RawConfig): # Do not parse the rest of the parameters for this algorithm if it is not included continue - if alg.timeout: - # Coerce to an `int` if an int isn't possible. - timeout = parse(alg.timeout, granularity='seconds') - if not timeout: raise RuntimeError(f"Algorithm {alg} has unparsable timeout string {alg.timeout}.") - self.algorithm_timeouts[alg.name] = int(timeout) - else: - # As per the type signature, we still want to say explicitly that this algorithm's timeout - # is uninhabited. - self.algorithm_timeouts[alg.name] = None - runs: dict[str, Any] = alg.runs # Each set of runs should be 1 level down in the config file @@ -195,7 +185,7 @@ def process_algorithms(self, raw_config: RawConfig): # We create the product of all param combinations for each run param_name_list = [] # We convert our run parameters to a dictionary, allowing us to iterate over it - run_subscriptable = vars(runs[run_name]) + run_subscriptable = vars(runs[run_name].params) for param in run_subscriptable: param_name_list.append(param) # this is guaranteed to be list[Any] by algorithms.py @@ -231,6 +221,19 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_params[alg.name][params_hash] = run_dict + # We finalize by handling any associated information to each parameter hash. + # At the time, this is only timeouts: + timeout = runs[run_name].timeout or alg.timeout + if timeout: + # Coerce to an `int` if an int isn't possible. + parsed_timeout = parse(timeout, granularity='seconds') + if not parsed_timeout: raise RuntimeError(f"Algorithm {alg} has unparsable timeout string '{parsed_timeout}'.") + self.algorithm_param_timeouts[params_hash] = int(parsed_timeout) + else: + # As per the type signature, we still want to say explicitly that this algorithm's timeout + # is uninhabited. + self.algorithm_param_timeouts[params_hash] = None + def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return diff --git a/test/analysis/input/egfr.yaml b/test/analysis/input/egfr.yaml index c9ed5f735..46bbfe001 100644 --- a/test/analysis/input/egfr.yaml +++ b/test/analysis/input/egfr.yaml @@ -12,14 +12,16 @@ algorithms: include: true runs: run1: - k: [10, 20] + params: + k: [10, 20] - name: meo include: true runs: run1: - local_search: true - max_path_length: 3 - rand_restarts: 10 + params: + local_search: true + max_path_length: 3 + rand_restarts: 10 datasets: - data_dir: "input" edge_files: diff --git a/test/analysis/input/example.yaml b/test/analysis/input/example.yaml index 1a4514c00..e80097cf3 100644 --- a/test/analysis/input/example.yaml +++ b/test/analysis/input/example.yaml @@ -12,22 +12,25 @@ algorithms: include: true runs: run1: - k: range(100,201,100) + params: + k: range(100,201,100) - name: "meo" include: true runs: run1: - max_path_length: [3] - local_search: ["Yes"] - rand_restarts: [10] + params: + max_path_length: [3] + local_search: ["Yes"] + rand_restarts: [10] - name: "mincostflow" include: true runs: run1: - flow: [1] # The flow must be an int - capacity: [1] + params: + flow: [1] # The flow must be an int + capacity: [1] - name: "allpairs" include: true From 5291335f4fbb2e8de6d5270d3159c334dfb94bcc Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 09:51:39 +0000 Subject: [PATCH 39/59] test(test_config): use correct config --- test/test_config.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_config.py b/test/test_config.py index 41551c381..9e7ff40cc 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -66,27 +66,27 @@ def get_test_config(): "name": "omicsintegrator2", "include": True, "runs": { - "strings": {"dummy_mode": ["terminals", "others"], "b": 3}, + "strings": {"params": {"dummy_mode": ["terminals", "others"], "b": 3}}, # spacing in np.linspace is on purpose - "singleton_string_np_linspace": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}, - "str_array_np_logspace": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"} + "singleton_string_np_linspace": {"params": {"dummy_mode": "terminals", "b": "np.linspace(0, 5,2,)"}}, + "str_array_np_logspace": {"params": {"dummy_mode": ["others", "all"], "g": "np.logspace(1,1)"}} } }, { "name": "meo", "include": True, "runs": { - "numbersAndBoolsDuplicate": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, - "numbersAndBool": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}, - "numbersAndBools": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}, - "boolArrTest": {"local_search": [True, False], "max_path_length": "range(1, 3)"} + "numbersAndBoolsDuplicate": {"params": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}}, + "numbersAndBool": {"params": {"max_path_length": 2, "rand_restarts": [float(2.0), 3], "local_search": [True]}}, + "numbersAndBools": {"params": {"max_path_length": 1, "rand_restarts": [float(2.0), 3], "local_search": [True, False]}}, + "boolArrTest": {"params": {"local_search": [True, False], "max_path_length": "range(1, 3)"}} } }, { "name": "mincostflow", "include": True, "runs": { - "int64artifact": {"flow": "np.arange(5, 7)", "capacity": [2, 3]} + "int64artifact": {"params": {"flow": "np.arange(5, 7)", "capacity": [2, 3]}} } }, ], From efa45fcb7474f79f8f8532b74ac9fe03f6d6347c Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 10:06:10 +0000 Subject: [PATCH 40/59] test(generate_inputs): fix config --- test/generate-inputs/inputs/test_config.yaml | 37 ++++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/test/generate-inputs/inputs/test_config.yaml b/test/generate-inputs/inputs/test_config.yaml index 33900bdb8..a4aa1e819 100644 --- a/test/generate-inputs/inputs/test_config.yaml +++ b/test/generate-inputs/inputs/test_config.yaml @@ -11,40 +11,46 @@ algorithms: include: true runs: run1: - k: range(100,201,100) + params: + k: range(100,201,100) - name: "omicsintegrator1" include: true runs: run1: - b: [5, 6] - w: np.linspace(0,5,2) - d: 10 + params: + b: [5, 6] + w: np.linspace(0,5,2) + d: 10 - name: "omicsintegrator2" include: true runs: run1: - b: 4 - g: 0 + params: + b: 4 + g: 0 run2: - b: 2 - g: 3 + params: + b: 2 + g: 3 - name: "meo" include: true runs: run1: - max_path_length: 3 - local_search: true - rand_restarts: 10 + params: + max_path_length: 3 + local_search: true + rand_restarts: 10 - name: "mincostflow" include: true runs: run1: - flow: 1 # The flow must be an int - capacity: 1 + params: + flow: 1 # The flow must be an int + capacity: 1 - name: "allpairs" include: true @@ -53,8 +59,9 @@ algorithms: include: true runs: run1: - slice_threshold: 0.3 - module_threshold: 0.05 + params: + slice_threshold: 0.3 + module_threshold: 0.05 datasets: - # Labels can only contain letters, numbers, or underscores From cae2eddd83548913de5be33e5078c97ea1cbf552 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 03:13:22 -0700 Subject: [PATCH 41/59] docs(algorithms): fix doc ordering --- spras/config/algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index af2910ee1..ad575ba95 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -23,8 +23,10 @@ __all__ = ['AlgorithmUnion'] class RunSettings(BaseModel): - """The associated timeout with a run, parsed with `pytimeparse`.""" + """All of the non-parameter settings associated with a run.""" + timeout: Optional[str] = None + """The associated timeout with a run, parsed with `pytimeparse`.""" model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) From 0475d45bf4709a341466cba595296bbb9c9c90e7 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 10:34:09 +0000 Subject: [PATCH 42/59] refactor: move params index grabbing into func --- Snakefile | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Snakefile b/Snakefile index a901f64bb..2534c12c1 100644 --- a/Snakefile +++ b/Snakefile @@ -48,11 +48,14 @@ def algo_has_mult_param_combos(algo): algorithms_mult_param_combos = [algo for algo in algorithms if algo_has_mult_param_combos(algo)] +# Gets the associated parameter hash out of a params wildcard +def params_index(params_hash): + return params_hash.replace('params-', '') + # Get the parameter dictionary for the specified # algorithm and parameter combination hash def reconstruction_params(algorithm, params_hash): - index = params_hash.replace('params-', '') - return algorithm_params[algorithm][index] + return algorithm_params[algorithm][params_index(params_hash)] # Log the parameter dictionary for this parameter configuration in a yaml file def write_parameter_log(algorithm, param_label, logfile): @@ -286,7 +289,7 @@ rule reconstruct: # Get the timeout from the config and use it as an input. # TODO: This has unexpected behavior when this rule succeeds but the timeout extends, # making this rule run again. - timeout = lambda wildcards: _config.config.algorithm_param_timeouts[wildcards.params.split("-")[1]] + timeout = lambda wildcards: _config.config.algorithm_param_timeouts[params_index(wildcards.params)] run: # Create a copy so that the updates are not written to the parameters logfile algorithm_params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() From d5ea035e7a3fc24de77206888721fc13d8ffa583 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 22:38:18 +0000 Subject: [PATCH 43/59] chore: bump down dsub --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 197fe2146..f069b1c85 100644 --- a/environment.yml +++ b/environment.yml @@ -36,4 +36,4 @@ dependencies: - sphinx-rtd-theme=3.1.0 - pip: - - dsub==0.5.2 + - dsub==0.4.13 From 8629e0976ff78208032f6695993c415de70ffac3 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 22:44:19 +0000 Subject: [PATCH 44/59] refactor: move generic parameter validation into its `get_params_generic` getter --- spras/prm.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/spras/prm.py b/spras/prm.py index 18f3c8a9f..63516d5fd 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -1,7 +1,8 @@ import os from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Generic, Mapping, Optional, TypeVar, cast, get_args +from types import get_original_bases +from typing import Any, Generic, Mapping, Optional, TypeVar, cast from pydantic import BaseModel @@ -52,10 +53,19 @@ def get_params_generic(cls) -> type[T]: For example, on `class PathLinker(PRM[PathLinkerParams])`, calling `PathLinker.get_params_generic()` returns `PathLinkerParams`. """ - # TODO: use the type-safe get_original_bases when we bump to >= Python 3.12 - # This is hacky reflection from https://stackoverflow.com/a/71720366/7589775 - # which grabs the class of type T by the definition of `__orig_bases__`. - return get_args(cast(Any, cls).__orig_bases__[0])[0] + original_bases = get_original_bases(cls) + + # Since we just used reflection, we provide a few mountain-dewey error messages here + # to protect against any developer confusion. + assert len(original_bases) == 1, "There were several generics passed into PRM, when precisely one is required." + T_class = original_bases[0] + + if not issubclass(T_class, BaseModel): + raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") + + # We finalize with a cast, since issubclass does an over-eager casting to + # its specified type. + return cast(type[T], T_class) # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. @@ -66,11 +76,6 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o """ T_class = cls.get_params_generic() - # Since we just used reflection, we provide a mountain-dewey error message here - # to protect against any developer confusion. - if not issubclass(T_class, BaseModel): - raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") - # Validates our untyped `args` parameter against our parameter class of type T # using BaseModel.model_validate (https://docs.pydantic.dev/latest/api/base_model/#pydantic.BaseModel.model_validate) # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) From cfa90119b3d1a26c5790c36bec598d33d372272e Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 4 May 2026 22:53:58 +0000 Subject: [PATCH 45/59] fix: drop useless cast i was Very wrong about that --- spras/prm.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spras/prm.py b/spras/prm.py index 63516d5fd..a68074d15 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -60,12 +60,10 @@ def get_params_generic(cls) -> type[T]: assert len(original_bases) == 1, "There were several generics passed into PRM, when precisely one is required." T_class = original_bases[0] - if not issubclass(T_class, BaseModel): + if not issubclass(BaseModel, T_class): raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") - # We finalize with a cast, since issubclass does an over-eager casting to - # its specified type. - return cast(type[T], T_class) + return T_class # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. From 1b4b789ec0fe509f1e2dde3ffef8edcdbd36b185 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 5 May 2026 01:23:18 +0000 Subject: [PATCH 46/59] fix: actually correct subclass logic get_original_bases does _not_ have the same signature as its internal reflection counterpart! --- spras/dataset.py | 5 ++--- spras/prm.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/spras/dataset.py b/spras/dataset.py index ddf74736f..3e9975f3d 100644 --- a/spras/dataset.py +++ b/spras/dataset.py @@ -1,7 +1,7 @@ import os import pickle as pkl import warnings -from typing import Union +from typing import Self, Union import pandas as pd @@ -60,9 +60,8 @@ def to_file(self, file: LoosePathLike): with open(file, "wb") as f: pkl.dump(self, f) - # NOTE: When we bump to Python 3.13, we can use the reference Dataset instead of the literal "Dataset" for typing. @classmethod - def from_file(cls, file: Union[LoosePathLike, "Dataset"]): + def from_file(cls, file: Union[LoosePathLike, Self]): """ Loads dataset object from a pickle file or another `Dataset` object. Usage: dataset = Dataset.from_file(pickle_file) diff --git a/spras/prm.py b/spras/prm.py index a68074d15..5d595e0c0 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -2,7 +2,7 @@ from abc import ABC, abstractmethod from pathlib import Path from types import get_original_bases -from typing import Any, Generic, Mapping, Optional, TypeVar, cast +from typing import Any, Generic, Mapping, Optional, TypeVar, cast, get_args from pydantic import BaseModel @@ -53,17 +53,22 @@ def get_params_generic(cls) -> type[T]: For example, on `class PathLinker(PRM[PathLinkerParams])`, calling `PathLinker.get_params_generic()` returns `PathLinkerParams`. """ + # This gives us (PRM[PathLinkerParams], ) original_bases = get_original_bases(cls) # Since we just used reflection, we provide a few mountain-dewey error messages here # to protect against any developer confusion. - assert len(original_bases) == 1, "There were several generics passed into PRM, when precisely one is required." - T_class = original_bases[0] + assert len(original_bases) == 1, f"{cls} inherits from several classes, when precisely one is required." + original_bases_args = get_args(original_bases[0]) + assert len(original_bases_args) == 1, "There were several generics passed into PRM, when precisely one is required." + T_class, = original_bases_args - if not issubclass(BaseModel, T_class): + if not issubclass(T_class, BaseModel): raise RuntimeError("The generic passed into PRM is not a pydantic.BaseModel.") - return T_class + # Finally, we cast, since issubclass overeagerly restricts T_class to type[BaseModel] + # instead of type[T] without imposing the restriction that T inherits from BaseModel + return cast(type[T], T_class) # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. From 41ecd1c5d80dbf92cd8dd3060b4bc688252bcf43 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 5 May 2026 01:30:06 +0000 Subject: [PATCH 47/59] fix(responsenet): drop use of axis on drop pandas requires precisely one of (argument-based list specification) or (axis specification) --- spras/responsenet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spras/responsenet.py b/spras/responsenet.py index f53b59843..f8bc3598c 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -136,7 +136,7 @@ def parse_output(raw_pathway_file, standardized_pathway_file, params): df = raw_pathway_df(raw_pathway_file, sep='\t', header=0) if not df.empty: df.columns = ['Node1', 'Node2', 'Flow'] - df = df.drop(columns=['Flow'], axis=1) + df = df.drop(columns=['Flow']) df = add_rank_column(df) # ResponseNet's outputs should be treated as undirected outputs. df = reinsert_direction_col_undirected(df) From 5da3967eda45c4bf42958ddbc22565d9eff6e4a8 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Tue, 5 May 2026 04:29:23 +0000 Subject: [PATCH 48/59] ci: bump miniconda Trying to resolve mac errors --- .github/workflows/release.yml | 2 +- .github/workflows/test-spras.yml | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cf4ee6157..fbb30271c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 diff --git a/.github/workflows/test-spras.yml b/.github/workflows/test-spras.yml index db6d1b0fc..b5bd1b74e 100644 --- a/.github/workflows/test-spras.yml +++ b/.github/workflows/test-spras.yml @@ -12,13 +12,13 @@ jobs: os: [macos-latest, windows-latest] steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install conda environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v4 with: activate-environment: spras environment-file: environment.yml - auto-activate-base: false + auto-activate: false miniconda-version: 'latest' - name: Log conda environment # Log conda environment contents @@ -49,11 +49,11 @@ jobs: sudo docker image prune --all --force sudo docker builder prune -a - name: Install conda environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v4 with: activate-environment: spras environment-file: environment.yml - auto-activate-base: false + auto-activate: false miniconda-version: 'latest' - name: Install spras in conda env # Install spras in the environment using pip @@ -85,11 +85,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - name: Install conda environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v4 with: activate-environment: spras environment-file: environment.yml - auto-activate-base: false + auto-activate: false miniconda-version: 'latest' # Install spras in the environment using pip - name: Install spras in conda env @@ -110,11 +110,11 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 - name: Install conda environment - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v4 with: activate-environment: spras environment-file: environment.yml - auto-activate-base: false + auto-activate: false miniconda-version: 'latest' - name: Run pre-commit shell: bash --login {0} From 114b6088277c2a4a238dc0e09ac6c86c0355b3b0 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 7 May 2026 06:34:52 +0000 Subject: [PATCH 49/59] feat: conditional runs --- Snakefile | 36 +++++++++++++++++++++++++++++------- spras/config/algorithms.py | 7 +++++++ spras/config/config.py | 21 +++++++++++++++++++++ spras/errors.py | 7 +++++-- 4 files changed, 62 insertions(+), 9 deletions(-) diff --git a/Snakefile b/Snakefile index 2534c12c1..708c5c311 100644 --- a/Snakefile +++ b/Snakefile @@ -10,7 +10,7 @@ from spras.evaluation import Evaluation from spras.analysis import ml, summary, cytoscape from spras.config.revision import detach_spras_revision import spras.config.config as _config -from spras.errors import mark_error, mark_success, is_error, TimeoutArtifactError +from spras.errors import mark_error, mark_success, is_error, TimeoutArtifactError, FailedDependencyError # Snakemake updated the behavior in the 6.5.0 release https://github.com/snakemake/snakemake/pull/1037 # and using the wrong separator prevents Snakemake from matching filenames to the rules that can produce them @@ -269,13 +269,27 @@ def collect_prepared_input(wildcards): return prepared_inputs +def collect_dependent_artifact_info(wildcards): + # Get the associated runs that this run depends on + dependent_runs = _config.config.conditional_run_dependencies[params_index(wildcards.params)] + return [ + SEP.join([out_dir, f'{wildcards.dataset}-{wildcards.algorithm}-params-{params}', 'artifact-log.json']) + for params in dependent_runs + ] + def filter_successful(files): """Convenient function for filtering iterators by whether or not their items are error files.""" return [file for file in files if not is_error(file)] +def filter_error(files): + """This function is precisely described as the list difference of `files` and `filter_successful(files)`""" + return [file for file in files if is_error(file)] + # Run the pathway reconstruction algorithm rule reconstruct: - input: collect_prepared_input + input: + prepared_files=collect_prepared_input, + required_artifact_info=collect_dependent_artifact_info # Each reconstruct call should be in a separate output subdirectory that is unique for the parameter combination so # that multiple instances of the container can run simultaneously without overwriting the output files # Overwriting files can happen because the pathway reconstruction algorithms often generate output files with the @@ -291,10 +305,21 @@ rule reconstruct: # making this rule run again. timeout = lambda wildcards: _config.config.algorithm_param_timeouts[params_index(wildcards.params)] run: + successful_runs = filter_successful(input.required_artifact_info) + errorful_runs = filter_error(input.required_artifact_info) + # NOTE: conditionals happen as a big OR statement, so we are looking for at least one success. + if len(successful_runs) == 0 and len(errorful_runs) != 0: + # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) + mark_error(output.artifact_info, FailedDependencyError(failing_dependencies=errorful_runs)) + # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'artifact info' file, + # which contains the status (success/failure) of specific Snakemake jobs. + # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` function. + Path(output.pathway_file).touch() + # Create a copy so that the updates are not written to the parameters logfile algorithm_params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() # Declare the input files as a dictionary. - inputs = dict(zip(runner.get_required_inputs(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)), *{input}, strict=True)) + inputs = dict(zip(runner.get_required_inputs(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm)), *{input.prepared_files}, strict=True)) # Remove the _spras_run_name parameter added for keeping track of the run name for parameters.yml if '_spras_run_name' in algorithm_params: algorithm_params.pop('_spras_run_name') @@ -302,11 +327,8 @@ rule reconstruct: runner.run(detach_spras_revision(_config.config.immutable_files, wildcards.algorithm), inputs, output.pathway_file, algorithm_params, container_settings, params.timeout) mark_success(output.artifact_info) except TimeoutError as err: - # We don't raise the error here (analogous to `--keep-going`, except we avoid unnecessarily re-running this rule.) + # See the above notes on conditional runs for precisely why we write this as is. mark_error(output.artifact_info, TimeoutArtifactError(duration=params.timeout)) - # and we touch pathway_file still: Snakemake doesn't have optional files, so we output a 'artifact info' file, - # which contains the status (success/failure) of specific Snakemake jobs. - # We filter for the successful files (such as ones that didn't time out) with the `filter_successful` function. Path(output.pathway_file).touch() # Original pathway reconstruction output to universal output diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index ad575ba95..6c742cffb 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -28,6 +28,13 @@ class RunSettings(BaseModel): timeout: Optional[str] = None """The associated timeout with a run, parsed with `pytimeparse`.""" + conditionals: list[str] = Field(alias="if", default_factory=lambda: []) + """ + If any of the specified runs in this list succeed, then this run itself is permitted to run. + We refer to these as 'conditional runs,' and since Python reserves the `if` keyword, we call + them "conditionals" for short in-code, but users interface with this using `if`. + """ + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) def is_numpy_friendly(type: type[Any] | None) -> bool: diff --git a/spras/config/config.py b/spras/config/config.py index f30a7aacb..1625baf07 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -64,6 +64,8 @@ def __init__(self, raw_config: dict[str, Any]): self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length) # Dictionary of parameter hashes to their respective timeout in seconds self.algorithm_param_timeouts: dict[str, Optional[int]] = dict() + # Dictionary of algorithm runs with their associated conditional requirements + self.conditional_run_dependencies: dict[str, set[str]] = dict() # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() @@ -168,6 +170,12 @@ def process_algorithms(self, raw_config: RawConfig): # We copy raw_config.algorithms to avoid mutating the original config # when we attach the SPRAS revision to algorithm names later. for alg in raw_config.algorithms[:]: + # We later use these dictionary to build up our conditional runs dictionary, + # choosing to handle it in the algorithms loop to avoid having to handle run name + # conflicts throughout separate algorithms. + unexpanded_conditional_run_dependencies: dict[str, set[str]] = dict() + run_name_hashes: dict[str, set[str]] = dict() + alg.name = attach_spras_revision(self.immutable_files, alg.name) if alg.include: # This dict maps from parameter combinations hashes to parameter combination dictionaries @@ -182,6 +190,9 @@ def process_algorithms(self, raw_config: RawConfig): for run_name in runs.keys(): all_runs = [] + # Since we have to build up our runs first, we save the dictionary expansion until after we loop through all runs. + unexpanded_conditional_run_dependencies[run_name] = set(runs[run_name].conditionals).union(set(alg.conditionals)) + # We create the product of all param combinations for each run param_name_list = [] # We convert our run parameters to a dictionary, allowing us to iterate over it @@ -221,6 +232,9 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_params[alg.name][params_hash] = run_dict + run_name_hashes.setdefault(run_name, set()) + run_name_hashes[run_name].add(params_hash) + # We finalize by handling any associated information to each parameter hash. # At the time, this is only timeouts: timeout = runs[run_name].timeout or alg.timeout @@ -234,6 +248,13 @@ def process_algorithms(self, raw_config: RawConfig): # is uninhabited. self.algorithm_param_timeouts[params_hash] = None + for key, values in unexpanded_conditional_run_dependencies.items(): + for key_run_hash in run_name_hashes[key]: + self.conditional_run_dependencies.setdefault(key_run_hash, set()) + for value in values: + for value_run_hash in run_name_hashes[value]: + self.conditional_run_dependencies[key_run_hash].add(value_run_hash) + def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: return diff --git a/spras/errors.py b/spras/errors.py index 5b94ccf54..e45bf7faf 100644 --- a/spras/errors.py +++ b/spras/errors.py @@ -23,9 +23,12 @@ class TimeoutArtifactError(BaseModel): error_type: Literal['timeout'] = 'timeout' duration: int -# NOTE: when we have several errors, replace this with Union[TimeoutError, , ...] +class FailedDependencyError(BaseModel): + error_type: Literal['depended'] = 'depended' + failing_dependencies: list[str] + """All possible distinguished errors.""" -ArtifactErrorOptions = TimeoutArtifactError +ArtifactErrorOptions = Union[TimeoutArtifactError, FailedDependencyError] class ArtifactError(BaseModel): """ From 91fe5eb52e655047b1971b818448bfb20095a21d Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Thu, 7 May 2026 08:06:46 +0000 Subject: [PATCH 50/59] fix: actually pass in run settings why did i not do this before??? --- spras/allpairs.py | 6 ++++-- spras/btb.py | 6 ++++-- spras/config/algorithms.py | 11 ++++++++++- spras/config/config.py | 22 +++++++--------------- spras/diamond.py | 6 ++++-- spras/domino.py | 10 ++++++---- spras/meo.py | 8 +++++--- spras/mincostflow.py | 8 +++++--- spras/omicsintegrator1.py | 6 ++++-- spras/omicsintegrator2.py | 8 +++++--- spras/pathlinker.py | 8 +++++--- spras/prm.py | 9 +++++---- spras/responsenet.py | 8 +++++--- spras/runner.py | 7 ++++--- spras/rwr.py | 6 ++++-- spras/strwr.py | 6 ++++-- 16 files changed, 81 insertions(+), 54 deletions(-) diff --git a/spras/allpairs.py b/spras/allpairs.py index 6358c6064..938a8c992 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,6 +1,7 @@ import warnings from pathlib import Path +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log @@ -72,8 +73,9 @@ def generate_inputs(data: Dataset, filename_map): header=["#Interactor1", "Interactor2", "Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() AllPairs.validate_required_run_args(inputs) work_dir = '/apsp' @@ -110,7 +112,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) @staticmethod def parse_output(raw_pathway_file, standardized_pathway_file, params): diff --git a/spras/btb.py b/spras/btb.py index ce6551ea5..66a848853 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,5 +1,6 @@ from pathlib import Path +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log @@ -61,8 +62,9 @@ def generate_inputs(data, filename_map): # Skips parameter validation step @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() BowTieBuilder.validate_required_run_args(inputs) # Tests for pytest (docker container also runs this) @@ -120,7 +122,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Output is already written to raw-pathway.txt file diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index ad575ba95..a1cb0b7b2 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -22,10 +22,19 @@ # This contains the dynamically generated algorithm schema for use in `schema.py` __all__ = ['AlgorithmUnion'] +def validate_duration(value): + parsed_duration = value(value, granularity='seconds') + if not parsed_duration: raise RuntimeError(f"Encountered unparsable duration string '{value}'.") + +PyDateTimeDuration = Annotated[ + int, + BeforeValidator(validate_duration) +] + class RunSettings(BaseModel): """All of the non-parameter settings associated with a run.""" - timeout: Optional[str] = None + timeout: Optional[PyDateTimeDuration] = None """The associated timeout with a run, parsed with `pytimeparse`.""" model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) diff --git a/spras/config/config.py b/spras/config/config.py index f30a7aacb..8f9ba90b3 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -17,12 +17,12 @@ import itertools as it import warnings from pathlib import Path -from typing import Any, Optional +from typing import Any import numpy as np import yaml -from pytimeparse import parse +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.revision import attach_spras_revision, spras_revision from spras.config.schema import DatasetSchema, RawConfig @@ -62,8 +62,8 @@ def __init__(self, raw_config: dict[str, Any]): self.hash_length = parsed_raw_config.hash_length # Container settings used by PRMs. self.container_settings = ProcessedContainerSettings.from_container_settings(parsed_raw_config.containers, self.hash_length) - # Dictionary of parameter hashes to their respective timeout in seconds - self.algorithm_param_timeouts: dict[str, Optional[int]] = dict() + # Dictionary of parameter hashes to their respective run settings + self.algorithm_param_run_settings: dict[str, RunSettings] = dict() # A nested dict mapping algorithm names to dicts that map parameter hashes to parameter combinations. # Only includes algorithms that are set to be run with 'include: true'. self.algorithm_params: dict[str, dict[str, Any]] = dict() @@ -222,17 +222,9 @@ def process_algorithms(self, raw_config: RawConfig): self.algorithm_params[alg.name][params_hash] = run_dict # We finalize by handling any associated information to each parameter hash. - # At the time, this is only timeouts: - timeout = runs[run_name].timeout or alg.timeout - if timeout: - # Coerce to an `int` if an int isn't possible. - parsed_timeout = parse(timeout, granularity='seconds') - if not parsed_timeout: raise RuntimeError(f"Algorithm {alg} has unparsable timeout string '{parsed_timeout}'.") - self.algorithm_param_timeouts[params_hash] = int(parsed_timeout) - else: - # As per the type signature, we still want to say explicitly that this algorithm's timeout - # is uninhabited. - self.algorithm_param_timeouts[params_hash] = None + self.algorithm_param_run_settings[params_hash] = RunSettings( + timeout=runs[run_name].timeout or alg.timeout + ) def process_analysis(self, raw_config: RawConfig): if not raw_config.analysis: diff --git a/spras/diamond.py b/spras/diamond.py index 6b48575f9..b9db2bf6d 100644 --- a/spras/diamond.py +++ b/spras/diamond.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import ContainerError, prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -63,8 +64,9 @@ def generate_inputs(data, filename_map): edges_df.to_csv(filename_map["network"], columns=["Interactor1", "Interactor2"], index=False, header=None, sep=',') @staticmethod - def run(inputs, output_file, args, container_settings=None, timeout=None): + def run(inputs, output_file, args, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() DIAMOnD.validate_required_run_args(inputs) work_dir = '/diamond' @@ -101,7 +103,7 @@ def run(inputs, output_file, args, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) except ContainerError as err: if err.streams_contain("KeyError: 'nix'"): raise RuntimeError(f"{err.stderr}\n" + \ diff --git a/spras/domino.py b/spras/domino.py index e38438e9e..b6c56612a 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,6 +5,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import BaseModel from spras.containers import ContainerError, prepare_volume, run_container_and_log @@ -79,9 +80,10 @@ def generate_inputs(data, filename_map): header=['ID_interactor_A', 'ppi', 'ID_interactor_B']) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): - if not container_settings: container_settings = ProcessedContainerSettings() + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not args: args = DominoParams() + if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() DOMINO.validate_required_run_args(inputs) work_dir = '/spras' @@ -118,7 +120,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) except ContainerError as err: # Occurs when DOMINO gets passed some empty dataframe from network_file. # This counts as an empty input, so we return an empty output. @@ -153,7 +155,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) except ContainerError as err: # Occurs when DOMINO gets passed some empty dataframe from network_file. # This counts as an empty input, so we return an empty output. diff --git a/spras/meo.py b/spras/meo.py index a0738c1a2..3705a125c 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -4,6 +4,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -138,7 +139,7 @@ def generate_inputs(data, filename_map): # TODO document required arguments @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): """ Run Maximum Edge Orientation in the Docker image with the provided parameters. The properties file is generated from the provided arguments. @@ -147,8 +148,9 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): Only the edge output file is retained. All other output files are deleted. """ - if not container_settings: container_settings = ProcessedContainerSettings() if not args: args = MEOParams() + if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() MEO.validate_required_run_args(inputs) work_dir = '/spras' @@ -196,7 +198,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) properties_file_local.unlink(missing_ok=True) diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 2b31037e8..8f07b3b9c 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -71,9 +72,10 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): - if not container_settings: container_settings = ProcessedContainerSettings() + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not args: args = MinCostFlowParams() + if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() MinCostFlow.validate_required_run_args(inputs) # the data files will be mapped within this directory within the container @@ -123,7 +125,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Check the output of the container out_dir_content = sorted(out_dir.glob('*.sif')) diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 7ec8e3734..262e8fa9f 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log @@ -154,8 +155,9 @@ def generate_inputs(data, filename_map): # TODO add support for knockout argument # TODO add reasonable default values @staticmethod - def run(inputs, output_file, args, container_settings=None, timeout=None): + def run(inputs, output_file, args, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() OmicsIntegrator1.validate_required_run_args(inputs, ["dummy_nodes"]) work_dir = '/spras' @@ -227,7 +229,7 @@ def run(inputs, output_file, args, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout, + run_settings.timeout, {'TMPDIR': mapped_out_dir}) conf_file_local.unlink(missing_ok=True) diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index 1e95c892b..c15c9d297 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log @@ -105,9 +106,10 @@ def generate_inputs(data: Dataset, filename_map): header=['protein1', 'protein2', 'cost']) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): - if not container_settings: container_settings = ProcessedContainerSettings() + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not args: args = OmicsIntegrator2Params() + if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() OmicsIntegrator2.validate_required_run_args(inputs) work_dir = '/spras' @@ -157,7 +159,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout, + run_settings.timeout, network_disabled=True) # TODO do we want to retain other output files? diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 4d7aeb8f5..43f1cbf5a 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -73,9 +74,10 @@ def generate_inputs(data, filename_map): header=["#Interactor1","Interactor2","Weight"]) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): - if not container_settings: container_settings = ProcessedContainerSettings() + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): if not args: args = PathLinkerParams() + if not run_settings: run_settings = RunSettings() + if not container_settings: container_settings = ProcessedContainerSettings() PathLinker.validate_required_run_args(inputs) work_dir = '/spras' @@ -114,7 +116,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Rename the primary output file to match the desired output filename # Currently PathLinker only writes one output file so we do not need to delete others diff --git a/spras/prm.py b/spras/prm.py index 79c149dc1..5e4586773 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -5,6 +5,7 @@ from pydantic import BaseModel +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset from spras.util import LoosePathLike @@ -60,7 +61,7 @@ def get_params_generic(cls) -> type[T]: # This is used in `runner.py` to avoid a dependency diamond when trying # to import the actual algorithm schema. @classmethod - def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings, timeout: Optional[int]): + def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings, run_settings: RunSettings): """ This is similar to PRA.run, but `args` is a dictionary and not a pydantic structure. However, this method still re-validates `args` against the associated pydantic PRM argument model. @@ -77,18 +78,18 @@ def run_typeless(cls, inputs: dict[str, str | os.PathLike], output_file: str | o # (Pydantic already provides nice error messages, so we don't need to worry about catching this.) T_parsed = T_class.model_validate(args) - return cls.run(inputs, output_file, T_parsed, container_settings, timeout) + return cls.run(inputs, output_file, T_parsed, container_settings, run_settings) @staticmethod @abstractmethod - def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings, timeout: Optional[int]): + def run(inputs: dict[str, str | os.PathLike], output_file: str | os.PathLike, args: T, container_settings: ProcessedContainerSettings, run_settings: RunSettings): """ Runs an algorithm. @param inputs: specified inputs @param output_file: designated reconstructed pathway output @param args: (T) typed algorithm params @param container_settings: what settings should be associated with the individual container. - @param timeout: timeout in seconds to run the container with + @param run_settings: The particular run settings to use. See `RunSettings` for more info. See the algorithm-specific `PRM.generate_inputs` and `PRM.parse_output` for information about the input and output format. diff --git a/spras/responsenet.py b/spras/responsenet.py index b93dfb16a..6e70d99cd 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -2,6 +2,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( @@ -63,10 +64,11 @@ def generate_inputs(data, filename_map): header=False) @staticmethod - def run(inputs, output_file, args=None, container_settings=None, timeout=None): + def run(inputs, output_file, args=None, container_settings=None, run_settings=None): + if not args: args = ResponseNetParams() + if not run_settings: run_settings = RunSettings() if not container_settings: container_settings = ProcessedContainerSettings() ResponseNet.validate_required_run_args(inputs) - if not args: args = ResponseNetParams() # the data files will be mapped within this directory within the container work_dir = '/ResponseNet' @@ -113,7 +115,7 @@ def run(inputs, output_file, args=None, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Rename the primary output file to match the desired output filename out_file_suffixed.rename(output_file) diff --git a/spras/runner.py b/spras/runner.py index b3b469af7..64f552345 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -1,9 +1,10 @@ from os import PathLike -from typing import Any, Mapping, Optional +from typing import Any, Mapping # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.dataset import Dataset, DatasetSchema from spras.diamond import DIAMOnD @@ -46,7 +47,7 @@ def run( output_file: str | PathLike, args: dict[str, Any], container_settings: ProcessedContainerSettings, - timeout: Optional[int] + run_settings: RunSettings ): """ A generic interface to the algorithm-specific run functions @@ -54,7 +55,7 @@ def run( algorithm_runner = get_algorithm(algorithm) # We can't use config.config here else we would get a cyclic dependency. # Since args is a dict here, we use the 'run_typeless' utility PRM function. - algorithm_runner.run_typeless(inputs, output_file, args, container_settings, timeout) + algorithm_runner.run_typeless(inputs, output_file, args, container_settings, run_settings) def get_required_inputs(algorithm: str): diff --git a/spras/rwr.py b/spras/rwr.py index 2858316c1..284150d4d 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -4,6 +4,7 @@ import pandas as pd from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -52,8 +53,9 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_settings=None, timeout=None): + def run(inputs, output_file, args, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() RWR.validate_required_run_args(inputs) with Path(inputs["network"]).open() as network_f: @@ -100,7 +102,7 @@ def run(inputs, output_file, args, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Rename the primary output file to match the desired output filename output_edges = Path(out_dir, 'output.txt') diff --git a/spras/strwr.py b/spras/strwr.py index 5226e5780..097553fd5 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -3,6 +3,7 @@ from pydantic import BaseModel, ConfigDict +from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset @@ -52,8 +53,9 @@ def generate_inputs(data, filename_map): edges.to_csv(filename_map['network'],sep='|',index=False,columns=['Interactor1','Interactor2'],header=False) @staticmethod - def run(inputs, output_file, args, container_settings=None, timeout=None): + def run(inputs, output_file, args, container_settings=None, run_settings=None): if not container_settings: container_settings = ProcessedContainerSettings() + if not run_settings: run_settings = RunSettings() ST_RWR.validate_required_run_args(inputs) with Path(inputs["network"]).open() as network_f: @@ -105,7 +107,7 @@ def run(inputs, output_file, args, container_settings=None, timeout=None): work_dir, out_dir, container_settings, - timeout) + run_settings.timeout) # Rename the primary output file to match the desired output filename output_edges = Path(out_dir, 'output.txt') From 2eb0d57af238467ddeac2d37019db1395e28b602 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 7 May 2026 16:59:52 +0000 Subject: [PATCH 51/59] fix: no cyclic imports this is why I didn't do this earlier... --- spras/allpairs.py | 2 +- spras/btb.py | 2 +- spras/config/algorithms.py | 20 ++------------------ spras/config/config.py | 2 +- spras/config/runs.py | 21 +++++++++++++++++++++ spras/diamond.py | 2 +- spras/domino.py | 2 +- spras/meo.py | 2 +- spras/mincostflow.py | 2 +- spras/omicsintegrator1.py | 2 +- spras/omicsintegrator2.py | 2 +- spras/pathlinker.py | 2 +- spras/prm.py | 2 +- spras/responsenet.py | 2 +- spras/runner.py | 2 +- spras/rwr.py | 2 +- spras/strwr.py | 2 +- 17 files changed, 38 insertions(+), 33 deletions(-) create mode 100644 spras/config/runs.py diff --git a/spras/allpairs.py b/spras/allpairs.py index 938a8c992..b4e542ad1 100644 --- a/spras/allpairs.py +++ b/spras/allpairs.py @@ -1,8 +1,8 @@ import warnings from pathlib import Path -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset diff --git a/spras/btb.py b/spras/btb.py index 66a848853..6ccfd3d5b 100644 --- a/spras/btb.py +++ b/spras/btb.py @@ -1,7 +1,7 @@ from pathlib import Path -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.config.util import Empty from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index a1cb0b7b2..06bdc1cc3 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -5,7 +5,7 @@ """ import ast import copy -from typing import Annotated, Any, Callable, Literal, Optional, Union, cast, get_args +from typing import Annotated, Any, Callable, Literal, Union, cast, get_args import numpy as np from pydantic import ( @@ -17,28 +17,12 @@ create_model, ) +from spras.config.runs import RunSettings from spras.runner import algorithms # This contains the dynamically generated algorithm schema for use in `schema.py` __all__ = ['AlgorithmUnion'] -def validate_duration(value): - parsed_duration = value(value, granularity='seconds') - if not parsed_duration: raise RuntimeError(f"Encountered unparsable duration string '{value}'.") - -PyDateTimeDuration = Annotated[ - int, - BeforeValidator(validate_duration) -] - -class RunSettings(BaseModel): - """All of the non-parameter settings associated with a run.""" - - timeout: Optional[PyDateTimeDuration] = None - """The associated timeout with a run, parsed with `pytimeparse`.""" - - model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) - def is_numpy_friendly(type: type[Any] | None) -> bool: """ Whether the passed in type can have any numpy helpers. diff --git a/spras/config/config.py b/spras/config/config.py index 8f9ba90b3..dbc68b5b7 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -22,9 +22,9 @@ import numpy as np import yaml -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings from spras.config.revision import attach_spras_revision, spras_revision +from spras.config.runs import RunSettings from spras.config.schema import DatasetSchema, RawConfig from spras.util import LoosePathLike, NpHashEncoder, hash_params_sha1_base32 diff --git a/spras/config/runs.py b/spras/config/runs.py new file mode 100644 index 000000000..9ed3205cf --- /dev/null +++ b/spras/config/runs.py @@ -0,0 +1,21 @@ +from typing import Annotated, Optional + +from pydantic import BaseModel, BeforeValidator, ConfigDict + + +def validate_duration(value): + parsed_duration = value(value, granularity='seconds') + if not parsed_duration: raise RuntimeError(f"Encountered unparsable duration string '{value}'.") + +PyDateTimeDuration = Annotated[ + int, + BeforeValidator(validate_duration) +] + +class RunSettings(BaseModel): + """All of the non-parameter settings associated with a run.""" + + timeout: Optional[PyDateTimeDuration] = None + """The associated timeout with a run, parsed with `pytimeparse`.""" + + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) diff --git a/spras/diamond.py b/spras/diamond.py index b9db2bf6d..65f1ac9be 100644 --- a/spras/diamond.py +++ b/spras/diamond.py @@ -2,8 +2,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import ContainerError, prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( diff --git a/spras/domino.py b/spras/domino.py index b6c56612a..1e419ecf6 100644 --- a/spras/domino.py +++ b/spras/domino.py @@ -5,8 +5,8 @@ import pandas as pd from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.config.util import BaseModel from spras.containers import ContainerError, prepare_volume, run_container_and_log from spras.interactome import ( diff --git a/spras/meo.py b/spras/meo.py index 3705a125c..003d6a621 100644 --- a/spras/meo.py +++ b/spras/meo.py @@ -4,8 +4,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( add_directionality_constant, diff --git a/spras/mincostflow.py b/spras/mincostflow.py index 8f07b3b9c..60f579003 100644 --- a/spras/mincostflow.py +++ b/spras/mincostflow.py @@ -3,8 +3,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, diff --git a/spras/omicsintegrator1.py b/spras/omicsintegrator1.py index 262e8fa9f..85f444b85 100644 --- a/spras/omicsintegrator1.py +++ b/spras/omicsintegrator1.py @@ -3,8 +3,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import MissingDataError diff --git a/spras/omicsintegrator2.py b/spras/omicsintegrator2.py index c15c9d297..7d11d2678 100644 --- a/spras/omicsintegrator2.py +++ b/spras/omicsintegrator2.py @@ -4,8 +4,8 @@ import pandas as pd from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.config.util import CaseInsensitiveEnum from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset, MissingDataError diff --git a/spras/pathlinker.py b/spras/pathlinker.py index 43f1cbf5a..6a05509fc 100644 --- a/spras/pathlinker.py +++ b/spras/pathlinker.py @@ -3,8 +3,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( diff --git a/spras/prm.py b/spras/prm.py index 5e4586773..d7e207a06 100644 --- a/spras/prm.py +++ b/spras/prm.py @@ -5,8 +5,8 @@ from pydantic import BaseModel -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.dataset import Dataset from spras.util import LoosePathLike diff --git a/spras/responsenet.py b/spras/responsenet.py index 6e70d99cd..c3c599cd0 100644 --- a/spras/responsenet.py +++ b/spras/responsenet.py @@ -2,8 +2,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.interactome import ( convert_undirected_to_directed, diff --git a/spras/runner.py b/spras/runner.py index 64f552345..994cf5c7d 100644 --- a/spras/runner.py +++ b/spras/runner.py @@ -4,8 +4,8 @@ # supported algorithm imports from spras.allpairs import AllPairs from spras.btb import BowTieBuilder -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.dataset import Dataset, DatasetSchema from spras.diamond import DIAMOnD from spras.domino import DOMINO diff --git a/spras/rwr.py b/spras/rwr.py index 284150d4d..38bc45342 100644 --- a/spras/rwr.py +++ b/spras/rwr.py @@ -4,8 +4,8 @@ import pandas as pd from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( diff --git a/spras/strwr.py b/spras/strwr.py index 097553fd5..b3fd86c39 100644 --- a/spras/strwr.py +++ b/spras/strwr.py @@ -3,8 +3,8 @@ from pydantic import BaseModel, ConfigDict -from spras.config.algorithms import RunSettings from spras.config.container_schema import ProcessedContainerSettings +from spras.config.runs import RunSettings from spras.containers import prepare_volume, run_container_and_log from spras.dataset import Dataset from spras.interactome import ( From 15d00a6187e6967193e18ba56eeb8bb50d6ce997 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 7 May 2026 17:07:06 +0000 Subject: [PATCH 52/59] docs(timeout): use nested params --- docs/timeout.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/timeout.rst b/docs/timeout.rst index a2e4b64f1..e58b2e506 100644 --- a/docs/timeout.rst +++ b/docs/timeout.rst @@ -13,7 +13,8 @@ timeout: runs: run1: timeout: 1d - k: 200 + params: + k: 200 The timeout string parsing is delegated to `pytimeparse `__ (examples linked here). This @@ -42,10 +43,12 @@ default unless otherwise specified in a specific run. run1: # this uses timeout: 2d timeout: 2d - k: 200 + params: + k: 200 run2: # this uses timeout: 1d - k: 100 + params: + k: 100 This is also useful for algorithms which take in no parameters, such as ``allpairs``: From 73f7b8dfbdfc10d890d330c0b96c18cfb2d0c3d9 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 7 May 2026 17:18:46 +0000 Subject: [PATCH 53/59] fix: properly define validate_duration --- spras/config/runs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spras/config/runs.py b/spras/config/runs.py index 9ed3205cf..5d33568c0 100644 --- a/spras/config/runs.py +++ b/spras/config/runs.py @@ -1,11 +1,14 @@ from typing import Annotated, Optional from pydantic import BaseModel, BeforeValidator, ConfigDict +from pytimeparse import parse def validate_duration(value): - parsed_duration = value(value, granularity='seconds') + if isinstance(value, int): return value + parsed_duration = parse(value, granularity='seconds') if not parsed_duration: raise RuntimeError(f"Encountered unparsable duration string '{value}'.") + return parsed_duration PyDateTimeDuration = Annotated[ int, From 4a95d79df69f98f6e3e3db6c8ff0f6fa654de6c5 Mon Sep 17 00:00:00 2001 From: "Tristan F.-R." Date: Thu, 7 May 2026 17:20:58 +0000 Subject: [PATCH 54/59] fix(Snakefile): properly fetch run settings --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 2534c12c1..0abecbfd3 100644 --- a/Snakefile +++ b/Snakefile @@ -289,7 +289,7 @@ rule reconstruct: # Get the timeout from the config and use it as an input. # TODO: This has unexpected behavior when this rule succeeds but the timeout extends, # making this rule run again. - timeout = lambda wildcards: _config.config.algorithm_param_timeouts[params_index(wildcards.params)] + timeout = lambda wildcards: _config.config.algorithm_param_run_settings[params_index(wildcards.params)].timeout run: # Create a copy so that the updates are not written to the parameters logfile algorithm_params = reconstruction_params(wildcards.algorithm, wildcards.params).copy() From 976816a37533f88b8036b9a7637f4edfa9377317 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 8 May 2026 19:47:01 +0000 Subject: [PATCH 55/59] refactor: move cli.py to spras/cli --- spras/{ => cli}/cli.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spras/{ => cli}/cli.py (100%) diff --git a/spras/cli.py b/spras/cli/cli.py similarity index 100% rename from spras/cli.py rename to spras/cli/cli.py From 237ef5401e6fa79c9aa507c2067d6f4f65b98381 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 8 May 2026 19:47:08 +0000 Subject: [PATCH 56/59] refactor: adjust references to cli.py --- pyproject.toml | 4 ++-- spras/__main__.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c9d0ffded..0b48513b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,10 +49,10 @@ dev = [ "Issues" = "https://github.com/Reed-CompBio/spras/issues" [project.entry-points."pipx.run"] -spras = "spras.cli:run" +spras = "spras.cli.cli:run" [project.scripts] -spras = "spras.cli:run" +spras = "spras.cli.cli:run" [build-system] requires = ["setuptools>=64.0"] diff --git a/spras/__main__.py b/spras/__main__.py index 16b442b7e..40af05213 100644 --- a/spras/__main__.py +++ b/spras/__main__.py @@ -1,3 +1,3 @@ if __name__ == "__main__": - from spras.cli import run + from spras.cli.cli import run run() From 35387b480a275227d0e3d512e97d5e3bdd6b35de Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 8 May 2026 13:38:15 -0700 Subject: [PATCH 57/59] fix: add conditionals to runs --- spras/config/runs.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spras/config/runs.py b/spras/config/runs.py index 5d33568c0..7044f3146 100644 --- a/spras/config/runs.py +++ b/spras/config/runs.py @@ -1,6 +1,6 @@ from typing import Annotated, Optional -from pydantic import BaseModel, BeforeValidator, ConfigDict +from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from pytimeparse import parse @@ -21,4 +21,11 @@ class RunSettings(BaseModel): timeout: Optional[PyDateTimeDuration] = None """The associated timeout with a run, parsed with `pytimeparse`.""" + conditionals: list[str] = Field(alias="if", default_factory=lambda: []) + """ + If any of the specified runs in this list succeed, then this run itself is permitted to run. + We refer to these as 'conditional runs,' and since Python reserves the `if` keyword, we call + them "conditionals" for short in-code, but users interface with this using `if`. + """ + model_config = ConfigDict(extra='forbid', use_attribute_docstrings=True) From 98830fb602182e514a65a4612f125f82d52da054 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Mon, 11 May 2026 05:04:12 +0000 Subject: [PATCH 58/59] refactor: preserve semantics of list --- spras/config/algorithms.py | 123 +++++++-------------- spras/config/config.py | 19 +--- spras/config/function_parsing.py | 49 +++++++++ spras/config/tunable.py | 181 +++++++++++++++++++++++++++++++ test/test_config.py | 2 +- 5 files changed, 272 insertions(+), 102 deletions(-) create mode 100644 spras/config/function_parsing.py create mode 100644 spras/config/tunable.py diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index 06bdc1cc3..c0fb53686 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -3,11 +3,9 @@ parameter combinations. This has been isolated from schema.py as it is not declarative, and rather mainly contains validators and lower-level pydantic code. """ -import ast import copy -from typing import Annotated, Any, Callable, Literal, Union, cast, get_args +from typing import Annotated, Any, Literal, Union, cast, get_args, get_origin -import numpy as np from pydantic import ( BaseModel, BeforeValidator, @@ -18,77 +16,23 @@ ) from spras.config.runs import RunSettings +from spras.config.tunable import FloatTunable, IntegerTunable, TunableSet from spras.runner import algorithms # This contains the dynamically generated algorithm schema for use in `schema.py` __all__ = ['AlgorithmUnion'] -def is_numpy_friendly(type: type[Any] | None) -> bool: - """ - Whether the passed in type can have any numpy helpers. - This is used to provide hints in the JSON schema, - and to determine whether or not to allow for easy ranges using - `python_evalish_coerce`. - """ - allowed_types = (int, float) - - # check basic types, then check optional types - return type in allowed_types or \ - any([arg for arg in get_args(type) if arg in allowed_types]) - -def python_evalish_coerce(value: Any) -> Any: - """ - Allows for using numpy and python calls: specifically, - `range`, `np.linspace`, `np.arange`, and `np.logspace` are supported. - - **Safety Note**: This does not prevent availability attacks: this can still exhaust - resources if wanted. This only prevents secret leakage. - """ - - if not isinstance(value, str): - return value - - # These strings are in the form of function calls `function.name(param1, param2, ...)`. - # Since we want to avoid `eval` (since this might be running in the secret-sensitive HTCondor), - # we need to parse these functions. - functions_dict: dict[str, Callable[[list[Any]], list[Union[int, float]]]] = { - 'range': lambda params: list(range(*params)), - "np.linspace": lambda params: list(np.linspace(*params)), - "np.arange": lambda params: list(np.arange(*params)), - "np.logspace": lambda params: list(np.logspace(*params)), - } - - # To do this, we get the AST of our string as an expression - # (filename='' is to make the error message more closely resemble that of eval.) - value_ast = ast.parse(value, mode='eval', filename='') +def determine_inner_type(outer_type: type): + # TODO: this doesn't handle annotated types. + if get_args(outer_type) != () and get_origin(outer_type) == Union: + # We map arbitrary unions Union[A, ...] -> Union[determine_inner_type(A), ..., TunableSet[Union[A, ...]]], + # where we include the extra entry at the end to allow for heterogeneous lists. + return Union[*(determine_inner_type(arg) for arg in get_args(outer_type)), TunableSet[outer_type]] - # Then we do some light parsing - we're only looking to do some literal evaluation - # (allowing light python notation) and some basic function parsing. Full python programs - # should just generate a config.yaml. - - # This should always be an Expression whose body is Call (a function). - if not isinstance(value_ast.body, ast.Call): - raise ValueError(f'This argument "{value}" was interpreted as a non-function-calling string: it should be a function call (e.g. range(100, 201, 50)), or an int or a float.') - - # We get the function name back as a string - function_name = ast.unparse(value_ast.body.func) - - # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. - arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] - - if function_name not in functions_dict: - raise ValueError(f"{function_name} is not an allowed function to be run! Allowed functions: {list(functions_dict.keys())}") - - return functions_dict[function_name](arguments) - -def list_coerce(value: Any) -> Any: - """ - Coerces to a value to a list if it isn't already. - Used as a BeforeValidator. - """ - if not isinstance(value, list): - return [value] - return value + if outer_type == float: return FloatTunable + if outer_type == int: return IntegerTunable + # We fall back to base `TunableSet` otherwise. + return TunableSet[outer_type] # This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection # and preserve rich type information at runtime. @@ -109,34 +53,43 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod # ... # and we want to transform this to: # class AlgorithmParamsCombination(BaseModel): - # key1: list[int] - # key2: list[list[str]] + # key1: TunableInteger + # key2: TunableSet[list[str]] + # ... + # where all key-types are children of `Tunable`. # However, we want to preserve certain conveniences (singleton values, fake python evaluation), # so we also make use of BeforeValidators to do so, and we pass over their preferences into the JSON schema. # (Note: This function does not worry about getting the cartesian product of this.) - # Map our fields to a list (assuming we have no nested keys), + # Map our fields to `Tunable`s (assuming we have no nested keys), # and specify our user convenience validators - mapped_list_field: dict[str, Annotated] = dict() + mapped_list_field: dict[str, tuple] = dict() for field_name, field in model.model_fields.items(): # We need to create a copy of the field, # as we need to make sure that it gets mapped to the list coerced version of the field. new_field = copy.deepcopy(field) new_field.validate_default = True - mapped_list_field[field_name] = (Annotated[ - list[field.annotation], - # This order isn't arbitrary. - # https://docs.pydantic.dev/latest/concepts/validators/#ordering-of-validators - # This runs second. This coerces any singletons to lists. - BeforeValidator(list_coerce, json_schema_input_type=Union[field.annotation, list[field.annotation]]), - # This runs first. This evaluates numpy utils for integer/float lists - BeforeValidator( - python_evalish_coerce, - # json_schema_input_type (sensibly) overwrites, so we have to specify the entire union again here. - json_schema_input_type=Union[field.annotation, list[field.annotation], str] - ) if is_numpy_friendly(field.annotation) else None - ], new_field) + assert field.annotation is not None, f"Field {field.title} ({field}) has a None annotation, but we require type annotations on fields." + field_type = determine_inner_type(field.annotation) + + mapped_list_field[field_name] = ( + Union[ + # We first try to validate the type directly + field_type, + # If that fails, we coerce it into a list beforehand then validate it again. + Annotated[ + field_type, + BeforeValidator( + lambda value: [value], + # and we ensure that in a JSON schema, this is properly marked + # as a singleton value. + json_schema_input_type=field.annotation + ), + ] + ], + new_field + ) # Runtime assertion check: mapped_list_field does not contain any `__-prefixed` fields for key in mapped_list_field.keys(): diff --git a/spras/config/config.py b/spras/config/config.py index 5b4196894..f12a7cb76 100644 --- a/spras/config/config.py +++ b/spras/config/config.py @@ -19,7 +19,6 @@ from pathlib import Path from typing import Any -import numpy as np import yaml from spras.config.container_schema import ProcessedContainerSettings @@ -197,26 +196,14 @@ def process_algorithms(self, raw_config: RawConfig): param_name_list = [] # We convert our run parameters to a dictionary, allowing us to iterate over it run_subscriptable = vars(runs[run_name].params) - for param in run_subscriptable: + # `param_values` is guaranteed to be `Tunable[Any]` by algorithms.py + for param, param_values in run_subscriptable.items(): param_name_list.append(param) - # this is guaranteed to be list[Any] by algorithms.py - param_values: list[Any] = run_subscriptable[param] - all_runs.append(param_values) + all_runs.append(param_values.to_list()) run_list_tuples = list(it.product(*all_runs)) param_name_tuple = tuple(param_name_list) for r in run_list_tuples: run_dict = dict(zip(param_name_tuple, r, strict=True)) - # TODO: Workaround for yaml.safe_dump in Snakefile write_parameter_log. - # We would like to preserve np info for larger floats and integers on the config, - # but this isn't strictly necessary for the pretty yaml logging that's happening - if we - # want to preserve the precision, we need to output this into yaml as strings. - for param, value in run_dict.copy().items(): - if isinstance(value, np.integer): - run_dict[param] = int(value) - if isinstance(value, np.floating): - run_dict[param] = float(value) - if isinstance(value, np.ndarray): - run_dict[param] = value.tolist() hash_run_dict = copy.deepcopy(run_dict) if self.immutable_files: # Incorporates the `spras_revision` into the hash diff --git a/spras/config/function_parsing.py b/spras/config/function_parsing.py new file mode 100644 index 000000000..dab3a5e0c --- /dev/null +++ b/spras/config/function_parsing.py @@ -0,0 +1,49 @@ +import ast +from typing import Callable + + +def python_evalish_coerce[T]( + value: str, + func: Callable[..., T], + desired_function_name: str +) -> T: + """ + A convenient function for parsing strings of the form + `function_name(arg1, arg2, named=arg3, ...)` using Python's `ast`, + and evaluating them against some anonymous function. We choose to immediately evaluate + against `func`, as Python's internal parameter validation logic encapsulates precisely + what we need for this function. + + Since these errors occur in the context of a union type, we don't worry too much + about trying to collect all function names for erroring purposes. + + @param value: The value to parse against. + @param func: The function to pass arguments to. + @param desired_function_name: The function name to look for. + + @returns The result of calling `func` with the arguments in `value` + """ + # To do this, we get the AST of our string as an expression + # (filename='' is to make the error message more closely resemble that of eval.) + value_ast = ast.parse(value, mode='eval', filename='') + + # Then we do some light parsing - we're only looking to do some literal evaluation + # (allowing light python notation) and some basic function parsing. Full python programs + # should just generate a config.yaml. + + # This should always be an Expression whose body is Call (a function). + if not isinstance(value_ast.body, ast.Call): + raise ValueError(f'This argument "{value}" was interpreted as a non-function-calling string: it should be a function call (e.g. range(100, 201, 50)), or an int or a float.') + + # We get the function name back as a string + function_name = ast.unparse(value_ast.body.func) + + if desired_function_name != function_name: + raise ValueError(f"Tried looking for a function of the name {desired_function_name}, but got {function_name} instead.") + + # and we use the (non-availability) safe `ast.literal_eval` to support literals passed into functions. + arguments = [ast.literal_eval(arg) for arg in value_ast.body.args] + # TODO: unclear when keyword.arg can be None? We filter for none-None values to satisfy the type checker. + kv_arguments = {keyword.arg: ast.literal_eval(keyword.value) for keyword in value_ast.body.keywords if keyword.arg is not None} + + return func(*arguments, **kv_arguments) diff --git a/spras/config/tunable.py b/spras/config/tunable.py new file mode 100644 index 000000000..bde58ec22 --- /dev/null +++ b/spras/config/tunable.py @@ -0,0 +1,181 @@ +""" +A collection of 'tunable' functions, or functions which can be tuned to +produce 'midpoints.' + +We consider parameter tuning a first-class feature of SPRAS configs, +so all algorithm list parameters are all marked with `Tunable`. +""" + +from abc import ABC, abstractmethod +from collections.abc import Set +from typing import Annotated, Any, Callable, List, Self, Union + +import numpy +from pydantic import BaseModel, Field, RootModel, model_validator + +from spras.config.function_parsing import python_evalish_coerce + + +class Tunable[T](ABC): + @abstractmethod + def tune(self) -> Self: + """ + Creates a tuned version of `self` which contains some sembalance of 'midpoints' on top of `self.` + + We impose a restriction: for all `P` implementing `Tunable`, any valid implementation of `P#tune` must satisfy that + for all `p : P`, we can construct some `p' = p.tune()` such that: + 1. `to_list(p)` is a sublist of `to_list(p')`, + 2. for every x, y in `to_list(p)` such that no z in `to_list(p)` satisfies x < z < y, there either: + - exists a unique a z' in `to_list(p')` such that x < z' < y. + - no such x < z' < y exists for all z' in to_list(p'). + + We do this for the sake of parameter tuning (to avoid situations where what runs depend on what other runs is ambiguous), + though this can be extended in the future to avoid that. + """ + # TODO: enforce at runtime + pass + + @abstractmethod + def to_list(self) -> List[T]: + pass + +class TunableSet[S](RootModel[List[S]], Tunable[S]): + """ + A thin wrapper class to allow generic sets to 'pretend' to be tunable. + Note that `Tunableset#tune` is the identity function. + """ + + def tune(self): + # NOTE: We use `type(self)` to access the constructor [This is annoying and I'm not a particular fan of it]. + # This happens throughout this file. + return type(self)(self.root) + + def to_list(self): + return self.root + +def parse_from_function[T](data: Any, func: Callable[..., T], desired_function_name: str) -> T: + """Convenience wrapper around `python_evalish_coerce` for trivial before model_validator declarations.""" + if not isinstance(data, str): + return data + + return python_evalish_coerce(data, func, desired_function_name) + +class BaseTunable[T](BaseModel, Tunable[T]): + """No-op class to avoid double inheritance issues for IntegerAdapter""" + pass + +# TODO: The way we do `parse_from_string` for these methods is lenghty and bad for types, +# and frankly, this section is a lot of length "declaring definitions" boilerplate, with the only substance +# happening in their respective `tune` methods: can we make this better? +class Range(BaseModel, Tunable[int]): + """A `range`-backed list of integers.""" + start: int = 0 + stop: int + step: int = 1 + + @model_validator(mode='before') + @classmethod + def parse_from_string(cls, data: Any) -> Any: + return parse_from_function(data, lambda start, stop, step=1: {"start": start, "stop": stop, "step": step}, "range") + + def tune(self): + return type(self)(start=self.start, stop=self.stop, step=int(self.step / 2.0)) + + def to_list(self): + return list(range(self.start, self.stop, self.step)) + +class LinSpace(BaseTunable[float]): + """ + A tuning wrapper for numpy.linspace: + see its associated documentation for parameter information. + """ + + start: float + stop: float + num: int = 50 + endpoint: bool + + @model_validator(mode='before') + @classmethod + def parse_from_string(cls, data: Any) -> Any: + return parse_from_function( + data, + lambda start, stop, num=50, endpoint=True: {"start": start, "stop": stop, "num": num, "endpoint": endpoint}, + "np.linspace" + ) + + def tune(self): + return type(self)(start=self.start, stop=self.stop, num=self.num * 2, endpoint=self.endpoint) + + def to_list(self): + return list(numpy.linspace(self.start, self.stop, self.num, endpoint=self.endpoint).tolist()) + +class ARange(BaseTunable[float]): + """ + A tuning wrapper for numpy.arange: + see its associated documentation for parameter information. + """ + + start: float + stop: float + step: float = 1.0 + + @model_validator(mode='before') + @classmethod + def parse_from_string(cls, data: Any) -> Any: + return parse_from_function( + data, + lambda start, stop, step=1.0: {"start": start, "stop": stop, "step": step}, + "np.arange" + ) + + def tune(self): + return type(self)(start=self.start, stop=self.stop, step=self.step / 2) + + def to_list(self): + return list(numpy.arange(self.start, self.stop, self.step).tolist()) + +class LogSpace(BaseTunable[float]): + """ + A tuning wrapper for numpy.logspace: + see its associated documentation for parameter information. + """ + + start: float + stop: float + num: int = 50 + endpoint: bool = True + base: float = 10.0 + + @model_validator(mode='before') + @classmethod + def parse_from_string(cls, data: Any) -> Any: + return parse_from_function( + data, + lambda start, stop, num=50, endpoint=True, base=10.0: + {"start": start, "stop": stop, "num": num, "endpoint": endpoint, "base": base}, + "np.logspace" + ) + + def tune(self): + # TODO: does logspace admit a proper `tune` implementation? + return type(self)(start=self.start, stop=self.stop, num=self.num, endpoint=self.endpoint, base=self.base) + + def to_list(self): + return list(numpy.logspace(self.start, self.stop, self.num, endpoint=self.endpoint, base=self.base).tolist()) + +class IntegerAdapter[S : (BaseTunable[float], BaseTunable[int])](RootModel[S], Tunable[int]): + """Takes some Tunable[float/int] backed by a BaseModel and turns it into a Tunable[int]""" + + def tune(self): + return type(self)(self.root.tune()) + + def to_list(self): + return list({int(elem) for elem in self.root.to_list()}) + + +FloatTunable = Annotated[Union[Range, LinSpace, ARange, LogSpace, TunableSet[float]], Field(union_mode="left_to_right")] + +# We have to annoyingly spread IntegerAdapter across every type we care about, to avoid multiple inheritance issues +# TODO: maybe a better way to do this? +IntegerTunable = Annotated[Union[Range, IntegerAdapter[LinSpace], IntegerAdapter[ARange], IntegerAdapter[LogSpace], TunableSet[int]], Field(union_mode="left_to_right")] diff --git a/test/test_config.py b/test/test_config.py index 9e7ff40cc..12b4a65f9 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -110,7 +110,7 @@ def get_test_config(): return test_raw_config -def value_test_util(alg: str, run_name: str, param_type: type[BaseModel], configurations: Iterable[BaseModel]): +def value_test_util[T: BaseModel](alg: str, run_name: str, param_type: type[T], configurations: Iterable[T]): """ Utility test function to be able to test against certain named runs under algorithms. This is, unfortunately, a very holistic function that depends From 9d00065ab6fbb4925c2c5df3a1a6618289beaec3 Mon Sep 17 00:00:00 2001 From: "Tristan F." Date: Fri, 22 May 2026 21:42:26 +0000 Subject: [PATCH 59/59] feat: init parameter tuning --- spras/cli/tune.py | 211 +++++++++++++++++++++++++++++++++++++ spras/cli/util.py | 17 +++ spras/config/algorithms.py | 39 ++++--- spras/config/tunable.py | 8 +- 4 files changed, 256 insertions(+), 19 deletions(-) create mode 100644 spras/cli/tune.py create mode 100644 spras/cli/util.py diff --git a/spras/cli/tune.py b/spras/cli/tune.py new file mode 100644 index 000000000..74c5485b9 --- /dev/null +++ b/spras/cli/tune.py @@ -0,0 +1,211 @@ +""" +Generic algorithm tuning. We spend a good chunk of this file setting up configuration tuning, +then finally use it by wrapping it in a CLI subcommand that takes in a configuration. +""" + +from dataclasses import dataclass, field +import itertools +from typing import Any, Callable, List, Mapping, NamedTuple, Optional +from pydantic import BaseModel, TypeAdapter +from spras.cli.util import window +from spras.config.schema import RawConfig +from spras.config.tunable import Tunable +from spras.util import NpHashEncoder, hash_params_sha1_base32 + +class Neighborly[C, N](NamedTuple): + """A neighborly element takes note of itself and its desired neighbors.""" + current: C + neighbors: List[N] = field(default_factory=list) + +def type_algorithm_params(params: BaseModel) -> dict[str, Tunable[Any]]: + """ + Applies runtime assertions to `params` and coerces it down into + the looser `dict` structure + """ + unfolded_params: dict[str, Tunable[Any]] = vars(params) + for key in unfolded_params.keys(): + # We assert at runtime our (implicit) cast of `unfolded_params` + assert issubclass(type(unfolded_params[key]), Tunable) + return unfolded_params + +@dataclass +class TunableNeighborly[S](Tunable[Neighborly[S, S]]): + """ + NOTE: This does not store data in the form of `Tunable[Neighborly[S]]`, + but that representation can be fetched through the `TunableNeighborly#to_list` method. + """ + wrapped: Tunable[S] + neighborly_elements: Mapping[S, List[S]] = field(default_factory=dict) + + def tune(self): + old_wrapped = self.wrapped.to_list() + new_wrapped = self.wrapped.tune() + new_neighborly_elements = dict(self.neighborly_elements) + for prev, curr, nxt in window(new_wrapped.to_list(), n = 3): + if curr in old_wrapped: continue + new_neighborly_elements[curr] = [prev, nxt] + + return type(self)(new_wrapped, new_neighborly_elements) + + def to_list(self): + return [ + Neighborly(elem, neighbors=self.neighborly_elements[elem] if elem in self.neighborly_elements else []) + for elem in self.wrapped.to_list() + ] + +def flatten_params( + params: Mapping[str, List[Neighborly]], + hash_function: Callable[[dict[str, Any]], str] +) -> dict[str, Neighborly[dict[str, Any], str]]: + """ + Turns parameter dictionary lists of the form: + a: [Neighborly(av1, [...]), Neighborly(av2, [...]), ...] + b: [Neighborly(bv1, [...]), Neighborly(bv2, [...]), ...] + To a dictionary of the form + {hash: Neighborly({a: av1, b: bv1}, [hash1, ...]), ...} + + @param params: a dictionary with a description as described above. + @param hash_function: the hash function to use to compute hashes for the return dictionary + """ + final_dictionary: dict[str, Neighborly[dict[str, Any], str]] = dict() + # Each parameter is a tuple of the form (a: Neighborly(...), b: Neighborly(...), ...) + for parameter_combination in itertools.product(*params.values()): + # We want to collect a list of neighbors of each parameter combination + neighbors: list[dict[str, Any]] = [] + # We transform it into {a: Neighborly(...), b: Neighborly(...), ...} + zipped_parameter_combination = dict(zip(params.keys(), parameter_combination)) + # and create a version of it with no neighbors + forgetful_parameter_combination = {k: v.current for k, v in zipped_parameter_combination.items()} + for key, value in zipped_parameter_combination.items(): + for neighbor in value.neighbors: + # Then, add the adjacent parameter combination to the `forgetful_parameter_combination`. + adjacent_parameter_combination = dict(forgetful_parameter_combination) + adjacent_parameter_combination[key] = neighbor + neighbors.append(adjacent_parameter_combination) + final_dictionary[hash_function(forgetful_parameter_combination)] = Neighborly( + forgetful_parameter_combination, + neighbors=[hash_function(neighbor) for neighbor in neighbors] + ) + return final_dictionary + +# The following is a list of utility functions to encapsulate the above behavior +# in decreasing granularity. + +def prepare_params( + params: Mapping[str, Tunable] +) -> dict[str, TunableNeighborly]: + """Attatches empty neighbor information to some given parameters.""" + return {k: TunableNeighborly(v) for k, v in params.items()} + +def forget_tuning_params( + params: Mapping[str, Tunable[Neighborly]] +) -> dict[str, List[Neighborly]]: + """Removes tuning information about some given parameters""" + return {k: v.to_list() for k, v in params.items()} + +def tune_params( + params: Mapping[str, Tunable], + hash_function: Callable[[dict[str, Any]], str], + tune_count: int = 1 +) -> dict[str, Neighborly[dict[str, Any], str]]: + """ + Tunes some `params` dictionary some `tune_count` number of times. + + @param params: a dictionary of parameter names to their `Tunable` values. + @param hash_function: The function to use to hash parameter dictionaries. + @param tune_count: The number of times to tune. + @returns: A dictionary from hashes to parameter combinations, with associated neighbor information + of what runs a run should depend on (see conditional runs.) + """ + if tune_count < 1: + raise ValueError(f"tune_count must be a positive integer: got {tune_count} instead.") + # We first attach empty neighbor information to associated parameters + prepared_params = prepare_params(params) + # Then we perform our tuning + tune_ready_params: Optional[dict[str, TunableNeighborly]] = None + for _ in range(tune_count): + tune_ready_params = {k: v.tune() for k, v in prepared_params.items()} + assert tune_ready_params is not None, "Empty range? This should have been caught by the first assumption." + # We then forget tuning information + tuned_params = forget_tuning_params(tune_ready_params) + # then finally flatten the associated parameters + return flatten_params(tuned_params, hash_function) + +def preferred_model_dump(model: BaseModel) -> Any: + """ + A custom invokation of BaseModel#model_dump which uses some preferred defaults to keep + generated configuration sizes to a minimum. + """ + return model.model_dump(by_alias=True, exclude_defaults=True, exclude_unset=True) + +def model_vars(model: Any) -> dict[str, Any]: + """Ignores model variables that are the default or unset: this acts as a kind of `vars`.""" + dumped_model = preferred_model_dump(model) + return {k: v for k, v in dict(model).items() if k in dumped_model.keys()} + +# Beyond this lies code that depends on types generated in algorithms.py. +# NOTE: If algorithms.py is refactored to be type-safe, this should be refactored as well. +def tune_run( + run: Any, + hash_function: Callable[[dict[str, Any]], str], + tune_count: int = 1, + # TODO: refactor to a lambda? + process_run: Optional[Callable[[str], str]] = None +) -> dict[str, Any]: + """ + Takes a run (a dictionary with at least a key "params") and returns a dictionary of + hashes to new runs, preserving the associated non-params run properties + while adding in new conditional runs. + + @param run: A pydantic object of runs. + @param run_prefix: The prefix to attatch to all hashed parameter combinations. + """ + if not process_run: process_run = lambda _: "" + tuned_params = tune_params(model_vars(run.params), hash_function, tune_count) + other_run_settings = {k: v for k, v in preferred_model_dump(run).items() if k != "params"} + return {process_run(k): { + "params": v.current, + **other_run_settings, + "if": [ + *[process_run(neighbor) for neighbor in v.neighbors], + *(other_run_settings["if"] if "if" in other_run_settings else []) + ] + } for k, v in tuned_params.items()} + +def tune_runs( + runs: Any, + hash_function: Callable[[dict[str, Any]], str], + tune_count: int = 1, +) -> dict[str, Any]: + all_runs: dict[str, Any] = dict() + for run_name, run_model in runs.items(): + all_runs = {**all_runs, **tune_run(run_model, hash_function, tune_count, lambda run: f"{run_name}_{run}")} + return all_runs + +def tune_algorithm_union( + algorithm: Any, + hash_function: Callable[[dict[str, Any]], str], + tune_count: int = 1, +) -> Any: + dumped_model = preferred_model_dump(algorithm) + dumped_model["runs"] = tune_runs(algorithm.runs, hash_function, tune_count) + return TypeAdapter(type(algorithm)).validate_python(dumped_model) + +def tune( + config: RawConfig, + hash_function: Callable[[dict[str, Any]], str], + tune_count: int = 1, +) -> RawConfig: + return config.model_copy(update={ + "algorithms": tune_algorithm_union([algorithm for algorithm in config.algorithms], hash_function, tune_count) + }) + +# TODO: isolate and deduplicate from config.py +def hash_func(hash_length: int, params: dict[str, Any]) -> str: + """ + General algorithm parameter dictionary hasher. + """ + return hash_params_sha1_base32(params, hash_length, cls=NpHashEncoder) + +"""The curried variant of `hash_func`.""" +hash_factory = lambda hash_length: lambda params: hash_func(hash_length, params) diff --git a/spras/cli/util.py b/spras/cli/util.py new file mode 100644 index 000000000..448067b79 --- /dev/null +++ b/spras/cli/util.py @@ -0,0 +1,17 @@ +from itertools import islice +from typing import Generator, Iterable + +# Adopted from https://stackoverflow.com/a/6822773/7589775 +# with a proper type signature. +def window[T](sequence: Iterable[T], n: int) -> Generator[tuple[T, ...], None, None]: + """ + Returns a sliding window (of width n) over data from the iterable + s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ... + """ + it = iter(sequence) + result = tuple(islice(it, n)) + if len(result) == n: + yield result + for elem in it: + result = result[1:] + (elem,) + yield result diff --git a/spras/config/algorithms.py b/spras/config/algorithms.py index c0fb53686..f364741c6 100644 --- a/spras/config/algorithms.py +++ b/spras/config/algorithms.py @@ -11,12 +11,13 @@ BeforeValidator, ConfigDict, Field, + PlainSerializer, ValidationError, create_model, ) from spras.config.runs import RunSettings -from spras.config.tunable import FloatTunable, IntegerTunable, TunableSet +from spras.config.tunable import FloatTunable, IntegerTunable, TunableList from spras.runner import algorithms # This contains the dynamically generated algorithm schema for use in `schema.py` @@ -27,12 +28,12 @@ def determine_inner_type(outer_type: type): if get_args(outer_type) != () and get_origin(outer_type) == Union: # We map arbitrary unions Union[A, ...] -> Union[determine_inner_type(A), ..., TunableSet[Union[A, ...]]], # where we include the extra entry at the end to allow for heterogeneous lists. - return Union[*(determine_inner_type(arg) for arg in get_args(outer_type)), TunableSet[outer_type]] + return Union[*(determine_inner_type(arg) for arg in get_args(outer_type)), TunableList[outer_type]] if outer_type == float: return FloatTunable if outer_type == int: return IntegerTunable # We fall back to base `TunableSet` otherwise. - return TunableSet[outer_type] + return TunableList[outer_type] # This is the most 'hacky' part of this code, but, thanks to pydantic, we avoid reflection # and preserve rich type information at runtime. @@ -74,19 +75,27 @@ def construct_algorithm_model(name: str, model: type[BaseModel]) -> type[BaseMod field_type = determine_inner_type(field.annotation) mapped_list_field[field_name] = ( - Union[ - # We first try to validate the type directly - field_type, - # If that fails, we coerce it into a list beforehand then validate it again. - Annotated[ + Annotated[ + Union[ + # We first try to validate the type directly field_type, - BeforeValidator( - lambda value: [value], - # and we ensure that in a JSON schema, this is properly marked - # as a singleton value. - json_schema_input_type=field.annotation - ), - ] + # If that fails, we coerce it into a list beforehand then validate it again. + Annotated[ + field_type, + BeforeValidator( + lambda value: [value], + # and we ensure that in a JSON schema, this is properly marked + # as a singleton value. + json_schema_input_type=field.annotation + ), + ] + ], + # For cleaner serialization, we also serialize singleton types + # as single arrays rather than full arrays. This is especially useful + # for parameter tuning output. + PlainSerializer( + lambda value: value.to_list()[0] if len(value.to_list()) == 1 else value + ) ], new_field ) diff --git a/spras/config/tunable.py b/spras/config/tunable.py index bde58ec22..4fdd2f627 100644 --- a/spras/config/tunable.py +++ b/spras/config/tunable.py @@ -39,10 +39,10 @@ def tune(self) -> Self: def to_list(self) -> List[T]: pass -class TunableSet[S](RootModel[List[S]], Tunable[S]): +class TunableList[S](RootModel[List[S]], Tunable[S]): """ A thin wrapper class to allow generic sets to 'pretend' to be tunable. - Note that `Tunableset#tune` is the identity function. + Note that `TunableList#tune` is the identity function. """ def tune(self): @@ -174,8 +174,8 @@ def to_list(self): return list({int(elem) for elem in self.root.to_list()}) -FloatTunable = Annotated[Union[Range, LinSpace, ARange, LogSpace, TunableSet[float]], Field(union_mode="left_to_right")] +FloatTunable = Annotated[Union[Range, LinSpace, ARange, LogSpace, TunableList[float]], Field(union_mode="left_to_right")] # We have to annoyingly spread IntegerAdapter across every type we care about, to avoid multiple inheritance issues # TODO: maybe a better way to do this? -IntegerTunable = Annotated[Union[Range, IntegerAdapter[LinSpace], IntegerAdapter[ARange], IntegerAdapter[LogSpace], TunableSet[int]], Field(union_mode="left_to_right")] +IntegerTunable = Annotated[Union[Range, IntegerAdapter[LinSpace], IntegerAdapter[ARange], IntegerAdapter[LogSpace], TunableList[int]], Field(union_mode="left_to_right")]