From a714aea4c77a44f7f501f4c271938e80a1c18250 Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Mon, 6 May 2024 07:23:31 -0400 Subject: [PATCH 01/18] feat: added tool --- .../rt-cetsa-moltprot-tool/.bumpversion.cfg | 29 + regression/rt-cetsa-moltprot-tool/Dockerfile | 20 + regression/rt-cetsa-moltprot-tool/README.md | 24 + regression/rt-cetsa-moltprot-tool/VERSION | 1 + regression/rt-cetsa-moltprot-tool/plugin.json | 52 + .../rt-cetsa-moltprot-tool/pyproject.toml | 76 + .../rt-cetsa-moltprot-tool/run-plugin.sh | 20 + .../regression/rt_cetsa_moltprot/VERSION | 1 + .../regression/rt_cetsa_moltprot/__init__.py | 48 + .../regression/rt_cetsa_moltprot/__main__.py | 76 + .../regression/rt_cetsa_moltprot/core.py | 3780 +++++++++++++++++ .../regression/rt_cetsa_moltprot/models.py | 658 +++ 12 files changed, 4785 insertions(+) create mode 100644 regression/rt-cetsa-moltprot-tool/.bumpversion.cfg create mode 100755 regression/rt-cetsa-moltprot-tool/Dockerfile create mode 100644 regression/rt-cetsa-moltprot-tool/README.md create mode 100644 regression/rt-cetsa-moltprot-tool/VERSION create mode 100644 regression/rt-cetsa-moltprot-tool/plugin.json create mode 100644 regression/rt-cetsa-moltprot-tool/pyproject.toml create mode 100644 regression/rt-cetsa-moltprot-tool/run-plugin.sh create mode 100644 regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/VERSION create mode 100644 regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py create mode 100644 regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py create mode 100644 regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py create mode 100644 regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py diff --git a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg new file mode 100644 index 0000000..d858009 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py] diff --git a/regression/rt-cetsa-moltprot-tool/Dockerfile b/regression/rt-cetsa-moltprot-tool/Dockerfile new file mode 100755 index 0000000..dd390da --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.3.6 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".arrow" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_moltprot"] +CMD ["--help"] diff --git a/regression/rt-cetsa-moltprot-tool/README.md b/regression/rt-cetsa-moltprot-tool/README.md new file mode 100644 index 0000000..24a28c6 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/README.md @@ -0,0 +1,24 @@ +# RT_CETSA Moltprot Regression (v0.1.0) + +This WIPP plugin runs regression analysis for the RT-CETSA pipeline. +The input csv file should be sorted by `Temperature` column. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eight input argument and one output argument: + +| Name | Description | I/O | Type | +|--------------------|------------------------------------------------------------|--------|---------------| +| `--inpDir` | Input data collection to be processed by this plugin | Input | genericData | +| `--filePattern` | Pattern to parse tabular files | Input | string | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/regression/rt-cetsa-moltprot-tool/VERSION b/regression/rt-cetsa-moltprot-tool/VERSION new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/regression/rt-cetsa-moltprot-tool/plugin.json b/regression/rt-cetsa-moltprot-tool/plugin.json new file mode 100644 index 0000000..e48b1d0 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/plugin.json @@ -0,0 +1,52 @@ +{ + "name": "RT-CETSA Moltprot", + "version": "0.1.0", + "title": "RT-CETSA Moltprot", + "description": "Run regression analysis for the RT-CETSA pipeline.", + "author": "Nicholas Schaub (nick.schaub@nih.gov), Najib Ishaq (najib.ishaq@nih.gov)", + "institution": "National Center for Advancing Translational Sciences, National Institutes of Health", + "repository": "https://github.com/PolusAI/tabular-tools", + "website": "https://ncats.nih.gov/preclinical/core/informatics", + "citation": "", + "containerId": "polusai/rt-cetsa-moltprot-tool:0.1.0", + "baseCommand": [ + "python3", + "-m", + "polus.tabular.regression.rt_cetsa_moltprot" + ], + "inputs": [ + { + "name": "inpDir", + "type": "genericData", + "description": "Input data collection to be processed by this plugin", + "required": true + }, + { + "name": "filePattern", + "type": "string", + "description": "Pattern to parse input files", + "default": ".+", + "required": false + } + ], + "outputs": [ + { + "name": "outDir", + "type": "genericData", + "description": "Output data collection" + } + ], + "ui": [ + { + "key": "inputs.inpDir", + "title": "Input collection", + "description": "Input data collection to be processed by this plugin" + }, + { + "key": "inputs.filePattern", + "title": "filePattern", + "description": "Pattern to parse input files", + "default": ".+" + } + ] +} diff --git a/regression/rt-cetsa-moltprot-tool/pyproject.toml b/regression/rt-cetsa-moltprot-tool/pyproject.toml new file mode 100644 index 0000000..8627e09 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/pyproject.toml @@ -0,0 +1,76 @@ +[tool.poetry] +name = "polus_tabular_regression_rt_cetsa_moltprot" +version = "0.1.0" +description = "Run regression analysis for the RT-CETSA pipeline." +authors = [ + "Nick Schaub ", + "Najib Ishaq ", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +filepattern = "^2.0.5" +pandas = "^2.2.2" +matplotlib = "^3.8.4" +scipy = "^1.13.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +extend-ignore = [ + "RET505", # Unnecessary `else` after `return` statement + "E501", # Line too long + "ANN001", # Missing type annotation + "D102", # Missing docstring in public method + "ANN201", # Missing return type annotation + "N806", # Variable in function should be lowercase + "D205", # 1 blank line required between summary line and description + "N803", # Argument name should be lowercase + "PLR0913", # Too many arguments + "D415", # First line should end with a period, question mark, or exclamation point + "PLR2004", # Magic value used in comparison + "B006", # Do not use mutable default arguments + "D107", # Missing docstring + "D101", # Missing docstring + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file + "PTH123", # `open()` should be replaced with `Path.open()` + "PTH118", # `os.path.join()` should be replaced with `/` operator + "PTH100", # `os.path.abspath()` should be replaced with `Path.resolve()` + "PLR0915", # Too many statements + "PLR0912", # Too many branches + "C901", # Function is too complex + "T201", # `print` used + "E722", # Do not use bare 'except' + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling + "ANN202", # Missing return type annotation for private function + "ARG002", # Unused method argument + "N802", # Function name should be lowercase + "PTH103", # `os.makedirs()` should be replaced with `Path.mkdir(parents=True)` + "ANN003", # Missing type annotation for `**kwargs` + "B007", # Loop control variable not used within the loop body + "ANN204", # Missing return type annotation for magic method + "D417", # Missing argument descriptions in the docstring + "ANN205", # Missing return type annotation for static method + "PLR5501", # Use `elif` instead of `else` following `if` condition to avoid unnecessary indentation + "EM102", # Exception must not use an f-string literal + "D414", # Section has no content + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "A001", # Variable `input` is shadowing a Python builtin + "A002", # Argument `input` is shadowing a Python builtin + "E741", # Ambiguous variable name: `l` + "PTH120", # `os.path.dirname()` should be replaced by `Path.parent` + "N816", # Variable `cfFilename` in global scope should not be mixedCase + "PTH109", # `os.getcwd()` should be replaced by `Path.cwd()` +] diff --git a/regression/rt-cetsa-moltprot-tool/run-plugin.sh b/regression/rt-cetsa-moltprot-tool/run-plugin.sh new file mode 100644 index 0000000..bea8c33 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash +version=$( pandas.DataFrame: + """Fit data to a model using Moltprot.""" + fit = core.MoltenProtFit( + filename=file_path, + input_type="csv", + ) + + fit.SetAnalysisOptions( + model="santoro1988", + baseline_fit=3, + baseline_bounds=3, + dCp=0, + onset_threshold=0.01, + savgol=10, + blanks=[], + exclude=[], + invert=False, + mfilt=None, + shrink=None, + trim_max=0, + trim_min=0, + ) + + fit.PrepareData() + fit.ProcessData() + + return fit.plate_results.sort_values("BS_factor") + + +def gen_out_path(file_path: pathlib.Path, out_dir: pathlib.Path) -> pathlib.Path: + """Generate the output path.""" + file_name = file_path.stem + "_moltprot" + POLUS_TAB_EXT + return out_dir / file_name diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py new file mode 100644 index 0000000..3c43a9f --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py @@ -0,0 +1,76 @@ +"""CLI for rt-cetsa-moltprot-tool.""" + +import json +import logging +import os +import pathlib + +import filepattern +import typer +from polus.tabular.regression.rt_cetsa_moltprot import fit_data +from polus.tabular.regression.rt_cetsa_moltprot import gen_out_path + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.tabular.transforms.tabular_merger") +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + +app = typer.Typer() + + +@app.command() +def main( + inp_dir: pathlib.Path = typer.Option( + ..., + help="Input directory containing the data files.", + exists=True, + dir_okay=True, + readable=True, + resolve_path=True, + ), + pattern: str = typer.Option( + ".+", + help="Pattern to match the files in the input directory.", + ), + preview: bool = typer.Option( + False, + help="Preview the files that will be processed.", + ), + out_dir: pathlib.Path = typer.Option( + ..., + help="Output directory to save the results.", + exists=True, + dir_okay=True, + writable=True, + resolve_path=True, + ), +) -> None: + """CLI for rt-cetsa-moltprot-tool.""" + # TODO: Add to docs that input csv file should be sorted by `Temperature` column. + logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") + + logger.info(f"Input directory: {inp_dir}") + logger.info(f"File Pattern: {pattern}") + logger.info(f"Output directory: {out_dir}") + + fp = filepattern.FilePattern(inp_dir, pattern) + inp_files = [f[1][0] for f in fp()] + + if preview: + out_json = {"files": [gen_out_path(f, out_dir) for f in inp_files]} + with (out_dir / "preview.json").open("w") as f: + json.dump(out_json, f, indent=2) + return + + for f in inp_files: + logger.info(f"Processing file: {f}") + out_path = gen_out_path(f, out_dir) + df = fit_data(f) + df.to_csv(out_path, index=True) + + +if __name__ == "__main__": + app() diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py new file mode 100644 index 0000000..31ba0f1 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py @@ -0,0 +1,3780 @@ +"""Copyright 2018-2021 Vadim Kotov, Thomas C. Marlovits. + +This file is part of MoltenProt. + +MoltenProt is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +MoltenProt is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MoltenProt. If not, see . +""" + +### Citation +# a simple dict with strings that provide different citation formatting +citation = { + "long": "\nIf you found MoltenProt helpful in your work, please cite:\nKotov et al., Protein Science (2021)\ndoi: 10.1002/pro.3986\n", + "html": """

If you found MoltenProt helpful in your work, please cite:

+

Kotov et al., Protein Science (2021)

+

doi: 10.1002/pro.3986

""", + "short": "Citation: Kotov et al., Protein Science (2021) doi: 10.1002/pro.3986", +} + + +### Modules +# some useful mathematical functions +# For printing line number and file name +from inspect import currentframe +from inspect import getframeinfo + +# for generating htmls +from string import Template + +# plotting +import matplotlib.pyplot as plt +import numpy as np +from matplotlib import gridspec + +# for color conversion +from matplotlib.colors import rgb2hex + +# interpolation +from scipy.interpolate import interp1d + +# fitting routine from scipy +from scipy.optimize import curve_fit + +# median filtering +from scipy.signal import medfilt + +cf = currentframe() +cfFilename = getframeinfo(cf).filename # type: ignore[arg-type] + +# handling exceptions +# saving class instances to JSON format +import json + +# creating folders +import os +import sys + +# function to recognize module versions +from distutils.version import LooseVersion + +# for compression of output JSON +# for timestamps +from time import strftime + +# data processing +import pandas as pd + +# import the fitting models +from . import models + +# A variable for reliable access to other resources of MoltenProt (e.g. report template) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + +# MoltenProt is stored in a plain-text file VERSION (also used by setup.py) +# extract and save it to a variable +with open(os.path.join(__location__, "VERSION")) as version_file: + __version__ = version_file.read().strip() + +# get scipy version (some methods may not be available in earlier versions) +scipy_version = sys.modules["scipy"].__version__ + +# check if running from a PyInstaller bundle +if hasattr(sys, "frozen") and hasattr(sys, "_MEIPASS"): + from_pyinstaller = True +else: + from_pyinstaller = False + +# for parallel processing of some for loops using joblib (may hang up) +try: + from joblib import Parallel + from joblib import delayed + + # this is only needed if we want to auto-estimate the available cores - currently done in MoltenProtMain + # access module version without importing the whole module: + joblib_version = sys.modules["joblib"].__version__ + # check joblib version: only 0.12 and above can pickle instance methods (which is used in mp) + if LooseVersion(joblib_version) >= LooseVersion("0.12"): + parallelization = True + else: + print( + "Warning: available joblib version ({}) is incompatible with current code parallelization".format( + joblib_version, + ), + ) + print("Information: Consider updating to joblib 0.12 or above") + parallelization = False +except ImportError: + print("Warning: joblib module not found, parallelization of code is not possible.") + joblib_version = "None" # only for printing version info + parallelization = False + +# NOTE MoltenProtFit and MoltenProtFitMultiple have different parallelization approaches: +# MoltenProtFit - can only parallelize figure plotting and n_jobs=3 works well +# MoltenProtFitMultiple - reads and runs several MoltenProtFit instances in parallel (F330, F350 etc), +# but each of them gets only one job (i.e. MoltenProtFit.n_jobs is always 1) + +### Constants imported from models.py +R = models.R +T_std = models.T_std + +# Standard plate index +# NOTE currently the software requires the layout to contain A1-H12 index, even if only a small part is used +alphanumeric_index = [] +for k in ["A", "B", "C", "D", "E", "F", "G", "H"]: + for l in range(1, 13): + alphanumeric_index.append(k + str(l)) +# convert index to pandas Series and set its name +alphanumeric_index = pd.Series(data=alphanumeric_index) +alphanumeric_index.name = "ID" # type: ignore[attr-defined] + +# dictionary holding the default values and and their description for CLI interface/tooltips +# dictionary key is the name of the option, each entry contains a tuple of default parameter value and its descriptions +# NOTE for mfilt the default value is only used when the option is supplied in the CLI + +# default data preparation parameters +prep_defaults = { + "blanks": [], + "blanks_h": "Input the sample ID's with buffer-only control", # subtract blanks in prep step + "exclude": [], + "exclude_h": 'Specify the well(s) to omit during analysis; this option is intended for simple removal of some bad wells; if many samples must be excluded, use a layout and add "Ignore" to the annotation of the sample', + "invert": False, # DELETE? + "invert_h": "Invert the curve", + "mfilt": None, + "mfilt_h": "Apply median filter with specificed window size (in temperature units) to remove spikes; 4 degrees is a good starting value", + "shrink": None, + "shrink_h": "Average the data points to a given degree step;\n may help to make trends more apparent and speeds up the computation;\n typical values 0.5-3.0", + "trim_max": 0, + "trim_max_h": "Decrease the finishing temperature by this value", + "trim_min": 0, + "trim_min_h": "Increase the starting temperature by this value", +} + +# default analysis parameters +analysis_defaults = { + "model": "santoro1988", + "model_h": "Select a model for describing the experimental data", + "baseline_fit": 10, + "baseline_fit_h": "The length of the input data (in temperature degrees) for initial estimation of pre- and post-transition baselines", + "baseline_bounds": 3, + "baseline_bounds_h": "Baseline bounds are set as multiples of stdev of baseline parameters obtained in the pre-fitting routine; this should stabilize the fit and speed up convergence; set to 0 to remove any bounds for baselines", + "dCp": 0, + "dCp_h": "Heat capacity change of unfolding for all samples (in J/mol/K), used only in equilibrium models; this value overrides the respective column in the layout", + "onset_threshold": 0.01, + "onset_threshold_h": "Percent unfolded to define the onset of unfolding", + "savgol": 10, + "savgol_h": "Set window size (in temperature units) for Savitzky-Golay filter used to calculate the derivative", +} + +# all other settings for MoltenProtFit +defaults = { + "debug": False, # currently not exposed in the CLI + "debug_h": "Print developer information to the console", + "dec": ".", + "dec_h": "CSV decimal separator, enclosed in quotes", + "denaturant": "C", + "denaturant_h": "For plain CSV input only; specify temperature scale that drives denaturation, in K or C", + "j": 1, # TODO not related to the core functions, supply to respective methods (output etc) + "j_h": "Number of jobs to be spawned by parallelized parts of the code; should not be higher than the amount of CPU's in the computer; for most recent laptops a value of 3 is recommended", + "layout": None, # TODO should not be set in SetAnalysisOptions, but rather in __init__ + "layout_h": "CSV file with layout", + "sep": ",", + "sep_h": "CSV separator, enclosed in quotes", + "readout": "Signal", + "readout_h": "For plain CSV input only; specify type of input signal", + "spectrum": False, + "spectrum_h": "If true, columns in the input CSV will be treated as separate wavelengths of a spectrum", + "heatmap_cmap": "coolwarm_r", # a color-safe heatmap color with red being "bad" (low value) + "heatmap_cmap_h": "Matplotlib code for colormap that would be used to color-code heatmaps in reports or images", +} + +# dictionary with avialble models +# NOTE to be automatically identified the model has to be subclass or subsubclass of MoltenProtModel +avail_models = {} +for model in models.MoltenProtModel.__subclasses__(): + avail_models[model.short_name] = model # subclass of MoltenProtModel + for submodel in model.__subclasses__(): # subsubclass of MoltenProtModel + avail_models[submodel.short_name] = submodel + +# add a dummy model to indicate that the dataset should not be analysed +avail_models["skip"] = "skip" # type: ignore[assignment] + +### Utility functions + + +def normalize(input, new_min=0, new_max=1, from_input=False): + """Helper function to normalize a pandas Series. + + Make the data occupy a specified range (defined by new_min and new_max) + + Parameters + ---------- + input : pd.Series + input Series to normalize + new_min, new_max : float + range for normalization (default [0-1]) + from_input : bool + use absolute vales of min/max of the input series to create normalization range + this is used to perform inversion of curves (mirror around X-axis) + + References: + ---------- + The general formula for normalization is from here: + https://en.wikipedia.org/wiki/Normalization_(image_processing) + """ + if from_input: + new_min = min(abs(input)) + new_max = max(abs(input)) + + return (input - input.min()) * (new_max - new_min) / ( + input.max() - input.min() + ) + new_min + + +def to_odd(input, step): + """Helper function to convert the window length in temperature units to an odd number of datapoints. + + NOTE for user's simplicity the window size is specified in temperature units + for filters, however, an odd integer is needed + + Parameters + ---------- + input : float + number in degrees to convert to a number of datapoints + step : float + temperature step in the dataset + + Returns: + ------- + float + an odd number of datapoints which should more or less fit in the + requested temperature range + """ + # calculate the approximate amount of datapoints corresponding to the range + input = input / step + # convert input to an integer + input = int(input) + # if it can be divided by 2 without the remainder (Euclidean division) + # than it is even and needs to become odd + # % computes the remainder from Euclidean division + # the complementary operator is // + if not input % 2: + input = input + 1 + return input + + +def analysis_kwargs(input_dict): + """Takes a dict and returns a valid argument dict for SetAnalysisOptions + Unknown options are removed, default values are supplied. + """ + output = {} + for i, j in input_dict.items(): + if i in list(analysis_defaults.keys()) or i in list(prep_defaults.keys()): + output[i] = j + return output + + +def showVersionInformation(): + """Print dependency version info.""" + print(cfFilename, currentframe().f_lineno) + print(f"MoltenProt v. {__version__}") + if from_pyinstaller: + print("(PyInstaller bundle)") + pd.show_versions(as_json=False) # matplotlib is also there + print(f"joblib : {joblib_version}") + + +### Wrappers + + +def mp_read_excel(filename, sheetname, index_col): + """Read XLSX files to pandas DataFrames and return None if errors occur. + + Parameters + ---------- + filename + filename for reading + sheetname + which sheet to read from file + index_col + which column to use as index + + Returns: + ------- + pd.DataFrame from Excel file or None if reading failed + """ + try: + # NOTE in pandas below 0.24 even though the data should be considered index-free (index_col=None) + # the first column was sometimes used as index (corresponds to Time in Prometheus XLSX) + # for now explicitly pass some column to use as index + return pd.read_excel(filename, sheetname, index_col=index_col) + except: + # NOTE XLSX parsing module has its own error naming system, so individual errors cannot be catched unless xlrd module is imported + print( + 'Warning: sheet "{}" does not exist in the input file {}'.format( + sheetname, + filename, + ), + ) + return None + + +### JSON I/O functions and variables + + +def serialize(obj): + """Helper function for serialization of DataFrames and other non-standard objects.""" + if isinstance(obj, pd.core.frame.DataFrame): + # NOTE original data contains 12 decimal digits, while default encoding parameter is 10 + # pandas calculates with 16 decimal digits (or more?) and in this part there will be certain discrepancy + # between the runs (i.e. run analysis>save to json>run analysis again) + # in any case the impact on the ultimate result is not measurable + return { + "DataFrame": obj.to_json( + force_ascii=False, + double_precision=15, + orient="split", + ), + } + elif isinstance(obj, MoltenProtFit): + output = obj.__dict__ + # delete some method descriptions (only needed for parallel processing) + if "plotfig" in output: + del output["plotfig"] + + # delete layout attribute, it will be set from parent MPFM when loading back + if "layout" in output: + del output["layout"] + + return {"MoltenProtFit": output} + elif isinstance(obj, MoltenProtFitMultiple): + output = obj.__dict__ + # a better way to query a dict, see: + # https://docs.quantifiedcode.com/python-anti-patterns/correctness/not_using_get_to_return_a_default_value_from_a_dictionary.html + # TODO this deletion is probably not needed? + if output.get("PrepareAndAnalyseSingle"): + del output["PrepareAndAnalyseSingle"] + if output.get("WriteOutputSingle"): + del output["WriteOutputSingle"] + # add version info and timestamp + return { + "MoltenProtFitMultiple": output, + "version": __version__, + "timestamp": strftime("%c"), + } + return None + + +def deserialize(input_dict): + """Helper function to deserialize MoltenProtFit instances and DataFrames from JSON files (read into a dict). + + If the object contains any plate_ entries, cycle through them and convert to DataFrames + also applies to the layout entry (again, a DataFrame) + """ + if input_dict.get("MoltenProtFit"): + # read a MPF instance from the dict + # plate_ variables conversion occurs during intitialization. + # the downstream part of the json (e.g. a single MPFIT instance) + output = MoltenProtFit(None, input_type="from_dict") + output.__dict__.update(input_dict["MoltenProtFit"]) + return output + elif input_dict.get("MoltenProtFitMultiple"): + # this level also has version info and timestamp + # NOTE if needed, this section can be used to discard too old JSON sessions + if input_dict.get("version") and input_dict.get("timestamp"): + print( + "Information: JSON session was created with MoltenProt v. {} on {}".format( + input_dict["version"], + input_dict["timestamp"], + ), + ) + else: + print( + "Warning: JSON session contains no version and timestamp, is it an old one?", + ) + # create an empty instance and update its dict + output = MoltenProtFitMultiple() + output.__dict__.update(input_dict["MoltenProtFitMultiple"]) + + # set layouts in all datasets + output.UpdateLayout() + return output + elif input_dict.get("DataFrame"): + return pd.read_json(input_dict["DataFrame"], precise_float=True, orient="split") + return input_dict + + +### Classes + + +class MoltenProtFit: + """Stores and analyses a single dataset. + + Attributes: + ---------- + defaults : dict + dictionary holding the default values and and their description + hm_dic : dict + dictionary for important heatmap parameters + plate - the workhorse variable, contains the values currently being procesed + plate_raw - initially imported data, but without invalid wells or the ones excluded by user + plate_results - fit values and sort score for individual wells + plate_derivative - the derivative curve for raw (?) data + plate_fit - the curves computed based on the fit parameters + resultfolder - a subfolder created from the file name to write the output (assigned by MoltenProtMain.py) + dT - step of temperature + xlim - the range for x-axis on RFU(T) plots + filename - the name of the input file + Lists holding removed wells: + bad_fit - ID of wells that could not be fit + blanks - blank wells (subtracted prior to analysis) -> now part of SetAnalysisOptions + exclude - supplied by user through --exclude option -> now part of SetAnalysisOptions + bad_Tm - the melting temperature is out of range of T + readout_type - the signal in the assay + + Notes: + ----- + This is an internal class, do not create it manually. + All official messages must be sent through print_message() method (enforces selecting message type) + Output information style: + "Fatal: " - error that causes the program to stop + "Warning: " - something may be messed up, but the program can proceed + "Information: " any other message, that has nothing to do with the program flow + """ + + # dictionary for important heatmap parameters: + # > which values are good high or low + # > what is the title for plot + # > what are the tick labels + # TODO for Tm, dHm and the like, substitute text info to the min and max numbers + # TODO use this as is a general hm_dic for all types of analysis + hm_dic = { + "S": { + "lowerbetter": True, + "title": "Std. Error of Estimate Heatmap", + "tick_labels": ["Bad fit", "Reference", "Good fit"], + }, + "dHm_fit": { + "lowerbetter": False, + "title": "Unfolding Enthalpy Heatmap", + "tick_labels": ["Low dHm", "Reference", "High dHm"], + }, + "Tm_fit": { + "lowerbetter": False, + "title": "Melting Temperature Heatmap", + "tick_labels": ["Low Tm", "Reference", "High Tm"], + }, + "d_fit": { + "lowerbetter": True, + "title": "Slope Heatmap", + "tick_labels": ["Flat", "Reference", "Steep"], + }, + "Tm_init": { + "lowerbetter": False, + "title": "Melting Temperature Heatmap", + "tick_labels": ["Low Tm", "Reference", "High Tm"], + }, + "T_onset": { + "lowerbetter": False, + "title": "Onset Temperature Heatmap", + "tick_labels": ["Low T_ons", "Reference", "High T_ons"], + }, + "a_fit": { + "lowerbetter": True, + "title": "Slope Heatmap", + "tick_labels": ["Flat", "Reference", "Steep"], + }, + "Tagg_fit": { + "lowerbetter": False, + "title": "Aggregation Temperature Heatmap", + "tick_labels": ["Low Tagg", "Reference", "High Tagg"], + }, + "Tagg_init": { + "lowerbetter": False, + "title": "Aggregation Temperature Heatmap", + "tick_labels": ["Low Tagg", "Reference", "High Tagg"], + }, + "dG_std": { + "lowerbetter": False, + "title": "Standard Gibbs Free Energy of Unfolding", + "tick_labels": ["na", "na", "na"], + }, + "BS_factor": { + "lowerbetter": False, + "title": "Dimesionless Signal Window", + "tick_labels": ["Narrow", "Reference", "Wide"], + }, + } + + def __init__( + self, + filename, + scan_rate=None, + denaturant=defaults["denaturant"], + sep=defaults["sep"], + dec=defaults["dec"], + debug=defaults["debug"], + input_type="csv", + parent_filename="", + readout_type="Signal", + ) -> None: + """Parameters + ---------- + filename + for csv files the file to parse, when MPF is made by MPFM this will be substituted to the filename used to create MPFM + scan_rate + scan rate in degrees per min (required for kinetic models) + denaturant + temperature (C or K) or chemical (under construction) + sep,dec + csv import parameters + debug + print additional info + input_type + defines where the data is coming from: + > csv - default value, corresponds to the standard csv file + > from_xlsx - means that the instance of MoltenProtFit is being created by MoltenProtFitMultiple, input will be a ready-to-use DataFrame + parent_filename + the filename of the original xlsx file + readout_type + the name of the readout (for plot Y-axis) + + Notes: + ----- + """ + # set debugging mode: if the __version__ is "git", then this is a development file + # and debugging is done by default. In all other cases, it is False + # TODO add debug to CLI interface + if __version__[:3] == "git": + self.debug = True + else: + self.debug = debug + + if input_type == "from_dict": + # this would return an empty object + # all manipulations are done by deserialize function + pass + else: + # NOTE the logic of overwriting etc is handled by CLI/GUI code + self.resultfolder = None + # attribute to hold diagnostic messages. + self.protocolString = "" + self.scan_rate = scan_rate + self.fixed_params = None + # load the dataset into the plate variable + if input_type == "csv": + try: + self.plate = pd.read_csv( + filename, + sep=sep, + decimal=dec, + index_col="Temperature", + encoding="utf-8", + ) + except ValueError: + self.print_message("Input *.csv file is invalid!", "f") + print( + "Please check if column called 'Temperature' exists and separators are specified correctly.", + ) + # the filename is needed for report title + self.filename = filename + elif input_type == "from_xlsx": + self.plate = filename + # BUG this is only needed for report generation, a dummy value + # Ideally, it should be the name of parent xlsx file + self.filename = parent_filename + + # If requested, convert temperature of input files to Kelvins + if denaturant == "K": + # input temperature is already in internal units + self.denaturant = "K" + elif denaturant == "C": + # convert Celsius to Kelvins + self.plate.index = self.plate.index + 273.15 + self.denaturant = "K" + elif denaturant == "F": + msg = "NO Fahrenheit, please!" + raise ValueError(msg) + else: + # if temperature was not recognized, set denaturant to chemical type + # NOTE currently not implemented in MoltenProtFit + print(f"Assuming that {denaturant} is a chemical denaturant") + self.denaturant = denaturant + msg = "Chemical denaturation not implemented yet" + raise NotImplementedError(msg) + + # save a copy of raw data + self.plate_raw = self.plate.copy() + + # compute the (average) step of Temperature (required for conversion of temperature ranges to sets of datapoints) + self.dT = (max(self.plate.index) - min(self.plate.index)) / ( + len(self.plate.index) - 1 + ) + + # for plots only: compute xlim from Temperature range + self.xlim = [min(self.plate.index) - 5, max(self.plate.index) + 5] + + # currently only for plots: give a proper label to Y-axis + self.readout_type = readout_type + + self.bad_fit = [] # type: ignore[var-annotated] + self.bad_Tm = [] # type: ignore[var-annotated] + + def __getstate__(self): + """A special method to enable pickling of class methods (for parallel exectution).""" + output = self.__dict__ + output["plotfig"] = self.plotfig + return output + + def converter96(self, use_column, reference=None): + """Reads self.plate_results with well ID in the index (A1, A2 etc) and some values in columns (Tm_fit, etc) and returns a DataFrame emulating a 96-well plate where each well has a normalized respective column value. + + Parameters + ---------- + use_column + which column to use in self.plate_results + reference + well to use as reference + + Notes: + ----- + Normalization is done based on lowerbetter information from self.hm_dic: + 1 - means the highest possible value and the best value as well + 0 - means the worst and the lowest possible value + + If a reference is supplied, it is subtracted from the data + Reference code _was not maintained_ for a while and is probably faulty! + """ + # create a DataFrame emulating a 96-well plate + output = pd.DataFrame( + index=["A", "B", "C", "D", "E", "F", "G", "H"], + columns=list(range(1, 13)), + ) + + # check if the use_column value is a valid one + self.print_message( + "Creating heatmap for column {}".format( + self.plate_results[use_column].name, + ), + "i", + ) + for i in output.index: + for j in output.columns: + a = i + str(j) + if (use_column in self.plate_results.columns) and ( + a in self.plate_results.index + ): + output.loc[[i], [j]] = self.plate_results.loc[a, use_column] + + # if reference value is supplied, subtract it from the values + if reference is not None: + try: + # probe if the well is OK (not NaN), that's easier to do with the input DataFrame + output = output - self.plate_results[use_column][reference] + except KeyError: + self.print_message( + "Supplied reference is invalid, creating a reference-free heatmap", + "w", + ) + + # invert the values if lowerbetter=true + # just a precaution against outdated self.hm_dic + try: + if self.hm_dic[use_column]["lowerbetter"]: + output = output - max(output.max()) + output = output * -1 + except KeyError: + self.print_message( + "{} was not found in hm_dic, the colors may be wrong!".format( + use_column, + ), + "w", + ) + + # bring everything to range 0-1 + # normalize by max value (this way we make sure that Nans changed to 1000 are out of range) + # NOTE in some rare cases, when there is only one sample left and it has value 0 in plate96 + min_val = output.min().min() + max_val = output.max().max() + if min_val == max_val: + self.print_message( + "Only one sample left after pre-processing and fitting!", + "w", + ) + output = output * 0 + else: + output = (output - min_val) / (max_val - min_val) + # make all Nan's equal to 1000 + output.fillna(1000, inplace=True) + return output + + def heatmap( + self, + output_path, + plate96, + use_column, + heatmap_cmap=defaults["heatmap_cmap"], + lowerbetter=False, + title="Result Heatmap", + tick_labels=["Unstable", "Reference", "Stable"], + pdf_report=False, + save=True, + ): + """Create a heatmap. + + Parameters + ---------- + output_path + where to save the image + plate96 + dataframe created with method converter96 + use_column + which curve parameter to use for heatmap + heatmap_cmap + matplotlib colormap for heatmap + lowerbetter + indicated if lower values of a parameters correspond to higher stability (e.g. S) + title + the title of the heatmap + tick_labels + how to label the colorbar + pdf_report + if True, will only return a figure object (and size will be adjusted to meet A4) + save + if True, save image to disk + """ + # tweak colormap to have outside values colored as gray + cmap = plt.get_cmap(heatmap_cmap).copy() + cmap.set_over("0.5") + + # check if there are negative values + # (plate96_sort<0).any() returns a Series with True/False indicating presence of negative values in respective columns + # we then use any() for this series to get the True statement. Very weird way to do it indeed. + if ((plate96 < 0).any()).any(): + # negative values are present, so we're dealing with ref-based data set + + # now check if we are dealing with higherbetter or lowerbetter values + if lowerbetter: + # we have to invert sign so that all values that are less than the reference are positive + plate96 = plate96 * -1 + + # Normalize positive values by the highest value and negative values by the lowest + # this is not fully correct because the slope of color(value) dependence line is different for values + # below zero and above zero. Other ways to do it are too hacky anyway + plate96[plate96 > 0] = plate96[plate96 > 0] / max(plate96.max()) + plate96[plate96 < 0] = plate96[plate96 < 0] / abs(min(plate96.min())) + vmin = -1 + tick_values = [-1, 0, 1] + else: + vmin = min(plate96.min()) + tick_values = [vmin, 1] + # this format means how many decimal digits is allowed + # set tick labels from use_column + col_min = f"{self.plate_results[use_column].min():.3f}" + col_max = f"{self.plate_results[use_column].max():.3f}" + tick_labels = [col_max, col_min] if lowerbetter else [col_min, col_max] + + # Making not available values gray (e.g. bad fit or blanks) + # either convert pd to numpy array and make a mask, or + # create figure canvas + # A4 is 8.3 x 11.7 inches, for report the whole page is needed + # for other outputs we need half the hight (i.e. A5 in landscape orientation) + if pdf_report: + fig, ax = plt.subplots(3, 1, figsize=(8.3, 11.7)) + cbar_shrink = 0.3 + cbar_orient = "horizontal" + heatmap_axis = ax[0] + axis_aspect = ["auto"] + else: + fig = plt.figure(figsize=(8.3, 11.7 / 2), tight_layout=True) + cbar_shrink = 8 / 12 + cbar_orient = "vertical" + heatmap_axis = fig.gca() + axis_aspect = ["equal", "box"] + fig.suptitle( + title, + fontweight="bold", + ) + # create the heatmap + # NOTE in some rare cases when the heatmap consists of a single sample matplotlib will raise a warning: + """ + RuntimeWarning: invalid value encountered in less_equal + b = b[(b <= intv[1] + eps) & (b >= intv[0] - eps)] + RuntimeWarning: invalid value encountered in greater_equal + b = b[(b <= intv[1] + eps) & (b >= intv[0] - eps)] + """ + c = heatmap_axis.pcolor(plate96, edgecolors="k", cmap=cmap, vmin=vmin, vmax=1) + + # cycle through all wells and write there the ID + for i in plate96.index: + for j in plate96.columns: + x = plate96.columns.get_loc(j) + y = plate96.index.get_loc(i) + heatmap_axis.text( + x + 0.5, + y + 0.5, + i + str(j), + horizontalalignment="center", + verticalalignment="center", + ) + + # y axis has to be inverted so that well A1 is in top left corner + heatmap_axis.invert_yaxis() + # addtional hacks: enforce square size of wells, hide axes and ticks + heatmap_axis.set_aspect(*axis_aspect) + heatmap_axis.axis("off") + # create a colorbar with text labels + cbar = fig.colorbar( + c, + ax=heatmap_axis, + ticks=tick_values, + shrink=cbar_shrink, + orientation=cbar_orient, + ) + # set colorbar ticks depending on the requested orientation + if cbar_orient == "horizontal": + cbar.ax.set_xticklabels(tick_labels) + elif cbar_orient == "vertical": + cbar.ax.set_yticklabels(tick_labels) + # label is not needed, because it will be in the figure title + + if pdf_report: + # return the figure object for subsequent manipulations + return (fig, ax) + + if save: + plt.savefig( + os.path.join(output_path, "heatmap_" + str(use_column) + ".png"), + dpi=(200), + tight_layout=True, + ) + # clean up after plotting so that no parameters are carried over to genpics + plt.close("all") + return None + + def print_message(self, text, message_type): + """Prints messages and saves them to protocolString. + + Parameters + ---------- + text + message text + message_type : i/w/f + type of message (Information, Warning, Fatal) + """ + # line No and file name are only printed in debug mode and only for Fatals and Warnings + if self.debug and (message_type != "i"): + cf = currentframe() + cfFilename = getframeinfo(cf).filename + print(f"Line {cf.f_back.f_lineno}, in file {cfFilename}") + if message_type == "f": + print("Fatal: " + text + f" ({self.readout_type})") + sys.exit(1) + elif message_type == "w": + msg = "Warning: " + text + f" ({self.readout_type})" + print(msg) + msg = msg + "\n" + self.protocolString = self.protocolString + msg + elif message_type == "i": + msg = "Information: " + text + f" ({self.readout_type})" + print(msg) + msg = msg + "\n" + self.protocolString = self.protocolString + msg + else: + raise ValueError(f"Unknown message type '{message_type}'") + + def _calculate_raw_corr(self): + """Compute baseline-corrected raw data; requires presence of kN, bN, kU, bU in plate_results. + + Notes: + ----- + calculate fraction unfolded (funf) + How to derive the formula: + K_eq(T) = (Fn(T) - F(T)) / (F(T) - Fu(T)) + K_eq() = funf/fnat = funf / (1-funf) + funf = (Fn(T) - F(T)) / (Fn(T) - Fu(T)) + where T is temperature, F(T) assay signal, Fu and Fn - baseline signal + funf - fraction unfolded; fnat - fraction native + whichever formula is used for calculation, the result is the same + + TODO: the same calculation using plate_fit can be done to yield "fit" variant of funf + """ + # initiate an empty DataFrame + self.plate_raw_corr = pd.DataFrame(index=self.plate.index) + + # a helper function for pandas.apply() + def calculate_raw_corr_series(input_series): + # NOTE the badly fitted samples will be present in the column names of self.plate + # but they are not present in the index of self.plate_results, thus we need to check for it + # Obviously the calculation cannot be done for the curves that could not be fit + if input_series.name in self.plate_results.index: + # here we need transposed plate_results to have sample ID's in the columns + output = ( + input_series.index * self.plate_results["kN_fit"][input_series.name] + + self.plate_results["bN_fit"][input_series.name] + - input_series + ) / ( + input_series.index * self.plate_results["kN_fit"][input_series.name] + + self.plate_results["bN_fit"][input_series.name] + - input_series.index + * self.plate_results["kU_fit"][input_series.name] + - self.plate_results["bU_fit"][input_series.name] + ) + # calculation above produces an index object, which we have to convert to a pd.Series + output = pd.Series( + output, + index=self.plate.index, + name=input_series.name, + ) + self.plate_raw_corr = pd.concat( + [self.plate_raw_corr, pd.Series(output)], + axis=1, + ) + + self.plate.apply(calculate_raw_corr_series) + + def _estimate_baseline(self, input_series, fit_length, estimate_Tm=False): + """Estimates pre- and post-transition baselines for a series + function for a single Series (index is Temperature, name is sample ID, values are RFU's). + + Parameters + ---------- + input_series + pd.Series with Temperature in index, name is sample ID, values are the signal + fit_length + number of degrees to be used from the start or end of data for fitting + estimate_Tm + additionally estimate melting temperature Tm with a heuristic + """ + # convert fit length in temperature degrees to datapoint number (using self.dT) + fit_datapoints = int(fit_length / self.dT) + # NOTE do not use Nan's to prevent issues during fitting + pre_fit, pre_covm = np.polyfit( + input_series.dropna().iloc[:fit_datapoints].index, + input_series.dropna().iloc[:fit_datapoints], + 1, + cov=True, + ) + post_fit, post_covm = np.polyfit( + input_series.dropna().iloc[-fit_datapoints:].index, + input_series.dropna().iloc[-fit_datapoints:], + 1, + cov=True, + ) + self.plate_results.loc[["kN_init"], [input_series.name]] = pre_fit[0] + self.plate_results.loc[["bN_init"], [input_series.name]] = pre_fit[1] + self.plate_results.loc[["kU_init"], [input_series.name]] = post_fit[0] + self.plate_results.loc[["bU_init"], [input_series.name]] = post_fit[1] + pre_stdev = np.sqrt(np.diagonal(pre_covm)) + post_stdev = np.sqrt(np.diagonal(pre_covm)) + # estimate_Tm stdev of each parameter and write it to plate_results_stdev (used to set bounds) + self.plate_results_stdev.loc[["kN_init"], [input_series.name]] = pre_stdev[0] + self.plate_results_stdev.loc[["bN_init"], [input_series.name]] = pre_stdev[1] + self.plate_results_stdev.loc[["kU_init"], [input_series.name]] = post_stdev[0] + self.plate_results_stdev.loc[["bU_init"], [input_series.name]] = post_stdev[1] + + if estimate_Tm: + # now the Tm part - find the maximum of smoothened derivative for S-shaped curves (low-to-high) + # or the minimum for Z-shaped ones + # intersection of the difference line with zero -> intersection of baselines + dintersect = -(post_fit[1] - pre_fit[1]) / (post_fit[0] - pre_fit[0]) + # min/max/middle temperature range + tmin = min(input_series.index) + tmax = max(input_series.index) + tmid = (tmin + tmax) / 2 + # value of baseline difference at tmid + b_diff = (post_fit[0] - pre_fit[0]) * tmid + (post_fit[1] - pre_fit[1]) + + if (dintersect > tmin + fit_length) and (dintersect < tmax - fit_length): + # NOTE rule out intersecting baselines - in this case post-baseline is not always + # above or below the pre-baseline + self.plate_results.loc["Tm_init", input_series.name] = tmid + else: + if b_diff > 0: + # low-to-high curve - use max of the deriv as Tm_init + self.plate_results.loc[ + "Tm_init", + input_series.name, + ] = self.plate_derivative[input_series.name].idxmax() + elif b_diff < 0: + # high-to-low curve - use min of the deriv as Tm_init + self.plate_results.loc[ + "Tm_init", + input_series.name, + ] = self.plate_derivative[input_series.name].idxmin() + else: + # rare case - curves are identical raise a warning and use mid-range + self.print_message( + "Baselines are identical in sample {}".format( + input_series.name, + ), + "w", + ) + self.print_message( + "Using the middle of temperature range as Tm_init", + "i", + ) + self.plate_results.loc["Tm_init", input_series.name] = tmid + + def _calc_Tons(self, Tm_col, dHm_col, onset_threshold): + """Computes onset temperature Tons based on supplied column names with dHm and Tm + and adds the value to plate_results. + + Parameters + ---------- + Tm_col + column with Tm values (Tm_fit, Tm1_fit, etc) + dHm_col + column with slope values + onset_threshold + fraction unfolded that corresponds to onset of unfolded (e.g. 0.01 - 1% must be unfolded) + """ + self.plate_results["T_onset"] = 1 / ( + 1 / self.plate_results[Tm_col] + - R + / self.plate_results[dHm_col] + * np.log(onset_threshold / (1 - onset_threshold)) + ) + + # also calculate stdev for T_onset using error propagation from Tm_fit and dHm_fit + self.plate_results_stdev["T_onset"] = ( + np.sqrt( + (self.plate_results_stdev[Tm_col] / self.plate_results[Tm_col]) ** 2 + + (self.plate_results_stdev[dHm_col] / self.plate_results[dHm_col]) ** 2 + + ( + np.sqrt( + self.plate_results_stdev[dHm_col] ** 2 + + ( + R + * np.log(onset_threshold / (1 - onset_threshold)) + * self.plate_results_stdev[Tm_col] + ) + ** 2, + ) + / ( + self.plate_results[Tm_col] + - R + * np.log(onset_threshold / (1 - onset_threshold)) + * self.plate_results[Tm_col] + ) + ) + ** 2, + ) + * self.plate_results.T_onset + ) + + def plotfig( + self, + output_path, + wellID, + datatype="overview", + save=True, + show=False, + data_ax=None, + vline_legend=False, + ): + """Plot the curves from individual wells. + Creates two subplots - top the fit + data, lower - derivative. + + Parameters + ---------- + output_path + where to write the file (can be also a dummy value) + wellID + from which well to plot + datatype + what is being plotted: + > overview - plot experimental data , fit data and some fit parameters + > very_raw - a workaround parameter plotting from plate attribute; made for the GUI when the data is loaded, but not processed yet + > raw - plot unprocessed data + > normalized - data after preprocessing + > derivative - first derivative + > fitted - based on the equation + save + actually save the file + show + show image instead + data_ax + instead of creating new ones, plot in these axes (for PDF reports: disables derivative plot and legend plot) + vline_legend + if True then all vlines will be added to the legend (looks bad when individual images are saved) + """ + if data_ax is None: + # create the figure object + fig = plt.figure(1, figsize=(8, 7)) + # create a specification for the relative plot sizes + gs = gridspec.GridSpec(3, 1, height_ratios=[4, 2, 0.05], figure=fig) + # get objects of individual subplots + data_ax = fig.add_subplot(gs[0]) # experimental data, fit, etc + deriv_ax = fig.add_subplot(gs[1]) # the derivative + else: + data_ax = data_ax + deriv_ax = None + + # NOTE currently all internal manipulations are done in K + # and conversion back to original scale is not done + if self.denaturant == "K" or self.denaturant == "C": + degree_sign = "K" + else: + # NOTE this is a placeholder for chemical denaturant scale + pass + + # a carry-over from the cycle + i = wellID + + if datatype == "overview": + # plot the fit + # plot the experimental data + # TODO use markevery=n to plot every n-th datapoint + # format used to be kx, however, for bigger datasets this doesn't look good + data_ax.plot( + self.plate[i].index.values, + self.plate[i], + "k.", + mew=1, + label="Experiment", + ) # , markevery=40) + + data_ax.plot( + self.plate[i].index.values, + self.plate_fit[i], + label="Fit", + ) + + # a label for Y-axis + data_ax.set_ylabel(self.readout_type) + + # calculate the offsets for the plot based on the overall length + max_val = max(self.plate[i].dropna()) + min_val = min(self.plate[i].dropna()) + y_range = max_val - min_val + data_ax.set_ylim([min_val - 0.1 * y_range, max_val + 0.1 * y_range]) + + # force specific range for x-axis + data_ax.set_xlim(self.xlim) + else: + # HACK hide the derivative plot + if deriv_ax is not None: + deriv_ax.set_visible(False) + # set the source dataframe based on the supplied option + if datatype == "very_raw": + sourcedf = self.plate + ylabel = self.readout_type + elif datatype == "raw": + sourcedf = self.plate_raw + ylabel = self.readout_type + elif datatype == "normalized": + # after all processing normalized curves are in the plate variable + sourcedf = self.plate + ylabel = self.readout_type + elif datatype == "derivative": + sourcedf = self.plate_derivative + ylabel = "dRFU/dT" + elif datatype == "fitted": + sourcedf = self.plate_fit + ylabel = "Fitted RFU" + else: + self.print_message("Invalid plotting source requested", "w") + + if datatype == "derivative": + # for derivative plots plot all values + plt.plot(sourcedf[i].index.values, sourcedf[i], label=i) + else: + data_ax.plot( + sourcedf[i].index.values, + sourcedf[i], + "k.", + mew=1, + label="Experiment", + ) + data_ax.set_ylabel(ylabel) + + data_ax.set_xlabel(f"Temperature, {degree_sign}") + data_ax.set_title("Sample " + str(i), fontsize=12, y=1.05) + data_ax.grid(True, which="both") + + # commands specific only to overview mode: + if datatype == "overview": + # plot the determined baselines + # create np.poly1d objects with respective fit parameters + # TODO for line just 2 points are needed... + poly_pre = np.poly1d( + *self.plate_results.loc[[i], ["kN_fit", "bN_fit"]].values, + ) + poly_post = np.poly1d( + *self.plate_results.loc[[i], ["kU_fit", "bU_fit"]].values, + ) + data_ax.plot( + self.plate[i].index.values, + poly_post(self.plate[i].index.values), + label="Post- baseline", + linestyle="--", + ) + data_ax.plot( + self.plate[i].index.values, + poly_pre(self.plate[i].index.values), + label="Pre- baseline", + linestyle="--", + ) + + # visualization of the lines requested (specific to different model types) + # NOTE this would not print the value/stdev on the plot, has to be done separately + for parameter_name in self.plotlines: + if vline_legend: + data_ax.axvline( + self.plate_results[parameter_name][i], + ls="dotted", + c="b", + lw=3, + label=parameter_name, + ) + else: + # NOTE in this case lines are not labeled so that they are not listed in the legend + data_ax.axvline( + self.plate_results[parameter_name][i], + ls="dotted", + c="b", + lw=3, + ) + # add text with the parameter used to generate the line - doesn't look nice in some caes + data_ax.text( + self.plate_results[parameter_name][i], + data_ax.get_ylim()[0] + 0.05 * y_range, + " " + parameter_name, + fontsize=12, + ) + if deriv_ax is not None: + fig.legend(loc="lower center", ncol=4, fontsize=12) + # commands for derivative plot (used only in overview mode) + deriv_ax.plot( + self.plate[i].index.values, + self.plate_derivative[i], + color="k", + ) + deriv_ax.set_xlabel(f"Temperature, {degree_sign}") + deriv_ax.set_ylabel(f"d({self.readout_type})/dT") + + # delete the X-label on the data axes + data_ax.set_xlabel("") + + # xlim for derivative plot and data plot must be the same! + deriv_ax.set_xlim(self.xlim) + deriv_ax.grid(True, which="both") + + if deriv_ax is None: + # if data_ax was provided externally, then showing/saving should not be done + return + if show: + plt.show() + elif save: + plt.savefig(output_path + "/" + str(i) + ".png", dpi=(100)) + plt.close("all") + + ## internal methods for creating HTML report elements + def html_heatmap(self, heatmap_cmap, display): + """Returns a div-block with a heatmap of the sortby parameter (the last column in self.plate_results). + + Parameters + ---------- + heatmap_cmap + matplotlib colormap for heatmap + display + whether the heatmap is shown or not in the final HTML: + > table (standard view) + > none (not visible) + > block (compact view) + + Notes: + ----- + The heatmap that the user sees when opening the HTML will have display=table, all other will start as display=none + """ + # this string template corresponds to a single sample entry (has to be wrapped within rows) + # NOTE when gray cells are clicked a window still pops up but says that there is no such image + # due to limitations of possible attributes within html, the possible layout info is stored in the + # title attribute which also gets displayed as a tooltip (and will be also shown in the bottom of the heatmap) + sample = '
$ID
\n' + sample = Template(sample) + # row template + # TODO add extra spaces for readability of output HTML file + row = '
\n$SAMPLES
' + row = Template(row) + + # extract the heatmap from matplotlib + cmap = plt.get_cmap(heatmap_cmap) + + # by convention the model-specific final sorting parameter is stored in the last column + # TODO add support creating HTML heatmaps for an arbitrary column + colors = self.plate_results.iloc[:, -1] + colors = normalize(colors) + + # convert numbers to HEX colors + colors = colors.apply(lambda x: rgb2hex(cmap(x))) + + # outer cycle creates rows, inner cycle creates lines for 1-12 + row_output = "" + for i in ["A", "B", "C", "D", "E", "F", "G", "H"]: + line = "" + for j in range(1, 13): + # if a respective color exists then colorise the
+ # if not, then set it to "lightgray" + sample_id = str(i) + str(j) + # NOTE modify this line to add additional information (e.g. Tm) to the text under heatmap + CONDITION = self.layout["Condition"][sample_id] + + if sample_id in colors.index: + line = line + sample.substitute( + ID=sample_id, + COLOR=colors[sample_id], + CONDITION=CONDITION, + ) + else: + # lightgray blends with mid-range of coolwarm, so change to gray + line = line + sample.substitute( + ID=sample_id, + COLOR="gray", + CONDITION=CONDITION, + ) + row_output = row_output + row.substitute(ROWNAME="Row_" + i, SAMPLES=line) + + # once the heatmap itself is created, it is wrapped around in the additional table + # that would control if the hm is shown, define the title, etc + output_template = '
\n
\n

$TITLE_TEXT

\n Click on the wells to open plots in a separate window \n
\n $HEATMAP\n
' + output_template = Template(output_template) + title_text = "{}: heatmap of {} (model {})".format( + self.readout_type, + self.plate_results.iloc[:, -1].name, + self.model, + ) + return output_template.substitute( + DISPLAY=display, + IDENTIFIER=self.readout_type, + TITLE_TEXT=title_text, + HEATMAP=row_output, + ) + + def html_button(self): + """Returns an HTML button string with the readout name on top.""" + button_template = '\n' + button_template = Template(button_template) + return button_template.substitute(IDENTIFIER=self.readout_type) + + ## Methods for GUI communication + def printAnalysisSettings(self): + """Prints current analysis settings.""" + for setting, def_value in analysis_defaults.items(): + if setting[-2:] == "_h": + # filter out the help message entries + pass + else: + print( + "{} = {} (default: {})".format( + setting, + getattr(self, setting), + def_value, + ), + ) + print("\n") + + def analysisHasBeenDone(self): + """Check if analysis compleded and self.plate_results is created.""" + return "plate_results" in self.__dict__ + + def testWellID(self, wellID, ignore_results=False): + """Check if a well exists in self.plate_results (return True), otherwise return False. + + Parameters + ---------- + wellID + sample ID to check + ignore_results: bool + if True, will look for the ID's in plate_raw, even if self.plate_results exists + """ + # check if analysis was done, and depending on that choose the index to check + if "plate_results" in self.__dict__: + index_tocheck = self.plate_results.index + else: + index_tocheck = self.plate_raw.columns + + if ignore_results: + index_tocheck = self.plate_raw.columns + + return wellID in index_tocheck + + def getResultsColumns(self): + """Returns a tuple of columns from self.plate_results that are relevant for GUI.""" + if "plate_results" in self.__dict__: + output = [self.plate_results.iloc[:, -1].name, *self.plotlines] + + # BS-factor is more useful than S, but not always available + if "BS_factor" in self.plate_results.columns: + output = [*output, "BS_factor"] + else: + output = [*output, "S"] + return output + else: + self.print_message("No plate_results attribute found", "w") + self.print_message("Please perform analysis first", "i") + return None + + ## Big methods for data input, processing and output + def SetAnalysisOptions( + self, + model=analysis_defaults["model"], + baseline_fit=analysis_defaults["baseline_fit"], + baseline_bounds=analysis_defaults["baseline_bounds"], + dCp=analysis_defaults["dCp"], + onset_threshold=analysis_defaults["onset_threshold"], + savgol=analysis_defaults["savgol"], + # these are pre-processing options (defaults stored in a different dict) + blanks=prep_defaults["blanks"], # TODO set in layout instead? + exclude=prep_defaults["exclude"], # TODO set in layout instead? + invert=prep_defaults["invert"], + mfilt=prep_defaults["mfilt"], + shrink=prep_defaults["shrink"], + trim_max=prep_defaults["trim_max"], + trim_min=prep_defaults["trim_min"], + # TODO layouts must be handled somewhere else... + layout=None, + layout_input_type="csv", + ): + """Sets in the MoltenProt instance all analysis-related parameters that will then be used by methods PrepareData() and ProcessData(). For parameter description see analysis_defaults and prep_defaults dicts. + + References: + ---------- + https://homepages.inf.ed.ac.uk/rbf/HIPR2/median.htm + the median filter: how it works and why it may be better than the mean + """ + self.model = model + self.baseline_fit = baseline_fit + self.baseline_bounds = baseline_bounds + # current value for onset_threshold (used in santoro1988(d) to obtain values of T_onset) + self.onset_threshold = 0.01 + # the value for savgol window to compute the derivative + self.savgol = savgol + + self.blanks = blanks + self.exclude = exclude + self.invert = invert + # to avoid confustion, medfilt is the imported method, mfilt is the respective analysis flag + self.mfilt = mfilt + self.shrink = shrink + # NOTE changes made by trim_min/trim_max are saved directly to self.plate + self.trim_min = trim_min + self.trim_max = trim_max + + # NOTE to prevent carry-over from previous run (e.g. in JSON) reset the bad fit list + self.bad_fit = [] + + # TESTING setting layout and dCp moved to a separate method + self.SetLayout(layout=layout, layout_input_type=layout_input_type, dCp=dCp) + + def SetLayout( + self, + layout=None, + layout_input_type="csv", + dCp=analysis_defaults["dCp"], + ): + """Sets layout and dCp.""" + if layout is not None: + if layout_input_type == "csv": + try: + # the format for layout is more strict: it is always a csv with commas as separators + # and index column called ID; more restrictions will follow + # TODO add check for the size of layout DataFrame - should be always 96 + # NOTE it may be better to use ";" as csv separator, because it would be easier to write stuff + self.layout = pd.read_csv(layout, index_col="ID", encoding="utf_8") + except: + self.print_message( + "Unsupported layout format! No layout info will be available", + "w", + ) + self.layout = None + elif layout_input_type == "from_xlsx": + self.layout = layout + # Read blanks and skipped samples from the layout information + self.blanks = list( + self.layout[self.layout["Condition"].astype(str) == "Blank"].index, + ) + self.exclude = list( + self.layout[self.layout["Condition"].astype(str) == "Ignore"].index, + ) + else: + # in all other cases set the instance attribute to None + self.layout = None + + # heat capacity change values for the whole plate + # self.dCp can be one of the following: "from_layout" or value specified by the user + # TODO overwrite layout value instead + if dCp >= 0: + # user-set dCp from CLI overrides dCp supplied in the layout + self.print_message(f"dCp for all samples is set to {dCp}", "i") + self.dCp = dCp + elif self.layout is not None: + # if there is a layout, there may or may not be dCp values + self.print_message( + "Using per-sample dCp values as provided in the layout (invalid values will be turned to 0)", + "i", + ) + # ensure that dCp values are numbers, but not something else + # if an invalid value occurs, set it to the default value of 0 + self.layout["dCp"] = pd.to_numeric(self.layout["dCp"], errors="coerce") + self.layout["dCp"].fillna(0, inplace=True) + # also negative values must be turned to 0 + self.layout.loc[layout["dCp"] < 0, ["dCp"]] = 0 + self.dCp = "from_layout" + else: + msg = f"Incorrect value for dCp ({dCp})!" + raise ValueError(msg) + + def SetFixedParameters(self, fixed_params): + """Takes a dataframe with alphanumeric columns (A1-H12) and index being names of + the parameters (e.g. Tf_fit, Ea_fit) and sets it as the attribute self.fixed_params. + + Notes: + ----- + No sanity checks of the input are currently done + """ + self.fixed_params = fixed_params + + def PrepareData(self): + """Prepares input data for processing.""" + # copy raw data to the main plate + # NOTE this is primarily needed for json i/o, to ensure that the analysis runs + # are the same after save/load cycle + # also the dT step size must be reset + self.plate = self.plate_raw.copy() + self.dT = (max(self.plate.index) - min(self.plate.index)) / ( + len(self.plate.index) - 1 + ) + + # Remove all-Nan columns + self.plate = self.plate.dropna(how="all", axis=1) + + # remove the user-specified unneeded wells + if self.exclude: + self.plate = self.plate.drop( + self.plate.columns.intersection(self.exclude), + axis=1, + ) + + # trim data from the beggining or end + if self.trim_min: + self.plate = self.plate[ + self.plate.index >= min(self.plate.index) + self.trim_min + ] + if self.trim_max: + self.plate = self.plate[ + self.plate.index <= max(self.plate.index) - self.trim_max + ] + + # invert the curves + if self.invert: + # reflect the curve relative to the x-axis + self.plate = self.plate * -1 + # normalize + self.plate = self.plate.apply(normalize, from_input=True) + + # if blank wells were specified, average them and subtract from the remaining data + if self.blanks: + self.print_message("Subtracting background...", "i") + try: + bg = self.plate[self.blanks].mean(axis=1) + # remove (drop) buffer-only columns from the dataset + self.plate = self.plate.drop( + self.plate.columns.intersection(self.blanks), + axis=1, + ) + # subtract the background + self.plate = self.plate.sub(bg, axis=0) + except KeyError: + self.print_message("Same well was supplied as Blank and Ignore", "f") + print( + "Please check parameters for --blank --exclude and the layout file", + ) + + # apply median filter + if self.mfilt: + # convert the temperature range to an odd integer window size + self.mfilt = to_odd(self.mfilt, self.dT) + + # check if the window is bigger than the whole dataset + if len(self.plate.index) < self.mfilt: + # NOTE the output may be confusing to the user, because user gives degrees, but gets datapoints + msg = f"Specified medfilt window size ({self.mfilt} datapoints) is bigger than the dataset length ({len(self.plate.index)} datapoints)!" + raise ValueError( + msg, + ) + + self.plate = self.plate.apply(medfilt, kernel_size=self.mfilt) + + # NOTE this must be done after median filtering (spikes are bad for averaging) + if self.shrink is not None: + if self.shrink > self.dT: + # create the range for Temperature binning (will be also used as the index) + bin_range = np.arange( + np.floor(min(self.plate.index)), + np.ceil(max(self.plate.index)) + self.shrink, + self.shrink, + ) + # temporary DataFrame for binning + self.plate_binned = pd.DataFrame( + index=bin_range[:-1], + columns=self.plate.columns, + dtype="float64", + ) + + # average values using the temperature range (plate_binned index, plate_binned index + bin step) + for i in self.plate_binned.index: + self.plate_binned.loc[i, :] = self.plate[ + (self.plate.index > i) & (self.plate.index < i + self.shrink) + ].mean() + + self.plate = self.plate_binned + # remove possible empty rows + self.print_message( + f"Input data binned down to {self.shrink} degree step", + "i", + ) + # required for proper derivative computation + self.dT = self.shrink + else: + msg = f"Requested shrinking step ({self.shrink} degrees) is less than the average temperature step ({self.dT} degrees)" + raise ValueError( + msg, + ) + + # compute the derivative + # NOTE the best way to do it for any type of data - savgol filtering + # which is not present in older scipy versions. In this case use a less correct approach + # NOTE it is not smart to save converted values of savgol to self.savgol, rather use internal window_length var + try: + from scipy.signal import savgol_filter + + # convert window size in temperature units to an odd integer + window_length = to_odd(self.savgol, self.dT) + # check if the window is bigger than the whole dataset + if len(self.plate.index) < window_length: + # NOTE the output may be confusing to the user, because user gives degrees, but gets datapoints + msg = f"Specified savgol window size ({window_length} datapoints) is bigger than the dataset length ({len(self.plate.index)} datapoints)!" + raise ValueError( + msg, + ) + + # NOTE an additional check for savgol requirements (4 is polyorder used by default) + if window_length < 4: + msg = f"Specified savgol window size ({window_length} datapoints) is smaller than the polynomial order of the filter (4); increase savgol window size, or do not shrink the data" + raise ValueError( + msg, + ) + """ + NOTE default mode for savgol is interp, which just uses polynomial fit of the last window to + create the data values beyond the end of the dataset. This may create tails in the derivative + that interfere with peak detection. "nearest" mode just takes the first value and uses it + for the missing parts of the window at data edges; see here for other modes: + https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.signal.savgol_filter.html + """ + self.plate_derivative = self.plate.apply( + savgol_filter, + window_length=window_length, + polyorder=4, + deriv=1, + mode="nearest", + ) + except: + """ + NOTE usually, this exception would happen with old scipy version + However, this also happened in pyinstaller bundle for Windows: + LinAlgError: SVD did not converge in Linear Least Squares + The derivate is not a crucial element (fits usually converge even with a random Tm guess), + so it should be OK to proceed. + """ + self.print_message( + "Cannot compute a smoothened derivative, fit results can be suboptimal", + "w", + ) + self.print_message("Falling back to the difference method", "i") + self.plate_derivative = (self.plate - self.plate.shift(periods=1)) / self.dT + + def ProcessData(self): + """Performs curve fitting and creates results dataframes. + + Notes: + ----- + The names for the parameters and the contents of plate_results variable are different for various methods, so they are set up based on the selected analysis + """ + if self.model == "skip": + self.print_message("Dataset was omitted from analysis", "i") + # check if there are plate_results attributes from previous analysis and delete them + # NOTE there are more leftover attributes which will be saved to JSON, but without plate_results the dataset should not be recognized as a processed one + if self.analysisHasBeenDone(): + del self.plate_results + del self.plate_results_stdev + # return statement allows to silently terminate the ProcessData method + return + + # check if model name is correct + if self.model not in avail_models.keys(): + msg = f"Unknown model '{self.model}'" + raise ValueError(msg) + + model = avail_models[self.model](scan_rate=self.scan_rate) + + self.print_message("Processing data...", "i") + + # Set here scan rate for the model + # None - not specified (e.g. for CSV), doesn't matter for equilibrium and empirical models; can be set in CLI + # XLSX - parsed from Excel and always available + # kinetic models should raise a ValueError + model.scan_rate = self.scan_rate + + # the function to do curve fitting + f = model.fun + # generate parameter names + result_index = [] + for i in model.param_names(): + result_index.append(i + "_init") + for i in model.param_names(): + result_index.append(i + "_fit") + result_index.append("S") + + # select dataframe to be fit (e.g. can be changed to plate_derivative) + df_for_fitting = self.plate + # create dataframe to store results and populate with initial parameter values + # NOTE by default empty dataframes get "object" data type, which can cause problems when using numpy + # if the datatype is specified explicitly, the problem should not exist + self.plate_results = pd.DataFrame( + index=result_index, + columns=self.plate.columns.values, + dtype="float64", + ) + + # create a dataframe to store stdev for fit parameters + # for simplicity the dataframe will also have initial parameters included + # they will be dropped in the end of processing + self.plate_results_stdev = pd.DataFrame( + index=result_index, + columns=self.plate.columns.values, + dtype="float64", + ) + + # an empty p0 variable + p0 = [] + # run a cycle through all columns and calculate fits + self.print_message("Fitting curves...", "i") + + for i in df_for_fitting.columns.values: + # drop Nan values to prevent crashes of fitting + data = df_for_fitting[i].dropna() + T = np.float64(data.index.values) + # guess initial parameters + p0 = model.param_init(data) + self.plate_results[i][0 : len(p0)] = p0 + # get parameter bounds + param_bounds = model.param_bounds(data) + # if the model has these parameters, then run a more precise initial parameter estimation + if {"kN", "bN", "kU", "bU"}.issubset(model.param_names()): + # furthermore, if there is a Tm than run smart Tm pre-estimation + estimate_Tm = "Tm" in model.param_names() + self._estimate_baseline( + data, + fit_length=self.baseline_fit, + estimate_Tm=estimate_Tm, + ) + + # Set bounds for kN, bN, kU, bU to be a multiple of stdev of baseline-prefitting + # NOTE this assumes that the parameters are the first four in the list of bounds/p0 + # originally param_bounds is a tuple, so it has to be converted to a list... + if self.baseline_bounds > 0: + param_bounds = list(param_bounds) + param_bounds[0] = list(param_bounds[0]) + param_bounds[1] = list(param_bounds[1]) + param_bounds[0][:4] = list( + self.plate_results[i][:4] + - self.plate_results_stdev[i][:4] * self.baseline_bounds, + ) + param_bounds[1][:4] = list( + self.plate_results[i][:4] + + self.plate_results_stdev[i][:4] * self.baseline_bounds, + ) + elif self.baseline_bounds < 0: + msg = f"Expected a non-negative int for baseline_bounds, but got {self.baseline_bounds}" + raise ValueError( + msg, + ) + + # if this MoltenProtFit instance has any fixed parameter dataframe, + # they will be supplied to the model + if self.fixed_params is not None: + model.set_fixed(list(self.fixed_params[i])) + try: + # for some reason, pandas Series for initial parameters have to be converted to a list + # the fit gives two arrays: fit parameters (p) and covariance matrix (covm for stdev estimation) + + # depending on scipy version enforce the parameter limits or not + if (LooseVersion(scipy_version) >= LooseVersion("0.17")) and ( + param_bounds is not None + ): + # NOTE adding ftol=0.01 and xtol=0.01 may in some cases speed up the fitting (loosens the convergence criteria) + # default values 1e-8 are a bit too conservative; in preliminary tests the speedup was marginal + p, covm = curve_fit( + f, + T, + data, + list(self.plate_results[i][0 : len(p0)]), + bounds=param_bounds, + ) + else: + p, covm = curve_fit( + f, + T, + data.values, + list(self.plate_results[i][0 : len(p0) + 1]), + ) + self.plate_results[i][len(p0) : (len(p0)) * 2] = p + # this is the official way to compute stdev error (from scipy docs) + self.plate_results_stdev[i][len(p0) : (len(p0)) * 2] = np.sqrt( + np.diagonal(covm), + ) + except RuntimeError: + # these probably correspond to bad fitting + self.print_message( + f"Curve fit for {i} failed, probably invalid transition.", + "w", + ) + # generate a list of bad fits + self.bad_fit.append(i) + except ValueError as e: + self.print_message(e.__str__(), "w") + # catch some problems of fitting with santoro1988d + self.print_message( + f"Curve fit for {i} failed unexpectedly (ValueError)", + "w", + ) + # generate a list of bad fits + self.bad_fit.append(i) + + # supply sqrt(n) for S calculation + # to calculate RMSE we don't care about the amount of parameters, + # however, they must be included for standrard error of estimate + # (more info here: http://people.duke.edu/~rnau/compare.htm) + try: + self.plate_results[i].loc["S"] = np.sqrt(len(T) - len(p0)) + except TypeError: + self.print_message( + f"Curve for sample {i} has just one value!", + "w", + ) + # add to bad samples list + self.bad_fit.append(i) + + # drop the wells that could not be fit + self.plate_results.drop(self.bad_fit, axis=1, inplace=True) + self.plate_results_stdev.drop(self.bad_fit, axis=1, inplace=True) + + # empty DataFrame with temperature index to store computed fitted curves + self.plate_fit = pd.DataFrame(index=self.plate.index) + + def calculate_fit(input_series): + """Helper function to compute fit curves + input DataFrame is self.plate_results. + """ + # use the fit parameters from plate_results and the index of plate_fit to compute fit curves + self.plate_fit = pd.concat( + [ + self.plate_fit, + pd.Series( + f( + self.plate_fit.index, + *list(input_series[len(p0) : (len(p0)) * 2]), + ), + index=self.plate_fit.index, + name=input_series.name, + ), + ], + axis=1, + ) + + # apply the helper function + self.plate_results.apply(calculate_fit) + + # calculate S and put it into plate_results + self.print_message("Estimating S...\n", "i") + + # compute S using df_for_fitting + plate_S = (df_for_fitting - self.plate_fit) ** 2 + plate_S = np.sqrt(plate_S.sum()) + self.plate_results.loc["S", :] = plate_S / self.plate_results.loc["S", :] + + # transpose the DataFrame to have sample wells in rows + self.plate_results = self.plate_results.T + self.plate_results_stdev = self.plate_results_stdev.T + + # compute BS-factor to assess how wide is the signal window relative to the noise + # this requires knowledge of baseline parameters and Tm + if {"kN", "bN", "kU", "bU", "Tm"}.issubset(model.param_names()): + # Use S as an overall proxy for assay noise + self.plate_results["BS_factor"] = 1 - 6 * self.plate_results.S / abs( + self.plate_results.kU_fit * self.plate_results.Tm_fit + + self.plate_results.bU_fit + - ( + self.plate_results.kN_fit * self.plate_results.Tm_fit + + self.plate_results.bN_fit + ), + ) + + # compute baseline-corrected raw data if standard baseline parameters are available + if {"kN", "bN", "kU", "bU"}.issubset(model.param_names()): + self._calculate_raw_corr() + + # add layout information to the results + if self.layout is not None: + self.plate_results = pd.concat( + [self.layout, self.plate_results], + join="inner", + axis=1, + ) + + # remove *_init columns and S from self.plate_results_stdev + self.plate_results_stdev.drop( + labels=result_index[: len(p0)], + axis=1, + inplace=True, + ) + self.plate_results_stdev.drop(labels="S", axis=1, inplace=True) + + # based on the model, calculate additional curve characteristics and set the vertical lines for plots + if model.sortby == "dG_std": + # Calculate T_onset + self._calc_Tons("Tm_fit", "dHm_fit", self.onset_threshold) + # dG and dCp component + self.CalculateThermodynamic() + # list of vertical lines to be plotted + self.plotlines = ["T_onset", "Tm_fit"] + elif model.sortby == "dG_comb_std": + # Calculate T_onset + self._calc_Tons("T1_fit", "dHm1_fit", self.onset_threshold) + # Calculate T2 from dT2_1_fit + # TODO error propagation + self.plate_results["T2_fit"] = ( + self.plate_results["T1_fit"] + self.plate_results["dT2_1_fit"] + ) + # NOTE dCp is hard to determine for the intermediate, so it is completely neglected + # dG is calculated for each reaction (N<->I and I<->U) and then combined following the principle of thermodynamic coupling + self.plate_results["dG_comb_std"] = self.plate_results["dHm1_fit"] * ( + 1 - T_std / self.plate_results["T1_fit"] + ) + self.plate_results["dHm2_fit"] * ( + 1 - T_std / self.plate_results["T2_fit"] + ) + # list of vertical lines to be plotted + self.plotlines = ["T_onset", "T1_fit", "T2_fit"] + elif model.sortby == "T_eucl": + # computes Euclidean distance for Tm/T_onset + # Tm and T_ons are on the same scale and are orthogonal characteristics of the sigmoidal curve + # thus, a sample with the most optimal combination is the one that is most far away from T=0 + self.plate_results["T_eucl"] = np.sqrt( + self.plate_results["Tm_fit"] ** 2 + + self.plate_results["T_onset_fit"] ** 2, + ) + # list of vertical lines to be plotted + self.plotlines = ["T_onset_fit", "Tm_fit"] + elif model.sortby == "T_eucl_comb": + self.plate_results["T_eucl_comb"] = np.sqrt( + self.plate_results["T1_fit"] ** 2 + + self.plate_results["T_onset1_fit"] ** 2, + ) + np.sqrt( + self.plate_results["T2_fit"] ** 2 + + self.plate_results["T_onset2_fit"] ** 2, + ) + self.plotlines = [ + "T_onset1_fit", + "T1_fit", + "T_onset2_fit", + "T2_fit", + ] + elif model.sortby == "pk_ratio_std": + # the kF/kR at std temperature; take as -log10 to have higher values for higher stability + self.plate_results["pk_ratio_std"] = -np.log10( + model.arrhenius( + T_std, + self.plate_results.TfF_fit, + self.plate_results.EaF_fit, + ) + / model.arrhenius( + T_std, + self.plate_results.TfR_fit, + self.plate_results.EaR_fit, + ), + ) + self.plotlines = ["TfF_fit", "TfR_fit"] + elif model.sortby == "pk_std": + # For irreversible reactions calculate the unfolding rate constant at std temperature + # based on Tf and Ea values + # NOTE by convention higher sorting value indicates higher stability, but it is not the case + # for rate constant of reaction N -> U; a common trick in chemistry is to calculate the + # negative log10 + self.plate_results["pk_std"] = -np.log10( + model.arrhenius( + T_std, + self.plate_results.Tf_fit, + self.plate_results.Ea_fit, + ), + ) + self.plotlines = ["Tf_fit"] + else: + self.print_message( + "Model {} contains no measure for final sorting".format( + model.short_name, + ), + "w", + ) + self.plotlines = [] + + # sort_values based on the model's sortby property + if model.sortby is not None: + self.plate_results.sort_values( + by=model.sortby, + inplace=True, + ascending=False, + ) + + # for proper json i/o + self.plate_results.index.name = "ID" + self.plate_results_stdev.index.name = "ID" + + def CalculateThermodynamic(self): + """Calculates some thermodynamic characteristics of data and append them to self.plate_results: + > dG_std - Gibbs free energy of unfolding at standard temperature (298 K), extrapolated using the values of dCp + > dCp - heat capacity change of unfolding (supplied by user either in the layout or in the command line) + > dHm - enthalpy of unfolding at Tm (calculated during fitting). + """ + # create temporary dataframe to store td results + column_names = ["dCp", "dG_std"] + # NOTE by default empty dataframes get "object" data type, which can cause problems when using numpy + # if the datatype is specified explicitly, the problem should not exist + # NOTE since dCp is set externally, it will be just copy-pasted and no stdev can be computed + td_results = pd.DataFrame( + index=self.plate_results.index, + columns=column_names, + dtype="float64", + ) + # create a separate dataframe for storing stdev + td_results_stdev = pd.DataFrame( + index=self.plate_results.index, + columns=[column_names[1]], + dtype="float64", + ) + + if self.dCp == "from_layout": + td_results["dCp"] = self.layout["dCp"] + else: + td_results["dCp"] = self.dCp + + # issue a warning about inaccuracy of extrapolated dG with dCp=0 + if any(td_results["dCp"] == 0): + self.print_message( + "One or more dCp values are set to 0; this results in an overestimate of dG_std", + "w", + ) + + # dG_std is extrapolated to standard temperature using the model described in Becktel and Schellman, 1987 + # Tm is chosen as the reference temperature for equation (4), which also means that dS(Tm) = dH(Tm)/Tm + td_results["dG_std"] = self.plate_results["dHm_fit"] * ( + 1 - T_std / self.plate_results["Tm_fit"] + ) - td_results["dCp"] * ( + self.plate_results["Tm_fit"] + - T_std + + T_std * np.log(T_std / self.plate_results["Tm_fit"]) + ) + td_results_stdev["dG_std"] = np.sqrt( + self.plate_results_stdev["dHm_fit"] ** 2 + + (self.plate_results["dHm_fit"] * T_std / self.plate_results["Tm_fit"]) + ** 2 + * ( + (self.plate_results_stdev["dHm_fit"] / self.plate_results["dHm_fit"]) + ** 2 + + (self.plate_results_stdev["Tm_fit"] / self.plate_results["Tm_fit"]) + ** 2 + ) + + (td_results["dCp"] * T_std) ** 2 + * ( + (self.plate_results_stdev["Tm_fit"] / T_std) ** 2 + + (self.plate_results_stdev["Tm_fit"] / self.plate_results["Tm_fit"]) + ** 2 + ), + ) + + # calculate contribution of dCp to the real dG + td_results["dCp_component"] = ( + T_std + - self.plate_results["Tm_fit"] + - T_std * np.log(T_std / self.plate_results["Tm_fit"]) + ) + td_results_stdev["dCp_component"] = np.sqrt( + (self.plate_results_stdev["Tm_fit"]) ** 2 + + ( + T_std + * self.plate_results_stdev["Tm_fit"] + / self.plate_results["Tm_fit"] + ) + ** 2, + ) + + # append td_results to self.plate_results and self.plate_results_stdev + self.plate_results = pd.concat( + [self.plate_results, td_results.loc[:, ["dCp_component", "dG_std"]]], + axis=1, + sort=True, + ) + + self.plate_results_stdev = pd.concat( + [self.plate_results_stdev, td_results_stdev], + axis=1, + sort=True, + ) + + def CombineResults(self, tm_stdev_filt, bs_filt, merge_dup, tm_key): + """A helper method to filter results and optionally average duplicates. + + Parameters + ---------- + tm_stdev_filt + samples with stdev for Tm above this value will be discarded + tm_key + the name for column with Tm + bs_filt + samples with BS-factor above this value will be kept + merge_dup + whether to merge duplicates (based on annotations in the layout) + + Returns: + ------- + results - joined plate_results/stdev DataFrame with optional filtering and duplicate averaging + + Notes: + ----- + * not all plate_results can contain BS-factor, so this part of filtering will be skipped + """ + # NOTE to prevent irreversible changes to plate_results* attributes, force a copy + results = self.plate_results.copy() + + # we may have 2 types of stdev, one from fitting and another one from duplicate averaging + stdev_fit = self.plate_results_stdev.copy() + + # add suffix _stdev to prevent duplicate column names in joined df's + stdev_fit.columns = stdev_fit.columns + "_stdev" + + # NOTE previously, merge_dup was run _before_ stdev filtering, but this messes up + # subsequent description of the procedure. Also, this operation joins 2 processes + # (removal of bad fits and removal of invalid dups) in one, which is bad + # a better way: first filter to remove crappy fits + # then merge dups and use stdev that comes from them. + # This approach also obsoletes the methods for stdev "joining" + + # use stdev_fit dataframe to get ID's of samples that have improper values + if tm_stdev_filt > 0: + drop_tm_stdev = stdev_fit[tm_key + "_fit_stdev"][ + stdev_fit[tm_key + "_fit_stdev"] > tm_stdev_filt + ].index + results = results.drop(drop_tm_stdev, axis=0) + if bs_filt > 0: + try: + results = results[results["BS_factor"] >= bs_filt] + except KeyError: + self.print_message( + "Column 'BS_factor' not found in the results DataFrame, cannot filter", + "w", + ) + + if merge_dup: + # TODO add an option for discarding samples that do not have a duplicate + # averaging capillary numbers doesn't make sense, convert them to strings + if "Capillary" in results.columns: + results.Capillary = results.Capillary.apply(str) + + # processing steps: remove duplicates with different aggregation functions: + # - text data (ID, Capillary) - create a comma-separated string of values + # - result data - compute mean, stdev and n (amount of duplicated values) + + # prepare DataFrames: + # split result data to text and numeric + text_data = results.select_dtypes(include=["object"]) + numeric_data = results.select_dtypes(include=["float64", "int64"]) + + # for text data convert ID index to a new column (a bit hacky) + text_data.reset_index(inplace=True) + text_data.set_index("ID", inplace=True, drop=False) + + # since the column for de-duplication is text, it has to be re-created for numeric_data and stdev_data + numeric_data = pd.concat([numeric_data, text_data["Condition"]], axis=1) + + # perform deduplication (grouping) + + # for text data we declare a lambda function + text_data = text_data.groupby("Condition").agg(lambda x: ",".join(list(x))) + + # results data - count of each sample + numeric_data_count = numeric_data.groupby("Condition").agg("count") + # now also stdev (add a proper suffix to index) + numeric_data_stdev = numeric_data.groupby("Condition").agg(np.std) + numeric_data_stdev.columns = numeric_data_stdev.columns + "_stdev" + # and finally average + numeric_data = numeric_data.groupby("Condition").agg(np.mean) + + # add n information to the text_data + # BUG if after filtering the dataset is empty, this operation produces an error + # this is caught with try/except + try: + text_data["n"] = numeric_data_count.iloc[:, [0]] + except ValueError as e: + self.print_message(e.__str__(), "w") + self.print_message( + "No data left after filtering in CombineResults()", + "w", + ) + + # update the results dataframe + results = pd.concat([text_data, numeric_data, numeric_data_stdev], axis=1) + else: + results = pd.concat([results, stdev_fit], axis=1) + return results + + def RenameResults(self, datatype="sct"): + """#HACK rename Tm to Tagg in for scattering data + # also chemical denaturation renaming can be done here. + """ + if datatype == "sct": + rename_dict = {"Tm_init": "Tagg_init", "Tm_fit": "Tagg_fit"} + elif datatype == "chem": + rename_dict = {"Tm_init": "dGH2O_init", "Tm_fit": "dGH2O_fit"} + msg = "Chemical denaturation renaming is not there yet" + raise NotImplementedError(msg) + + self.plate_results.rename(columns=rename_dict, inplace=True) + self.plate_results_stdev.rename(columns=rename_dict, inplace=True) + + @staticmethod + def _trim_string(string, length=30, symmetric=True): + """Helper method to trim a string to a specific length by removing the middle part. + + Arguments: + --------- + string + string to be trimmed + length + length of the data to keep + symmetric + if true, also show the end of the string + """ + # ensure that input is a str + string = str(string) + # if too short, return as is + if len(string) <= length: + return string + if symmetric: + return string[: length // 2] + " ... " + string[-length // 2 :] + else: + return string[:length] + "..." + + def _plotfig_pdf(self, samples, failed=False): + """A helper method for smart packing of individual sample plots into figures. + + Parameters + ---------- + samples + a valid list of samples to be plotted + failed + indicate if samples are from failed fits + + Returns: + ------- + A list of Figure objects to be added to pages list + """ + if failed: + suptitle = "Excluded/failed samples" + datatype = "raw" + else: + suptitle = "Successful fits" + datatype = "overview" + pages = [] + n_results = len(samples) + # intialize figures, axes, and plot counter + plot_fig, plot_axs = plt.subplots( + 4, + 3, + sharex=False, + sharey=False, + figsize=(8.3, 11.7), + ) + plot_fig.suptitle(suptitle, fontweight="bold", fontsize="x-large") + plot_axs = list(plot_axs.flat) + plot_counter = 0 + for sample in samples: + if plot_counter < 11: + self.plotfig( + "dummy_output", + sample, + datatype=datatype, + save=False, + show=False, + data_ax=plot_axs[plot_counter], + vline_legend=True, + ) + plot_counter += 1 + # if the 11th plot was plotted, append old figure and initialize a new one + # this should be also triggered if less than 11 is plotted + if (plot_counter == 11) or (plot_counter == n_results): + # make legend in the next axes + plot_axs[plot_counter].legend( + handles=plot_axs[plot_counter - 1].get_lines(), + mode="expand", + ncol=1, + ) + # hide remaining unused axes + while plot_counter <= 11: + plot_axs[plot_counter].set_axis_off() + plot_counter += 1 + plot_fig.tight_layout( + rect=(0.02, 0.05, 0.98, 0.95), + ) # rect leaves some margins empty + pages.append(plot_fig) + plot_fig, plot_axs = plt.subplots( + 4, + 3, + sharex=False, + sharey=False, + figsize=(8.3, 11.7), + ) + plot_fig.suptitle( + suptitle + " (continued)", + fontweight="bold", + fontsize="x-large", + ) + plot_axs = list(plot_axs.flat) + plot_counter = 0 + return pages + + def PdfReport(self, outfile): + """Generate and write a multi-page PDF report. + + Parameters + ---------- + outfile + location of the output, overwrite without confirmation + + Notes: + ----- + * heatmap and converter96 are ancient methods, so they are just carefully wrapped around + * each page is a figure object + * multi-page pdf as per mpl [docs](https://matplotlib.org/stable/gallery/misc/multipage_pdf.html) + """ + from matplotlib.backends.backend_pdf import PdfPages + from matplotlib.table import table as mpl_table + + result_columns = ( + self.getResultsColumns() + ) # the first value should be the recommended sorting parameter followed by vlines and BS/S + # add condition column + result_columns = ["Condition", *result_columns] + sort_parameter = result_columns[1] + # preprocess the result df + result_table = self.plate_results.loc[ + :, + result_columns, + ].copy() # to prevent edits to the original DF + result_table = np.round(result_table, 2) # round the numeric data + result_table["Condition"] = result_table["Condition"].apply( + self._trim_string, + length=10, + symmetric=False, + ) + result_table_colors = result_table.copy() # color table 1.0 is white 0 is black + result_table_colors.loc[:, :] = "1.0" + result_table_colors.iloc[::2] = "0.75" + result_table_colors = result_table_colors.values + result_index = result_table.index + result_table = result_table.values # convert to a list of lists + n_results = len(result_table) # total number of results + pages = [] + + ## Page 1: Heatmap of the respective sortby parameter, top 15 results, run info + plate96 = self.converter96(sort_parameter, reference=None) + page1, page1_ax = self.heatmap( + "dummy_output", + plate96, + sort_parameter, + save=False, + pdf_report=True, + ) + page1_ax[0].set_title( + f"Heatmap of {sort_parameter}", + loc="left", + fontweight="bold", + ) + # mpl tables cannot do word wrapping, so trim the file name + filename = self._trim_string(self.filename) + mp_version = __version__ + if from_pyinstaller: + mp_version += " (PyInstaller bundle)" + timestamp = strftime("%c") + # failed fits and user-excluded samples + excluded = self._get_failed_samples() + if len(excluded) > 0: + excluded_str = self._trim_string(", ".join(list(excluded))) + else: + excluded_str = "None" + + info_table = [ + ["Timestamp", timestamp], + ["Input file", filename], + ["Scan rate, degrees/min", self.scan_rate], + ["MoltenProt version", mp_version], + ["Analysis model", self.model], + ["Excluded/failed samples", excluded_str], + ] + info_table_ax = page1_ax[2] + info_table = mpl_table( + info_table_ax, + info_table, + loc="upper left", + edges="open", + cellLoc="left", + ) + + # using the solution from here to set proper font size in the table: + # https://stackoverflow.com/questions/15514005/how-to-change-the-tables-fontsize-with-matplotlib-pyplot + info_table.auto_set_font_size(False) + info_table_ax.set_title("Run info", loc="left", fontweight="bold") + top10_table_ax = page1_ax[1] + mpl_table( + top10_table_ax, + result_table[:15, :], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[:15], + cellColours=result_table_colors[:15, :], + ) + top10_table_ax.set_title("Top 15 results", loc="left", fontweight="bold") + + info_table_ax.set_axis_off() + top10_table_ax.set_axis_off() + + # finalize page 1 + page1.suptitle( + f"MoltenProt Report: {self.readout_type}", + fontweight="bold", + fontsize="x-large", + ) + # add citation to the bottom of the page + page1.text(0.5, 0.05, citation["short"], ha="center") # fontstyle='italic', + pages.append(page1) + + ## Full result table - create if more than 15 results (but less than 48) + if n_results > 15: + page2, page2_ax = plt.subplots(1, 1, figsize=(8.3, 11.7)) + mpl_table( + page2_ax, + result_table[:48], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[:48], + cellColours=result_table_colors[:48], + ) + page2_ax.set_axis_off() + page2_ax.set_title( + f"Result table (sorted by {sort_parameter})", + loc="left", + fontweight="bold", + ) + pages.append(page2) + + # if more than 48 samples are present + if n_results > 48: + page3, page3_ax = plt.subplots(1, 1, figsize=(8.3, 11.7)) + mpl_table( + page3_ax, + result_table[48:], + loc="upper left", + colLabels=result_columns, + cellLoc="left", + rowLabels=result_index[48:], + cellColours=result_table_colors[48:], + ) + page3_ax.set_axis_off() + page3_ax.set_title( + "Result table (continued)", + loc="left", + fontweight="bold", + ) + pages.append(page3) + + ## pages with plots of individual curves + pages += self._plotfig_pdf(self.plate_results.index) + ## same as above, but for failed fits + pages += self._plotfig_pdf(excluded, failed=True) + + # write output + page_no = 1 + page_count = len(pages) + with PdfPages(outfile) as pdf_file: + for i in pages: + # add page number and save + i.text( + 0.5, + 0.025, + f"-- Page {page_no} of {page_count} --", + fontstyle="italic", + ha="center", + ) + page_no += 1 + pdf_file.savefig(i) + + # clean up mpl objects + plt.close("all") + + def _get_failed_samples(self): + """Return a list of samples that were either excluded or not fit. + + Notes: + ----- + * will raise a value error if the analysis is not done + """ + return self.plate_raw.columns.difference(self.plate_results.index) + + def WriteOutput( + self, + print10=False, + xlsx=False, + genpics=False, + heatmaps=[], + hm_ref=None, + heatmap_cmap=defaults["heatmap_cmap"], + resources_prefix="", + n_jobs=1, + no_data=False, + pdf=False, + ): + """Write the results to the disk. + + Parameters + ---------- + print10 + print 10 best samples to stdout + xlsx + write output in XLSX rather than CSV + genpics + generate figures for all samples + heatmaps + generate heatmaps for selected plate_results columns + hm_ref + reference well for heatmap + heatmap_cmap + matplotlib colormap for heatmap + resources_prefix + extra string to add when saving files (e.g. to prevent overwrites in multi-dataset case) + n_jobs + number of subprocesses to run figure plotting in parallel + no_data + do not output any data + pdf + write a report in PDF format + """ + # estimate how many samples could not go through the processing by finding the + # difference between the indexes of original dataset and the final results + # BUG if the sample is registered in xlsx annotations, but not present in the raw data plate + # (i.e. it was marked as bad capillary by Prometheus), then it will not be shown here + # such samples can be easily diagnosed through reports: even the raw-only signal is not plotted + # BUG failed samples have only ID's, but not layout info + failed_samples = self._get_failed_samples() + + # if there is a least one empty sample create respective results dataframes for it + # and temporarily append them to the main results (will not persist as a class attribute) + # NOTE index objects are immutable, so a new empty df has to be created and concatenated + if len(failed_samples) > 0: + failed_results = pd.DataFrame( + index=failed_samples, + columns=self.plate_results.columns, + ) + failed_results_stdev = pd.DataFrame( + index=failed_samples, + columns=self.plate_results_stdev.columns, + ) + output_results = pd.concat([self.plate_results, failed_results]) + output_results_stdev = pd.concat( + [self.plate_results_stdev, failed_results_stdev], + ) + else: + output_results = self.plate_results + output_results_stdev = self.plate_results_stdev + + output_path = self.resultfolder + + if print10: + # print the 10 best conditions to the console + self.print_message("\n\nBest 10 conditions (best->worst):", "i") + print(", ".join(list(self.plate_results[:10].index)) + "\n") + + # the checks for the folder existence are done in the main script + self.print_message("Writing results...", "i") + if not no_data: + if xlsx: + # create a holding object for *.xlsx export + writer = pd.ExcelWriter( + os.path.join(output_path, resources_prefix + "_Results.xlsx"), + ) + + self.plate_raw.to_excel(writer, "Raw data") + self.plate.to_excel(writer, "Preprocessed data") + self.plate_fit.to_excel(writer, "Fit curves") + self.plate_raw_corr.to_excel(writer, "Baseline-corrected") + output_results.to_excel(writer, "Fit parameters") + output_results_stdev.to_excel(writer, "Standard deviations") + + # this is the longest part in XLSX saving procedure + writer.save() + else: + # convert plate_results* dataframes to *.csv's + output_results.to_csv( + os.path.join(output_path, resources_prefix + "_results.csv"), + sep=",", + index_label="Parameters", + encoding="utf-8", + ) + output_results_stdev.to_csv( + os.path.join(output_path, resources_prefix + "_results_stdev.csv"), + sep=",", + index_label="Parameters", + encoding="utf-8", + ) + self.plate_fit.to_csv( + os.path.join(output_path, resources_prefix + "_fit.csv"), + sep=",", + encoding="utf-8", + ) + self.plate.to_csv( + os.path.join(output_path, resources_prefix + "_preproc_curves.csv"), + sep=",", + encoding="utf-8", + ) + self.plate_raw_corr.to_csv( + os.path.join(output_path, resources_prefix + "_raw_corr.csv"), + sep=",", + encoding="utf-8", + ) + # PDF report + if pdf: + self.PdfReport(os.path.join(output_path, resources_prefix + "_report.pdf")) + + # generate heatmaps + if len(heatmaps) > 0: + if "all" in heatmaps: + # columns shared (presumably) between all models + heatmaps = ["S", *self.plotlines] + for i in heatmaps: + try: + plate96 = self.converter96(i, hm_ref) + # check if the column name is in the hm_dic, if not use default options + try: + self.heatmap( + output_path, + plate96, + i, + lowerbetter=self.hm_dic[i]["lowerbetter"], + title=self.hm_dic[i]["title"], + tick_labels=self.hm_dic[i]["tick_labels"], + heatmap_cmap=heatmap_cmap, + ) + except KeyError: + try: + # print that lowerbetter for this heatmap can be wrong + self.print_message( + "{} was not found in hm_dic, the colors may be wrong!".format( + i, + ), + "w", + ) + self.heatmap( + output_path, + plate96, + i, + title="Heatmap of " + str(i), + ) + except: + self.print_message( + "Re-check the parameters for --heatmap option", + "w", + ) + except ValueError: + # NOTE a temporary fix for cases when a wrong heatmap parameter was supplied + self.print_message( + f'"{i}" is an invalid option for heatmap', + "w", + ) + print("Valid options are:") + print(", ".join(list(self.plate_results.columns))) + + # clean up after plotting so that no parameters are carried over to downstream plottings + plt.close("all") + + # generate plots of individual samples + # BUG for multi-dataset instance using --genfigs will overwrite images + if genpics: + self.print_message("Generating figures... This may take a while...", "i") + # depending on the value of parallelization either run a single or multi-processor command + if parallelization and n_jobs > 1: + # with picklable plotfig method: + Parallel(n_jobs=n_jobs)( + delayed(self.plotfig)(output_path=output_path, wellID=i) + for i in self.plate_fit.columns.values + ) + # plot raw data of failed samples + if len(failed_samples) > 0: + Parallel(n_jobs=n_jobs)( + delayed(self.plotfig)( + output_path=output_path, + datatype="raw", + wellID=i, + ) + for i in failed_samples + ) + else: + for i in self.plate_fit.columns.values: + # save to default path + self.plotfig(output_path, i) + + # plot failed samples, if any + # TODO add the derivative curve so that the plots look nicer + if len(failed_samples) > 0: + for i in failed_samples: + self.plotfig(output_path=output_path, datatype="raw", wellID=i) + + +class MoltenProtFitMultiple: + """The main class in MoltenProt; contains one or more datasets (i.e. MoltenProtFit instances) + and coordinates their processing. + """ + + def __init__( + self, + scan_rate=None, + denaturant=None, + layout=None, + source=None, + ) -> None: + """Only core settings are defined at the level of initialization: + + layout:DF - annotation for all samples + denaturant:str - can be either a unit of temperature (C, K) or denaturant name (GuHCl, Urea) + all internal processing is done with temperature in Kelvins + source: string - the name of the file that was used to get the data (currently not the real path) + debug level? + + All datasets must be added using a dedicated method. + Defaults to None, which is useful to re-create instances from JSON files + """ + self.layout = layout + if layout is not None: + self.layout_raw = layout.copy() # a backup copy of the original layout + else: + self.layout_raw = None + self.source = source + self.scan_rate = scan_rate + self.denaturant = denaturant + self.datasets = {} # type: ignore[var-annotated] + + def __getstate__(self): + """A special method to enable pickling of class methods.""" + output = self.__dict__ + output["PrepareAndAnalyseSingle"] = self.PrepareAndAnalyseSingle + output["WriteOutputSingle"] = self.WriteOutputSingle + return output + + def print_message(self, text, message_type): + """Print a message and add it to the protocolString of all datasets.""" + if message_type == "w": + prefix = "Warning: " + elif message_type == "i": + prefix = "Information: " + else: + msg = f"Unknown message type '{message_type}'" + raise ValueError(msg) + message = f"{prefix}{text} (All)" + print(message) + message += "\n" + for i, j in self.datasets.items(): + j.protocolString += message + + def GetAnalysisSettings(self): + """Get information on the analysis settings (shared settings and per-dataset model settings). + + Returns: + ------- + a tuple of two variables: + > dict with current analysis settings + > dataframe with dataset/model settings (compatible with QtTableWidget) + """ + # NOTE currently all settings except for model are the same for all datasets + # start from default settings + analysis_settings = analysis_kwargs(analysis_defaults) + model_settings = pd.Series( + index=self.GetDatasets(), + name="Model", + data="santoro1988", + ) + model_settings.index.name = "Dataset" + + for dataset_name, dataset in self.datasets.items(): + if dataset.analysisHasBeenDone(): + analysis_settings = analysis_kwargs(dataset.__dict__) + model_settings[dataset_name] = dataset.model + # unprocessed MPF may not have the model attribute + if hasattr(dataset, "model") and dataset.model == "skip": + model_settings[dataset_name] = dataset.model + + # reset index to comply with QtTableWidget + model_settings = model_settings.reset_index() + + return (analysis_settings, model_settings) + + def AddDataset(self, data, readout): + """Add an individual Dataset (as MP_fit instance in datasets attribute). + + Parameters + ---------- + data: pd.DataFrame + a ready-togo DataFrame with Index called "Temperature" or "Denaturant" (not implemented yet) + readout: str + a name for the dataset to be added; will be used to call internal data and thus has to be unique; spaces will be substituted with underscore + + Notes: + ----- + TODO add an option to indicate that the data is about aggregation (to rename to Tagg) + TODO set layouts here as well? + """ + if readout in list(self.datasets.keys()): + msg = "Readout '{}' is already present in this MoltenProtFitMultiple instance!" + raise RuntimeError( + msg, + ) + + # in readout name convert spaces to underscores + readout = readout.replace(" ", "_") + # NOTE parent_filename is only used in reports + self.datasets[readout] = MoltenProtFit( + data, + scan_rate=self.scan_rate, + input_type="from_xlsx", + parent_filename=self.source, + denaturant=self.denaturant, + readout_type=readout, + ) + + def DelDataset(self, todelete): + """Remove a dataset from the instance. + + Parameters + ---------- + todelete + which dataset to delete + + Notes: + ----- + This permanently removes a dataset, so the data cannot be recovered. If the data should be retained, then set model to 'skip' in Analysis + """ + if todelete in self.GetDatasets(): + del self.datasets[todelete] + else: + self.print_message(f"Dataset '{todelete}' not found", "w") + + def GetDatasets(self, no_skip=False): + """Returns available dataset names. + + Parameters + ---------- + no_skip + if True, then datasets with model 'skip' will not be included + """ + if no_skip: + output = [] + for dataset_name, dataset in self.datasets.items(): + # NOTE model attribute is only added after processing + if hasattr(dataset, "model") and dataset.model == "skip": + continue + output.append(dataset_name) + return tuple(output) + else: + return tuple(self.datasets.keys()) + + def UpdateLayout(self): + """For GUI communication only, update the layout of the datasets after the "master" layout in MPFM was changed. + + Notes: + ----- + BUG this method does not use the MPF.SetLayout, which makes code redundant + layouts are set only at SetAnalysisOptions state + """ + for dataset_id, mp_fit in self.datasets.items(): + mp_fit.layout = self.layout + # also update the layout info in plate_results (if present) + if hasattr(mp_fit, "plate_results"): + mp_fit.plate_results["Condition"] = self.layout["Condition"] + + def ResetLayout(self): + """Change the master layout in MPFM to layout_raw (recorded during parsing of XLSX in newer versions of moltenprot) and update all MPF instances.""" + if self.layout_raw is not None: + self.layout = ( + self.layout_raw.copy() + ) # need to copy, because all edits to the layout will propagate to layout_raw, and it will not be "original" + self.UpdateLayout() + else: + self.print_message("Attribute layout_raw is None, nothing to reset", "w") + + def SetScanRate(self, scan_rate): + """Sets a new scan rate (degC/min) to all datasets.""" + # too high precision is not relevant + self.scan_rate = round(scan_rate, 2) + for dataset_id, mp_fit in self.datasets.items(): + mp_fit.scan_rate = self.scan_rate + + def RenameResultsColumns(self, which, mapping): + """E.g. in scattering data, the output is not Tm, but Tagg + Also, for chemical denaturation the columns can be completely different + NOTE currently done via RenameResults of MoltenProtFit. + + Parameters + ---------- + which : str + to which dataset to apply + mapping : dict + a dictionary with keys being original names and values being new names + """ + self.print_message( + f"Renaming results columns in dataset {which}\n{mapping}", + "i", + ) + self.datasets[which].plate_results.rename(columns=mapping, inplace=True) + self.datasets[which].plate_results_stdev.rename(columns=mapping, inplace=True) + + def SetAnalysisOptions(self, which="all", printout=False, **kwargs): + """Set analysis options for a single dataset. + + Parameters + ---------- + which : str + To which dataset the options are applied; all means apply same settings for all datasets + printout : bool + print the settings + **kwargs + args for MoltenProtFit.SetAnalysisOptions() + """ + # Add layout-related options to kwargs + if kwargs.get("blanks"): + # check if the current layout has any blanks listed and remove those + self.layout.Condition = self.layout.Condition.replace("Blank", "") + try: + self.layout.loc[kwargs["blanks"], "Condition"] = "Blank" + except KeyError: + self.print_message( + "One or more blank samples have invalid IDs, no blank info will be available", + "w", + ) + # add to all datasets' protocol string + + # NOTE if the same sample is listed as blank and exclude, it will have only exclude in the end + # i.e. exclusion by user has higher priority + if kwargs.get("exclude"): + # check if the current layout has any excluded samples and remove those + self.layout.Condition = self.layout.Condition.replace("Ignore", "") + try: + self.layout.loc[kwargs["exclude"], "Condition"] = "Ignore" + except KeyError: + self.print_message( + "One or more excluded samples have invalid IDs, no exclusion done", + "w", + ) + + kwargs["layout"] = self.layout + kwargs["layout_input_type"] = "from_xlsx" + # NOTE the differences from 1x processing: + # the layout is shared, so we use self.layout, and also tell MoltenProtFit instance that the layout is already a DataFrame (using layout_input_type) + if which == "all": + for i, j in self.datasets.items(): + j.SetAnalysisOptions(**kwargs) + else: + self.datasets[which].SetAnalysisOptions(**kwargs) + + if printout: + print(f"Data type is {i}") + self.datasets[which].printAnalysisSettings() + + def PrepareAndAnalyseSingle(self, which): + """Run data processing pipeline on a single dataset. + + Parameters + ---------- + which + dataset name + """ + self.datasets[which].PrepareData() + self.datasets[which].ProcessData() + + # NOTE return statement is only needed for parallelized code (MoltenProtFitMultiple instance + # gets overwritten and computed results are not stored) + return self.datasets[which] + + def PrepareAndAnalyseAll(self, n_jobs=1): + """Run analysis on all datasets. + + Parameters + ---------- + n_jobs : int + how many parallel processes to start + """ + analysis_tuple = self.GetDatasets() + + # parallelization of analysis routine + if parallelization and n_jobs > 1: + results_tuple = Parallel(n_jobs=n_jobs)( + delayed(self.PrepareAndAnalyseSingle)(i) for i in analysis_tuple + ) + for i, j in zip(analysis_tuple, results_tuple): + self.datasets[i] = j + else: + for i in analysis_tuple: + self.PrepareAndAnalyseSingle(i) + + def CombineResults(self, outfile, tm_stdev_filt=-1, bs_filt=-1, merge_dup=False): + """Join all plate_results/stdev DataFrames and write to a single XLSX file. + + Parameters + ---------- + outfile : str + where to write the output (a full path) + tm_stdev_filt : float + samples with Tm stdev above this value will be discarded + bs_filt : float + samples with BS-factor below this value will be discarded + merge_dup : bool + whether to aggregate samples with identical annotations in layout + + Notes: + ----- + TODO: add more generic filtering options + """ + analysis_tuple = self.GetDatasets() + + # initiate xlsx writer objects + writer = pd.ExcelWriter(outfile) + + for i in analysis_tuple: + tm_key = "Tagg" if i == "Scattering" else "Tm" + + # skip non-processed datasets (model=skip) + if self.datasets[i].model != "skip": + output = self.datasets[i].CombineResults( + tm_stdev_filt=tm_stdev_filt, + bs_filt=bs_filt, + merge_dup=merge_dup, + # merge_stdev=merge_stdev, DEPRECATED? + tm_key=tm_key, + ) + # NOTE Excel sheets are now named identically to input dataset names + output.to_excel(writer, i) + + # write XLSX files to the result folder: + writer.save() + + def GenerateReport(self, heatmap_cmap, template_path=None): + """Creates an interactive HTML report (as a string). + + Parameters + ---------- + heatmap_cmap + matplotlib colormap for heatmap + template_path + the HTML template + + Returns: + ------- + A string made from report template where placeholders were substituted to actual HTML code + """ + # use default template if none provided + if template_path is None: + template_path = os.path.join(__location__, "resources/report.template") + + # open the html template + with open(template_path) as template_file: + template = template_file.read() + + # convert it to a string.Template instance + template = Template(template) + + heatmap_table = "" + buttons = "" + + # for the first heatmap display is table, for the rest it is none + display_heatmap = "table" + + # for a single-dataset MPFm do not show buttons + display_buttons = "none" if len(self.datasets) == 1 else "" + + # cycle through all datasets and get glue up strings to the starting button or heatmap string + for dataset_name, dataset in self.datasets.items(): + # skip non-processed datasets + if dataset.model != "skip": + heatmap_table += dataset.html_heatmap(heatmap_cmap, display_heatmap) + if display_heatmap == "table": + display_heatmap = "none" + first_heatmap = dataset_name + buttons += dataset.html_button() + + return template.substitute( + FILE=self.source, + FIRST_HEATMAP=first_heatmap, + HEATMAP_TABLE=heatmap_table, + DISPLAY_BUTTONS=display_buttons, + BUTTONS=buttons, + VERSION=__version__, + TIMESTAMP=strftime("%c"), + ) + + def WriteOutputSingle( + self, + which, + outfolder, + subfolder=False, + **kwargs, # keyword args for WriteOutput + ): + """Write output to disc for a single dataset. + + Parameters + ---------- + which : dataset to process + outfolder : folder for output + heatmap_cmap : str + matplotlib colormap for heatmap + xlsx : bool + write output in XLSX format (default is CSV) + genpics : bool + create figures for samples + heatmaps : list + create heatmaps for the column in list + subfolder : bool + write output in outfolder (default) or create a subfolder called "which_resources" + n_jobs : int + how many parallel processes can be spawned + no_data : bool + no data output + + Notes: + ----- + * No output generated for datasets with model "skip" + * Do not use this method directly, use WriteOutputAll instead + """ + if self.datasets[which].model == "skip": + pass + else: + if subfolder: + outfolder = os.path.join(outfolder, which + "_resources") + os.makedirs(outfolder, exist_ok=True) + + # HACK to minimize edits to MoltenProtFit assingment of outfolder is done via the attribute + self.datasets[which].resultfolder = outfolder + self.datasets[which].WriteOutput( + resources_prefix=which, + **kwargs, + ) + # delete the attribute completely + del self.datasets[which].resultfolder + + def WriteOutputAll( + self, + outfolder, + # report, + xlsx=False, + genpics=False, + heatmaps=[], + report_format=None, + heatmap_cmap=defaults["heatmap_cmap"], + n_jobs=1, + no_data=False, + session=False, + ): + """Write output to disc for all associated datasets. + + Parameters + ---------- + outfolder : str + the folder where report.html will be placed and per-dataset subfolders + xlsx : bool + write output in XLSX format (default is CSV) + report : bool DEPRECATED + generate HTML report + summary : bool DEPRECATED + create a compact summary XLSX file + report_format : None or str 'pdf', 'html', 'xlsx' + n_jobs : int + how many output processes to run + genpics : bool + create figures for samples + heatmaps : list + create heatmaps for the column in list + heatmap_cmap : str + matplotlib colormap for heatmap + session : bool + save MP session in JSON format + """ + # generate and populate a dict of output settings + output_kwargs = {} + + if len(self.datasets) == 1: + # for single-dataset instances (and no reports planned) + # write everything to outdir + output_kwargs["subfolder"] = False + + if xlsx: + output_kwargs["xlsx"] = True + if heatmaps: + # heatmaps should be a list with one or more elements + output_kwargs["heatmaps"] = heatmaps + if genpics: + output_kwargs["genpics"] = True + + output_kwargs["heatmap_cmap"] = heatmap_cmap + output_kwargs["no_data"] = no_data + + # NOTE since reports are pre-defined data bundles, they may override some of the previous settings + if report_format == "html": + # generate a reporthtml string + reporthtml = self.GenerateReport(heatmap_cmap=heatmap_cmap) + # write all datatets to dedicated subfolders + output_kwargs["subfolder"] = True + # write the HTML of the report to outdir + with open(os.path.join(outfolder, "report.html"), "w") as file: + file.write(reporthtml) + output_kwargs["xlsx"] = True + output_kwargs["genpics"] = True + output_kwargs["no_data"] = False + elif report_format == "xlsx": + self.CombineResults(os.path.join(outfolder, "report.xlsx"), -1, -1, False) + elif report_format == "pdf": + output_kwargs["pdf"] = True + + # write output in parallel or serially + if parallelization and n_jobs > 1: + if len(self.GetDatasets()) == 1: + # if there is only a single associated dataset, it makes sense to enable parallel figure plotting + self.WriteOutputSingle( + self.GetDatasets()[0], + outfolder, + n_jobs=n_jobs, + **output_kwargs, + ) + else: + Parallel(n_jobs=n_jobs)( + delayed(self.WriteOutputSingle)(i, outfolder, **output_kwargs) + for i in self.GetDatasets() + ) + else: + # resultfolder was cleaned previously or created fresh so we just have to supply a proper prefix + for i in self.GetDatasets(): + self.WriteOutputSingle(i, outfolder, **output_kwargs) + + # NOTE JSON dumping must be done _AFTER_ all parallelized jobs! + if session: + mp_to_json(self, output=os.path.join(outfolder, "MP_session.json")) + + +class MoltenProtFitMultipleLE(MoltenProtFitMultiple): + """A special class to handle the lumry_eyring model + The main difference is that Scattering signal is required and + the datasets are processed sequentially. + """ + + def PrepareAndAnalyseAll(self, n_jobs=1): + """First the scattering data is fit to get Ea and Tf for reaction U->A + Then they are supplied as fixed parameters to fit reaction N <-kF, kR -> U + This fit is done in all other datasets. + """ + # rename dataset if refolded data was used + if "Scattering (Unfolding)" in self.GetDatasets(): + self.datasets["Scattering"] = self.datasets.pop("Scattering (Unfolding)") + + # check if Scattering signal is present + if "Scattering" not in self.GetDatasets(): + msg = "lumry_eyring model requires a Scattering dataset" + raise ValueError(msg) + + # run analysis of Scattering data with irrev model (has to be changed from whatever was supplied) + self.datasets["Scattering"].model = "irrev" + self.PrepareAndAnalyseSingle("Scattering") + # cycle through all other datasets and add fixed parameters + for dataset in self.GetDatasets(): + if dataset != "Scattering": + # NOTE in GUI it is possible that some datasets are not skipped, are not Scattering and + # do not have lumry_eyring model, for those SetFixedParameters will cause AttributeError, + # because non-LE models do not use this feature + if self.datasets[dataset].model == "lumry_eyring": + self.datasets[dataset].SetFixedParameters( + self.datasets["Scattering"] + .plate_results.loc[:, ["Tf_fit", "Ea_fit"]] + .T, + ) + self.PrepareAndAnalyseSingle(dataset) + + +class MoltenProtFitMultipleRefold(MoltenProtFitMultiple): + """Contains special tweaks to work with refolding data from Prometheus NT.48. + + * Analyse all datasets, but not the refolding datasets (addDataset for normal unfolding, addRefoldDataset + for refolding datasets, maintain them separately, but use the same keys in datasets dict) + * take the baseline parameters from unfolding datasets and draw fraction refolded + * for fraction refolded generate a separate report HTML file + * how to show this in GUI? + + Eventually: + * add fraction_irreversibly_unfolded to the equation and fit refolding data + using the parameters fro unfolding as starting values + """ + + pass + + +### Data parsers +""" +Functions to create a MoltenProtFitMultiple instance from a specific +experimental data file. +""" + + +def _csv_helper(filename, sep, dec): + """Pre-processing steps for reading CSV files. + + Returns: + ------- + pd.DataFrame with Temperature as index + """ + try: + data = pd.read_csv( + filename, + sep=sep, + decimal=dec, + index_col="Temperature", + encoding="utf-8", + ) + except ValueError as e: + print(e) + msg = "Input *.csv file is invalid!\nCheck if column called 'Temperature' exists and separators are specified correctly" + raise ValueError( + msg, + ) + + # check if index contains duplicates and drop those + if data.index.duplicated().any(): + print( + "Warning: Temperature scale contains duplicates, all but first occurence are dropped", + ) + data = data.loc[data.index.drop_duplicates(), :] + return data + + +def parse_plain_csv( + filename, + scan_rate=None, + sep=defaults["sep"], + dec=defaults["dec"], + denaturant=defaults["denaturant"], + readout=defaults["readout"], + layout=defaults["layout"], +): + """Parse a standard CSV file with columns Temperature, A1, A2, ... + + Parameters + ---------- + filename : str + path to csv file + sep,dec - csv import parameters + denaturant : str + temperature in input file assumed to be Celsius (default value C), but could be also in K + readout : str + name for the experimental technique (e.g. CD or F330), will be used as key in dataset dict + layout : str or None + specify a special *.csv file which defines the plate layout (i.e. what conditions are in each sample) + + Returns: + ------- + MoltenProtFitMultiple instance + + #TODO add removal of zeros in A01, A02, etc + """ + # read the CSV into a DataFrame + data = _csv_helper(filename, sep, dec) + + # read layout (if provided) + if layout is not None: + try: + layout = pd.read_csv(layout, index_col="ID", encoding="utf_8") + except: + print( + "Warning: unsupported layout format! No layout info will be available", + ) + layout = None + + if layout is None: + # if no layout provided or could not be read, create an empty layout DataFrame + layout = pd.DataFrame(index=alphanumeric_index, columns=["Condition"]) + + # initialize and return a MoltenProtFitMultiple instance + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + denaturant=denaturant, + layout=layout, + source=filename, + ) + output.AddDataset(data, readout) + return output + + +def parse_spectrum_csv( + filename, + scan_rate=None, + denaturant=defaults["denaturant"], + sep=defaults["sep"], + dec=defaults["dec"], + readout=defaults["readout"], +): + """Parse CSV file with columns Temperature,wavelengths... + + Parameters + ---------- + filename : str + path to csv file + sep,dec - csv import parameters + denaturant : str + temperature in input file assumed to be Celsius (default value C), but could be also in K + readout : str + name for the experimental technique (e.g. CD or F330), will be used as key in dataset dict + + Returns: + ------- + MoltenProtFitMultiple instance + + Notes: + ----- + * Temperature axis is not sorted + * Layouts are generated automatically from column names (assumed to be respective wavelengths) + """ + data = _csv_helper(filename, sep, dec) + + # if data is too big, take a random subset + if len(data.columns) > 96: + print( + f"Warning: too many wavelengths in the spectrum ({len(data.columns)}), selecting random 96", + ) + data = data.sample(n=96, axis=1) + # to be on the safe side, sort columns ascending + data = data.loc[:, sorted(data.columns)] + # apply the alphanumeric index + data = data.T + data["ID"] = list(alphanumeric_index[: len(data)]) + data.index.name = "Condition" + data.reset_index(inplace=True) + + # set ID as index + data.set_index("ID", inplace=True) + # extract layout info and drop from the main df + # initialize the layout dataframe + layout = pd.DataFrame(index=alphanumeric_index, columns=["Condition"]) + layout.index.name = "ID" + layout.loc[data.index, "Condition"] = data.loc[:, "Condition"].copy() + data.drop(["Condition"], axis=1, inplace=True) + data = data.T + + # initialize and return a MoltenProtFitMultiple instance + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + denaturant=denaturant, + layout=layout, + source=filename, + ) + output.AddDataset(data, readout) + return output + + +def parse_prom_xlsx(filename, raw=False, refold=False, LE=False, deltaF=True): + """Parse a processed file from Prometheus NT.48. In these files temperature + is always in Celsius and the readouts are more or less known. Layout is read + from the overview sheet. + + Parameters + ---------- + filename : str + path to xlsx file + raw : bool + if the data is "raw" or "processed" in terms of the manufacturer's GUI + refold : bool + if refolding ramp was used (default False) + LE : bool + instead of standard instance, create the one with Lumry-Eyring model + deltaF : bool + compute an alternative signal-enhanced readout: F350-F330 difference + it is an extensive readout (proportional to protein conc, like F330 or F350), + which also makes the transitions more pronounced (like Ratio) + + Returns: + ------- + MoltenProtFitMultiple instance + + Notes: + ----- + * Parsing relies on sheet names in English + * Current implementation can successfully parse raw XLSX as long as there are less than 96 data columns (which is 3 times the number of capillaries). The data will be contaminated with straight lines of temperature and time + + Todo: + * the sheets have a standardized order in the file: Overview, Readout1_(unfolding), Readout1_(unfolding)_derivative ..., Readout1_(refolding), Readout1_(refolding)_derivative ... ; this can be used to parse data independently of sheet labels; also, the sheets containing "deriv" can be auto-excluded + """ + # force the input file to have absolute path (to be stored in JSON session) + filename = os.path.abspath(filename) + + # read the whole Excel file - get an Ordered Dict + input_xlsx = pd.read_excel(filename, None) + + # parse the layout + # layout contains 3 columns: Condition, Capillary and dCp + # the capillary info can be appended during report generation, but not earlier (needed for Blanks/References etc) + # NOTE if the user manipulated the Overview sheet, additional non-data rows can be read in and produce a messy layout DF. This doesn't seem to affect the processing + if "Overview" not in input_xlsx: + msg = f"Input file {filename} contains no overview sheet" + raise ValueError(msg) + + layout = input_xlsx["Overview"] + layout.reset_index(inplace=True) + # read scan rate from first row in column "Temperature Slope" + # NOTE without conversion to float scan_rate (even if 1) will be saved to JSON as "null"! + scan_rate = float(layout["Temperature Slope"].iloc[0]) + + layout = layout.reindex(["Capillary", "Sample ID", "dCp"], axis=1) + layout.rename(columns={"Sample ID": "Condition"}, inplace=True) + # concatenate A1-H12 and description, then use the "ID" column as the new index + layout = pd.concat([layout, alphanumeric_index], axis=1) + layout.set_index("ID", inplace=True) + + # initialize a MoltenProtFitMultiple instance + if LE: + output = MoltenProtFitMultipleLE( + scan_rate=scan_rate, + layout=layout, + denaturant="C", + source=filename, + ) + else: + output = MoltenProtFitMultiple( + scan_rate=scan_rate, + layout=layout, + denaturant="C", + source=filename, + ) + + # cycle through available readouts and add them to MPMultiple + if refold: + # Full list of readouts, currently only unfolding can be processed + # TODO add a class to process refolding data in conjunction with unfolding + readouts = ( + "Ratio (Unfolding)", + "330nm (Unfolding)", + "350nm (Unfolding)", + "Scattering (Unfolding)", + "Ratio (Refolding)", + "330nm (Refolding)", + "350nm (Refolding)", + "Scattering (Refolding)", + ) + output.print_message( + "Currently refolding data is treated separately from unfolding data", + "w", + ) + else: + readouts = ("Ratio", "330nm", "350nm", "Scattering") + + # NOTE to avoid multiple checks of the scan rate (temp and time scale are the same for all readouts) + refined_scan_rate = None + for i in readouts: + if i in list(input_xlsx.keys()): + data = input_xlsx[i] + """ + Convert the read sheet from *.xlsx + The first column ("Unnamed: 0") contains several rows with NaN values that correspond to one or more columns of the annotations; there is at least one Called Sample ID, and then additional user-defined names. Those have to be removed + The next row contains the value "u'Time [s]'", and it becomes the first row once the previous operation is done. + """ + data = data[data.iloc[:, 0].notna()] + data = data.iloc[1:, :] + + if raw: + # warn the user that there is a potentially harmful data modification + output.print_message( + "Import of raw data requires interpolation to have all readings on the same temperature scale, i.e. the data gets irreversibly modified", + "w", + ) + # in proc data currently there will be: shared time, shared temp, readings + # in raw data there are 3 columns for each sample: time, temperature, readings + # NOTE in some older versions of the raw data the time and temperature are actually the same! + # care must be taken if scan_rate is determined from such files + + # extract readings, temperatures and times + readings = data.iloc[:, 2::3].copy() + temps = data.iloc[:, 1::3] + times = data.iloc[:, 0::3] + # the time and temperature of the first sample will be used in the final scale + time_scale = times.iloc[:, 0].astype(float) + temp_scale = temps.iloc[:, 0].astype(float) + # cycle through all samples and perform interpolation + for col_ix in range(len(readings.columns)): + r_col = readings.columns[col_ix] + t_col = temps.columns[col_ix] + # interpolation function + interpolator = interp1d( + temps[t_col], + readings[r_col], + bounds_error=False, + ) + # interpolated values for readings' + readings.loc[:, r_col] = interpolator(temp_scale) + # add time and temperature of the first sample to the output data + data = pd.concat([time_scale, temp_scale, readings], axis=1) + + # determine true scan rate by running a linear fit of temperature vs time + if refined_scan_rate is None: + temp_vs_time = data.iloc[:, :2].astype(float).dropna() + slope, intercept = np.polyfit( + temp_vs_time.iloc[:, 0], + temp_vs_time.iloc[:, 1], + 1, + ) + refined_scan_rate = slope * 60 # convert degC/sec to degC/min + + # remove the first column with time + data.drop(data.columns[0], inplace=True, axis=1) + # rename the first column to "Temperature" + data.rename(columns={data.columns[0]: "Temperature"}, inplace=True) + # set Temperature as the index column + data.set_index("Temperature", inplace=True) + # convert column names to A1-D12 + data.columns = list(alphanumeric_index.iloc[0 : len(data.columns)]) + # for compatibility with future pandas versions we must make sure that data type is float32 + data = data.apply(pd.to_numeric, errors="coerce") + + # Create a MoltenProtFit instance with this DataFrame as data source + output.AddDataset(data, i) + else: + output.print_message(f"Readout {i} not found", "w") + + if deltaF: + # check if F330 and F350 are available + if {"330nm", "350nm"}.issubset(output.GetDatasets()): + readout_ids = ("330nm", "350nm") + elif {"330nm_(Unfolding)", "350nm_(Unfolding)"}.issubset( + output.GetDatasets(), + ): + readout_ids = ("330nm_(Unfolding)", "350nm_(Unfolding)") + else: + output.print_message( + "Cannot compute readout deltaF: either F330 or F350 is missing in the input data", + "w", + ) + readout_ids = None + + if readout_ids is not None: + # HACK Temperature in other datasets is already in Kelvins, but denaturant is set to C + # to prevent adding extra 273.15 to index temporarily set denaturant to K + output.denaturant = "K" + output.AddDataset( + output.datasets[readout_ids[1]].plate + - output.datasets[readout_ids[0]].plate, + "deltaF", + ) + output.denaturant = "C" + + # Check if any datasets could be properly added + if len(output.datasets) < 1: + msg = f"Input file {filename} contains no data" + raise ValueError(msg) + + # assign refined scan_rate + if abs(refined_scan_rate - output.scan_rate) > 0.2: + # in range 1-7 degC/min the difference between nominal and true rate is less than 0.2 deg/min + output.print_message( + f"The difference between nominal ({output.scan_rate}) and estimated ({refined_scan_rate}) scan rate >0.2 degrees/min", + "w", + ) + output.SetScanRate(refined_scan_rate) + + return output + + +def mp_from_json(input_file): + """Read a json file and if successful return a ready-to-use MoltenProtFit instance. + + Parameters + ---------- + input_file + input file in JSON format + + Notes: + ----- + BUG column ordering is messed up after JSON I/O + """ + with open(input_file) as file: + return json.load(file, object_hook=deserialize) + + +def mp_to_json(object_inst, output=None): + """Convert an MoltenProtFit/MPFMultiple instance to a JSON file. + + Parameters + ---------- + object_inst : + MP instance to be converted to JSON + output : string or None + if None, return a JSON string + if output=='self', then use object_inst.resultfolder attribute + otherwise use str as a location where to write + + Returns: + ------- + string + only if output parameter is None + + Notes: + ----- + BUG Column sorting is usually A1 A2..., but after json i/o it is A1 A10... + """ + if output is None: + # indent=4 makes the output JSON more human-readable + return json.dumps(object_inst, default=serialize, indent=4) + + # if output is some kind of string we can use it to write the output + if output == "self": + # DELETE only useful for old MPFMultiple + # a special case is when self.resultfolder is used + # in all other just use the user-provided string + output = os.path.join(object_inst.resultfolder, "MP_session.json") + + with open(output, "w") as file: + json.dump(object_inst, file, default=serialize, indent=4) + + """ + # add compression: output file is smaller, but the process itself is slower + with gzip.GzipFile(self.resultfolder+'/MP_session.json.gz', 'w') as fout: + fout.write(json.dumps(self, default=serialize, indent=4)) + """ + return None diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py new file mode 100644 index 0000000..ac1ec99 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py @@ -0,0 +1,658 @@ +"""Copyright 2020,2021 Vadim Kotov, Thomas C. Marlovits. + +This file is part of MoltenProt. + +MoltenProt is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +MoltenProt is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with MoltenProt. If not, see . +""" +""" +NOTE +This file formalizes models used in MoltenProt fitting (MPModel class) +User-defined models can be added here +""" + +# finding number of arguments in a function +from inspect import signature + +import numpy as np + +# numeric integration +from scipy.integrate import solve_ivp + +### Constants +R = 8.314 # universtal gas constant +T_std = 298.15 # standard temperature, in Kelvins + + +class MoltenProtModel: + # dummy function is defined here, because it does not make sense for the + # instances to be able to have a different function + # the first argument HAS to be the X-axis (independent var) + fun = lambda x, k, b: k * x + b + + # same applies for the model description - no point to make an instance-specific value + _description = "A dummy MoltenProt model" + + # since a single class is created for each model, then the long name is already in the class name + # here we can also add a short name used in the main module + short_name = "model_template" + + # what measure to use in final sorting - calculated in MoltenProtFit + # NOTE all measures are selected in such a way that higher values correspond to higher stability + # if None, then final sorting is skipped + sortby = None + + def __init__(self, scan_rate=None) -> None: + """In a general case scan rate is not relevant, so it is set to None + For kinetic data it needs to be set for sure. + """ + self.scan_rate = scan_rate + + def __repr__(self) -> str: + """For interactive prompts or print() - show the model description.""" + return self._description + + def __str__(self) -> str: + """When converting to a string return the short name.""" + return self.short_name + + def param_names(self): + """Return parameter names encoded in the function declaration as a list.""" + params = signature(self.fun).parameters + # skip parameter self + return list(params)[1:] + + def param_init(self, input_data=None): + """Return starting parameters based on the input values + input_data must be a pd.Series with an index (e.g. Temperature) + returns None if the parameters should be guessed by curve_fit. + """ + if input_data is None: + return + + def param_bounds(self, input_data=None): + """Return the bounds based on the input data. + input_data must be a pd.Series with an index (e.g. Temperature) + returns (-np.inf, np.inf) if no bounds are to be set. + """ + if input_data is None: + return (-np.inf, np.inf) + return None + + +class EquilibriumTwoState(MoltenProtModel): + short_name = "santoro1988" + _description = "N <-> U" + sortby = "dG_std" # type: ignore[assignment] + + # original function + # d -> dHm + # NOTE parameter names do matter. If kN,bN,kU,bU and Tm are present, then _estimate_baseline routine + # will be run by MoltenProtFit instance to get the best possible starting values + def fun(self, T, kN, bN, kU, bU, dHm, Tm): + return (kN * T + bN + (kU * T + bU) * np.exp(dHm / R * (1 / Tm - 1 / T))) / ( + 1 + np.exp(dHm / R * (1 / Tm - 1 / T)) + ) + + def param_bounds(self, input_data=None): + # if no data supplied, run the default action from the master class + # otherwise compute bounds from plate index or hard-coded + if input_data is None: + return super().param_bounds(None) + else: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, 60000, min(input_data.index)), + (np.inf, np.inf, np.inf, np.inf, 4000000, max(input_data.index)), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and 45 degree slope + # original implementation did not put values for Tm, which was computed dynamically in main code + # here a good starting Tm is just the middle of th range (or 0 if no data provided) + # NOTE for custom models any parameter initialization code should be implemented here + if input_data is None: + return (1, 0, 2, 0, 100000, 0) + else: + return ( + 1, + 0, + 2, + 0, + 100000, + min(input_data.index) + + (min(input_data.index) + max(input_data.index)) / 2.0, + ) + + +class EquilibriumThreeState(MoltenProtModel): + short_name = "santoro1988i" + _description = "N <-> I <-> U" + # in theory total stability of the protein is the sum of stabilities of N and I + sortby = "dG_comb_std" # type: ignore[assignment] + + def fun(self, T, kN, bN, kU, bU, kI, dHm1, T1, dHm2, dT2_1): + # dT2_1 = T2 - T1, i.e. the distance between the two transitions + return ( + kN * T + + bN + + kI * np.exp(dHm1 / R * (1 / T1 - 1 / T)) + + (kU * T + bU) + * np.exp(dHm1 / R * (1 / T1 - 1 / T)) + * np.exp(dHm2 / R * (1 / (T1 + dT2_1) - 1 / T)) + ) / ( + 1 + + np.exp(dHm1 / R * (1 / T1 - 1 / T)) + + np.exp(dHm1 / R * (1 / T1 - 1 / T)) + * np.exp(dHm2 / R * (1 / (T1 + dT2_1) - 1 / T)) + ) + + def param_bounds(self, input_data=None): + # TESTING preliminary results show that no limits for dHm are better in intermediate mode + if input_data is None: + return super().param_bounds(None) + else: + # by definition T2 follows T1, so dT2_1 is > 0 + # the upper bound for dT2_1 is 1/2 of the full temperature range (i.e. the limit on max distance between the two Tms) + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + -np.inf, + 0, + # allow dT2_1 to be +/- + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index), + np.inf, + (max(input_data.index) - min(input_data.index)) / 2, + ), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + # For dT2_1 we start from the assumption that dT2_1=0, i.e. there is no 2nd transition + if input_data is None: + return (1, 0, 2, 0, 1, 100000, 0, 100000, 0) + else: + # T1 is heuristically placed in the middle of the temp range + temp_range = max(input_data.index) - min(input_data.index) + return ( + 1, + 0, + 2, + 0, + 1, + 100000, + min(input_data.index) + 0.5 * temp_range, + 100000, + 0, + ) + + +class EmpiricalTwoState(MoltenProtModel): + short_name = "santoro1988d" + _description = "Same as santoro1988, but fits Tm and T_onset" + sortby = "T_eucl" # type: ignore[assignment] + # NOTE onset threshold is hard-coded to 0.01, i.e. onset point is 1% unfolded + onset_threshold = 0.01 + + def fun(self, T, kN, bN, kU, bU, T_onset, Tm): + return ( + kN * T + + bN + + (kU * T + bU) + * np.exp( + (T - Tm) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset - Tm), + ) + ) / ( + 1 + + np.exp( + (T - Tm) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset - Tm), + ) + ) + + def param_bounds(self, input_data=None): + if input_data is None: + return super().param_bounds(None) + else: + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + min(input_data.index), + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index) - 2, + max(input_data.index), + ), + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + if input_data is None: + return (1, 0, 2, 0, 1, 2) + else: + return ( + 1, + 0, + 2, + 0, + min(input_data.index) + 10, + (min(input_data.index) + max(input_data.index)) / 2.0, + ) + + +class EmpiricalThreeState(MoltenProtModel): + short_name = "santoro1988di" + _description = "Same as santoro1988i, but fits Tm and T_onset" + # similar to thermodynamic 3-state model: sum up Euclidean temperature distance + # for both reaction steps + sortby = "T_eucl_comb" # type: ignore[assignment] + # NOTE onset threshold is hard-coded to 0.01, i.e. onset point is 1% unfolded + onset_threshold = 0.01 + + def fun(self, T, kN, bN, kU, bU, kI, T_onset1, T1, T_onset2, T2): + return ( + kN * T + + bN + + kI + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + + (kU * T + bU) + * np.exp( + (T - T2) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset2 - T2), + ) + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + ) / ( + 1 + + np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + + np.exp( + (T - T2) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset2 - T2), + ) + * np.exp( + (T - T1) + * np.log(self.onset_threshold / (1 - self.onset_threshold)) + / (T_onset1 - T1), + ) + ) + + def param_bounds(self, input_data=None): + if input_data is None: + return super().param_bounds(None) + else: + return ( + [ + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index), + min(input_data.index), + min(input_data.index), + min(input_data.index), + ], + [ + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index), + max(input_data.index), + max(input_data.index), + max(input_data.index), + ], + ) + + def param_init(self, input_data=None): + # Initial parameters - pre baseline has no intercept and are 45 degree slope + if input_data is None: + return (1, 0, 2, 0, 1, 1, 1, 1, 1) + else: + temp_range = max(input_data.index) - min(input_data.index) + # with these starting values first transition is supposed to start in the beginning of the curve + # and the second transition starts in the end of the curve. Then they should meet + return ( + 1, + 0, + 2, + 0, + 1, + min(input_data.index) + 0.2 * temp_range, + min(input_data.index) + 0.4 * temp_range, + max(input_data.index) - 0.4 * temp_range, + max(input_data.index) - 0.2 * temp_range, + ) + + +class IrreversibleTwoState(MoltenProtModel): + short_name = "irrev" + _description = "N -> U" + sortby = "pk_std" # type: ignore[assignment] + xn = 1 # y0 (starting condition) for differential equation + + def __init__(self, scan_rate) -> None: + # scan rate is an essential parameter and must thus be set explicitly + if scan_rate is not None: + self.scan_rate = scan_rate + else: + msg = f"{self.short_name} model requires scan_rate to be set" + raise ValueError( + msg, + ) + + def arrhenius(self, t, Tf, Ea): + """Arrhenius equiation: defines dependence of reaction rate constant k on temperature + In this version of the equation we use Tf (a temperature of k=1) + to get rid of instead of pre-exponential constant A. + """ + return np.exp(-Ea / R * (1 / t - 1 / Tf)) + + def ode(self, t, xn, Tf, Ea): + """Ordinary differential equation for fraction native versus temperature + dxn/dT = -1/v*k(T)*xn. + + start_value xn - should be always 1 because at the start of assay we assume everything is folded + v - scan rate to convert minutes of scan rate to degrees (default 1) + xn - fraction native (xn + xagg = 1) + k(T) - temperature-dependent rate constant of aggregation + """ + return -1 / self.scan_rate * self.arrhenius(t, Tf, Ea) * xn + + def fun(self, t, kN, bN, kU, bU, Tf, Ea): + """Returns aggregation signal at given temperature + Signal(T) = (kN*T + bN)*xn +(kU*T + bU)*xu + k, b - baseline parameters (N or U state) + xn, xu - fraciton native/unfolded, xn + xu = 1 + in other words: + Signal(T) = kU*T + bU + (kN*T + bN - kU*T - bU) * xn. + """ + # step 1: numerically integrate agg_ode for given parameters - gives xn(T) + ivp_result = solve_ivp( + self.ode, + t_span=[min(t), max(t)], + t_eval=t, + y0=[self.xn], + args=(Tf, Ea), + method="BDF", + ) + + # step 2: return the result of the signal + return kU * t + bU + (kN * t + bN - kU * t - bU) * ivp_result.y[0, :] + + def param_init(self, input_data=None): + if input_data is None: + # without input data it's hard to guess starting values + # but it seems that starting with a high Tf may help + # kN, bN, kU, bU, Tf, Ea + return (0, 1, 0, 1, 400, 100000) + else: + # the baselines will have a better guess in MoltenProt + # since Tf may or may not coincide with the derivative peak + # it is taken as the middle of the temp range + return ( + 0, + 1, + 0, + 1, + min(input_data.index) + + (max(input_data.index) - min(input_data.index)) / 2.0, + 50000, + ) + + def param_bounds(self, input_data=None): + # NOTE it may happen that Tf is ouside the temperature range, but the curve is still OK + # also, MoltenProt calculations are always done in Kelvins + # thus, the default bounds are quite relaxed + if input_data is None: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, 1, 0), + (np.inf, np.inf, np.inf, np.inf, np.inf, np.inf), + ) + else: + return ( + (-np.inf, -np.inf, -np.inf, -np.inf, min(input_data.index) - 100, 0), + (np.inf, np.inf, np.inf, np.inf, max(input_data.index) + 100, np.inf), + ) + + +""" +NOTE given the current computational workload even of a simple kinetic equiation implementation of more complex +equations makes little sense + +class IrreversibleThreeState(IrreversibleTwoState): + short_name = 'irrev' + _description = "N -> I -> U" +""" + +# TODO another (potentially useful) variant could be to implement N <-kF,kR-> U without any A +# the class can be called ReversibleTwoState, because there are only U and N, and U can sometimes become N + + +class LumryEyring(IrreversibleTwoState): + short_name = "lumry_eyring" + _description = "N <- kF,kR -> U -> A" + + # the kF/kR at std temperature; take as -log10 to have higher values for higher stability + sortby = "pk_ratio_std" # type: ignore[assignment] + + # fmt: off + # ~_~ + z0 = (0,0) # starting values for system of 2x ode + # /,www,\ + # ||wWw|| an owl?! + # \|||/ + # ~~~m'm~~~ + # fmt: on + def __init__(self, scan_rate, tfea=[None, None]) -> None: + # scan rate is an essential parameter and must thus be set explicitly + if scan_rate is not None: + self.scan_rate = scan_rate + else: + msg = f"{self.short_name} model requires scan_rate to be set" + raise ValueError( + msg, + ) + + # tfea are characteristics of the irreversible aggregation reaction (Tf and Ea) + # and are obtained separately; they are fixed during fitting + self.tfea = tfea + + def set_fixed(self, tfea): + """Set parameters that are needed in the fit equation, but not being fit + tfea must be a list with two values [Tf, Ea] (floats) + NOTE only the length of the input is checked, but not the type of list elements. + """ + if len(tfea) != 2: + msg = "LE model requires tfea to be a list of two floats" + raise ValueError(msg) + self.tfea = tfea + + def ode( + self, + t, + z0, + TfF, + EaF, + TfR, + EaR, + Tf2, + Ea2, + ): + """A function to process a system of differential equations x and y packaged into array z = [x,y] + Implements a general case of Lumry-Eyring equation (case D in Mazurenko2017) + N <- kF, kR -> U - k2 -> A + there is an quasi-equilibrium between N and U, but irreversible conversion to A + As described in the paper: + + fN + fU + fA = 1 + dfAdT = 1/v * k2(T) * fU(T) ...................................... (eq. y) + dfUdT = 1/v* ( kF(T)*fN - (kR(T)+k2(T))*fU ) + + since fN = 1 - fU - fA, can rewrite dfUdT like this: + + dfUdT = 1/v * ( kF(T) * (1 - fU - fA) - ( kR(T)+k2(T) )*fU ) ..... (eq. x) + + the formula for rate in reaction i would then be: + ki(T) = exp(-Eai/R * (1/T - 1/Tfi)) + i can be F or R for N<->U and 2 for U->A + + t - temperature scale (converted to kinetic time using v, the scan rate (global constant) + z0 - starting values for equations in the system [fU=0, fA=0] + """ + # unpack initial values + x0, y0 = z0 + + dxdt = ( + 1 + / self.scan_rate + * ( + self.arrhenius(t, TfF, EaF) * (1 - x0 - y0) + - (self.arrhenius(t, TfR, EaR) + self.arrhenius(t, Tf2, Ea2)) * x0 + ) + ) + dydt = 1 / self.scan_rate * self.arrhenius(t, Tf2, Ea2) * x0 + + return (dxdt, dydt) + + # def fun(self, T, TfF, EaF, TfR, EaR, kNF, bNF, kUF, kAF, bAF): + def fun(self, T, kN, bN, kU, bU, kI, TfF, EaF, TfR, EaR): + """Uses pre-computed Tf and Ea for scattering data to model fluorescence signal + NOTE the original order and naming of the parameters is as follows: + T, TfF, EaF, TfR, EaR, kNF, bNF, kUF, kAF, bAF + T - temperature/time + TfF - temperature at which the rate constant kF=1 (reaction N->U) + EaF - activation energy for N->U + TfR,EaR - Tf and Ea for reaction U->N + kNF, bNF - slope and intercept for the baseline of state N + kUF - slope for the fluorescence of state U (assumed to be short-lived and not abundant, see Bedouelle2016) + kAF, bAF - slope/intercept for the baseline of state A, which ultimately makes up the post-transition baseline. + + For MoltenProt to recognize pre- and post- baseline parameters the have to be placed first + and renamed as follows: + kNF, bNF - kN, bN + kAF, bAF - kU, bU + kUF - kI (similar to three-state cases above) + + NOTE this is not the only way to define the law of signal; for instance, we can assume + that the fluorescence has a similar time dependence for states U and A; then kI (aka kUF) is not needed + and the law of signal will be: + (kN * T + bN) * fN + (kU*T + bU) * (fA + fU) + """ + ivp_result = solve_ivp( + self.ode, + t_span=[min(T), max(T)], + y0=self.z0, + args=(TfF, EaF, TfR, EaR, self.tfea[0], self.tfea[1]), + t_eval=T, + method="BDF", + ) + + # based on diff eqn compute fractions of each state + fU = ivp_result.y[0, :] + fA = ivp_result.y[1, :] + fN = 1 - fU - fA + # return modelled fluorescence signal + return (kN * T + bN) * fN + kI * fU + (kU * T + bU) * fA + + def param_init(self, input_data=None): + if input_data is None: + # without input data it's hard to guess starting values + # kN, bN, kU, bU, kI, TfF, EaF, TfR, EaR + return (0, 1, 0, 1, 0, 400, 100000, 400, 100000) + else: + # the baselines will have a better guess in MoltenProt + # Tf is probably not similar to Tm, however, it makes sense to try the middle of the range + temp_range = max(input_data.index) - min(input_data.index) + return ( + 0, + 1, + 0, + 1, + 0, + min(input_data.index) + temp_range / 2.0, + 50000, + min(input_data.index) + temp_range / 2.0, + 50000, + ) + + def param_bounds(self, input_data=None): + # NOTE it may happen that Tf is ouside the temperature range, but the curve is still OK + # also, MoltenProt calculations are always done in Kelvins + # thus, the default bounds are quite relaxed + # NOTE Tf should not be zero, see the Arrhenius formula + if input_data is None: + return ((-np.inf, -np.inf, -np.inf, -np.inf, -np.inf, 1, 0, 1, 0), np.inf) + else: + return ( + ( + -np.inf, + -np.inf, + -np.inf, + -np.inf, + -np.inf, + min(input_data.index) - 100, + 0, + min(input_data.index) - 100, + 0, + ), + ( + np.inf, + np.inf, + np.inf, + np.inf, + np.inf, + max(input_data.index) + 100, + np.inf, + max(input_data.index) + 100, + np.inf, + ), + ) From 08097f3a134f907a236170883c1f6f35028b73ed Mon Sep 17 00:00:00 2001 From: Najib Ishaq Date: Mon, 6 May 2024 07:51:21 -0400 Subject: [PATCH 02/18] chore: added ict.yml file --- .../rt-cetsa-moltprot-tool/.bumpversion.cfg | 2 + regression/rt-cetsa-moltprot-tool/README.md | 12 ++--- regression/rt-cetsa-moltprot-tool/ict.yml | 51 +++++++++++++++++++ regression/rt-cetsa-moltprot-tool/plugin.json | 8 +-- .../rt-cetsa-moltprot-tool/run-plugin.sh | 4 +- 5 files changed, 65 insertions(+), 12 deletions(-) create mode 100644 regression/rt-cetsa-moltprot-tool/ict.yml diff --git a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg index d858009..5c3e4be 100644 --- a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg +++ b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg @@ -27,3 +27,5 @@ replace = version = "{new_version}" [bumpversion:file:README.md] [bumpversion:file:src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py] + +[bumpversion:file:ict.yml] diff --git a/regression/rt-cetsa-moltprot-tool/README.md b/regression/rt-cetsa-moltprot-tool/README.md index 24a28c6..3d85f45 100644 --- a/regression/rt-cetsa-moltprot-tool/README.md +++ b/regression/rt-cetsa-moltprot-tool/README.md @@ -16,9 +16,9 @@ If WIPP is running, navigate to the plugins page and add a new plugin. Paste the This plugin takes eight input argument and one output argument: -| Name | Description | I/O | Type | -|--------------------|------------------------------------------------------------|--------|---------------| -| `--inpDir` | Input data collection to be processed by this plugin | Input | genericData | -| `--filePattern` | Pattern to parse tabular files | Input | string | -| `--outDir` | Output file | Output | genericData | -| `--preview` | Generate JSON file with outputs | Output | JSON | +| Name | Description | I/O | Type | +|-------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | +| `--pattern` | Pattern to parse input files | Input | string | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/regression/rt-cetsa-moltprot-tool/ict.yml b/regression/rt-cetsa-moltprot-tool/ict.yml new file mode 100644 index 0000000..bf5c3fe --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/ict.yml @@ -0,0 +1,51 @@ +author: +- Nick Schaub +- Najib Ishaq +contact: nick.schaub@nih.gov +container: polusai/rt-cetsa-moltprot-tool:0.1.0 +description: Run regression analysis for the RT-CETSA pipeline. +entrypoint: python3 -m polus.tabular.regression.rt_cetsa_moltprot +inputs: +- description: Input data collection to be processed by this tool + format: + - inpDir + name: inpDir + required: true + type: path +- description: Filepattern to parse input files + format: + - pattern + name: pattern + required: false + type: string +- description: Generate an output preview. + format: + - preview + name: preview + required: false + type: boolean +name: polusai/RTCETSAMoltprot +outputs: +- description: Output collection + format: + - outDir + name: outDir + required: true + type: path +repository: https://github.com/PolusAI/tabular-tools +specVersion: 1.0.0 +title: RT-CETSA Moltprot +ui: +- description: Input data collection + key: inputs.inpDir + title: Input data collection + type: path +- description: Filepattern to parse input files + key: inputs.pattern + title: pattern + type: text +- description: Generate an output preview. + key: inputs.preview + title: Preview example output of this plugin + type: checkbox +version: 0.1.0 diff --git a/regression/rt-cetsa-moltprot-tool/plugin.json b/regression/rt-cetsa-moltprot-tool/plugin.json index e48b1d0..fc1f91d 100644 --- a/regression/rt-cetsa-moltprot-tool/plugin.json +++ b/regression/rt-cetsa-moltprot-tool/plugin.json @@ -18,11 +18,11 @@ { "name": "inpDir", "type": "genericData", - "description": "Input data collection to be processed by this plugin", + "description": "Input data collection to be processed by this tool", "required": true }, { - "name": "filePattern", + "name": "pattern", "type": "string", "description": "Pattern to parse input files", "default": ".+", @@ -43,8 +43,8 @@ "description": "Input data collection to be processed by this plugin" }, { - "key": "inputs.filePattern", - "title": "filePattern", + "key": "inputs.pattern", + "title": "pattern", "description": "Pattern to parse input files", "default": ".+" } diff --git a/regression/rt-cetsa-moltprot-tool/run-plugin.sh b/regression/rt-cetsa-moltprot-tool/run-plugin.sh index bea8c33..97d7005 100644 --- a/regression/rt-cetsa-moltprot-tool/run-plugin.sh +++ b/regression/rt-cetsa-moltprot-tool/run-plugin.sh @@ -4,7 +4,7 @@ datapath=$(readlink --canonicalize data) # Inputs inpDir=/data/input -filePattern=".*" +pattern=".*" # Output paths outDir=/data/output @@ -16,5 +16,5 @@ docker run --mount type=bind,source=${datapath},target=/data/ \ --env POLUS_LOG=${LOGLEVEL} \ polusai/rt-cetsa-moltprot-tool:${version} \ --inpDir ${inpDir} \ - --filePattern ${filePattern} \ + --pattern ${pattern} \ --outDir ${outDir} From ec94a098eab4d766a57dc7b7971c8ba23e850a56 Mon Sep 17 00:00:00 2001 From: agerardin Date: Mon, 13 May 2024 11:02:59 -0400 Subject: [PATCH 03/18] fix: moltenprot tool. --- regression/rt-cetsa-moltprot-tool/README.md | 12 ++--- regression/rt-cetsa-moltprot-tool/plugin.json | 10 ++-- .../regression/rt_cetsa_moltprot/__init__.py | 8 +-- .../regression/rt_cetsa_moltprot/__main__.py | 49 ++++++++++++++++--- 4 files changed, 55 insertions(+), 24 deletions(-) diff --git a/regression/rt-cetsa-moltprot-tool/README.md b/regression/rt-cetsa-moltprot-tool/README.md index 3d85f45..7038d99 100644 --- a/regression/rt-cetsa-moltprot-tool/README.md +++ b/regression/rt-cetsa-moltprot-tool/README.md @@ -16,9 +16,9 @@ If WIPP is running, navigate to the plugins page and add a new plugin. Paste the This plugin takes eight input argument and one output argument: -| Name | Description | I/O | Type | -|-------------|----------------------------------------------------|--------|-------------| -| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | -| `--pattern` | Pattern to parse input files | Input | string | -| `--outDir` | Output file | Output | genericData | -| `--preview` | Generate JSON file with outputs | Output | JSON | +| Name | Description | I/O | Type | +|-----------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | +| `--filePattern` | File Pattern to parse input files | Input | string | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/regression/rt-cetsa-moltprot-tool/plugin.json b/regression/rt-cetsa-moltprot-tool/plugin.json index fc1f91d..92dcd14 100644 --- a/regression/rt-cetsa-moltprot-tool/plugin.json +++ b/regression/rt-cetsa-moltprot-tool/plugin.json @@ -22,9 +22,9 @@ "required": true }, { - "name": "pattern", + "name": "filePattern", "type": "string", - "description": "Pattern to parse input files", + "description": "File Pattern to parse input files", "default": ".+", "required": false } @@ -43,9 +43,9 @@ "description": "Input data collection to be processed by this plugin" }, { - "key": "inputs.pattern", - "title": "pattern", - "description": "Pattern to parse input files", + "key": "inputs.filePattern", + "title": "File Pattern", + "description": "File Pattern to parse input files", "default": ".+" } ] diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py index 6d3ec01..b7d7e48 100644 --- a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py +++ b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py @@ -39,10 +39,4 @@ def fit_data(file_path: pathlib.Path) -> pandas.DataFrame: fit.PrepareData() fit.ProcessData() - return fit.plate_results.sort_values("BS_factor") - - -def gen_out_path(file_path: pathlib.Path, out_dir: pathlib.Path) -> pathlib.Path: - """Generate the output path.""" - file_name = file_path.stem + "_moltprot" + POLUS_TAB_EXT - return out_dir / file_name + return fit diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py index 3c43a9f..4a36b08 100644 --- a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py +++ b/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py @@ -8,7 +8,6 @@ import filepattern import typer from polus.tabular.regression.rt_cetsa_moltprot import fit_data -from polus.tabular.regression.rt_cetsa_moltprot import gen_out_path # Initialize the logger logging.basicConfig( @@ -18,6 +17,8 @@ logger = logging.getLogger("polus.tabular.transforms.tabular_merger") logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) +POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") + app = typer.Typer() @@ -25,6 +26,7 @@ def main( inp_dir: pathlib.Path = typer.Option( ..., + "--inpDir", help="Input directory containing the data files.", exists=True, dir_okay=True, @@ -33,14 +35,17 @@ def main( ), pattern: str = typer.Option( ".+", + "--filePattern", help="Pattern to match the files in the input directory.", ), preview: bool = typer.Option( False, + "--preview", help="Preview the files that will be processed.", ), out_dir: pathlib.Path = typer.Option( ..., + "--outDir", help="Output directory to save the results.", exists=True, dir_okay=True, @@ -56,20 +61,52 @@ def main( logger.info(f"File Pattern: {pattern}") logger.info(f"Output directory: {out_dir}") + if POLUS_TAB_EXT != ".csv": + msg = "this tool can currently only process csv files." + raise ValueError(msg) + fp = filepattern.FilePattern(inp_dir, pattern) inp_files = [f[1][0] for f in fp()] + for f in inp_files: + if not f.suffix == POLUS_TAB_EXT: + raise ValueError( + f"this tool can only process {POLUS_TAB_EXT} files. Got {f}", + ) + if preview: - out_json = {"files": [gen_out_path(f, out_dir) for f in inp_files]} + outputs: list[str] = [] + for f in inp_files: + fit_params_path = f.stem + "_moltenprot_params" + POLUS_TAB_EXT + fit_curves_path = f.stem + "_moltenprot_curves" + POLUS_TAB_EXT + outputs = [*outputs, fit_params_path, fit_curves_path] + out_json = {"files": outputs} with (out_dir / "preview.json").open("w") as f: json.dump(out_json, f, indent=2) return for f in inp_files: - logger.info(f"Processing file: {f}") - out_path = gen_out_path(f, out_dir) - df = fit_data(f) - df.to_csv(out_path, index=True) + logger.info(f"Processing plate timeserie: {f}") + fit = fit_data(f) + fit_params_path = out_dir / (f.stem + "_moltenprot_params" + POLUS_TAB_EXT) + fit_curves_path = out_dir / (f.stem + "_moltenprot_curves" + POLUS_TAB_EXT) + + # sort fit_params by row/column + fit_params = fit.plate_results + fit_params["_index"] = fit_params.index + fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) + fit_params["number"] = fit_params.apply( + lambda row: row._index[1:], + axis=1, + ).astype(int) + fit_params = fit_params.drop(columns="_index") + fit_params = fit_params.sort_values(["letter", "number"]) + fit_params.to_csv(fit_params_path, index=True) + + # keep only 2 signicant digits for temperature index + fit_curves = fit.plate_raw_corr + fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) + fit_curves.to_csv(fit_curves_path, index=True) if __name__ == "__main__": From c0a0bad2a76742aada39cc8c18880a64752e3f85 Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 14 May 2024 03:16:03 -0400 Subject: [PATCH 04/18] feat: containerized and cwl-able --- .../rt-cetsa-moltprot-tool/.bumpversion.cfg | 12 ++++--- regression/rt-cetsa-moltprot-tool/Dockerfile | 6 ++-- regression/rt-cetsa-moltprot-tool/README.md | 5 ++- .../rt-cetsa-moltprot-tool/build-docker.sh | 4 +++ regression/rt-cetsa-moltprot-tool/ict.yml | 3 +- regression/rt-cetsa-moltprot-tool/plugin.json | 6 ++-- .../rt-cetsa-moltprot-tool/pyproject.toml | 3 +- .../rt_cetsa_moltenprot.cwl | 32 +++++++++++++++++++ .../rt-cetsa-moltprot-tool/run-plugin.sh | 6 ++-- .../{VERSION => RT_CETSA_VERSION} | 0 .../regression/rt_cetsa_moltprot/__init__.py | 5 ++- .../regression/rt_cetsa_moltprot/__main__.py | 2 +- .../regression/rt_cetsa_moltprot/core.py | 2 +- 13 files changed, 64 insertions(+), 22 deletions(-) create mode 100755 regression/rt-cetsa-moltprot-tool/build-docker.sh create mode 100644 regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl mode change 100644 => 100755 regression/rt-cetsa-moltprot-tool/run-plugin.sh rename regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/{VERSION => RT_CETSA_VERSION} (100%) diff --git a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg index 5c3e4be..b3d8c10 100644 --- a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg +++ b/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg @@ -16,16 +16,18 @@ values = [bumpversion:part:dev] +[bumpversion:file:VERSION] + [bumpversion:file:pyproject.toml] search = version = "{current_version}" replace = version = "{new_version}" -[bumpversion:file:plugin.json] - -[bumpversion:file:VERSION] - [bumpversion:file:README.md] -[bumpversion:file:src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py] +[bumpversion:file:src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py] + +[bumpversion:file:plugin.json] [bumpversion:file:ict.yml] + +[bumpversion:file:rt_cetsa_moltenprot.cwl] diff --git a/regression/rt-cetsa-moltprot-tool/Dockerfile b/regression/rt-cetsa-moltprot-tool/Dockerfile index dd390da..db91d4c 100755 --- a/regression/rt-cetsa-moltprot-tool/Dockerfile +++ b/regression/rt-cetsa-moltprot-tool/Dockerfile @@ -1,9 +1,9 @@ -FROM polusai/bfio:2.3.6 +FROM polusai/bfio:2.1.9 # environment variables defined in polusai/bfio ENV EXEC_DIR="/opt/executables" ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".arrow" +ENV POLUS_TAB_EXT=".csv" ENV POLUS_LOG="INFO" # Work directory defined in the base container @@ -16,5 +16,5 @@ COPY src ${EXEC_DIR}/src RUN pip3 install ${EXEC_DIR} --no-cache-dir -ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_moltprot"] +ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_moltenprot"] CMD ["--help"] diff --git a/regression/rt-cetsa-moltprot-tool/README.md b/regression/rt-cetsa-moltprot-tool/README.md index 7038d99..50ad929 100644 --- a/regression/rt-cetsa-moltprot-tool/README.md +++ b/regression/rt-cetsa-moltprot-tool/README.md @@ -1,7 +1,6 @@ -# RT_CETSA Moltprot Regression (v0.1.0) +# RT_CETSA MoltenProt (v0.1.0) -This WIPP plugin runs regression analysis for the RT-CETSA pipeline. -The input csv file should be sorted by `Temperature` column. +This WIPP plugin runs moltenprot regression for the RT-CETSA pipeline. ## Building diff --git a/regression/rt-cetsa-moltprot-tool/build-docker.sh b/regression/rt-cetsa-moltprot-tool/build-docker.sh new file mode 100755 index 0000000..80539db --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Antoine Gerardin ", "Najib Ishaq ", ] readme = "README.md" diff --git a/regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl b/regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl new file mode 100644 index 0000000..ee45fd6 --- /dev/null +++ b/regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl @@ -0,0 +1,32 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + filePattern: + inputBinding: + prefix: --filePattern + type: string? + preview: + inputBinding: + prefix: --preview + type: boolean? + outDir: + inputBinding: + prefix: --outDir + type: Directory +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/rt-cetsa-moltenprot-tool:0.1.0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/regression/rt-cetsa-moltprot-tool/run-plugin.sh b/regression/rt-cetsa-moltprot-tool/run-plugin.sh old mode 100644 new mode 100755 index 97d7005..73c9245 --- a/regression/rt-cetsa-moltprot-tool/run-plugin.sh +++ b/regression/rt-cetsa-moltprot-tool/run-plugin.sh @@ -1,6 +1,6 @@ #!/bin/bash version=$( Date: Tue, 14 May 2024 05:57:29 -0400 Subject: [PATCH 05/18] feat: add rt_cetsa analysis. --- .../rt-cetsa-analysis-tool/.bumpversion.cfg | 33 ++++++++ regression/rt-cetsa-analysis-tool/Dockerfile | 20 +++++ regression/rt-cetsa-analysis-tool/README.md | 23 ++++++ regression/rt-cetsa-analysis-tool/VERSION | 1 + .../rt-cetsa-analysis-tool/build-docker.sh | 4 + regression/rt-cetsa-analysis-tool/ict.yml | 52 +++++++++++++ regression/rt-cetsa-analysis-tool/plugin.json | 52 +++++++++++++ .../rt-cetsa-analysis-tool/pyproject.toml | 77 +++++++++++++++++++ .../rt_cetsa_analysis.cwl | 32 ++++++++ .../rt-cetsa-analysis-tool/run-plugin.sh | 20 +++++ 10 files changed, 314 insertions(+) create mode 100644 regression/rt-cetsa-analysis-tool/.bumpversion.cfg create mode 100755 regression/rt-cetsa-analysis-tool/Dockerfile create mode 100644 regression/rt-cetsa-analysis-tool/README.md create mode 100644 regression/rt-cetsa-analysis-tool/VERSION create mode 100755 regression/rt-cetsa-analysis-tool/build-docker.sh create mode 100644 regression/rt-cetsa-analysis-tool/ict.yml create mode 100644 regression/rt-cetsa-analysis-tool/plugin.json create mode 100644 regression/rt-cetsa-analysis-tool/pyproject.toml create mode 100644 regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl create mode 100755 regression/rt-cetsa-analysis-tool/run-plugin.sh diff --git a/regression/rt-cetsa-analysis-tool/.bumpversion.cfg b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg new file mode 100644 index 0000000..3bc5c4c --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg @@ -0,0 +1,33 @@ +[bumpversion] +current_version = 0.1.0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:VERSION] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:README.md] + +[bumpversion:file:src/polus/tabular/regression/rt_cetsa_analysis/__init__.py] + +[bumpversion:file:plugin.json] + +[bumpversion:file:ict.yml] + +[bumpversion:file:rt_cetsa_analysis.cwl] diff --git a/regression/rt-cetsa-analysis-tool/Dockerfile b/regression/rt-cetsa-analysis-tool/Dockerfile new file mode 100755 index 0000000..0e29286 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/Dockerfile @@ -0,0 +1,20 @@ +FROM polusai/bfio:2.1.9 + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] +CMD ["--help"] diff --git a/regression/rt-cetsa-analysis-tool/README.md b/regression/rt-cetsa-analysis-tool/README.md new file mode 100644 index 0000000..acde158 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/README.md @@ -0,0 +1,23 @@ +# RT_CETSA Analysis Tool (v0.1.0) + +This WIPP plugin runs statistical analysis for the RT-CETSA pipeline. + +## Building + +To build the Docker image for the conversion plugin, run +`./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the contents of `plugin.json` into the pop-up window and submit. + +## Options + +This plugin takes eight input argument and one output argument: + +| Name | Description | I/O | Type | +|-----------------|----------------------------------------------------|--------|-------------| +| `--inpDir` | Input data collection to be processed by this tool | Input | genericData | +| `--filePattern` | File Pattern to parse input files | Input | string | +| `--outDir` | Output file | Output | genericData | +| `--preview` | Generate JSON file with outputs | Output | JSON | diff --git a/regression/rt-cetsa-analysis-tool/VERSION b/regression/rt-cetsa-analysis-tool/VERSION new file mode 100644 index 0000000..6e8bf73 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/regression/rt-cetsa-analysis-tool/build-docker.sh b/regression/rt-cetsa-analysis-tool/build-docker.sh new file mode 100755 index 0000000..93d9b45 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Antoine Gerardin ", + "Najib Ishaq ", +] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +filepattern = "^2.0.5" +pandas = "^2.2.2" +matplotlib = "^3.8.4" +scipy = "^1.13.0" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pre-commit = "^3.1.0" +pytest = "^7.2.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.ruff] +extend = "../../ruff.toml" +extend-ignore = [ + "RET505", # Unnecessary `else` after `return` statement + "E501", # Line too long + "ANN001", # Missing type annotation + "D102", # Missing docstring in public method + "ANN201", # Missing return type annotation + "N806", # Variable in function should be lowercase + "D205", # 1 blank line required between summary line and description + "N803", # Argument name should be lowercase + "PLR0913", # Too many arguments + "D415", # First line should end with a period, question mark, or exclamation point + "PLR2004", # Magic value used in comparison + "B006", # Do not use mutable default arguments + "D107", # Missing docstring + "D101", # Missing docstring + "E731", # Do not assign a lambda expression, use a def + "E402", # Module level import not at top of file + "PTH123", # `open()` should be replaced with `Path.open()` + "PTH118", # `os.path.join()` should be replaced with `/` operator + "PTH100", # `os.path.abspath()` should be replaced with `Path.resolve()` + "PLR0915", # Too many statements + "PLR0912", # Too many branches + "C901", # Function is too complex + "T201", # `print` used + "E722", # Do not use bare 'except' + "B904", # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling + "ANN202", # Missing return type annotation for private function + "ARG002", # Unused method argument + "N802", # Function name should be lowercase + "PTH103", # `os.makedirs()` should be replaced with `Path.mkdir(parents=True)` + "ANN003", # Missing type annotation for `**kwargs` + "B007", # Loop control variable not used within the loop body + "ANN204", # Missing return type annotation for magic method + "D417", # Missing argument descriptions in the docstring + "ANN205", # Missing return type annotation for static method + "PLR5501", # Use `elif` instead of `else` following `if` condition to avoid unnecessary indentation + "EM102", # Exception must not use an f-string literal + "D414", # Section has no content + "RUF012", # Mutable class attributes should be annotated with `typing.ClassVar` + "A001", # Variable `input` is shadowing a Python builtin + "A002", # Argument `input` is shadowing a Python builtin + "E741", # Ambiguous variable name: `l` + "PTH120", # `os.path.dirname()` should be replaced by `Path.parent` + "N816", # Variable `cfFilename` in global scope should not be mixedCase + "PTH109", # `os.getcwd()` should be replaced by `Path.cwd()` +] diff --git a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl new file mode 100644 index 0000000..9bb5d18 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl @@ -0,0 +1,32 @@ +class: CommandLineTool +cwlVersion: v1.2 +inputs: + inpDir: + inputBinding: + prefix: --inpDir + type: Directory + filePattern: + inputBinding: + prefix: --filePattern + type: string? + preview: + inputBinding: + prefix: --preview + type: boolean? + outDir: + inputBinding: + prefix: --outDir + type: Directory +outputs: + outDir: + outputBinding: + glob: $(inputs.outDir.basename) + type: Directory +requirements: + DockerRequirement: + dockerPull: polusai/rt-cetsa-analysis-tool:0.1.0 + InitialWorkDirRequirement: + listing: + - entry: $(inputs.outDir) + writable: true + InlineJavascriptRequirement: {} diff --git a/regression/rt-cetsa-analysis-tool/run-plugin.sh b/regression/rt-cetsa-analysis-tool/run-plugin.sh new file mode 100755 index 0000000..c0aa711 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash +version=$( Date: Tue, 14 May 2024 09:20:01 -0400 Subject: [PATCH 06/18] feat: rt_Cetsa analysis tool, containerized. --- regression/rt-cetsa-analysis-tool/Dockerfile | 64 +- .../rt-cetsa-analysis-tool/build-docker.sh | 2 +- regression/rt-cetsa-analysis-tool/ict.yml | 42 +- regression/rt-cetsa-analysis-tool/plugin.json | 21 +- .../rt-cetsa-analysis-tool/pyproject.toml | 7 +- .../requirements/Requirements_1.R | 3 + .../requirements/Requirements_2.R | 1 + .../requirements/Requirements_3.R | 6 + .../requirements/Requirements_4.R | 1 + .../requirements/Requirements_5.R | 1 + .../requirements/Requirements_6.R | 1 + .../rt_cetsa_analysis.cwl | 16 +- .../rt-cetsa-analysis-tool/run-plugin.sh | 2 +- .../regression/rt_cetsa_analysis/__init__.py | 1 + .../regression/rt_cetsa_analysis/__main__.py | 96 + .../rt_cetsa_analysis/__main_filepattern__.py | 52 + .../regression/rt_cetsa_analysis/functions.R | 1559 +++++++++++++++++ .../regression/rt_cetsa_analysis/main.R | 59 + .../rt_cetsa_analysis/main_analysis.R | 92 + .../rt_cetsa_analysis/prepare_data.R | 113 ++ .../prepare_moltenprot_outputs.py | 1 + .../rt_cetsa_analysis/prepare_params.R | 78 + .../rt_cetsa_analysis/prepare_values.R | 64 + .../rt_cetsa_analysis/run_rscript.py | 42 + 24 files changed, 2277 insertions(+), 47 deletions(-) mode change 100755 => 100644 regression/rt-cetsa-analysis-tool/Dockerfile create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_1.R create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_2.R create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_3.R create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_4.R create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_5.R create mode 100644 regression/rt-cetsa-analysis-tool/requirements/Requirements_6.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py diff --git a/regression/rt-cetsa-analysis-tool/Dockerfile b/regression/rt-cetsa-analysis-tool/Dockerfile old mode 100755 new mode 100644 index 0e29286..b41b225 --- a/regression/rt-cetsa-analysis-tool/Dockerfile +++ b/regression/rt-cetsa-analysis-tool/Dockerfile @@ -1,20 +1,60 @@ -FROM polusai/bfio:2.1.9 +FROM r-base:4.4.0 -# environment variables defined in polusai/bfio -ENV EXEC_DIR="/opt/executables" -ENV POLUS_IMG_EXT=".ome.tif" -ENV POLUS_TAB_EXT=".csv" -ENV POLUS_LOG="INFO" -# Work directory defined in the base container -WORKDIR ${EXEC_DIR} +ARG EXEC_DIR="/opt/executables" + +RUN apt-get update && apt-get install -y python3 + +#Create folders +RUN mkdir -p ${EXEC_DIR} + +# Install R packages external dependencies +RUN apt-get install -y libssl-dev \ + && apt-get install -y libcurl4-openssl-dev libfontconfig1-dev \ + && apt-get install -y libharfbuzz-dev libfribidi-dev \ + && apt-get install -y libxml2-dev \ + && apt-get install -y libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \ + && apt-get install -y cmake + +# Install required R packages +ADD requirements/Requirements_1.R ${EXEC_DIR}/Requirements_1.R +RUN Rscript ${EXEC_DIR}/Requirements_1.R + +ADD requirements/Requirements_2.R ${EXEC_DIR}/Requirements_2.R +RUN Rscript ${EXEC_DIR}/Requirements_2.R + +ADD requirements/Requirements_3.R ${EXEC_DIR}/Requirements_3.R +RUN Rscript ${EXEC_DIR}/Requirements_3.R + +RUN apt-get install -y --allow-downgrades libxcb-render0=1.15-1 libxcb-shm0=1.15-1 libxcb1=1.15-1 +RUN apt-get install -y libcairo2-dev + +ADD requirements/Requirements_4.R ${EXEC_DIR}/Requirements_4.R +RUN Rscript ${EXEC_DIR}/Requirements_4.R + +RUN apt-get install -y libgit2-dev + +ADD requirements/Requirements_6.R ${EXEC_DIR}/Requirements_6.R +RUN Rscript ${EXEC_DIR}/Requirements_6.R + +ADD requirements/Requirements_5.R ${EXEC_DIR}/Requirements_5.R +RUN Rscript ${EXEC_DIR}/Requirements_5.R COPY pyproject.toml ${EXEC_DIR} -COPY VERSION ${EXEC_DIR} COPY README.md ${EXEC_DIR} -COPY src ${EXEC_DIR}/src +RUN apt-get update && apt-get install -y python3 +RUN apt-get -y install python3-pip python3-venv -RUN pip3 install ${EXEC_DIR} --no-cache-dir +COPY src/ ${EXEC_DIR}/src + +RUN python3 -m venv ${EXEC_DIR}/.venv +# RUN chmod u+x ${EXEC_DIR}/.venv/bin/activate +# RUN ${EXEC_DIR}/.venv/bin/activate +RUN ${EXEC_DIR}/.venv/bin/pip3 install ${EXEC_DIR} --no-cache-dir + + +WORKDIR ${EXEC_DIR} -ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] +# CMD ["Rscript", "main.R"] +ENTRYPOINT ["/opt/executables/.venv/bin/python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] CMD ["--help"] diff --git a/regression/rt-cetsa-analysis-tool/build-docker.sh b/regression/rt-cetsa-analysis-tool/build-docker.sh index 93d9b45..aaecd7a 100755 --- a/regression/rt-cetsa-analysis-tool/build-docker.sh +++ b/regression/rt-cetsa-analysis-tool/build-docker.sh @@ -1,4 +1,4 @@ #!/bin/bash version=$( None: + """CLI for rt-cetsa-moltprot-tool.""" + # TODO: Add to docs that input csv file should be sorted by `Temperature` column. + logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") + + logger.info(f"Input directory: {inp_dir}") + logger.info(f"params_pattern: {params_pattern}") + logger.info(f"values_pattern: {values_pattern}") + logger.info(f"platemap path: {platemap}") + logger.info(f"Output directory: {out_dir}") + + params = inp_dir / params_pattern + values = inp_dir / values_pattern + + if preview: + NotImplemented # noqa: B018 + + if not params.exists(): + raise FileNotFoundError(f"params file not found : {params}") + if not values.exists(): + raise FileNotFoundError(f"values file not found : {values}") + if not platemap.exists(): + raise FileNotFoundError(f"platemap file not found : {platemap}") + + run_rscript(params, values, platemap, out_dir) + + +if __name__ == "__main__": + app() diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py new file mode 100644 index 0000000..d150462 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py @@ -0,0 +1,52 @@ +"""Main with filepattern support.""" +# """CLI for rt-cetsa-moltprot-tool.""" + + +# # get env + +# # Initialize the logger +# logging.basicConfig( + + +# @app.command() +# def main( +# inp_dir: pathlib.Path = typer.Option( +# ..., +# "--inpDir", +# ), +# params_pattern: str = typer.Option( +# ".+", +# "--params", +# ), +# values_pattern: str = typer.Option( +# ".+", +# "--values", +# ), +# platemap_pattern: str = typer.Option( +# ".+", +# "--platemap", +# ), +# preview: bool = typer.Option( +# False, +# "--preview", +# ), +# out_dir: pathlib.Path = typer.Option( +# ..., +# "--outDir", +# ), +# ) -> None: +# """CLI for rt-cetsa-moltprot-tool.""" +# # TODO: Add to docs that input csv file should be sorted by `Temperature` column. + + +# if preview: +# with (out_dir / "preview.json").open("w") as f: + +# for params, values, platemap in zip(params_files, values_files, platemap_files): +# # TODO replace with exceptions +# if len(params[1]) != 1 or len(values[1]) != 1 or len(platemap[1]) != 1: +# raise Exception( +# msg, + + +# if __name__ == "__main__": diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R new file mode 100644 index 0000000..cb2be4d --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/functions.R @@ -0,0 +1,1559 @@ +# # @@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@ @@@@@@ +# # @@@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ +# # @@! @@@ @@! !@@ @@! @@! !@@ @@! @@@ +# # !@! @!@ !@! !@! !@! !@! !@! !@! @!@ +# # @!@!!@! @!! @!@!@!@!@ !@! @!!!:! @!! !!@@!! @!@!@!@! +# # !!@!@! !!! !!!@!@!!! !!! !!!!!: !!! !!@!!! !!!@!!!! +# # !!: :!! !!: :!! !!: !!: !:! !!: !!! +# # :!: !:! :!: :!: :!: :!: !:! :!: !:! +# # :: ::: :: ::: ::: :: :::: :: :::: :: :: ::: +# # Isothermal Analysis of CETSA/RT-CETSA Experimental Sets +# # +# # Plate assignment, data cleanup, and functions +# # Patents: PCT/US21/45184, HHS E-022-2022-0-US-01 + +print("####### loading all functions used in the analysis...") + +library(tidyverse) +library(readxl) +library(stringr) +library(drc) +library(ggthemes) +library(cowplot) +library(hrbrthemes) +library(ggpubr) +library(MESS) +library(devtools) +# load_all("."); +#' construct_grid +#' Construct a grid with compatable headers for MoltenProt file prep +#' +#' @param row_num Number of rows in microplate +#' @param col_num Number of columns in microplate +#' @param pad_num Add padding 0 to well address? +#' +#' @return df containing the grid +#' @export +#' +#' @examples +#' construct_grid(16,24) +#' construct_grid(32,48,TRUE) +construct_grid <- + function(row_num = 16, + col_num = 24, + pad_num = FALSE) { + if (pad_num == FALSE) { + grid <- + expand.grid(row = LETTERS[1:(row_num)], col = c(1:(col_num))) %>% + arrange(row) %>% + mutate(address = paste(row, col, sep = '')) %>% + dplyr::select(-c('row', 'col')) + } else { + letter <- LETTERS[1:(row_num)] + number <- c(1:(col_num)) + number <- str_pad(number, 2, pad = '0') + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), + col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + } + return(grid) + } + +#' prepMatLabforMolt +#' +#' Need to provide location of file, usually in ./data/, and sheet location +#' +#' @param file_loc Location of raw Matlab data file +#' @param sheet What sheet in the .xlsx is the data located +#' @param col_names Are there column names in the data sheet +#' @param start_temp Start temp for the experiment +#' @param end_temp End temp for the experiment +#' +#' @return df containing the raw values for the RT-CETSA experiment +#' @export +#' +#' @examples +#' prepMatLabforMolt(file_loc = './data/210318_plate3.xlsx', +#' start_temp = startTemp, +#' end_temp = endTemp) +prepMatLabforMolt <- function(file_loc = './data/rtcetsa_raw.xlsx', + sheet = 'Sheet1', + col_names = FALSE, + start_temp = 37, + end_temp = 90) { + if (file.exists(file_loc) == FALSE) { + stop('File does not exist at path supplied.') + } + df <- + read_excel( + path = file_loc, + sheet = sheet, + col_names = col_names, + .name_repair = 'unique' + ) + if (nrow(df) == 0 || ncol(df) == 0) { + stop('Imported file is empty. Please navigate to correct RT-CETSA file') + } + df <- df %>% + dplyr::select(-c('...1', '...2')) %>% + rownames_to_column() %>% + rename('well' = 'rowname') + + # Construct temperature index (t_n) and pivot around the data to tidy + tracker <- 1 + for (val in 2:ncol(df) - 1) { + names(df)[val + 1] <- paste('t_', val, sep = '') + tracker <- tracker + 1 + } + df <- df %>% + pivot_longer(., cols = 2:ncol(df)) %>% + pivot_wider(names_from = well) %>% + rename(., 'Temperature' = 'name') %>% + mutate(., Temperature = as.integer(gsub("[^0-9.]", "", Temperature))) + + #Create temperature index in line with experimental parameters supplied in main script + temperature_df <- + seq(start_temp, end_temp, by = ((end_temp - start_temp) / (nrow(df) - + 1))) %>% + round(., digits = 1) + for (i in 1:length(temperature_df)) + df$Temperature[i] <- temperature_df[i] + + # Assemble data for moltenprot analysis by splitting 384-well plate to 96-well plate with appropriate index + grid_96w <- construct_grid(row_num = 8, col_num = 12) + q1 <- df %>% + dplyr::select(., 1, 2:97) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q1)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q2 <- df %>% + dplyr::select(., 1, 98:193) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q2)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q3 <- df %>% + dplyr::select(., 1, 194:289) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q3)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + q4 <- df %>% + dplyr::select(., 1, 290:385) + tracker <- 1 + for (val in 1:nrow(grid_96w)) { + colnames(q4)[val + 1] <- grid_96w$address[val] + tracker <- tracker + 1 + } + write.csv(q1, './data/cleaned_expt1.csv', row.names = FALSE) + write.csv(q2, './data/cleaned_expt2.csv', row.names = FALSE) + write.csv(q3, './data/cleaned_expt3.csv', row.names = FALSE) + write.csv(q4, './data/cleaned_expt4.csv', row.names = FALSE) + + return(df) +} + +# Read in MoltenProt readout, with different column identities for different models +retrieveMoltenData <- + function(model = 'standard', + plate_format = 384) { + # Retrieve experimental data from processed file folders + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + if (model == 'standard') { + exp1_param <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp2_param <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp3_param <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp4_param <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + # Reformat ID column in each exp from MoltenProt format (A1, not A01) to arrange + exp1_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp1_param$ID, perl = TRUE) + exp1_param <- exp1_param %>% arrange(ID) + exp2_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp2_param$ID, perl = TRUE) + exp2_param <- exp2_param %>% arrange(ID) + exp3_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp3_param$ID, perl = TRUE) + exp3_param <- exp3_param %>% arrange(ID) + exp4_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp4_param$ID, perl = TRUE) + exp4_param <- exp4_param %>% arrange(ID) + # Combine all experiments and add identifiers + exp_param_full <- + exp1_param %>% rbind(., exp2_param, exp3_param, exp4_param) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + dplyr::select( + -c( + 'ID', + 'kN_init', + 'bN_init', + 'kU_init', + 'bU_init', + 'dHm_init', + 'Tm_init', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S', + 'dCp_component' + ) + ) %>% + bind_cols(col_by_row) %>% + relocate(c('row', 'col'), .after = well) %>% + dplyr::select(-'well') + exp_param_full <- well_assignment(exp_param_full, 384) + return(exp_param_full) + } + if (model == 'irrev') { + exp1_param <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp2_param <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp3_param <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + exp4_param <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit parameters') %>% + dplyr::select(-c('Condition')) + # Reformat ID column in each exp from MoltenProt format (A1, not A01) to arrange + exp1_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp1_param$ID, perl = TRUE) + exp1_param <- exp1_param %>% arrange(ID) + exp2_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp2_param$ID, perl = TRUE) + exp2_param <- exp2_param %>% arrange(ID) + exp3_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp3_param$ID, perl = TRUE) + exp3_param <- exp3_param %>% arrange(ID) + exp4_param$ID <- + gsub('([A-Z])(\\d)(?!\\d)', '\\10\\2\\3', exp4_param$ID, perl = TRUE) + exp4_param <- exp4_param %>% arrange(ID) + # Combine all experiments and add identifiers + exp_param_full <- + exp1_param %>% rbind(., exp2_param, exp3_param, exp4_param) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + dplyr::select(c( + 'well', + 'Ea_fit', + 'Tf_fit', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S' + )) %>% + bind_cols(col_by_row) %>% + relocate(c('row', 'col'), .after = well) %>% + dplyr::select(-'well') + exp_param_full <- well_assignment(exp_param_full, 384) + return(exp_param_full) + } + } + +# Gather base-line corrected fit curves for the 384-well plate and pivot plate +retrieve_FittedCurves <- + function(model = 'baseline-fit', + start_temp = 37, + end_temp = 90) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + if (model == 'baseline-fit') { + exp1_curve <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') + exp2_curve <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp3_curve <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp4_curve <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Baseline-corrected') %>% + dplyr::select(-c('Temperature')) + exp_curve_all <- + cbind( + xp1 = exp1_curve, + xp2 = exp2_curve, + xp3 = exp3_curve, + xp4 = exp4_curve + ) %>% + rename(., Temperature = xp1.Temperature) %>% + mutate(., Temperature = paste('val_t_', Temperature, sep = '')) + exp_curve_all <- exp_curve_all %>% + pivot_longer(cols = 2:ncol(exp_curve_all)) %>% + pivot_wider(names_from = Temperature) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + bind_cols(col_by_row) %>% + dplyr::select(-c('name', 'well', 'row', 'col')) %>% + add_tempheaders(., start_temp, end_temp) + message('Fit curves retrieved.') + return(exp_curve_all) + } + if (model == 'fit_curves') { + exp1_curve <- + read_excel('./data/cleaned_expt1/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') + exp2_curve <- + read_excel('./data/cleaned_expt2/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp3_curve <- + read_excel('./data/cleaned_expt3/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp4_curve <- + read_excel('./data/cleaned_expt4/Signal_resources/Signal_results.xlsx', + sheet = 'Fit curves') %>% + dplyr::select(-c('Temperature')) + exp_curve_all <- + cbind( + xp1 = exp1_curve, + xp2 = exp2_curve, + xp3 = exp3_curve, + xp4 = exp4_curve + ) %>% + rename(., Temperature = xp1.Temperature) %>% + mutate(., Temperature = paste('val_t_', Temperature, sep = '')) + exp_curve_all <- exp_curve_all %>% + pivot_longer(cols = 2:ncol(exp_curve_all)) %>% + pivot_wider(names_from = Temperature) %>% + rownames_to_column() %>% rename('well' = 'rowname') %>% + bind_cols(col_by_row) %>% + dplyr::select(-c('name', 'well', 'row', 'col')) %>% + add_tempheaders(., start_temp, end_temp) + message('Fit curves retrieved.') + return(exp_curve_all) + } + } + +# Construct full data frame with curve fit and parameters for analysis +bind_fulldf <- function(param_df, curve_df) { + df <- cbind(param_df, curve_df) + return(df) +} + +#Convert any columns containing Kelvin values from MoltenProt to Celsius +kelToCel <- function(df) { + df <- df %>% + mutate(Tm_fit = Tm_fit - 273.15) %>% + mutate(T_onset = T_onset - 273.15) +} +# # +# ISO-CETSA Functions (new) +# # + +# Add temperature headers to df +add_tempheaders <- function(df, + start_temp = 37, + end_temp = 90) { + temperature_df <- + seq(start_temp, end_temp, by = ((end_temp - start_temp) / (ncol(df) - 1))) %>% + round(., digits = 1) + for (i in 1:ncol(df)) { + colnames(df)[i] <- paste('t_', temperature_df[i], sep = '') + } + message('Temperature assignments changed for ', + ncol(df), + ' points.') + return(df) +} + +# Add row and column to a tidy dataframe (columns are each temperatures, rows are wells/conditions) +add_rowcol <- function(df, well_num) { + if (well_num == 96) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:8), col = sprintf('%.2d', 1:12)) %>% + arrange(., row) + } + else if (well_num == 384) { + col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), + col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + } + message('Row + Column assignments created for ', + well_num, + '-well plate') + df <- cbind(col_by_row, df) + return(df) +} + +# Add well assignmnets for each plate +well_assignment <- function(df, well_num) { + if (well_num == 96) { + letter <- LETTERS[1:8] + number <- c(1:12) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + else if (well_num == 384) { + letter <- LETTERS[1:16] + number <- c(1:24) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + message('Well assignments created for ', well_num, '-well plate.') + df <- cbind(temp_df, df) + return(df) +} + +# Assign compound ids and concentration from platemap +plate_assignment <- function(df, platemap_file) { + id_df <- read_excel(platemap_file, sheet = 'sample') %>% + dplyr::select(-1) %>% + pivot_longer(., cols = 1:ncol(.)) %>% + rename(ncgc_id = value) %>% + dplyr::select(-c('name')) + id_df$ncgc_id <- gsub('empty', 'vehicle', id_df$ncgc_id) + conc_df <- read_excel(platemap_file, sheet = 'conc') %>% + dplyr::select(-1) %>% + pivot_longer(., cols = 1:ncol(.)) %>% + rename(conc = value) %>% + dplyr::select(-c('name')) + df <- cbind(id_df, conc_df, df) + message('Plate assignment attached to dataframe.') + df$row <- as.numeric(df$row) + df$col <- as.numeric(df$col) + return(df) +} + +# Calculate AUC for each well +print("####### loading calculate_auc...") +calculate_auc <- function(df) { + #Retrieve temperatures to be used for AUC determination. + auc.df <- df %>% + dplyr::select(matches('t_\\d')) + + #Initialize the AUC column + df$auc <- NA + + # Pivot and clean each row for AUC model + for (i in 1:nrow(auc.df)) { + curveVals <- auc.df[i,] %>% + pivot_longer(cols = everything(), + names_to = 'temp', + values_to = 'response') + curveVals$temp <- curveVals$temp %>% + sub('t_', '', .) + curveVals$temp <- as.numeric(curveVals$temp) + df$auc[i] <- auc(x = curveVals$temp, y = curveVals$response) + } + message('AUC Values calculated for ', nrow(auc.df), ' wells.') + return(df) +} + +control_grouping <- function(df, control = 'DMSO', pc = 'control') { + control_df <- filter(df, ncgc_id == control | ncgc_id == pc) + if (nrow(control_df) == 0) { + message('No control wells found. Review control input to function.') + } else + if (nrow(control_df) > 0) { + control_df <- control_df %>% + dplyr::select(-'conc') + return(control_df) + } +} + +control_variability <- + function(df, nc = 'vehicle', pc = 'control') { + #Filter out positive and negative controls into their own df + nc.controls.df <- df %>% + filter(ncgc_id == nc) %>% + dplyr::select(-c('ncgc_id', 'well', 'row', 'col')) + pc.controls.df <- df %>% + filter(ncgc_id == pc) %>% + dplyr::select(-c('ncgc_id', 'well', 'row', 'col')) + + #Calculate means, sd, and %CV + nc.mean.df <- + apply(nc.controls.df[1:ncol(nc.controls.df)], 2, mean) + nc.sd.df <- apply(nc.controls.df[1:ncol(nc.controls.df)], 2, sd) + pc.mean.df <- + apply(pc.controls.df[1:ncol(pc.controls.df)], 2, mean) + pc.sd.df <- apply(pc.controls.df[1:ncol(pc.controls.df)], 2, sd) + + #Calculate %CV + nc.var.df <- tibble(nc.mean = nc.mean.df, nc.sd = nc.sd.df) %>% + mutate(nc.cv = (nc.sd / nc.mean) * 100) + pc.var.df <- tibble(pc.mean = pc.mean.df, pc.sd = pc.sd.df) %>% + mutate(pc.cv = (pc.sd / pc.mean) * 100) + analysis_method <- colnames(nc.controls.df) + var_df <- cbind(analysis_method, nc.var.df, pc.var.df) + message('Control group variability analyzed.') + return(var_df) + } + +# Returns thermogram with mean/sd of DMSO curve across temps +control_thermogram <- function(df, pcTm, ncTm) { + subset_df <- subset(df, grepl('t_', analysis_method)) %>% + mutate(temp = as.numeric(gsub('t_', '', analysis_method))) %>% + dplyr::select(-'analysis_method') + therm_plot <- ggplot(subset_df, aes(x = temp)) + + geom_line(aes(y = nc.mean), + size = 1.5, + alpha = 0.75, + color = '#88CCEE') + + geom_errorbar(aes(ymin = nc.mean - nc.sd, ymax = nc.mean + nc.sd), + size = 0.5, + width = 1) + + geom_point( + aes(y = nc.mean), + size = 3.25, + shape = 21, + color = 'black', + fill = '#88CCEE' + ) + + geom_line(aes(y = pc.mean), + size = 1.5, + alpha = 0.75, + color = '#882255') + + geom_errorbar(aes(ymin = pc.mean - pc.sd, ymax = pc.mean + pc.sd), + size = 0.5, + width = 1) + + geom_point( + aes(y = pc.mean), + size = 3.25, + shape = 21, + color = 'black', + fill = '#EE3377' + ) + + theme_minimal() + + labs(title = 'Control Thermograms', + x = 'Temperature [C]', + y = 'Fraction Unfolded') + print(therm_plot) + return(therm_plot) +} + +# Controls analysis and z' output for groups +# Possible outputs: +# output = 'plot': Cowplot of controls +# output = 'df': Control dataframe +control_analysis <- + function(df, + nc = 'vehicle', + pc = 'control', + output = '', + controlDF) { + controls.df <- df %>% + filter(ncgc_id == nc | ncgc_id == pc) + + #Calculate Z' from controls for each parameter + test_params <- + c('Tm_fit', + 'auc') + Tm.nc.mean <- + mean(controls.df$Tm_fit[controls.df$ncgc_id == nc]) + Tm.nc.sd <- sd(controls.df$Tm_fit[controls.df$ncgc_id == nc]) + Tm.pc.mean <- + mean(controls.df$Tm_fit[controls.df$ncgc_id == pc]) + Tm.pc.sd <- sd(controls.df$Tm_fit[controls.df$ncgc_id == pc]) + Tm.z <- + 1 - (((3 * Tm.pc.sd) + (3 * Tm.nc.sd)) / abs(Tm.pc.mean - Tm.nc.mean)) + + message('Z\' for Tm: ', signif(Tm.z)) + auc.nc.mean <- mean(controls.df$auc[controls.df$ncgc_id == nc]) + auc.nc.sd <- sd(controls.df$auc[controls.df$ncgc_id == nc]) + auc.pc.mean <- mean(controls.df$auc[controls.df$ncgc_id == pc]) + auc.pc.sd <- sd(controls.df$auc[controls.df$ncgc_id == pc]) + auc.z <- + 1 - (((3 * auc.pc.sd) + (3 * auc.nc.sd)) / abs(auc.pc.mean - auc.nc.mean)) + message('Z\' for AUC: ', signif(auc.z)) + + if (output == 'plot') { + Tm.plot <- + ggplot(controls.df, aes(x = ncgc_id, y = Tm_fit, fill = ncgc_id)) + + geom_boxplot(outlier.alpha = 0, size = 0.75) + + geom_jitter(shape = 21, size = 3) + + theme_minimal() + + scale_fill_hue() + + labs(title = 'Controls | Tagg', + subtitle = paste('Z\': ', signif(Tm.z), sep = '')) + + theme( + legend.position = 'none', + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + axis.text.y = element_text(size = 10), + axis.title.y = element_text(size = 12, face = 'bold'), + plot.title = element_text(size = 12, face = 'bold') + ) + auc.plot <- + ggplot(controls.df, aes(x = ncgc_id, y = auc, fill = ncgc_id)) + + geom_boxplot(outlier.alpha = 0, size = 0.75) + + geom_jitter(shape = 21, size = 3) + + theme_minimal() + + scale_fill_hue() + + labs(title = 'Controls | AUC', + subtitle = paste('Z\': ', signif(auc.z), sep = '')) + + theme( + legend.position = 'none', + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + axis.text.y = element_text(size = 10), + axis.title.y = element_text(size = 12, face = 'bold'), + plot.title = element_text(size = 12, face = 'bold') + ) + right.grid <- + plot_grid(Tm.plot, auc.plot, ncol = 1) + control.grid <- + plot_grid( + control_thermogram(controlDF, ncTm = Tm.nc.mean, pcTm = Tm.pc.mean), + right.grid, + ncol = 2, + nrow = 1 + ) + out <- paste(outdir,'controls.png',sep="/") + ggsave(out, dpi = 'retina', scale = 1.5) + return(control.grid) + } + if (output == 'df') { + means <- + c(Tm.nc.mean, + auc.nc.mean) + parameters <- c('Tm_fit', 'auc') + output.df <- tibble(parameters, means) + return(output.df) + } + } + + +# Dose-response curve fit with LL.4 log-logistic fit +# otrace = TRUE; output from optim method is displayed. Good for diagnostic +# trace = TRUE; trace from optim displayed +# robust fitting +# robust = 'lms': doesn't handle outlier/noisy data well +dr_fit <- function(df) { + try(expr = { + drm( + resp ~ conc, + data = df, + type = 'continuous', + fct = LL.4(), + control = drmc( + errorm = FALSE, + maxIt = 10000, + noMessage = TRUE, + ) + ) + }) +} + +# Deprecated for now... +# Perform drm on each compound at each temperature +dr_analysis <- + function(df, + control = 'DMSO', + export_label = '', + plot = TRUE) { + # Construct df from the unique compound ids (less control) with empty analysis parameters + model_df <- + tibble(compound = (unique(filter( + df, ncgc_id != control + )$ncgc_id))) %>% + filter(compound != 'control') + for (i in 6:ncol(df)) { + col.nm <- colnames(df)[i] + model_df[, col.nm] <- NA + } + + # Make a long df with the parameters (colnames above) + modelfit_df <- tibble(colnames(model_df)[2:ncol(model_df)]) + names(modelfit_df)[1] <- 'analysis' + + # Loop through each column in every row of modelfit_df and create a drm model for each and + # add statistics and readouts to a temp df that is bound to modelfit_df + for (i in 1:nrow(model_df)) { + # Create a working df with the raw data from compound[i] + temp_df <- + filter(df, df$ncgc_id == model_df$compound[(i)]) %>% + dplyr::select(-c('well', 'row', 'col')) + print(paste('Analyzing: ', model_df$compound[i]), sep = '') + + # This temp df will hold the statistics that we read out from each model, and is reset every time. + # Parameters to include: + # ec50: EC50 reading of the curve fit + # pval: curve fit pvalue + # noEffect: p-value of the noEffect test of the dose-response + # hill: LL4 parameter B + # ec50: LL4 parameter A + # lowerlim: LL4 parameter C + # upperlim: LL4 parameter D + temp_modelfit_df <- modelfit_df[1] %>% + mutate( + ec50 = 0, + noEffect = 0, + hill = 0, + lowerlim = 0, + upperlim = 0, + ec50 = 0 + ) + # Iterate through columns + for (n in 3:ncol(temp_df)) { + #Make df for drm model by selecting concentration and appropriate column + dr_df <- temp_df %>% dplyr::select(c(2, n)) + colnames(dr_df)[1] <- 'conc' + colnames(dr_df)[2] <- 'resp' + temp.model <- + drm( + resp ~ conc, + data = dr_df, + fct = LL.4(), + control = drmc( + errorm = FALSE, + maxIt = 500, + noMessage = TRUE + ) + ) + # Construct fitted curve for plotting + pred.fit <- + expand.grid(pr.x = exp(seq(log(min( + dr_df[1] + )), log(max( + dr_df[1] + )), length = 1000))) + # Seems necessary to make loop continue through curves that can't be fit... NEED TO STUDY + if ("convergence" %in% names(temp.model) == FALSE) { + pm <- + predict(object = temp.model, + newdata = pred.fit, + interval = 'confidence') + pred.fit$p <- pm[, 1] + pred.fit$pmin <- pm[, 2] + pred.fit$pmax <- pm[, 3] + # Plot out dose response curve if conditional met in function + if (plot == TRUE) { + dr_plot <- ggplot(dr_df, aes(x = conc, y = resp)) + + geom_line( + data = pred.fit, + aes(x = pr.x, y = p), + size = 1.5, + color = 'black' + ) + + geom_point( + size = 4, + shape = 21, + fill = 'orange', + color = 'black' + ) + + scale_x_log10() + + theme_cowplot() + + labs( + title = paste( + 'Analysis of ', + model_df$compound[i], + ' by ', + colnames(temp_df)[n], + sep = '' + ), + subtitle = paste( + 'EC50: ', + signif(temp.model$coefficients[4], 3), + ' nM', + '\n', + 'Significance of noEffect Test: ', + signif(noEffect(temp.model)[3], 3), + sep = '' + ), + x = 'Concentration' + ) + print(dr_plot) + + out <- paste(outdir,'dr_curves', export_label, sep="/") + png( + filename = paste( + out, + '_', + model_df$compound[i], + colnames(temp_df)[n], + '.png', + sep = '' + ), + width = 3200, + height = 1800, + res = 300 + ) + print(dr_plot) + dev.off() + } + print(n) + # Extract fit parameters for the dr model + temp_modelfit_df$ec50[(n - 2)] <- + signif(temp.model$coefficients[4], 3) + temp_modelfit_df$noEffect[(n - 2)] <- + signif(noEffect(temp.model)[3], 3) + temp_modelfit_df$hill[(n - 2)] <- + signif(temp.model$coefficients[1], 3) + temp_modelfit_df$lowerlim[(n - 2)] <- + signif(temp.model$coefficients[2], 3) + temp_modelfit_df$upperlim[(n - 2)] <- + signif(temp.model$coefficients[3], 3) + } + } + modelfit_df <- modelfit_df %>% cbind(., temp_modelfit_df[2:6]) + names(modelfit_df)[names(modelfit_df) == 'ec50'] <- + paste('ec50_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'noEffect'] <- + paste('noEffect_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'hill'] <- + paste('hill_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'lowerlim'] <- + paste('lowerlim_', model_df$compound[i], sep = '') + names(modelfit_df)[names(modelfit_df) == 'upperlim'] <- + paste('upperlim_', model_df$compound[i], sep = '') + } + return(modelfit_df) + } + + + +# Extract residual sum of squares for dmso columns +# Returns df with all dmso values ready for model fit +dmso_rss <- function(df, control = 'DMSO') { + df_rss <- df %>% + dplyr::select(starts_with("t_")) %>% + pivot_longer(cols = everything()) + colnames(df_rss)[1] <- 'conc' + colnames(df_rss)[2] <- 'resp' + df_rss$conc <- as.integer(gsub('t_', '', df_rss$conc)) + message('Fitting DMSO thermogram...') + rss_model <- dr_fit(df_rss) + rss_dmso <- sum(residuals(rss_model) ^ 2) + message('DMSO RSS: ', signif(rss_dmso, 6)) + plot( + rss_model, + type = 'all', + cex = 0.5, + main = paste('DMSO Thermogram Fit\n', 'RSS: ', signif(rss_dmso, 5), sep = + ''), + sub = paste('DMSO RSS: ', signif(rss_dmso, 5), sep = ''), + xlab = 'Temperature', + ylab = 'Fraction Unfolded', + ylim = c(-0.25, 1.25) + ) + return(df_rss) +} + +compare_models <- function(df, dmso.rss.df, plot = FALSE) { + temp_df <- df %>% dplyr::select(-one_of( + 'Ea_fit', + 'Tf_fit', + 'kN_fit', + 'bN_fit', + 'kU_fit', + 'bU_fit', + 'S' + )) %>% + filter(ncgc_id != 'DMSO' & ncgc_id != 'ignore') + rss_df <- temp_df %>% + dplyr::select(-starts_with('t_')) + rss_df$null.rss <- NA + rss_df$alt.rss <- NA + rss_df$rss.diff <- NA + + dmso.model <- dr_fit(dmso.rss.df) + dmso.rss <- sum(residuals(dmso.model) ^ 2) + for (i in 1:nrow(temp_df)) { + cmpnd.df <- temp_df[i, ] %>% + dplyr::select(starts_with('t_')) %>% + pivot_longer(cols = everything()) + colnames(cmpnd.df)[1] <- 'conc' + colnames(cmpnd.df)[2] <- 'resp' + cmpnd.df$conc <- as.integer(gsub('t_', '', cmpnd.df$conc)) + + # Fitting the null model + null.model <- bind_rows(dmso.rss.df, cmpnd.df) + null.drm <- dr_fit(null.model) + null.rss <- sum(residuals(null.drm) ^ 2) + rss_df$null.rss[i] <- null.rss + message( + 'Null model for ', + temp_df$ncgc_id[i], + ' at concentration \' ', + temp_df$conc[i], + '\': ', + signif(null.rss, 6) + ) + if (plot == TRUE) { + plot(null.drm, + type = 'all', + cex = 0.5, + main = 'Null model fit') + } + # Fitting the alternate model + cmpnd.drm <- dr_fit(cmpnd.df) + cmpnd.rss <- sum(residuals(cmpnd.drm) ^ 2) + alt.rss <- sum(cmpnd.rss, dmso.rss) + rss_df$alt.rss[i] <- alt.rss + message ( + 'Alternate Model for ', + temp_df$ncgc_id[i], + ' at concentration \' ', + temp_df$conc[i], + ' \': ', + signif(alt.rss, 6) + ) + rss.diff <- null.rss - alt.rss + message('RSS.0 - RSS.1: ', signif(rss.diff, 6)) + rss_df$rss.diff[i] <- rss.diff + } + return(rss_df) +} + +# Calculate the traditional melting parameters from the full + rss model and output df +# of the parameters for each compound +calculate_meltingparams <- function (df, control = 'vehicle') { + #Standard parameters to test: + test_params <- c('dHm_fit', 'Tm_fit', 'dG_std', 'T_onset') + + #Set up to loop through entire dataframe for each of the above params + for (i in 1:length(test_params)) { + #Initialize the column name + current_param <- test_params[i] + df[, paste(current_param, '_diff', sep = '')] <- NA + + #First, calculate the mean of control columns + mean_control <- mean(df[[current_param]][df$ncgc_id == control]) + + #Then, subtract this mean value from each well in the plate in a new column. + #Can't figure out how to mutate with a pasted column name... + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_diff', sep = '')] <- + df[i, current_param] - mean_control + } + + #Print out mean and stdev of vehicle for each condition + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + message('Vehicle mean for ', + current_param, + ': ', + mean_control, + ' (SD: ', + std_control, + ')') + } + return(df) +} + +# Print out the volcano plots for each parameter and RSS vs. p-val +plot_volcanos <- function(df, save = TRUE) { + test_params <- + c('Tm_fit.maxDiff', + 'auc.maxDiff') + test_pval <- + c('Tm_fit.maxDiff', + 'auc.maxDiff') + + # Plot out RSS Difference(x) vs. Parameter Difference(y) + # Conditional fill: grey/alpha if not significant in either + # grey/alpha if not significant in either #DDDDDD + # teal if by parameter only #009988 + # orange if by NPARC only #EE7733 + # wine if by both #882255 + # NEED TO CODE THIS BETTER WTF + for (i in 1:length(test_params)) { + current_param <- test_params[i] + current_pval <- test_pval[i] + plot.df <- df %>% + dplyr::select(compound, + rss.diff, + mannwhit.pval, + one_of(current_param), + one_of(current_pval)) + # Assign significance testing outcomes + plot.df$sigVal <- + case_when((plot.df$mannwhit.pval < 0.05 & + plot.df[, current_pval] < 0.05) ~ 'Both', + (plot.df$mannwhit.pval < 0.05 & + plot.df[, current_pval] >= 0.05) ~ 'RSS NPARC', + (plot.df$mannwhit.pval >= 0.05 & + plot.df[, current_pval] < 0.05) ~ 'Parameter', + (plot.df$mannwhit.pval >= 0.05 & + plot.df[, current_pval] >= 0.05) ~ 'Insignificant' + ) + + fillvalues <- + c('Both', 'RSS NPARC', 'Parameter', 'Insignificant') + colors <- c('#882255', '#EE7733', '#009988', '#DDDDDD') + volcano_plot <- + ggplot(plot.df, + aes(x = rss.diff, + y = plot.df[, current_param], + label = compound)) + + geom_point(shape = 21, + aes(fill = sigVal), + size = 5) + + theme_minimal() + + labs( + title = paste('Residual Variance vs. ', current_param, sep = ''), + y = paste(current_param, ' Experimental - Vehicle Mean', sep = ''), + x = 'RSS0 - RSS1 NPARC', + fill = 'Significance Detected' + ) + + scale_fill_manual(breaks = fillvalues, values = colors) + + theme(legend.position = 'bottom') + print(volcano_plot) + out <- paste(outdir,"/",current_param, sep="") + ggsave( + paste(out, '_volcano.png', sep = ''), + dpi = 'retina', + scale = 1.25 + ) + } +} + +# Plot out RSS Difference by p-value for the MannWhitney +rss.pval.plot <- function (df, savePlot = FALSE) { + plot.df <- df %>% + dplyr::select(compound, rss.diff, mannwhit.pval, mannwhit.ec50) + plot.df$mannwhit.pval <- log2(plot.df$mannwhit.pval) + + rss.plot <- + ggplot(plot.df, + aes(x = rss.diff, y = mannwhit.pval, fill = mannwhit.ec50)) + + geom_point(shape = 21, size = 3.5) + + theme_minimal() + + scale_fill_gradient(low = '#EE3377', + high = '#88CCEE', + na.value = 'grey20') + + labs( + title = 'RSS vs. Mann Whitney P-val', + x = 'RSS0 - RSS1', + y = 'Log2 Mann Whitney P-val', + fill = 'NPARC EC50' + ) + print(rss.plot) + if (savePlot == TRUE) { + out <- paste(outdir,'/rssPvalcomp.png',sep="") + ggsave(out, + dpi = 'retina', + scale = 1.25 + ) + } +} + +# +parameter_doseresponse <- + function(df, control = 'vehicle', plot = TRUE) { + #First calculate the mean for the control/vehicle condition, and construct df with this. + parameter_df <- + tibble(compound = (unique(filter( + df, ncgc_id != control + )$ncgc_id))) + for (i in 1:nrow(parameter_df)) { + + } + } + +calculate_zscore <- + function(df, control = 'vehicle', plot = FALSE) { + test_params <- + c('Tm_fit') + for (i in 1:length(test_params)) { + current_param <- test_params[i] + mean_control <- + mean(df[[current_param]][df$ncgc_id == control]) + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + df[, paste(current_param, '_zscore', sep = '')] <- NA + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_zscore', sep = '')] <- + (df[i, current_param] - mean_control) / std_control + } + } + return(df) + } + +convert_zscore <- function(df, control = 'vehicle', plot = FALSE) { + test_params <- + c('Tm_fit_zscore') + for (i in 1:length(test_params)) { + current_param <- test_params[i] + mean_control <- mean(df[[current_param]][df$ncgc_id == control]) + std_control <- sd(df[[current_param]][df$ncgc_id == control]) + df[, paste(current_param, '_pval', sep = '')] <- NA + + #Calculate p value for normal distribution from z score for each column. + for (i in 1:nrow(df)) { + df[i, paste(current_param, '_pval', sep = '')] <- + 2 * pnorm(-abs(df[i, current_param])) + } + } + return(df) +} + +# Fit the null model to a set of conc ~ resp values. +# Returns: RSS values +# Requires; df with concentration and response values +# df[1]: concentration values +# df[2]: response values +fit_nullmodel <- function(df, + plot.model, + graphTitle = '') { + null.model <- lm(resp ~ 1, data = df) + #Plot if TRUE. For diagnostic use mostly... + if (plot.model == TRUE) { + out <- paste(outdir,'models', sep="/") + try(jpeg(filename = paste(out, graphTitle, '.jpg', sep = + ''))) + try(plot( + df$conc, + df$resp, + main = graphTitle, + pch = 21, + cex = 3, + col = 'black', + bg = 'orange', + lwd = 3 + )) + try(abline(null.model, col = 'black', lwd = 3)) + try(dev.off() + ) + } + #Return squared residuals for null model + # message('NUll Model RSS: ', + # (sum(residuals(null.model) ^ 2))) + return(sum(residuals(null.model) ^ 2)) +} + +# Fit the alternate model log logistic to a set of conc ~ resp values. +# Returns: RSS values +# Requires: same as null_model +fit_altmodel <- function(df, + plot.model, + graphTitle = '') { + alt.model <- dr_fit(df) + if (plot.model == TRUE) { + out <- paste(outdir,'models', sep="/") + try(jpeg(filename = paste(out, graphTitle, '.jpg', sep = + ''))) + try(plot( + alt.model, + main = graphTitle, + pch = 21, + cex = 3, + col = 'black', + bg = 'cyan', + lwd = 3 + )) + try(dev.off() + ) + } + # message('Alternate Model RSS: ', + # (sum(residuals(alt.model) ^ 2))) + return(sum(residuals(alt.model) ^ 2)) +} + +# Derive RSS values for null and alternate model for each compound from full_df +compute.rss.models <- + function(df, + control = 'DMSO', + plotModel = TRUE, + rssPlot = TRUE, + drPlot = TRUE) { + #Construct tibble of unique compounds names + rss.df <- tibble(compound = (unique( + filter(df, ncgc_id != control | ncgc_id != 'vehicle')$ncgc_id + ))) %>% + filter(compound != 'control') %>% + filter(compound != 'vehicle') + rss.df$null.model.n <- NA + rss.df$alt.model.n <- NA + rss.df$null.model.sum <- NA + rss.df$alt.model.sum <- NA + rss.df$null.model.sd <- NA + rss.df$alt.model.sd <- NA + rss.df$rss.diff <- NA + rss.df$mannwhit.pval <- NA + rss.df$mannwhit.ec50 <- NA + + for (i in 1:nrow(rss.df)) { + #Construct df for current compound + fit.df <- df %>% filter(ncgc_id == toString(rss.df[i, 1])) %>% + dplyr::select(ncgc_id, conc, starts_with('t_')) %>% + dplyr::select(!contains('onset')) + + #Plot out dose-response thermogram here? + if (drPlot == TRUE) { + dr.thermogram(fit.df, target = rss.df$compound[i]) + } + + #Construct a df to hold the rss values until final calculations of mean,sd,N + cmpnd.fit.df <- fit.df %>% + dplyr::select(starts_with('t_')) + cmpnd.fit.df <- tibble(temp = colnames(cmpnd.fit.df)) + cmpnd.fit.df$null <- NA + cmpnd.fit.df$alt <- NA + + #Iterate through each temperature, construct df, perform rss analysis, and add to cmpnd.fit.df + for (t in 3:ncol(fit.df)) { + current.fit.df <- fit.df %>% + dplyr::select(1:2, colnames(fit.df)[t]) + colnames(current.fit.df)[3] <- 'resp' + cmpnd.fit.df$null[t - 2] <- + fit_nullmodel(current.fit.df, + plot.model = plotModel, + graphTitle = as.character(paste( + current.fit.df[1, 1], ' Null Model at ', colnames(fit.df)[t], sep = '' + ))) + cmpnd.fit.df$alt[t - 2] <- + fit_altmodel(current.fit.df, + plot.model = plotModel, + graphTitle = as.character(paste( + current.fit.df[1, 1], ' Alternate Model at ', colnames(fit.df)[t], sep = '' + ))) + } + # RSS0-RSS1 + cmpnd.fit.df <- cmpnd.fit.df %>% + mutate(diff = null - alt) + #Now, we calculate and assign rss values for both models in the rss.df for this compound. + rss.df$null.model.n[i] <- length(na.omit(cmpnd.fit.df$null)) + rss.df$alt.model.n[i] <- length(na.omit(cmpnd.fit.df$alt)) + rss.df$null.model.sum[i] <- sum(cmpnd.fit.df$null) + rss.df$alt.model.sum[i] <- sum(cmpnd.fit.df$alt) + rss.df$null.model.sd[i] <- sd(cmpnd.fit.df$null) + rss.df$alt.model.sd[i] <- sd(cmpnd.fit.df$alt) + rss.df$rss.diff[i] <- + sum(cmpnd.fit.df$null) - sum(cmpnd.fit.df$alt) + + #Perform Mann-Whitney iU test on alternative vs. null model dataframe for compound. + mann.whit <- + wilcox.test(x = cmpnd.fit.df$null, + y = cmpnd.fit.df$alt, + exact = TRUE) + rss.df$mannwhit.pval[i] <- mann.whit$p.value + + #Message out RSS0-RSS1 and p value + message('RSS Difference for ', + rss.df[i, 1], + ': ', + rss.df$rss.diff[i]) + message('Mann-Whitney U Test p-val: ', + rss.df$mannwhit.pval[i]) + + # Construct drc model and derive ec50 if p-val is significant + if (rss.df$mannwhit.pval[i] <= 0.05) { + #Find what temperature is the max point + rss.max.temp <- + cmpnd.fit.df$temp[cmpnd.fit.df$diff == max(cmpnd.fit.df$diff)] + #Construct df ready for drc at max temperature + rss.drc.df <- fit.df %>% + dplyr::select(conc, one_of(rss.max.temp)) + colnames(rss.drc.df)[2] <- 'resp' + rss.drc.model <- dr_fit(rss.drc.df) + ec50.temp <- rss.drc.model$coefficients[4] + if (length(ec50.temp != 0)) { + rss.df$mannwhit.ec50[i] <- signif(ec50.temp, 3) + } + } + + #Plot the RSS values across the temperature range if true + if (rssPlot == TRUE) { + # First, clean up the temperatures + cmpnd.fit.df$temp <- sub('t_', '', cmpnd.fit.df$temp) + cmpnd.fit.df$temp <- as.numeric(cmpnd.fit.df$temp) + + #Plot RSS as + rss.plot <- ggplot(cmpnd.fit.df, aes(x = temp, y = diff)) + + geom_point(shape = 21, + size = 4, + fill = '#AA4499') + + theme_minimal() + + labs( + title = paste(current.fit.df[1, 1], ' RSS Difference', sep = ''), + subtitle = paste('Mann-Whitney U pval: ', signif(rss.df$mannwhit.pval[i])), + sep = '', + y = 'RSS0-RSS1', + x = 'Temperature [C]' + ) + print(rss.plot) + out <- paste(outdir,'models', current.fit.df[1, 1], sep="/") + ggsave( + filename = paste(out, '_rss.png', sep =''), + scale = 1.25, + dpi = 'retina' + ) + } + } + return(rss.df) + } + +# Compute the rss difference and significance for each of the parameters +compute_parameter.rssmodel <- function(df, plotModel = FALSE) { + #Test parameters for standard model + test_params <- + c('Tm_fit', + 'auc') + + #Construct df of unique compounds and initialize parameter readouts. + param.rss.df <- tibble(compound = (unique( + filter(df, ncgc_id != 'control' & ncgc_id != 'vehicle')$ncgc_id + ))) + param.rss.df$Tm_fit.ec50 <- as.numeric(NA) + param.rss.df$Tm_fit.pval <- as.numeric(NA) + param.rss.df$Tm_fit.maxDiff <- as.numeric(NA) + param.rss.df$auc.ec50 <- as.numeric(NA) + param.rss.df$auc.pval <- as.numeric(NA) + param.rss.df$auc.maxDiff <- as.numeric(NA) + + control.means <- control_analysis(df, output = 'df') + + for (i in 1:nrow(param.rss.df)) { + cmpnd.fit.df <- df %>% + filter(ncgc_id == param.rss.df$compound[i]) + #Now iterate through columns in test_params + for (p in 1:length(test_params)) { + current_param <- test_params[p] + current.fit.df <- cmpnd.fit.df %>% + dplyr::select(ncgc_id, conc, I(test_params[p])) + colnames(current.fit.df)[3] <- 'resp' + current.model <- dr_fit(current.fit.df) + + #Workaround to avoid drm that can't converge + if (class(current.model) != 'list') { + param.rss.df[i, paste(current_param, '.pval', sep = '')] <- + noEffect(current.model)[3] + + param.rss.df[i, paste(current_param, '.ec50', sep = '')] <- + summary(current.model)$coefficients[4] + + #Calculate the maximum difference in param and subtract negative control mean from it. + current.fit.df$absDiff <- + abs(current.fit.df$resp - control.means$means[control.means$parameters == + current_param]) + param.rss.df[i, paste(current_param, '.maxDiff', sep = '')] <- + current.fit.df$resp[current.fit.df$absDiff == max(current.fit.df$absDiff)] - control.means$means[control.means$parameters == + current_param] + + message('Analyzing Compound ', param.rss.df[i, 1], '...') + message(current_param) + message('EC50: ', param.rss.df[i, paste(current_param, '.ec50', sep = + '')]) + message('No Effect ANOVA p-val: ', signif(param.rss.df[i, paste(current_param, '.pval', sep = + '')]), 1) + if (plotModel == TRUE) { + out <- paste(outdir,'models', param.rss.df[i, 1], sep="/") + png( + filename = paste( + out, + '_', + current_param, + '.png', + sep = '' + ), + bg = 'transparent' + ) + plot( + current.model, + main = paste( + param.rss.df[i, 1], + '\n', + ' NoEffect pval: ', + signif(param.rss.df[i, paste(current_param, '.pval', sep = + '')]), + '\n', + 'EC50: ', + signif(param.rss.df[i, paste(current_param, '.ec50', sep = + '')]), + '\n', + current_param + ) + ) + dev.off() + } + } + } + } + return(param.rss.df) +} + +# Creates a thermogram with all concentrations of a target for plotting +# Must match exact ncgc_id in well assignment.. +dr.thermogram <- function(df, target = '') { + # Create df with the compound, conc, and temperature columns + # df <- df %>% + # dplyr::select(ncgc_id, conc, matches('t_\\d')) %>% + # filter(., ncgc_id == target) + df <- df %>% + pivot_longer(cols = 3:ncol(df), + names_to = 'temp', + values_to = 'resp') + df$temp <- as.numeric(sub('t_', '', df$temp)) + + dr.plot <- ggplot(df, aes( + y = resp, + x = temp, + fill = as.factor(signif(conc)), + group_by(signif(conc)) + )) + + geom_line(color = 'black', + alpha = 0.8, + size = 1) + + geom_point(shape = 21, size = 3) + + theme_minimal() + + scale_color_viridis_d() + + labs( + title = paste('Dose-Response Thermogram for ', target, sep = ''), + x = 'Temperature [C]', + y = 'Response', + fill = 'Concentration' + ) + + theme() + print(dr.plot) + out <- paste(outdir,'models', 'dr_', sep="/") + ggsave( + filename = paste(out, target, '.png', sep = ''), + scale = 1.25, + dpi = 'retina' + ) + return(dr.plot) +} + +# Export heatmaps of EC50 and P-values across analysis parameters +# Pass in parameters df +parameter_heatmaps <- function(df, plotHeat = FALSE) { + ec50.heat.df <- df %>% + dplyr::select(compound, contains('ec50')) %>% + pivot_longer(cols = !compound, + names_to = 'parameter', + values_to = 'ec50') %>% + mutate(ec50 = log10(ec50)) + ec50.heat.df$parameter <- ec50.heat.df$parameter %>% + sub('.ec50', '', .) + + ec50.heat.plot <- + ggplot(ec50.heat.df, + aes( + x = parameter, + y = compound, + fill = ec50, + label = signif(ec50) + )) + + geom_tile(color = 'black') + + geom_text(alpha = 0.85, size = 2.5) + + theme_minimal() + + scale_fill_gradientn(colors = c('#EE3377', '#DDCC77', '#88CCEE'), ) + + labs(title = 'EC50 Parameter Comparison', + fill = 'Log EC50') + + theme( + axis.title.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold') + ) + + pval.heat.df <- df %>% + dplyr::select(compound, contains('pval')) %>% + pivot_longer(cols = !compound, + names_to = 'parameter', + values_to = 'pval') %>% + mutate(sigVal = ifelse(pval < (0.05 / length(unique( + df + ))), 'Significant', 'Insignificant')) + pval.heat.df$parameter <- pval.heat.df$parameter %>% + sub('.pval', '', .) + pval.heat.plot <- + ggplot(pval.heat.df, + aes( + x = parameter, + y = compound, + fill = sigVal, + label = signif(pval) + )) + + geom_tile(color = 'black') + + geom_text(alpha = 0.85, size = 2.5) + + theme_minimal() + + labs(title = 'P-Value Parameter Comparison', + fill = 'P-Value') + + theme( + axis.title.y = element_blank(), + axis.title.x = element_blank(), + axis.text.x = element_text(size = 12, face = 'bold'), + + ) + if (plotHeat == TRUE) { + print(pval.heat.plot) + out <- paste(outdir, 'pval_heatmap.png' , sep="/") + ggsave(out, + dpi = 'retina', + scale = 1.25) + print(ec50.heat.plot) + out <- paste(outdir, 'ec50_heatmap.png' , sep="/") + ggsave(out, + dpi = 'retina', + scale = 1.25) + } +} + +# Mutates a binary variable testing each analysis method for significance +# 0 if insignificant +# 1 if significant +determineSig <- function(df, alpha = 0.05) { + analysisMethods <- dplyr::select(df, contains('pval')) + analysisMethods <- colnames(analysisMethods) + analysisMethodsNames <- sub('pval', 'pval.sig', analysisMethods) + sigVal <- alpha / nrow(df) + for (i in 1:length(analysisMethods)) { + df[, analysisMethodsNames[i]] <- + ifelse(df[, analysisMethods[i]] < sigVal, 1, 0) + df[is.na(df)] <- 0 + } + return(df) +} + +# Use after determineSig from above +rankOrder <- function(df) { + methodSig <- dplyr::select(df, contains('pval.sig')) + methodSig <- colnames(methodSig) + methodRank <- sub('pval.sig', 'rankOrder', methodSig) + methods <- sub('.rankOrder', '', methodRank) + methodsEC <- paste(methods, '.ec50', sep = '') + + for (i in 1:length(methods)) { + rank.df <- filter(df, df[, (methodSig[i])] == 1) + rank.df[, methodRank[i]] <- + as.integer(rank(rank.df[, methodsEC[i]])) + df <- left_join(df, rank.df) + } + return(df) +} diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R new file mode 100644 index 0000000..118d0cf --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R @@ -0,0 +1,59 @@ +suppressWarnings(library(argparse)) +suppressWarnings(library(logging)) + + +# Initialize the logger +basicConfig() + +# Setup the argument parsing +addFitParams <- function(parser) { + print("addFitParams") + parser$add_argument("--params", type = "character",help="Fit params csv file") + invisible(NULL) +} +addBaselineCorrected <- function(parser) { + parser$add_argument("--values", type = "character",help="Baseline corrected csv file") + invisible(NULL) +} +addPlateMap <- function(parser) { + parser$add_argument("--platemap", type = "character",help="platemap excel file") + invisible(NULL) +} +addOutputArgs <- function(parser) { + parser$add_argument("--outdir", type = "character",help="Output csv file") + invisible(NULL) +} +getAllParser <- function() { + parser <- ArgumentParser(description="ALL PARSER") + addFitParams(parser) + addBaselineCorrected(parser) + addPlateMap(parser) + addOutputArgs(parser) + return(parser) +} + +# Parse the arguments +parser <- getAllParser() +args <- parser$parse_args() + + +#Path to csvfile directory +params <- args$params +loginfo('params = %s', params) + +values <- args$values +loginfo('values = %s', values) + +plate_map <- args$platemap +loginfo('platemap = %s', plate_map) + +outdir <- args$outdir +loginfo('outdir = %s', outdir) + +loginfo('params (fit params) = %s', params) +loginfo('values (baseline corrected): %s', values) +loginfo('platemap file (plate metadata): %s', plate_map) +loginfo('outdir (output directory): %s', outdir) + +source('./prepare_data.R') +source('./main_analysis.R') diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R new file mode 100644 index 0000000..53ef628 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R @@ -0,0 +1,92 @@ +# # @@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@ @@@@@@ +# # @@@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ +# # @@! @@@ @@! !@@ @@! @@! !@@ @@! @@@ +# # !@! @!@ !@! !@! !@! !@! !@! !@! @!@ +# # @!@!!@! @!! @!@!@!@!@ !@! @!!!:! @!! !!@@!! @!@!@!@! +# # !!@!@! !!! !!!@!@!!! !!! !!!!!: !!! !!@!!! !!!@!!!! +# # !!: :!! !!: :!! !!: !!: !:! !!: !!! +# # :!: !:! :!: :!: :!: :!: !:! :!: !:! +# # :: ::: :: ::: ::: :: :::: :: :::: :: :: ::: +# # NonParametric Multiparameter Analysis of CETSA/RT-CETSA Experimental Sets +# # +# # Written by: Michael Ronzetti {NIH/NCATS/UMD} +# # Patents: PCT/US21/45184, HHS E-022-2022-0-US-01 +# # Main Analysis + +library(tidyverse) +library(readxl) +library(stringr) +library(drc) +library(ggthemes) +library(cowplot) +library(hrbrthemes) +library(ggpubr) +library(MESS) +library(devtools) + +# BECAUSE OF BUG +pdf(file = NULL) + + +# EXPERIMENTAL PARAMETERS AND SETUP +# +# Input experiment parameters here + +startTemp <- 37 +endTemp <- 95 +plate_format <- 384 +control <- 'vehicle' +pc <- 'control' + +# read in preprocessed data +merge_molten_filename <- "test_merged_molten_final.csv" +output_path <- paste(outdir, merge_molten_filename, sep="/") +loginfo('reading preprocessed data from : %s', output_path) +source('./functions.R') +full_df <- calculate_auc(full_df) + +# Perform some preliminary control group analysis of variability +control_df <- + control_grouping(full_df, control, pc) # Pull out control compound datapoints +control_var <- + control_variability(control_df) # Read out the control group variability +controlPlot <- + control_analysis( + full_df, + nc = 'vehicle', + pc = 'control', + output = 'plot', + controlDF = control_var + ) +print(controlPlot) + +#Calculate melting parameter difference for each well from MoltenProt +# full_df <- calculate_meltingparams(full_df) %>% +# calculate_zscore() %>% +# convert_zscore + +#Derive RSS values for null and alternate model for each compound from full_df +rss <- compute.rss.models(full_df, rssPlot = FALSE, drPlot = FALSE, plotModel = FALSE) + +#Perform dose-response for each thermogram parameter +parameters <- compute_parameter.rssmodel(full_df, plotModel = FALSE) + +#Merge these plots for further analysis +signif.df <- merge(rss, parameters) +colnames(signif.df)[9] <- 'mannwhit.pval' +signif.df <- determineSig(signif.df) +signif.df <- rankOrder(signif.df) + +# Volcano plots comparing the different parameters of analysis against the NPARC RSS Difference +# Colored by significance test and whether the compound passes any. +plot_volcanos(signif.df, save = FALSE) + +# Plot of RSS Differences vs. p-values for NPARC +rss.pval.plot(signif.df, savePlot = TRUE) + +#Heatmap of compounds vs. different measurement styles. +parameter_heatmaps(signif.df, plotHeat = TRUE) + +#Write out signif.df and full_df +write.csv(x = full_df, file = paste(outdir,'full_df.csv',sep="/")) +write.csv(x = signif.df, file = paste(outdir,'signif_df.csv',sep="/")) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R new file mode 100644 index 0000000..5a74700 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R @@ -0,0 +1,113 @@ +# # @@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@ @@@@@@ +# # @@@@@@@@ @@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ @@@@@@@ @@@@@@@@ +# # @@! @@@ @@! !@@ @@! @@! !@@ @@! @@@ +# # !@! @!@ !@! !@! !@! !@! !@! !@! @!@ +# # @!@!!@! @!! @!@!@!@!@ !@! @!!!:! @!! !!@@!! @!@!@!@! +# # !!@!@! !!! !!!@!@!!! !!! !!!!!: !!! !!@!!! !!!@!!!! +# # !!: :!! !!: :!! !!: !!: !:! !!: !!! +# # :!: !:! :!: :!: :!: :!: !:! :!: !:! +# # :: ::: :: ::: ::: :: :::: :: :::: :: :: ::: +# # NonParametric Multiparameter Analysis of CETSA/RT-CETSA Experimental Sets +# # +# # Written by: Michael Ronzetti {NIH/NCATS/UMD} +# # Patents: PCT/US21/45184, HHS E-022-2022-0-US-01 +# # Main Analysis + +suppressWarnings(library(logging)) +suppressWarnings(library(tidyverse)) +suppressWarnings(library(readxl)) +suppressWarnings(library(stringr)) +suppressWarnings(library(drc)) +suppressWarnings(library(ggthemes)) +suppressWarnings(library(cowplot)) +suppressWarnings(library(hrbrthemes)) +suppressWarnings(library(ggpubr)) +suppressWarnings(library(MESS)) +suppressWarnings(library(devtools)) + +loginfo('loading moltenprot fit params from : %s', params) +loginfo('loading moltenprot baseline corrected from : %s', values) +loginfo('loading platemap fit params from : %s', plate_map) + + +source('./prepare_params.R') + +# full_param <- read_csv(params, +# show_col_types = FALSE +# ) + +source('./prepare_values.R') + +# curve_df <- read_csv(values, +# show_col_types = FALSE +# ) + +platemap_filepath = plate_map + +# Assign compound ids and concentration from platemap +plate_assignment <- function(df, platemap_file) { + # read sample sheet from plate file + id_df <- read_excel(platemap_file, sheet = 'sample') %>% + # remove first column + dplyr::select(-1) %>% + # pivot to get row, col coordinates as columns + pivot_longer(cols = 1:ncol(.)) %>% + # rename the columns with all ids + rename(ncgc_id = value) %>% + # remove name column + dplyr::select(-c('name')) + # NOTE `EMPTY` are considered as vehicle + id_df$ncgc_id <- gsub('empty', 'vehicle', id_df$ncgc_id) + + # read the concentration from the file + conc_df <- read_excel(platemap_file, sheet = 'conc') %>% + # remove first colum + dplyr::select(-1) %>% + # pivot + pivot_longer(., cols = 1:ncol(.)) %>% + rename(conc = value) %>% + dplyr::select(-c('name')) + + # add the columns to the datset + df <- cbind(id_df, conc_df, df) + message('Plate assignment attached to dataframe.') + + # make sure we have numeric value? (unecessary?) + df$row <- as.numeric(df$row) + df$col <- as.numeric(df$col) + return(df) +} + + +full_df <- full_param + + + +full_df <- plate_assignment(full_df, platemap_filepath) + +# Construct full data frame with curve fit and parameters for analysis +bind_fulldf <- function(param_df, curve_df) { + df <- cbind(param_df, curve_df) + return(df) +} + +# Concat dataframes. +loginfo('concat dataframes') +full_df <- bind_fulldf(full_df, curve_df) + +#Convert any columns containing Kelvin values from MoltenProt to Celsius +kelToCel <- function(df) { + df <- df %>% + mutate(Tm_fit = Tm_fit - 273.15) %>% + mutate(T_onset = T_onset - 273.15) +} + +# TODO move that before for each dataset +# full_df <- full_df %>% dplyr::select(-c('...1')) %>% dplyr::select(-c('...1')) +full_df <- kelToCel(full_df) + +merge_molten_filename <- "test_merged_molten_final.csv" +output_path <- paste(outdir, merge_molten_filename, sep="/") +loginfo('writing results to : %s', output_path) + +write.csv(full_df,output_path) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py new file mode 100644 index 0000000..8f284cc --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py @@ -0,0 +1 @@ +"""Preprocess pandas dataframmes in python.""" diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R new file mode 100644 index 0000000..59247de --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R @@ -0,0 +1,78 @@ +suppressWarnings(library(logging)) +library(tidyverse) + +# params = "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/plate_(1-59)_moltenprot_params.csv" + +loginfo('loading moltenprot params from : %s', params) + +# create a dataframe with two columns (row, col) for plate of (16,24) +col_by_row <- expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% +# sort by row number +arrange(., row) + +# NOTE this process creates spurious columns that should be removed +exp_param <- read_csv(params, +show_col_types = FALSE +) + +# create a col with sequential ids for indexing +# named it `well` +exp_param <- exp_param %>% rownames_to_column() %>% rename('well' = 'rowname') + +# select columns we need for this analysis +exp_param <- exp_param %>% + dplyr::select( + c( + 'well', + 'dHm_fit', + 'Tm_fit', + 'BS_factor', + 'T_onset', + 'dG_std' + )) + +# add row col info to results (after the well column) +exp_param <- exp_param %>% bind_cols(col_by_row) %>% relocate(c('row', 'col'), .after = well) + +# NOTE why do we create well at the first place? +# remove the well column +exp_param <- exp_param %>% dplyr::select(-'well') + +# NOTE Basically regenerate the battleship coordinates based on the current ordering +# Add well assignments for each plate +well_assignment <- function(df, well_num) { + if (well_num == 96) { + letter <- LETTERS[1:8] + number <- c(1:12) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + else if (well_num == 384) { + letter <- LETTERS[1:16] + number <- c(1:24) + number <- str_pad(number, 2, pad = '0') + tracker <- 1 + temp_df <- tibble(well = c(1:384)) + for (val in letter) { + for (num in number) { + temp_df$well[tracker] <- paste(val, num, sep = '') + tracker <- tracker + 1 + } + } + } + message('Well assignments created for ', well_num, '-well plate.') + df <- cbind(temp_df, df) + return(df) +} + +# exp_param <- well_assignment(exp_param, 384) +full_param <- well_assignment(exp_param, 384) + +# write.csv(exp_param, "test_exp_param_full.csv") diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R new file mode 100644 index 0000000..a12d3ae --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R @@ -0,0 +1,64 @@ + +# values = "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/plate_(1-59)_moltenprot_curves.csv" + +suppressWarnings(library(logging)) +library(tidyverse) + +loginfo('loading moltenprot values from : %s', values) + +# NOTE this process creates spurious columns that should be removed +exp_curve_all <- read_csv(values, +show_col_types = FALSE +) + +# same, row, col grid. +col_by_row <- + expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:24)) %>% + arrange(., row) + + +# rename to Temperature +exp_curve_all <- exp_curve_all %>% +# add prefix +mutate(., Temperature = paste('val_t_', Temperature, sep = '')) + +exp_curve_all <- exp_curve_all %>% + # pivot transform columns into row combinations (vy creating a column name) + pivot_longer(cols = 2:ncol(exp_curve_all)) %>% + # pivoting again to get temperature as columns + pivot_wider(names_from = Temperature) %>% + # create a id column called well + rownames_to_column() %>% rename('well' = 'rowname') %>% + # add the grid coordinates + bind_cols(col_by_row) %>% + # remove all unused cols + dplyr::select(-c('name', 'well', 'row', 'col')) + + +# Add temperature headers to df +# TODO REVIEW this is sketchy +add_tempheaders <- function(df, + start_temp = 37, + end_temp = 90) { + # generate temperature intervals + temperature_df <- + seq(start_temp, end_temp, by = ((end_temp - start_temp) / (ncol(df) - 1))) %>% + round(., digits = 1) + # rewrite all temperatures! + # TODO CHECK that, that's quite sketchy. Should convert existing temp + for (i in 1:ncol(df)) { + colnames(df)[i] <- paste('t_', temperature_df[i], sep = '') + } + message('Temperature assignments changed for ', + ncol(df), + ' points.') + return(df) +} + +start_temp = 37 +end_temp = 90 +# exp_curve_all <- add_tempheaders(exp_curve_all, start_temp, end_temp) +curve_df <- add_tempheaders(exp_curve_all, start_temp, end_temp) +message('Fit curves retrieved.') + +# write.csv(exp_curve_all, "test_exp_curve_all.csv") diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py new file mode 100644 index 0000000..4166b0d --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py @@ -0,0 +1,42 @@ +"""Run R scripts.""" + +import logging +import os +import subprocess +from pathlib import Path + +POLUS_LOG = os.environ.get("POLUS_LOG", logging.INFO) + +logger = logging.getLogger("rt_cetsa_analysis") +logger.setLevel(POLUS_LOG) + + +def run_rscript( + params_filepath: Path, + values_filepath: Path, + platemap_filepath: Path, + out_dir: Path, +): + """Run R script.""" + print( + "run rscript with args: ", + params_filepath, + values_filepath, + platemap_filepath, + out_dir, + ) + + cmd = [ + "Rscript", + "./main.R", + "--params", + params_filepath.as_posix(), + "--values", + values_filepath.as_posix(), + "--platemap", + platemap_filepath.as_posix(), + "--outdir", + out_dir.as_posix(), + ] + + subprocess.run(args=cmd, cwd="src/polus/tabular/regression/rt_cetsa_analysis/") From a64ec3ccff4fdabeed55dd5e31f5dd316b583634 Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 05:13:50 -0400 Subject: [PATCH 07/18] feat: update for running tools in rt_cetsa dashboard. --- regression/rt-cetsa-analysis-tool/.gitignore | 1 + regression/rt-cetsa-analysis-tool/Dockerfile | 13 +++----- .../rt-cetsa-analysis-tool/pyproject.toml | 2 +- .../rt_cetsa_analysis.cwl | 4 +++ .../regression/rt_cetsa_analysis/__main__.py | 4 ++- .../regression/rt_cetsa_analysis/main.R | 22 +++++++++++-- .../rt_cetsa_analysis/main_analysis.R | 8 +---- .../rt_cetsa_analysis/prepare_data.R | 14 -------- .../prepare_moltenprot_outputs.py | 32 +++++++++++++++++++ .../rt_cetsa_analysis/prepare_params.R | 8 +---- .../rt_cetsa_analysis/prepare_values.R | 19 ++++------- .../rt_cetsa_analysis/run_rscript.py | 29 +++++++++++++---- .../rt_cetsa_analysis/simple_main.R | 30 +++++++++++++++++ .../regression/rt_cetsa_analysis/test.R | 0 .../rt-cetsa-analysis-tool/tests/__init__.py | 1 + .../rt-cetsa-analysis-tool/tests/conftest.py | 13 ++++++++ .../tests/test_run_rscript.py | 20 ++++++++++++ 17 files changed, 159 insertions(+), 61 deletions(-) create mode 100644 regression/rt-cetsa-analysis-tool/.gitignore create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R create mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R create mode 100644 regression/rt-cetsa-analysis-tool/tests/__init__.py create mode 100644 regression/rt-cetsa-analysis-tool/tests/conftest.py create mode 100644 regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py diff --git a/regression/rt-cetsa-analysis-tool/.gitignore b/regression/rt-cetsa-analysis-tool/.gitignore new file mode 100644 index 0000000..c32211a --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/.gitignore @@ -0,0 +1 @@ +tests/out diff --git a/regression/rt-cetsa-analysis-tool/Dockerfile b/regression/rt-cetsa-analysis-tool/Dockerfile index b41b225..6311372 100644 --- a/regression/rt-cetsa-analysis-tool/Dockerfile +++ b/regression/rt-cetsa-analysis-tool/Dockerfile @@ -44,17 +44,12 @@ COPY pyproject.toml ${EXEC_DIR} COPY README.md ${EXEC_DIR} RUN apt-get update && apt-get install -y python3 RUN apt-get -y install python3-pip python3-venv - COPY src/ ${EXEC_DIR}/src - -RUN python3 -m venv ${EXEC_DIR}/.venv -# RUN chmod u+x ${EXEC_DIR}/.venv/bin/activate -# RUN ${EXEC_DIR}/.venv/bin/activate -RUN ${EXEC_DIR}/.venv/bin/pip3 install ${EXEC_DIR} --no-cache-dir - +RUN pip3 install ${EXEC_DIR} --no-cache-dir --break-system-packages WORKDIR ${EXEC_DIR} # CMD ["Rscript", "main.R"] -ENTRYPOINT ["/opt/executables/.venv/bin/python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] -CMD ["--help"] +CMD ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] +# CMD ["--help"] +# CMD ["bash"] diff --git a/regression/rt-cetsa-analysis-tool/pyproject.toml b/regression/rt-cetsa-analysis-tool/pyproject.toml index 92595cf..323a14f 100644 --- a/regression/rt-cetsa-analysis-tool/pyproject.toml +++ b/regression/rt-cetsa-analysis-tool/pyproject.toml @@ -13,7 +13,7 @@ packages = [{include = "polus", from = "src"}] [tool.poetry.dependencies] python = ">=3.9,<3.12" # filepattern = "2.0.4" -typer = "^0.12.3" +typer = "^0.7" [tool.poetry.group.dev.dependencies] bump2version = "^1.0.1" diff --git a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl index e49f32b..149b3a3 100644 --- a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl +++ b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl @@ -1,5 +1,6 @@ class: CommandLineTool cwlVersion: v1.2 +baseCommand: ["python3", "-m", "polus.tabular.regression.rt_cetsa_analysis"] inputs: inpDir: inputBinding: @@ -31,6 +32,9 @@ outputs: glob: $(inputs.outDir.basename) type: Directory requirements: + EnvVarRequirement: + envDef: + WORKDIR: /opt/executables/ DockerRequirement: dockerPull: polusai/rt-cetsa-analysis-simple-tool:0.1.0 InitialWorkDirRequirement: diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py index 30cd581..4264c23 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py @@ -68,7 +68,7 @@ def main( ) -> None: """CLI for rt-cetsa-moltprot-tool.""" # TODO: Add to docs that input csv file should be sorted by `Temperature` column. - logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") + logger.info("Starting the CLI for rt-cetsa-moltenprot-tool.") logger.info(f"Input directory: {inp_dir}") logger.info(f"params_pattern: {params_pattern}") @@ -79,6 +79,8 @@ def main( params = inp_dir / params_pattern values = inp_dir / values_pattern + logger.info(f"{inp_dir}") + if preview: NotImplemented # noqa: B018 diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R index 118d0cf..f8742cc 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main.R @@ -7,21 +7,24 @@ basicConfig() # Setup the argument parsing addFitParams <- function(parser) { - print("addFitParams") parser$add_argument("--params", type = "character",help="Fit params csv file") invisible(NULL) + print("params arg ok ") } addBaselineCorrected <- function(parser) { parser$add_argument("--values", type = "character",help="Baseline corrected csv file") invisible(NULL) + print("values arg ok ") } addPlateMap <- function(parser) { parser$add_argument("--platemap", type = "character",help="platemap excel file") invisible(NULL) + print("platemap arg ok ") } addOutputArgs <- function(parser) { - parser$add_argument("--outdir", type = "character",help="Output csv file") + parser$add_argument("--outdir", type = "character",help="Output directory") invisible(NULL) + print("outdir arg ok ") } getAllParser <- function() { parser <- ArgumentParser(description="ALL PARSER") @@ -33,8 +36,21 @@ getAllParser <- function() { } # Parse the arguments +loginfo('collect all parsing routines...') parser <- getAllParser() -args <- parser$parse_args() +loginfo('parse data...') + +print(parser) + +tryCatch( + args <- parser$parse_args(), + error = function(e){ + print("There was an error: ") + print(e) + } +) + + #Path to csvfile directory diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R index 53ef628..c15c59a 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/main_analysis.R @@ -27,22 +27,17 @@ library(devtools) # BECAUSE OF BUG pdf(file = NULL) - # EXPERIMENTAL PARAMETERS AND SETUP # # Input experiment parameters here - startTemp <- 37 endTemp <- 95 plate_format <- 384 control <- 'vehicle' pc <- 'control' -# read in preprocessed data -merge_molten_filename <- "test_merged_molten_final.csv" -output_path <- paste(outdir, merge_molten_filename, sep="/") -loginfo('reading preprocessed data from : %s', output_path) source('./functions.R') + full_df <- calculate_auc(full_df) # Perform some preliminary control group analysis of variability @@ -58,7 +53,6 @@ controlPlot <- output = 'plot', controlDF = control_var ) -print(controlPlot) #Calculate melting parameter difference for each well from MoltenProt # full_df <- calculate_meltingparams(full_df) %>% diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R index 5a74700..f08e82e 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_data.R @@ -32,16 +32,8 @@ loginfo('loading platemap fit params from : %s', plate_map) source('./prepare_params.R') -# full_param <- read_csv(params, -# show_col_types = FALSE -# ) - source('./prepare_values.R') -# curve_df <- read_csv(values, -# show_col_types = FALSE -# ) - platemap_filepath = plate_map # Assign compound ids and concentration from platemap @@ -105,9 +97,3 @@ kelToCel <- function(df) { # TODO move that before for each dataset # full_df <- full_df %>% dplyr::select(-c('...1')) %>% dplyr::select(-c('...1')) full_df <- kelToCel(full_df) - -merge_molten_filename <- "test_merged_molten_final.csv" -output_path <- paste(outdir, merge_molten_filename, sep="/") -loginfo('writing results to : %s', output_path) - -write.csv(full_df,output_path) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py index 8f284cc..dad0616 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py @@ -1 +1,33 @@ """Preprocess pandas dataframmes in python.""" + +from pathlib import Path + +import pandas as pd +from scipy.constants import convert_temperature + +pd.options.display.precision = 1 + + +inp_dir = Path( + "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/", +) +params = "plate_(1-59)_moltenprot_params.csv" +values = "plate_(1-59)_moltenprot_curves.csv" + +df = pd.read_csv(inp_dir / values) +df["Temperature"] = ( + df["Temperature"] + .map(lambda temp: convert_temperature(temp, "Kelvin", "Celsius")) + .round(1) + .astype(str) +) +df["Temperature"] = "t_" + df["Temperature"] + +print(list(df.columns)) +print(df.index) +df = df.transpose() +print(df.head) + +print(list(df.columns)) + +df.to_csv("transformed_curves.csv") diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R index 59247de..c21372d 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_params.R @@ -1,8 +1,6 @@ suppressWarnings(library(logging)) library(tidyverse) -# params = "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/plate_(1-59)_moltenprot_params.csv" - loginfo('loading moltenprot params from : %s', params) # create a dataframe with two columns (row, col) for plate of (16,24) @@ -10,7 +8,6 @@ col_by_row <- expand.grid(row = sprintf('%.2d', 1:16), col = sprintf('%.2d', 1:2 # sort by row number arrange(., row) -# NOTE this process creates spurious columns that should be removed exp_param <- read_csv(params, show_col_types = FALSE ) @@ -38,7 +35,7 @@ exp_param <- exp_param %>% bind_cols(col_by_row) %>% relocate(c('row', 'col'), . # remove the well column exp_param <- exp_param %>% dplyr::select(-'well') -# NOTE Basically regenerate the battleship coordinates based on the current ordering +# NOTE Basically regenerate the alphanumerical coordinates based on the current ordering # Add well assignments for each plate well_assignment <- function(df, well_num) { if (well_num == 96) { @@ -72,7 +69,4 @@ well_assignment <- function(df, well_num) { return(df) } -# exp_param <- well_assignment(exp_param, 384) full_param <- well_assignment(exp_param, 384) - -# write.csv(exp_param, "test_exp_param_full.csv") diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R index a12d3ae..866423d 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_values.R @@ -1,12 +1,9 @@ - -# values = "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/plate_(1-59)_moltenprot_curves.csv" - suppressWarnings(library(logging)) library(tidyverse) loginfo('loading moltenprot values from : %s', values) -# NOTE this process creates spurious columns that should be removed + exp_curve_all <- read_csv(values, show_col_types = FALSE ) @@ -17,13 +14,12 @@ col_by_row <- arrange(., row) -# rename to Temperature exp_curve_all <- exp_curve_all %>% # add prefix mutate(., Temperature = paste('val_t_', Temperature, sep = '')) exp_curve_all <- exp_curve_all %>% - # pivot transform columns into row combinations (vy creating a column name) + # pivot transform columns into row combinations pivot_longer(cols = 2:ncol(exp_curve_all)) %>% # pivoting again to get temperature as columns pivot_wider(names_from = Temperature) %>% @@ -34,9 +30,6 @@ exp_curve_all <- exp_curve_all %>% # remove all unused cols dplyr::select(-c('name', 'well', 'row', 'col')) - -# Add temperature headers to df -# TODO REVIEW this is sketchy add_tempheaders <- function(df, start_temp = 37, end_temp = 90) { @@ -55,10 +48,10 @@ add_tempheaders <- function(df, return(df) } +# NOTE: HARDCODED VALUE INTERVAL +# Also, this should comes from moltenprot start_temp = 37 end_temp = 90 -# exp_curve_all <- add_tempheaders(exp_curve_all, start_temp, end_temp) -curve_df <- add_tempheaders(exp_curve_all, start_temp, end_temp) -message('Fit curves retrieved.') -# write.csv(exp_curve_all, "test_exp_curve_all.csv") +curve_df <- add_tempheaders(exp_curve_all, start_temp, end_temp) +message('Done preparing fit curves.') diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py index 4166b0d..5ce5bd8 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py @@ -6,6 +6,7 @@ from pathlib import Path POLUS_LOG = os.environ.get("POLUS_LOG", logging.INFO) +WORKDIR = os.environ.get("WORKDIR", "") logger = logging.getLogger("rt_cetsa_analysis") logger.setLevel(POLUS_LOG) @@ -26,17 +27,33 @@ def run_rscript( out_dir, ) + cwd = Path(__file__).parent + if WORKDIR: + cwd = ( + Path(WORKDIR) + / "src" + / "polus" + / "tabular" + / "regression" + / "rt_cetsa_analysis/" + ) + + logger.info(f"################## current working directory : {cwd.as_posix()}") + + # "Rscript", + # "./main.R", + # "--params", + # "--values", + # "--platemap", + # "--outdir", + cmd = [ "Rscript", - "./main.R", - "--params", + "./simple_main.R", params_filepath.as_posix(), - "--values", values_filepath.as_posix(), - "--platemap", platemap_filepath.as_posix(), - "--outdir", out_dir.as_posix(), ] - subprocess.run(args=cmd, cwd="src/polus/tabular/regression/rt_cetsa_analysis/") + subprocess.run(args=cmd, cwd=cwd) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R new file mode 100644 index 0000000..b30eb7b --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R @@ -0,0 +1,30 @@ + +suppressWarnings(library(logging)) + + +# Initialize the logger +basicConfig() + +args = commandArgs(trailingOnly=TRUE) +print(args[1]) +print(args[2]) +print(args[3]) +print(args[4]) + +params <- args[1] +values <- args[2] +plate_map <- args[3] +outdir <- args[4] + +loginfo('params = %s', params) +loginfo('values = %s', values) +loginfo('platemap = %s', plate_map) +loginfo('outdir = %s', outdir) + +loginfo('params (fit params) = %s', params) +loginfo('values (baseline corrected): %s', values) +loginfo('platemap file (plate metadata): %s', plate_map) +loginfo('outdir (output directory): %s', outdir) + +source('./prepare_data.R') +source('./main_analysis.R') diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R new file mode 100644 index 0000000..e69de29 diff --git a/regression/rt-cetsa-analysis-tool/tests/__init__.py b/regression/rt-cetsa-analysis-tool/tests/__init__.py new file mode 100644 index 0000000..d420712 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests.""" diff --git a/regression/rt-cetsa-analysis-tool/tests/conftest.py b/regression/rt-cetsa-analysis-tool/tests/conftest.py new file mode 100644 index 0000000..4983315 --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/conftest.py @@ -0,0 +1,13 @@ +"""Set up.""" +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py b/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py new file mode 100644 index 0000000..fb8cb0b --- /dev/null +++ b/regression/rt-cetsa-analysis-tool/tests/test_run_rscript.py @@ -0,0 +1,20 @@ +"""Tests.""" +from pathlib import Path + +import pytest +from polus.tabular.regression.rt_cetsa_analysis.run_rscript import run_rscript + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_run_rscript(): + """Run R script.""" + inpDir = Path.cwd() / "tests" / "data" + params = "plate_(1-59)_moltenprot_params.csv" + values = "plate_(1-59)_moltenprot_curves.csv" + platemap = Path.cwd() / "tests" / "data" / "platemap.xlsx" + outDir = Path.cwd() / "tests" / "out" + + params = inpDir / params + values = inpDir / values + + run_rscript(params, values, platemap, outDir) From 1a34c2ab9b2a29d4e8924e052c8c4f097d3979a7 Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 05:50:51 -0400 Subject: [PATCH 08/18] feat: update analysis tool before PR. --- regression/rt-cetsa-analysis-tool/ict.yml | 8 +-- regression/rt-cetsa-analysis-tool/plugin.json | 17 ++++-- .../regression/rt_cetsa_analysis/__main__.py | 54 ++++++++++++------- .../rt_cetsa_analysis/__main_filepattern__.py | 52 ------------------ .../prepare_moltenprot_outputs.py | 33 ------------ .../rt_cetsa_analysis/run_rscript.py | 18 +------ .../rt_cetsa_analysis/simple_main.R | 4 -- .../regression/rt_cetsa_analysis/test.R | 0 8 files changed, 55 insertions(+), 131 deletions(-) delete mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py delete mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py delete mode 100644 regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R diff --git a/regression/rt-cetsa-analysis-tool/ict.yml b/regression/rt-cetsa-analysis-tool/ict.yml index b001e9f..353fc84 100644 --- a/regression/rt-cetsa-analysis-tool/ict.yml +++ b/regression/rt-cetsa-analysis-tool/ict.yml @@ -12,13 +12,13 @@ inputs: - params name: params required: true - type: path + type: string - description: Baseline corrected csv files format: - values name: values required: true - type: path + type: string - description: platemap excel file format: - platemap @@ -46,11 +46,11 @@ ui: - description: Fit params csv file key: inputs.params title: Fit params csv file - type: path + type: string - description: baseline corrected csv file key: inputs.values title: baseline corrected csv file - type: path + type: string - description: platemap excel file key: inputs.platemap title: platemap excel file diff --git a/regression/rt-cetsa-analysis-tool/plugin.json b/regression/rt-cetsa-analysis-tool/plugin.json index d59519a..f16fd81 100644 --- a/regression/rt-cetsa-analysis-tool/plugin.json +++ b/regression/rt-cetsa-analysis-tool/plugin.json @@ -48,10 +48,19 @@ "description": "Input data collection to be processed by this plugin" }, { - "key": "inputs.filePattern", - "title": "File Pattern", - "description": "File Pattern to parse input files", - "default": ".+" + "key": "inputs.params", + "title": "Params", + "description": "Params" + }, + { + "key": "inputs.values", + "title": "Values", + "description": "Values" + }, + { + "key": "inputs.platemap", + "title": "Plate Map", + "description": "Plate Map" } ] } diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py index 4264c23..e4af4bb 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main__.py @@ -1,5 +1,6 @@ -"""CLI for rt-cetsa-moltprot-tool.""" +"""CLI for rt-cetsa-analysis tool.""" +import json import logging import os from pathlib import Path @@ -33,15 +34,15 @@ def main( readable=True, resolve_path=True, ), - params_pattern: str = typer.Option( + params_filename: str = typer.Option( ..., "--params", - help="name of the molten fit params csv file in the input directory.", + help="name of the moltenprot fit params csv file in the input directory.", ), - values_pattern: str = typer.Option( + values_filename: str = typer.Option( ..., "--values", - help="name of the baseline corrected values csv file in the input directory.", + help="name of the moltenprot baseline corrected values csv file in the input directory.", ), platemap: Path = typer.Option( ..., @@ -66,30 +67,47 @@ def main( resolve_path=True, ), ) -> None: - """CLI for rt-cetsa-moltprot-tool.""" - # TODO: Add to docs that input csv file should be sorted by `Temperature` column. - logger.info("Starting the CLI for rt-cetsa-moltenprot-tool.") + """CLI for rt-cetsa-analysis tool.""" + logger.info("Starting the CLI for rt-cetsa-analysis tool.") logger.info(f"Input directory: {inp_dir}") - logger.info(f"params_pattern: {params_pattern}") - logger.info(f"values_pattern: {values_pattern}") + logger.info(f"params: {params_filename}") + logger.info(f"values: {values_filename}") logger.info(f"platemap path: {platemap}") logger.info(f"Output directory: {out_dir}") - params = inp_dir / params_pattern - values = inp_dir / values_pattern + if params_filename: + params = inp_dir / params_filename + elif (inp_dir / "params.csv").exists(): + params = inp_dir / "params.csv" + else: + raise ValueError( + f"No 'params.csv' moltenprot parameters file found in {inp_dir}.", + ) - logger.info(f"{inp_dir}") - - if preview: - NotImplemented # noqa: B018 + if values_filename: + values = inp_dir / values_filename + elif (inp_dir / "values.csv").exists(): + values = inp_dir / "values.csv" + else: + raise ValueError(f"No 'values.csv' moltenprot values file found in {inp_dir}.") if not params.exists(): raise FileNotFoundError(f"params file not found : {params}") if not values.exists(): raise FileNotFoundError(f"values file not found : {values}") - if not platemap.exists(): - raise FileNotFoundError(f"platemap file not found : {platemap}") + + logger.info(f"params filename: {params}") + logger.info(f"values filename: {values}") + logger.info(f"platemap path: {platemap}") + logger.info(f"Output directory: {out_dir}") + + if preview: + outputs: list[str] = ["signif_df.csv"] + out_json = {"files": outputs} + with (out_dir / "preview.json").open("w") as f: + json.dump(out_json, f, indent=2) + return run_rscript(params, values, platemap, out_dir) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py deleted file mode 100644 index d150462..0000000 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__main_filepattern__.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Main with filepattern support.""" -# """CLI for rt-cetsa-moltprot-tool.""" - - -# # get env - -# # Initialize the logger -# logging.basicConfig( - - -# @app.command() -# def main( -# inp_dir: pathlib.Path = typer.Option( -# ..., -# "--inpDir", -# ), -# params_pattern: str = typer.Option( -# ".+", -# "--params", -# ), -# values_pattern: str = typer.Option( -# ".+", -# "--values", -# ), -# platemap_pattern: str = typer.Option( -# ".+", -# "--platemap", -# ), -# preview: bool = typer.Option( -# False, -# "--preview", -# ), -# out_dir: pathlib.Path = typer.Option( -# ..., -# "--outDir", -# ), -# ) -> None: -# """CLI for rt-cetsa-moltprot-tool.""" -# # TODO: Add to docs that input csv file should be sorted by `Temperature` column. - - -# if preview: -# with (out_dir / "preview.json").open("w") as f: - -# for params, values, platemap in zip(params_files, values_files, platemap_files): -# # TODO replace with exceptions -# if len(params[1]) != 1 or len(values[1]) != 1 or len(platemap[1]) != 1: -# raise Exception( -# msg, - - -# if __name__ == "__main__": diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py deleted file mode 100644 index dad0616..0000000 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/prepare_moltenprot_outputs.py +++ /dev/null @@ -1,33 +0,0 @@ -"""Preprocess pandas dataframmes in python.""" - -from pathlib import Path - -import pandas as pd -from scipy.constants import convert_temperature - -pd.options.display.precision = 1 - - -inp_dir = Path( - "/Users/antoinegerardin/RT-CETSA-Analysis/.data/final_outputs/moltenprot/", -) -params = "plate_(1-59)_moltenprot_params.csv" -values = "plate_(1-59)_moltenprot_curves.csv" - -df = pd.read_csv(inp_dir / values) -df["Temperature"] = ( - df["Temperature"] - .map(lambda temp: convert_temperature(temp, "Kelvin", "Celsius")) - .round(1) - .astype(str) -) -df["Temperature"] = "t_" + df["Temperature"] - -print(list(df.columns)) -print(df.index) -df = df.transpose() -print(df.head) - -print(list(df.columns)) - -df.to_csv("transformed_curves.csv") diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py index 5ce5bd8..c7ca0c4 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/run_rscript.py @@ -19,15 +19,8 @@ def run_rscript( out_dir: Path, ): """Run R script.""" - print( - "run rscript with args: ", - params_filepath, - values_filepath, - platemap_filepath, - out_dir, - ) - cwd = Path(__file__).parent + if WORKDIR: cwd = ( Path(WORKDIR) @@ -38,14 +31,7 @@ def run_rscript( / "rt_cetsa_analysis/" ) - logger.info(f"################## current working directory : {cwd.as_posix()}") - - # "Rscript", - # "./main.R", - # "--params", - # "--values", - # "--platemap", - # "--outdir", + logger.info(f"current working directory : {cwd.as_posix()}") cmd = [ "Rscript", diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R index b30eb7b..239e87c 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/simple_main.R @@ -6,10 +6,6 @@ suppressWarnings(library(logging)) basicConfig() args = commandArgs(trailingOnly=TRUE) -print(args[1]) -print(args[2]) -print(args[3]) -print(args[4]) params <- args[1] values <- args[2] diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/test.R deleted file mode 100644 index e69de29..0000000 From 1b9a4c613e996665d67de13853598395cf1dfbaf Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 05:53:00 -0400 Subject: [PATCH 09/18] chore: refactor moltenprot --- .../.bumpversion.cfg | 0 regression/rt-cetsa-moltenprot-tool/.gitignore | 0 .../Dockerfile | 1 + .../README.md | 0 .../VERSION | 0 .../build-docker.sh | 0 .../ict.yml | 2 +- .../plugin.json | 2 +- .../pyproject.toml | 2 +- .../rt_cetsa_moltenprot.cwl | 0 .../run-plugin.sh | 2 +- .../regression/rt_cetsa_moltenprot/MOLTENPROT_VERSION} | 0 .../polus/tabular/regression/rt_cetsa_moltenprot}/__init__.py | 0 .../polus/tabular/regression/rt_cetsa_moltenprot}/__main__.py | 4 ++-- .../src/polus/tabular/regression/rt_cetsa_moltenprot}/core.py | 2 +- .../polus/tabular/regression/rt_cetsa_moltenprot}/models.py | 0 16 files changed, 8 insertions(+), 7 deletions(-) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/.bumpversion.cfg (100%) create mode 100644 regression/rt-cetsa-moltenprot-tool/.gitignore rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/Dockerfile (96%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/README.md (100%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/VERSION (100%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/build-docker.sh (100%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/ict.yml (95%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/plugin.json (95%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/pyproject.toml (98%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/rt_cetsa_moltenprot.cwl (100%) rename regression/{rt-cetsa-moltprot-tool => rt-cetsa-moltenprot-tool}/run-plugin.sh (87%) rename regression/{rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/RT_CETSA_VERSION => rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/MOLTENPROT_VERSION} (100%) rename regression/{rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot => rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot}/__init__.py (100%) rename regression/{rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot => rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot}/__main__.py (95%) rename regression/{rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot => rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot}/core.py (99%) rename regression/{rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot => rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot}/models.py (100%) diff --git a/regression/rt-cetsa-moltprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg similarity index 100% rename from regression/rt-cetsa-moltprot-tool/.bumpversion.cfg rename to regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg diff --git a/regression/rt-cetsa-moltenprot-tool/.gitignore b/regression/rt-cetsa-moltenprot-tool/.gitignore new file mode 100644 index 0000000..e69de29 diff --git a/regression/rt-cetsa-moltprot-tool/Dockerfile b/regression/rt-cetsa-moltenprot-tool/Dockerfile similarity index 96% rename from regression/rt-cetsa-moltprot-tool/Dockerfile rename to regression/rt-cetsa-moltenprot-tool/Dockerfile index db91d4c..4e6d5bc 100755 --- a/regression/rt-cetsa-moltprot-tool/Dockerfile +++ b/regression/rt-cetsa-moltenprot-tool/Dockerfile @@ -15,6 +15,7 @@ COPY README.md ${EXEC_DIR} COPY src ${EXEC_DIR}/src RUN pip3 install ${EXEC_DIR} --no-cache-dir +RUN pip3 install . ENTRYPOINT ["python3", "-m", "polus.tabular.regression.rt_cetsa_moltenprot"] CMD ["--help"] diff --git a/regression/rt-cetsa-moltprot-tool/README.md b/regression/rt-cetsa-moltenprot-tool/README.md similarity index 100% rename from regression/rt-cetsa-moltprot-tool/README.md rename to regression/rt-cetsa-moltenprot-tool/README.md diff --git a/regression/rt-cetsa-moltprot-tool/VERSION b/regression/rt-cetsa-moltenprot-tool/VERSION similarity index 100% rename from regression/rt-cetsa-moltprot-tool/VERSION rename to regression/rt-cetsa-moltenprot-tool/VERSION diff --git a/regression/rt-cetsa-moltprot-tool/build-docker.sh b/regression/rt-cetsa-moltenprot-tool/build-docker.sh similarity index 100% rename from regression/rt-cetsa-moltprot-tool/build-docker.sh rename to regression/rt-cetsa-moltenprot-tool/build-docker.sh diff --git a/regression/rt-cetsa-moltprot-tool/ict.yml b/regression/rt-cetsa-moltenprot-tool/ict.yml similarity index 95% rename from regression/rt-cetsa-moltprot-tool/ict.yml rename to regression/rt-cetsa-moltenprot-tool/ict.yml index da600de..5b987c2 100644 --- a/regression/rt-cetsa-moltprot-tool/ict.yml +++ b/regression/rt-cetsa-moltenprot-tool/ict.yml @@ -3,7 +3,7 @@ author: - Antoine Gerardin - Najib Ishaq contact: nick.schaub@nih.gov -container: polusai/rt-cetsa-moltprot-tool:0.1.0 +container: polusai/rt-cetsa-moltproten-tool:0.1.0 description: Run moltenprot regression for the RT-CETSA pipeline. entrypoint: python3 -m polus.tabular.regression.rt_cetsa_moltprot inputs: diff --git a/regression/rt-cetsa-moltprot-tool/plugin.json b/regression/rt-cetsa-moltenprot-tool/plugin.json similarity index 95% rename from regression/rt-cetsa-moltprot-tool/plugin.json rename to regression/rt-cetsa-moltenprot-tool/plugin.json index 63877b2..8b44a5a 100644 --- a/regression/rt-cetsa-moltprot-tool/plugin.json +++ b/regression/rt-cetsa-moltenprot-tool/plugin.json @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/tabular-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/rt-cetsa-moltprot-tool:0.1.0", + "containerId": "polusai/rt-cetsa-moltenproten-tool:0.1.0", "baseCommand": [ "python3", "-m", diff --git a/regression/rt-cetsa-moltprot-tool/pyproject.toml b/regression/rt-cetsa-moltenprot-tool/pyproject.toml similarity index 98% rename from regression/rt-cetsa-moltprot-tool/pyproject.toml rename to regression/rt-cetsa-moltenprot-tool/pyproject.toml index aa90b1f..51f1640 100644 --- a/regression/rt-cetsa-moltprot-tool/pyproject.toml +++ b/regression/rt-cetsa-moltenprot-tool/pyproject.toml @@ -1,5 +1,5 @@ [tool.poetry] -name = "polus_tabular_regression_rt_cetsa_moltprot" +name = "polus_tabular_regression_rt_cetsa_moltenprot" version = "0.1.0" description = "Run moltenprot for the RT-CETSA pipeline." authors = [ diff --git a/regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl similarity index 100% rename from regression/rt-cetsa-moltprot-tool/rt_cetsa_moltenprot.cwl rename to regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl diff --git a/regression/rt-cetsa-moltprot-tool/run-plugin.sh b/regression/rt-cetsa-moltenprot-tool/run-plugin.sh similarity index 87% rename from regression/rt-cetsa-moltprot-tool/run-plugin.sh rename to regression/rt-cetsa-moltenprot-tool/run-plugin.sh index 73c9245..948a180 100755 --- a/regression/rt-cetsa-moltprot-tool/run-plugin.sh +++ b/regression/rt-cetsa-moltenprot-tool/run-plugin.sh @@ -14,7 +14,7 @@ LOGLEVEL=INFO docker run --mount type=bind,source=${datapath},target=/data/ \ --env POLUS_LOG=${LOGLEVEL} \ - polusai/rt-cetsa-moltenprot-tool:${version} \ + polusai/rt-cetsa-moltenproten-tool:${version} \ --inpDir ${inpDir} \ --filePattern ${pattern} \ --outDir ${outDir} diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/RT_CETSA_VERSION b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/MOLTENPROT_VERSION similarity index 100% rename from regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/RT_CETSA_VERSION rename to regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/MOLTENPROT_VERSION diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py similarity index 100% rename from regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__init__.py rename to regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py similarity index 95% rename from regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py rename to regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py index 569bf40..cc867ae 100644 --- a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/__main__.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py @@ -7,7 +7,7 @@ import filepattern import typer -from polus.tabular.regression.rt_cetsa_moltprot import fit_data +from polus.tabular.regression.rt_cetsa_moltenprot import fit_data # Initialize the logger logging.basicConfig( @@ -54,7 +54,6 @@ def main( ), ) -> None: """CLI for rt-cetsa-moltprot-tool.""" - # TODO: Add to docs that input csv file should be sorted by `Temperature` column. logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") logger.info(f"Input directory: {inp_dir}") @@ -101,6 +100,7 @@ def main( ).astype(int) fit_params = fit_params.drop(columns="_index") fit_params = fit_params.sort_values(["letter", "number"]) + fit_params = fit_params.drop(columns=["letter", "number"]) fit_params.to_csv(fit_params_path, index=True) # keep only 2 signicant digits for temperature index diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py similarity index 99% rename from regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py rename to regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py index e1160e5..79d51f7 100644 --- a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/core.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/core.py @@ -82,7 +82,7 @@ # MoltenProt is stored in a plain-text file VERSION (also used by setup.py) # extract and save it to a variable -with open(os.path.join(__location__, "RT_CETSA_VERSION")) as version_file: +with open(os.path.join(__location__, "MOLTENPROT_VERSION")) as version_file: __version__ = version_file.read().strip() # get scipy version (some methods may not be available in earlier versions) diff --git a/regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/models.py similarity index 100% rename from regression/rt-cetsa-moltprot-tool/src/polus/tabular/regression/rt_cetsa_moltprot/models.py rename to regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/models.py From b50a8841136985c158dd536d49b5b2486f0ddbc3 Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 06:32:44 -0400 Subject: [PATCH 10/18] feat: review api. Removed filepattern. --- regression/rt-cetsa-moltenprot-tool/ict.yml | 12 +-- .../rt-cetsa-moltenprot-tool/plugin.json | 12 ++- .../rt_cetsa_moltenprot.cwl | 2 +- .../rt_cetsa_moltenprot/__main__.py | 89 +++++++++---------- 4 files changed, 55 insertions(+), 60 deletions(-) diff --git a/regression/rt-cetsa-moltenprot-tool/ict.yml b/regression/rt-cetsa-moltenprot-tool/ict.yml index 5b987c2..3c4b744 100644 --- a/regression/rt-cetsa-moltenprot-tool/ict.yml +++ b/regression/rt-cetsa-moltenprot-tool/ict.yml @@ -13,10 +13,10 @@ inputs: name: inpDir required: true type: path -- description: Filepattern to parse input files +- description: intensities file format: - - pattern - name: pattern + - intensities + name: intensities required: false type: string - description: Generate an output preview. @@ -41,9 +41,9 @@ ui: key: inputs.inpDir title: Input data collection type: path -- description: Filepattern to parse input files - key: inputs.pattern - title: pattern +- description: Intensities file + key: inputs.intensities + title: intensities type: text - description: Generate an output preview. key: inputs.preview diff --git a/regression/rt-cetsa-moltenprot-tool/plugin.json b/regression/rt-cetsa-moltenprot-tool/plugin.json index 8b44a5a..034b34b 100644 --- a/regression/rt-cetsa-moltenprot-tool/plugin.json +++ b/regression/rt-cetsa-moltenprot-tool/plugin.json @@ -22,10 +22,9 @@ "required": true }, { - "name": "filePattern", + "name": "intensities", "type": "string", - "description": "File Pattern to parse input files", - "default": ".+", + "description": "Filename for the intensities file.", "required": false } ], @@ -43,10 +42,9 @@ "description": "Input data collection to be processed by this plugin" }, { - "key": "inputs.filePattern", - "title": "File Pattern", - "description": "File Pattern to parse input files", - "default": ".+" + "key": "inputs.intensities", + "title": "Intensities file", + "description": "Intensities file" } ] } diff --git a/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl index ee45fd6..98f47eb 100644 --- a/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl +++ b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl @@ -7,7 +7,7 @@ inputs: type: Directory filePattern: inputBinding: - prefix: --filePattern + prefix: --intensities type: string? preview: inputBinding: diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py index cc867ae..283e787 100644 --- a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py @@ -5,7 +5,6 @@ import os import pathlib -import filepattern import typer from polus.tabular.regression.rt_cetsa_moltenprot import fit_data @@ -33,15 +32,10 @@ def main( readable=True, resolve_path=True, ), - pattern: str = typer.Option( - ".+", - "--filePattern", - help="Pattern to match the files in the input directory.", - ), - preview: bool = typer.Option( - False, - "--preview", - help="Preview the files that will be processed.", + int_filename: str = typer.Option( + None, + "--intensities", + help="name of the intensities file (optional).", ), out_dir: pathlib.Path = typer.Option( ..., @@ -52,61 +46,64 @@ def main( writable=True, resolve_path=True, ), + preview: bool = typer.Option( + False, + "--preview", + help="Preview the files that will be processed.", + ), ) -> None: """CLI for rt-cetsa-moltprot-tool.""" logger.info("Starting the CLI for rt-cetsa-moltprot-tool.") logger.info(f"Input directory: {inp_dir}") - logger.info(f"File Pattern: {pattern}") logger.info(f"Output directory: {out_dir}") + # NOTE we may eventually deal with other types. if POLUS_TAB_EXT != ".csv": msg = "this tool can currently only process csv files." raise ValueError(msg) - fp = filepattern.FilePattern(inp_dir, pattern) - inp_files = [f[1][0] for f in fp()] - - for f in inp_files: - if not f.suffix == POLUS_TAB_EXT: - raise ValueError( - f"this tool can only process {POLUS_TAB_EXT} files. Got {f}", + if int_filename is not None: + intensities_file = inp_dir / int_filename + if not intensities_file.exists(): + raise FileNotFoundError(intensities_file) + else: + if len(list(inp_dir.iterdir())) != 1: + raise FileExistsError( + f"There should be a single intensities file in {inp_dir}", ) + intensities_file = next(inp_dir.iterdir()) + logger.info(f"Using intensities file: {intensities_file}") if preview: - outputs: list[str] = [] - for f in inp_files: - fit_params_path = f.stem + "_moltenprot_params" + POLUS_TAB_EXT - fit_curves_path = f.stem + "_moltenprot_curves" + POLUS_TAB_EXT - outputs = [*outputs, fit_params_path, fit_curves_path] + outputs = ["params" + POLUS_TAB_EXT, "values" + POLUS_TAB_EXT] out_json = {"files": outputs} with (out_dir / "preview.json").open("w") as f: json.dump(out_json, f, indent=2) return - for f in inp_files: - logger.info(f"Processing plate timeserie: {f}") - fit = fit_data(f) - fit_params_path = out_dir / (f.stem + "_moltenprot_params" + POLUS_TAB_EXT) - fit_curves_path = out_dir / (f.stem + "_moltenprot_curves" + POLUS_TAB_EXT) - - # sort fit_params by row/column - fit_params = fit.plate_results - fit_params["_index"] = fit_params.index - fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) - fit_params["number"] = fit_params.apply( - lambda row: row._index[1:], - axis=1, - ).astype(int) - fit_params = fit_params.drop(columns="_index") - fit_params = fit_params.sort_values(["letter", "number"]) - fit_params = fit_params.drop(columns=["letter", "number"]) - fit_params.to_csv(fit_params_path, index=True) - - # keep only 2 signicant digits for temperature index - fit_curves = fit.plate_raw_corr - fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) - fit_curves.to_csv(fit_curves_path, index=True) + fit = fit_data(intensities_file) + + fit_params_path = out_dir / ("params" + POLUS_TAB_EXT) + fit_curves_path = out_dir / ("values" + POLUS_TAB_EXT) + + # sort fit_params by row/column + fit_params = fit.plate_results + fit_params["_index"] = fit_params.index + fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) + fit_params["number"] = fit_params.apply( + lambda row: row._index[1:], + axis=1, + ).astype(int) + fit_params = fit_params.drop(columns="_index") + fit_params = fit_params.sort_values(["letter", "number"]) + fit_params = fit_params.drop(columns=["letter", "number"]) + fit_params.to_csv(fit_params_path, index=True) + + # keep only 2 signicant digits for temperature index + fit_curves = fit.plate_raw_corr + fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) + fit_curves.to_csv(fit_curves_path, index=True) if __name__ == "__main__": From 15be75388ee0edbcc2a69519c8be25ce71ed642d Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 06:33:20 -0400 Subject: [PATCH 11/18] =?UTF-8?q?Bump=20version:=200.1.0=20=E2=86=92=200.2?= =?UTF-8?q?.0-dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg | 2 +- regression/rt-cetsa-moltenprot-tool/README.md | 2 +- regression/rt-cetsa-moltenprot-tool/VERSION | 2 +- regression/rt-cetsa-moltenprot-tool/ict.yml | 4 ++-- regression/rt-cetsa-moltenprot-tool/plugin.json | 4 ++-- regression/rt-cetsa-moltenprot-tool/pyproject.toml | 2 +- regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl | 2 +- .../polus/tabular/regression/rt_cetsa_moltenprot/__init__.py | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg b/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg index b3d8c10..e58bfbe 100644 --- a/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg +++ b/regression/rt-cetsa-moltenprot-tool/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.2.0-dev0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/regression/rt-cetsa-moltenprot-tool/README.md b/regression/rt-cetsa-moltenprot-tool/README.md index 50ad929..4ebd854 100644 --- a/regression/rt-cetsa-moltenprot-tool/README.md +++ b/regression/rt-cetsa-moltenprot-tool/README.md @@ -1,4 +1,4 @@ -# RT_CETSA MoltenProt (v0.1.0) +# RT_CETSA MoltenProt (v0.2.0-dev0) This WIPP plugin runs moltenprot regression for the RT-CETSA pipeline. diff --git a/regression/rt-cetsa-moltenprot-tool/VERSION b/regression/rt-cetsa-moltenprot-tool/VERSION index 6e8bf73..ce0f6f8 100644 --- a/regression/rt-cetsa-moltenprot-tool/VERSION +++ b/regression/rt-cetsa-moltenprot-tool/VERSION @@ -1 +1 @@ -0.1.0 +0.2.0-dev0 diff --git a/regression/rt-cetsa-moltenprot-tool/ict.yml b/regression/rt-cetsa-moltenprot-tool/ict.yml index 3c4b744..fee0d6d 100644 --- a/regression/rt-cetsa-moltenprot-tool/ict.yml +++ b/regression/rt-cetsa-moltenprot-tool/ict.yml @@ -3,7 +3,7 @@ author: - Antoine Gerardin - Najib Ishaq contact: nick.schaub@nih.gov -container: polusai/rt-cetsa-moltproten-tool:0.1.0 +container: polusai/rt-cetsa-moltproten-tool:0.2.0-dev0 description: Run moltenprot regression for the RT-CETSA pipeline. entrypoint: python3 -m polus.tabular.regression.rt_cetsa_moltprot inputs: @@ -49,4 +49,4 @@ ui: key: inputs.preview title: Preview example output of this plugin type: checkbox -version: 0.1.0 +version: 0.2.0-dev0 diff --git a/regression/rt-cetsa-moltenprot-tool/plugin.json b/regression/rt-cetsa-moltenprot-tool/plugin.json index 034b34b..c55d213 100644 --- a/regression/rt-cetsa-moltenprot-tool/plugin.json +++ b/regression/rt-cetsa-moltenprot-tool/plugin.json @@ -1,6 +1,6 @@ { "name": "RT-CETSA Moltprot", - "version": "0.1.0", + "version": "0.2.0-dev0", "title": "RT-CETSA Moltprot", "description": "Run moltenprot for the RT-CETSA pipeline.", "author": "Nicholas Schaub (nick.schaub@nih.gov), Antoine Gerardin (antoine.gerardin@nih.gov), Najib Ishaq (najib.ishaq@nih.gov)", @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/tabular-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/rt-cetsa-moltenproten-tool:0.1.0", + "containerId": "polusai/rt-cetsa-moltenproten-tool:0.2.0-dev0", "baseCommand": [ "python3", "-m", diff --git a/regression/rt-cetsa-moltenprot-tool/pyproject.toml b/regression/rt-cetsa-moltenprot-tool/pyproject.toml index 51f1640..ee6d96b 100644 --- a/regression/rt-cetsa-moltenprot-tool/pyproject.toml +++ b/regression/rt-cetsa-moltenprot-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus_tabular_regression_rt_cetsa_moltenprot" -version = "0.1.0" +version = "0.2.0-dev0" description = "Run moltenprot for the RT-CETSA pipeline." authors = [ "Nick Schaub ", diff --git a/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl index 98f47eb..acae74f 100644 --- a/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl +++ b/regression/rt-cetsa-moltenprot-tool/rt_cetsa_moltenprot.cwl @@ -24,7 +24,7 @@ outputs: type: Directory requirements: DockerRequirement: - dockerPull: polusai/rt-cetsa-moltenprot-tool:0.1.0 + dockerPull: polusai/rt-cetsa-moltenprot-tool:0.2.0-dev0 InitialWorkDirRequirement: listing: - entry: $(inputs.outDir) diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py index 62b4e6c..c9c9edf 100644 --- a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py @@ -1,6 +1,6 @@ """RT_CETSA Moltprot Regression Tool.""" -__version__ = "0.1.0" +__version__ = "0.2.0-dev0" import os import pathlib From 74e7928cc15d63b07c9c61eb3ace71efe4f4beab Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 06:38:15 -0400 Subject: [PATCH 12/18] fix: add version in __init__.py --- .../src/polus/tabular/regression/rt_cetsa_analysis/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py index 98fb3e2..a4634cf 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py @@ -1 +1,3 @@ """RT CETSA Analysis.""" + +__version__ = "0.1.0" From a6f05046a0a76d327d8cd26c19ef90a3104a7e83 Mon Sep 17 00:00:00 2001 From: agerardin Date: Tue, 21 May 2024 06:38:19 -0400 Subject: [PATCH 13/18] =?UTF-8?q?Bump=20version:=200.1.0=20=E2=86=92=200.2?= =?UTF-8?q?.0-dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- regression/rt-cetsa-analysis-tool/.bumpversion.cfg | 2 +- regression/rt-cetsa-analysis-tool/README.md | 2 +- regression/rt-cetsa-analysis-tool/VERSION | 2 +- regression/rt-cetsa-analysis-tool/ict.yml | 4 ++-- regression/rt-cetsa-analysis-tool/plugin.json | 4 ++-- regression/rt-cetsa-analysis-tool/pyproject.toml | 2 +- regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl | 2 +- .../polus/tabular/regression/rt_cetsa_analysis/__init__.py | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/regression/rt-cetsa-analysis-tool/.bumpversion.cfg b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg index 3bc5c4c..daa7920 100644 --- a/regression/rt-cetsa-analysis-tool/.bumpversion.cfg +++ b/regression/rt-cetsa-analysis-tool/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.2.0-dev0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? diff --git a/regression/rt-cetsa-analysis-tool/README.md b/regression/rt-cetsa-analysis-tool/README.md index acde158..87fd543 100644 --- a/regression/rt-cetsa-analysis-tool/README.md +++ b/regression/rt-cetsa-analysis-tool/README.md @@ -1,4 +1,4 @@ -# RT_CETSA Analysis Tool (v0.1.0) +# RT_CETSA Analysis Tool (v0.2.0-dev0) This WIPP plugin runs statistical analysis for the RT-CETSA pipeline. diff --git a/regression/rt-cetsa-analysis-tool/VERSION b/regression/rt-cetsa-analysis-tool/VERSION index 6e8bf73..ce0f6f8 100644 --- a/regression/rt-cetsa-analysis-tool/VERSION +++ b/regression/rt-cetsa-analysis-tool/VERSION @@ -1 +1 @@ -0.1.0 +0.2.0-dev0 diff --git a/regression/rt-cetsa-analysis-tool/ict.yml b/regression/rt-cetsa-analysis-tool/ict.yml index 353fc84..86dc427 100644 --- a/regression/rt-cetsa-analysis-tool/ict.yml +++ b/regression/rt-cetsa-analysis-tool/ict.yml @@ -3,7 +3,7 @@ author: - Antoine Gerardin - Najib Ishaq contact: nick.schaub@nih.gov -container: polusai/rt-cetsa-analysis-simple-tool:0.1.0 +container: polusai/rt-cetsa-analysis-simple-tool:0.2.0-dev0 description: Run statisical analysis for the RT-CETSA pipeline. entrypoint: python3 -m polus.tabular.regression.rt_cetsa_analysis inputs: @@ -59,4 +59,4 @@ ui: key: inputs.preview title: Preview example output of this plugin type: checkbox -version: 0.1.0 +version: 0.2.0-dev0 diff --git a/regression/rt-cetsa-analysis-tool/plugin.json b/regression/rt-cetsa-analysis-tool/plugin.json index f16fd81..7f25700 100644 --- a/regression/rt-cetsa-analysis-tool/plugin.json +++ b/regression/rt-cetsa-analysis-tool/plugin.json @@ -1,6 +1,6 @@ { "name": "RT-CETSA Analysis", - "version": "0.1.0", + "version": "0.2.0-dev0", "title": "RT-CETSA Analysis", "description": "Run statistical analysis for the RT-CETSA pipeline.", "author": "Nicholas Schaub (nick.schaub@nih.gov), Antoine Gerardin (antoine.gerardin@nih.gov), Najib Ishaq (najib.ishaq@nih.gov)", @@ -8,7 +8,7 @@ "repository": "https://github.com/PolusAI/tabular-tools", "website": "https://ncats.nih.gov/preclinical/core/informatics", "citation": "", - "containerId": "polusai/rt-cetsa-analysis-simple-tool:0.1.0", + "containerId": "polusai/rt-cetsa-analysis-simple-tool:0.2.0-dev0", "baseCommand": [ "python3", "-m", diff --git a/regression/rt-cetsa-analysis-tool/pyproject.toml b/regression/rt-cetsa-analysis-tool/pyproject.toml index 323a14f..355a020 100644 --- a/regression/rt-cetsa-analysis-tool/pyproject.toml +++ b/regression/rt-cetsa-analysis-tool/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "polus_tabular_regression_rt_cetsa_analysis" -version = "0.1.0" +version = "0.2.0-dev0" description = "Run statiscal analysis for the RT-CETSA pipeline." authors = [ "Nick Schaub ", diff --git a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl index 149b3a3..ad3390f 100644 --- a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl +++ b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl @@ -36,7 +36,7 @@ requirements: envDef: WORKDIR: /opt/executables/ DockerRequirement: - dockerPull: polusai/rt-cetsa-analysis-simple-tool:0.1.0 + dockerPull: polusai/rt-cetsa-analysis-simple-tool:0.2.0-dev0 InitialWorkDirRequirement: listing: - entry: $(inputs.outDir) diff --git a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py index a4634cf..bb098b4 100644 --- a/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py +++ b/regression/rt-cetsa-analysis-tool/src/polus/tabular/regression/rt_cetsa_analysis/__init__.py @@ -1,3 +1,3 @@ """RT CETSA Analysis.""" -__version__ = "0.1.0" +__version__ = "0.2.0-dev0" From ec31dd9bd07e65677ede85a78c711d71102bc15b Mon Sep 17 00:00:00 2001 From: agerardin Date: Wed, 22 May 2024 04:30:42 -0400 Subject: [PATCH 14/18] chore: refactor code for dashboard integration. --- .../rt_cetsa_moltenprot/__init__.py | 34 ++++++++++++++++++- .../rt_cetsa_moltenprot/__main__.py | 19 ++--------- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py index c9c9edf..e3454f1 100644 --- a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__init__.py @@ -16,7 +16,39 @@ POLUS_TAB_EXT = os.environ.get("POLUS_TAB_EXT", ".csv") -def fit_data(file_path: pathlib.Path) -> pandas.DataFrame: +def run_moltenprot_fit( + file_path: pathlib.Path, +) -> tuple[pandas.DataFrame, pandas.DataFrame]: + """Run moltenprot. + + Args: + file_path : path to intensities file. + + Returns: + tuple of dataframe containing the fit_params and the curve values. + """ + fit = fit_data(file_path) + + # sort fit_params by row/column + fit_params = fit.plate_results + fit_params["_index"] = fit_params.index + fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) + fit_params["number"] = fit_params.apply( + lambda row: row._index[1:], + axis=1, + ).astype(int) + fit_params = fit_params.drop(columns="_index") + fit_params = fit_params.sort_values(["letter", "number"]) + fit_params = fit_params.drop(columns=["letter", "number"]) + + # keep only 2 signicant digits for temperature index + fit_curves = fit.plate_raw_corr + fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) + + return fit_params, fit_curves + + +def fit_data(file_path: pathlib.Path) -> core.MoltenProtFit: """Fit data to a model using Moltprot.""" fit = core.MoltenProtFit( filename=file_path, diff --git a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py index 283e787..5cd9c49 100644 --- a/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py +++ b/regression/rt-cetsa-moltenprot-tool/src/polus/tabular/regression/rt_cetsa_moltenprot/__main__.py @@ -6,7 +6,7 @@ import pathlib import typer -from polus.tabular.regression.rt_cetsa_moltenprot import fit_data +from polus.tabular.regression.rt_cetsa_moltenprot import run_moltenprot_fit # Initialize the logger logging.basicConfig( @@ -82,27 +82,12 @@ def main( json.dump(out_json, f, indent=2) return - fit = fit_data(intensities_file) + fit_params, fit_curves = run_moltenprot_fit(intensities_file) fit_params_path = out_dir / ("params" + POLUS_TAB_EXT) fit_curves_path = out_dir / ("values" + POLUS_TAB_EXT) - # sort fit_params by row/column - fit_params = fit.plate_results - fit_params["_index"] = fit_params.index - fit_params["letter"] = fit_params.apply(lambda row: row._index[:1], axis=1) - fit_params["number"] = fit_params.apply( - lambda row: row._index[1:], - axis=1, - ).astype(int) - fit_params = fit_params.drop(columns="_index") - fit_params = fit_params.sort_values(["letter", "number"]) - fit_params = fit_params.drop(columns=["letter", "number"]) fit_params.to_csv(fit_params_path, index=True) - - # keep only 2 signicant digits for temperature index - fit_curves = fit.plate_raw_corr - fit_curves.index = fit_curves.index.map(lambda t: round(t, 2)) fit_curves.to_csv(fit_curves_path, index=True) From e17bb38b0af582bf2680e5f7cf933d07537420d4 Mon Sep 17 00:00:00 2001 From: agerardin Date: Wed, 22 May 2024 06:29:44 -0400 Subject: [PATCH 15/18] dev: add tests to satisfy git actions Please enter the commit message for your changes. Lines starting --- .../rt-cetsa-moltenprot-tool/tests/__init__.py | 1 + .../rt-cetsa-moltenprot-tool/tests/conftest.py | 13 +++++++++++++ .../tests/test_moltenprot.py | 17 +++++++++++++++++ 3 files changed, 31 insertions(+) create mode 100644 regression/rt-cetsa-moltenprot-tool/tests/__init__.py create mode 100644 regression/rt-cetsa-moltenprot-tool/tests/conftest.py create mode 100644 regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py diff --git a/regression/rt-cetsa-moltenprot-tool/tests/__init__.py b/regression/rt-cetsa-moltenprot-tool/tests/__init__.py new file mode 100644 index 0000000..d420712 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests.""" diff --git a/regression/rt-cetsa-moltenprot-tool/tests/conftest.py b/regression/rt-cetsa-moltenprot-tool/tests/conftest.py new file mode 100644 index 0000000..4983315 --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/conftest.py @@ -0,0 +1,13 @@ +"""Set up.""" +import pytest + + +def pytest_addoption(parser: pytest.Parser) -> None: + """Add options to pytest.""" + parser.addoption( + "--slow", + action="store_true", + dest="slow", + default=False, + help="run slow tests", + ) diff --git a/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py b/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py new file mode 100644 index 0000000..80816cc --- /dev/null +++ b/regression/rt-cetsa-moltenprot-tool/tests/test_moltenprot.py @@ -0,0 +1,17 @@ +"""Tests.""" +from polus.tabular.regression.rt_cetsa_moltenprot import run_moltenprot_fit +from pathlib import Path + +import pytest + + +@pytest.mark.skipif("not config.getoption('slow')") +def test_moltenprot(): + path = Path(__file__).parent / "data" / "plate_(1-58).csv" + params, values = run_moltenprot_fit(path) + assert params is not None + assert values is not None + + +def test_dummy_test(): + pass From d54b950af2ad348adfc95e21b7fc69fe658b9376 Mon Sep 17 00:00:00 2001 From: agerardin Date: Wed, 22 May 2024 08:57:38 -0400 Subject: [PATCH 16/18] fix: values and params and not required. --- regression/rt-cetsa-analysis-tool/build-docker.sh | 2 +- regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl | 4 ++-- .../polus/tabular/regression/rt_cetsa_analysis/__main__.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/regression/rt-cetsa-analysis-tool/build-docker.sh b/regression/rt-cetsa-analysis-tool/build-docker.sh index aaecd7a..93d9b45 100755 --- a/regression/rt-cetsa-analysis-tool/build-docker.sh +++ b/regression/rt-cetsa-analysis-tool/build-docker.sh @@ -1,4 +1,4 @@ #!/bin/bash version=$( Date: Wed, 22 May 2024 08:58:09 -0400 Subject: [PATCH 17/18] fix: values and params and not required. --- regression/rt-cetsa-analysis-tool/ict.yml | 4 ++-- regression/rt-cetsa-analysis-tool/plugin.json | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/regression/rt-cetsa-analysis-tool/ict.yml b/regression/rt-cetsa-analysis-tool/ict.yml index 86dc427..1918ef8 100644 --- a/regression/rt-cetsa-analysis-tool/ict.yml +++ b/regression/rt-cetsa-analysis-tool/ict.yml @@ -11,13 +11,13 @@ inputs: format: - params name: params - required: true + required: false type: string - description: Baseline corrected csv files format: - values name: values - required: true + required: false type: string - description: platemap excel file format: diff --git a/regression/rt-cetsa-analysis-tool/plugin.json b/regression/rt-cetsa-analysis-tool/plugin.json index 7f25700..0bbb6b9 100644 --- a/regression/rt-cetsa-analysis-tool/plugin.json +++ b/regression/rt-cetsa-analysis-tool/plugin.json @@ -18,14 +18,12 @@ { "name": "params", "type": "genericData", - "description": "Fit params csv file", - "required": true + "description": "Fit params csv file" }, { "name": "values", "type": "genericData", - "description": "Baseline corrected csv files", - "required": true + "description": "Baseline corrected csv files" }, { "name": "platemap", From bff4e9d4df644f55c8daab15dc7022901d65413d Mon Sep 17 00:00:00 2001 From: agerardin Date: Wed, 22 May 2024 12:00:05 -0400 Subject: [PATCH 18/18] fix cwl. --- regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl index c8f4fc9..fed422f 100644 --- a/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl +++ b/regression/rt-cetsa-analysis-tool/rt_cetsa_analysis.cwl @@ -36,7 +36,7 @@ requirements: envDef: WORKDIR: /opt/executables/ DockerRequirement: - dockerPull: polusai/rt-cetsa-analysis-simple-tool:0.2.0-dev0 + dockerPull: polusai/rt-cetsa-analysis-tool:0.2.0-dev0 InitialWorkDirRequirement: listing: - entry: $(inputs.outDir)