barricklab · kevin99111 · May 15, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
diff --git a/.github/workflows/package_and_test.yml b/.github/workflows/package_and_test.yml
@@ -28,10 +28,10 @@ jobs:
         with:
           python-version: 3.12
 
-      - name: install efmcalculator
+      - name: install efmcalculator2
         run: |
           pip install ./
 
-      - name: test efmcalculator
+      - name: test efmcalculator2
         run: |
           python -m unittest
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
 # Auto-generated files
-efmcalculator/_version.py
+efmcalculator2/_version.py
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -1,16 +1,16 @@
-[![Status](https://github.com/barricklab/efm-calculator2/actions/workflows/package_and_test.yml/badge.svg)](https://github.com/barricklab/efm-calculator2/actions/workflows/package_and_test.yml)
+[![Stantus](https://github.com/barricklab/efmcalculator2/actions/workflows/package_and_test.yml/badge.svg)](https://github.com/barricklab/efmcalculator2/actions/workflows/package_and_test.yml)
 
-`efmcalculator` is a Python package or web tool for detecting mutational hotspots. It predicts the mutation rates associated with each hotspot and combines them into a relative instability score. These hotspots include simple sequence repeats, repeat mediated deletions, and short repeat sequences. This code updates and improves upon the last version of the [EFM calculator](https://github.com/barricklab/efm-calculator).
+`efmcalculator2` is a Python package or web tool for detecting mutational hotspots. It predicts the mutation rates associated with each hotspot and combines them into a relative instability score. These hotspots include simple sequence repeats, repeat mediated deletions, and short repeat sequences. This code updates and improves upon the last version of the [EFM calculator](https://github.com/barricklab/efm-calculator).
 
-`efmcalculator` supports multifasta, genbank, or csv files as input and accepts parameters from the command line. It also supports the scanning of both linear and circular sequences. It defaults to a pairwise comparison strategy (all occurrences of a repeat are compared with all other occurrences), but it also contains an option for a linear comparison strategy (each occurrence of a repeat is only compared with the next occurrence in the sequence) to accelerate the analysis of large sequences.
+`efmcalculator2` supports multifasta, genbank, or csv files as input and accepts parameters from the command line. It also supports the scanning of both linear and circular sequences. It defaults to a pairwise comparison strategy (all occurrences of a repeat are compared with all other occurrences), but it also contains an option for a linear comparison strategy (each occurrence of a repeat is only compared with the next occurrence in the sequence) to accelerate the analysis of large sequences.
 
 
 # Installation
 The EFM Calculator can be accessed as a free web tool at efm2-beta.streamlit.app. It is limited to 50000 bases to ensure the app remains performant for other users.
 It can be installed and run locally below without such base restriction.
 
 ## From pip:
-`pip install efmcalculator` or clone this repository and `pip install ./` from the root of the repository.
+`pip install efmcalculator2` or clone this repository and `pip install ./` from the root of the repository.
 
 # Command Line Usage
 - -h: help
@@ -24,17 +24,17 @@ It can be installed and run locally below without such base restriction.
 - -v: verbose. 0 (silent), 1 (basic information), 2 (debug)
 - --summary: saves only aggrigate results, useful for very tall inputs
 
-Print efmcalculator help:
+Print efmcalculator2 help:
 ```
-efmcalculator -h
+efmcalculator2 -h
 ```
 
-Run efmcalculator on all sequences in a FASTA file using the pairwise strategy and print output to csv files within an output folder:
+Run efmcalculator2 on all sequences in a FASTA file using the pairwise strategy and print output to csv files within an output folder:
 ```
-efmcalculator -i “input.fasta” -o “output_folder”
+efmcalculator2 -i “input.fasta” -o “output_folder”
 ```
 
-Run efmcalculator on all sequences in a FASTA file, outputing to the folder output_folder, while treating the input as circular, searching with a linear pattern, and printing debug information:
+Run efmcalculator2 on all sequences in a FASTA file, outputing to the folder output_folder, while treating the input as circular, searching with a linear pattern, and printing debug information:
 ```
-efmcalculator -i “input.fasta” -o “output_folder” -c -s “linear” -v 2
+efmcalculator2 -i “input.fasta” -o “output_folder” -c -s “linear” -v 2
 ```
diff --git a/efmcalculator/StateMachine.py → efmcalculator2/StateMachine.py b/efmcalculator/StateMachine.py → efmcalculator2/StateMachine.py
@@ -10,6 +10,8 @@
 import multiprocessing as mp
 from .pipeline.mutation_rates import rip_score
 from .webapp.SequenceState import SequenceState
+from .utilities import sanitize_filename
+from copy import deepcopy
 
 class ThreadSafeBar(Bar):
     def __init__(self, *args, **kwargs):
@@ -53,10 +55,13 @@ def import_sequences(self, sequences, max_size=None, webapp = False):
         """Import newly uploaded sequences while retaining state of existing sequences"""
         # Import sequences without overwriting old ones
         new = {seq._originhash: seq for seq in sequences}
+        retained_states = {}
         for key in new:
             if key in self.user_sequences:
                 new[key] = self.user_sequences[key]
-        if new == self.user_sequences:
+                if webapp:
+                    retained_states[key] = deepcopy(self.sequencestates[key])
+        if new.keys() == self.user_sequences.keys():
             return
         self.user_sequences = new
 
@@ -65,7 +70,9 @@ def import_sequences(self, sequences, max_size=None, webapp = False):
 
         # Make webapp states
         if webapp:
-            self.sequencestates = {key: SequenceState(value) for key, value in self.user_sequences.items()}
+            self.sequencestates = {key: SequenceState(value) for key, value in self.user_sequences.items() if key not in retained_states.keys()}
+            self.sequencestates.update(retained_states)
+
 
         # Update sequence names
         self.named_sequences = {}
@@ -78,6 +85,7 @@ def import_sequences(self, sequences, max_size=None, webapp = False):
             self.named_sequences[sequence_name] = seqhash
 
     def predict_tall(self, outpath, strategy, filetype, threads, keepmem=False, summaryonly=False):
+        outpath = sanitize_filename(outpath)
         samples = []
         for seqname in self.named_sequences:
             seqhash = self.named_sequences[seqname]
@@ -117,6 +125,7 @@ def predict_tall(self, outpath, strategy, filetype, threads, keepmem=False, summ
                 summary_df.write_csv(summarypath)
 
     def save_results(self, folderpath, prediction_style = None, filetype = "parquet", summaryonly=False):
+        folderpath = sanitize_filename(folderpath)
         summary_df = pl.DataFrame([
             pl.Series("name", [], dtype=pl.String),
             pl.Series("ssr_sum", [], dtype=pl.Float64),
@@ -157,7 +166,7 @@ def save_results(self, folderpath, prediction_style = None, filetype = "parquet"
             srss = seqobj.srss.select(pl.exclude(["predid", "annotationobjects"]))
             rmds = seqobj.rmds.select(pl.exclude(["predid", "annotationobjects"]))
 
-            folder = os.path.join(folderpath, f"{seqname}")
+            folder = os.path.join(folderpath, sanitize_filename(f"{seqname}"))
             path = pathlib.Path(folder)
             path.mkdir(parents=True)
             if filetype == "parquet":

diff --git a/efmcalculator/__init__.py → efmcalculator2/__init__.py b/efmcalculator/__init__.py → efmcalculator2/__init__.py
diff --git a/efmcalculator/cli.py → efmcalculator2/cli.py b/efmcalculator/cli.py → efmcalculator2/cli.py
@@ -10,7 +10,6 @@
 from Bio.SeqRecord import SeqRecord
 
 from .utilities import (
-    is_path_creatable,
     is_pathname_valid,
 )
 from .ingest.EFMSequence import EFMSequence
@@ -160,7 +159,9 @@ def main():
     elif not is_pathname_valid(args.outpath):
         logger.error(f"File {args.outpath} is not a valid path.")
         exit(1)
-    elif not is_path_creatable(args.outpath):
+    try:
+        os.makedirs(args.outpath, exist_ok=True)
+    except:
         logger.error(f"Cannot write to {args.outpath}")
         exit(1)
 
@@ -201,6 +202,8 @@ def main():
 
     # Unpack sequences into list ---------
     sequences = list(sequences)
+    for seq in sequences:
+        seq.oneindex = True
 
     # Run EFM Calculator ----------------
     statemachine = StateMachine()
@@ -234,7 +237,7 @@ def main():
     t_min, t_sec = divmod(t_sec, 60)
     t_hour, t_min = divmod(t_min, 60)
     logger.info(
-        f"EFMCalculator completed in {t_hour:02d}h:{t_min:02d}m:{t_sec:02d}s:{t_msec:02d}ms"
+        f"EFMCalculator2 completed in {t_hour:02d}h:{t_min:02d}m:{t_sec:02d}s:{t_msec:02d}ms"
     )
 
 if __name__ == "__main__":

diff --git a/efmcalculator/constants.py → efmcalculator2/constants.py b/efmcalculator/constants.py → efmcalculator2/constants.py
diff --git a/efmcalculator/data/__init__.py → efmcalculator2/data/__init__.py b/efmcalculator/data/__init__.py → efmcalculator2/data/__init__.py
diff --git a/efmcalculator/data/gam_df.csv → efmcalculator2/data/gam_df.csv b/efmcalculator/data/gam_df.csv → efmcalculator2/data/gam_df.csv