acorg · terrycojones · Apr 9, 2026
diff --git a/bin/mid-cluster.py b/bin/mid-cluster.py
@@ -60,7 +60,7 @@
         help=(
             "The approach to use when making a consensus if there are no reads "
             "covering a site. A value of 'N' means to use an ambigous N nucleotide "
-            "code, whereas a value fo 'reference' means to take the base from the "
+            "code, whereas a value of 'reference' means to take the base from the "
             "reference sequence."
         ),
     )
@@ -71,6 +71,27 @@
         list(chain.from_iterable(args.referenceId)) if args.referenceId else None
     )
 
+    # The logic of the below is a little hard to follow. There are three steps:
+    #
+    # 1. Make a read analysis with a 'run' method (which we don't call yet).
+    #
+    # 2. Set up a Cluster Analysis, giving it the read analysis (so the running cluster
+    #    analysis can get some parameters and produce reporting information when the
+    #    read analysis is run.
+    #
+    # 3. Call the read analysis 'run' method (see 1 above), passing it the function from
+    #    the cluster analysis that can analyze a reference.
+    #
+    # Things are done in this (seemingly?) convoluted way because I implemented several
+    # methods for doing the main work, of which the cluster analysis is just one. They
+    # all needed to work in the same way, and so I separated the organizational things
+    # (like making output directories and collecting final / overall results) into the
+    # common read analysis class, and the code that does the actual analysis for an
+    # input alignment file / reference pair. If you look at the 'run' method of the
+    # ReadAnalysis class you'll see it's basically just a loop over alignment file /
+    # reference pairs, each with some setup and then calling the cluster analysis
+    # function. Then some final gathering of overall results.
+
     analysis = ReadAnalysis(
         args.sampleName,
         list(chain.from_iterable(args.alignmentFile)),

diff --git a/src/midtools/analysis.py b/src/midtools/analysis.py
@@ -3,7 +3,7 @@
 from pathlib import Path
 from itertools import chain
 from collections import defaultdict
-from typing import Optional, Callable
+from typing import Callable
 
 from dark.dna import compareDNAReads
 from dark.process import Executor
@@ -71,7 +71,7 @@ def __init__(
         alignmentFiles: list[str],
         referenceGenomeFiles: list[str],
         outputDir: str,
-        referenceIds: Optional[list[str]] = None,
+        referenceIds: list[str] | None = None,
         minReads: int = DEFAULT_MIN_READS,
         homogeneousCutoff: float = DEFAULT_HOMOGENEOUS_CUTOFF,
         plotSAM: bool = False,

diff --git a/src/midtools/clusterAnalysis.py b/src/midtools/clusterAnalysis.py
@@ -13,7 +13,7 @@
     from midtools.analysis import ReadAnalysis
 from midtools.clusters import ReadCluster, ReadClusters
 from midtools.match import matchToString
-from midtools.offsets import analyzeOffets, findSignificantOffsets, OffsetBases
+from midtools.offsets import analyzeOffsets, findSignificantOffsets, OffsetBases
 from midtools.plotting import plotBaseFrequencies, plotConsistentComponents
 from midtools.read import AlignedRead
 from midtools.reference import Reference
@@ -349,9 +349,10 @@ def saveConsensuses(
                 "      Saving component %d consensus info to %s"
                 % (count, consensusFilename, count, infoFilename)
             )
-        with open(consensusFilename, "w") as consensusFp, open(
-            infoFilename, "w"
-        ) as infoFp:
+        with (
+            open(consensusFilename, "w") as consensusFp,
+            open(infoFilename, "w") as infoFp,
+        ):
             # First write the reference sequence for this component.
             (reference,) = list(
                 FastaReads(outputDir / ("reference-component-%d.fasta" % count))
@@ -481,7 +482,7 @@ def __init__(
 
     def analyzeReference(self, reference: Reference) -> None:
         """
-        Analyze a reference.
+        Analyze a reference. This is called by the read analysis.
 
         @param reference: A C{Reference} instance to analyze.
         """
@@ -517,10 +518,9 @@ def findConnectedComponents(self, reference: Reference) -> list[Component]:
         Find all connected components.
 
         @param reference: A C{Reference} instance to analyze.
-        @return: A C{list} of C{Component} instances,
-            sorted by component (the smallest offset is used for sorting
-            so this gives the components from left to right along the
-            reference genome.
+        @return: A C{list} of C{Component} instances, sorted by component. The smallest
+            offset is used for sorting, which arranges the components from left to right
+            along the reference genome.
         """
         significantReads = set(
             read for read in reference.alignedReads if read.significantOffsets
@@ -696,7 +696,7 @@ def scoreCcs(
             return result
 
         def partitionCcs(
-            scoredCcs: list[tuple[float, int, ConsistentComponent]]
+            scoredCcs: list[tuple[float, int, ConsistentComponent]],
         ) -> tuple[
             set[tuple[int, ConsistentComponent]], set[tuple[int, ConsistentComponent]]
         ]:
@@ -809,8 +809,10 @@ def partitionCcs(
 
             # Get the base counts at each offset, from the full set of reads minus those
             # we're not using.
-            (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = analyzeOffets(
-                referenceLength, set(reference.alignedReads) - unwantedReads
+            (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = (
+                analyzeOffsets(
+                    referenceLength, set(reference.alignedReads) - unwantedReads
+                )
             )
             remainingOffsets = sorted(set(range(referenceLength)) - offsetsDone)
 

diff --git a/src/midtools/connectedComponentAnalysis.py b/src/midtools/connectedComponentAnalysis.py
@@ -9,7 +9,7 @@
 from dark.sam import samfile
 
 from midtools.analysis import ReadAnalysis
-from midtools.offsets import analyzeOffets, findSignificantOffsets
+from midtools.offsets import analyzeOffsets, findSignificantOffsets
 from midtools.match import matchToString
 from midtools.plotting import plotBaseFrequencies, plotConsistentComponents
 from midtools.utils import (
@@ -107,8 +107,7 @@ def consensusSequence(self, componentOffsets, referenceSequence, infoFp):
                     self.nucleotides[offset],
                     referenceBase,
                     infoFp,
-                    "WARNING: consensus draw at offset %d" % offset
-                    + " %(baseCounts)s.",
+                    f"WARNING: consensus draw at offset {offset} %(baseCounts)s.",
                 )
             else:
                 base = "-"
@@ -323,9 +322,10 @@ def saveConsensuses(self, outputDir, count, referenceSequence, verbose):
                 "      Saving component %d consensus info to %s"
                 % (count, consensusFilename, count, infoFilename)
             )
-        with open(consensusFilename, "w") as consensusFp, open(
-            infoFilename, "w"
-        ) as infoFp:
+        with (
+            open(consensusFilename, "w") as consensusFp,
+            open(infoFilename, "w") as infoFp,
+        ):
             # First write the reference sequence for this component.
             (reference,) = list(
                 FastaReads(join(outputDir, "reference-component-%d.fasta" % count))
@@ -804,8 +804,8 @@ def bestConsistentComponent(component, reference, fp):
                         bestCc.nucleotides[offset],
                         referenceBase,
                         infoFp,
-                        ("      WARNING: base count draw at offset %d " % offset)
-                        + " %(baseCounts)s.",
+                        f"      WARNING: base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     consensus[offset] = base
                     offsetsDone.add(offset)
@@ -864,7 +864,7 @@ def bestConsistentComponent(component, reference, fp):
                 consensusReadCountAtOffset,
                 consensusWantedReadsBaseCountAtOffset,
                 _,
-            ) = analyzeOffets(genomeLength, set(alignedReads) - unwantedCcReads)
+            ) = analyzeOffsets(genomeLength, set(alignedReads) - unwantedCcReads)
 
             depthFile = join(outputDir, "consensus-depth.txt")
             self.report("    Writing consensus depth information to", depthFile)
@@ -891,11 +891,8 @@ def bestConsistentComponent(component, reference, fp):
                         baseCount,
                         referenceBase,
                         infoFp,
-                        (
-                            "    WARNING: consensus base count draw at "
-                            "offset %d" % offset
-                        )
-                        + " %(baseCounts)s.",
+                        f"    WARNING: consensus base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     print(
                         "  Offset %d: %s from nucleotides %s"
@@ -934,11 +931,8 @@ def bestConsistentComponent(component, reference, fp):
                         baseCount,
                         referenceBase,
                         infoFp,
-                        (
-                            "    WARNING: consensus base count draw at "
-                            "offset %d" % offset
-                        )
-                        + " %(baseCounts)s.",
+                        f"    WARNING: consensus base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     print(
                         "  Offset %d: %s from nucleotides %s"
@@ -1258,8 +1252,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
                         bestCc.nucleotides[offset],
                         referenceBase,
                         infoFp,
-                        ("    WARNING: base count draw at offset %d " % offset)
-                        + " %(baseCounts)s.",
+                        f"    WARNING: base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     if base == referenceBase:
                         mismatch = ""
@@ -1268,11 +1262,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
                             baseCountAtOffset[offset],
                             referenceBase,
                             infoFp,
-                            (
-                                "    WARNING: consensus base count draw at "
-                                "offset %d " % offset
-                            )
-                            + " %(baseCounts)s.",
+                            f"    WARNING: consensus base count draw at offset {offset} "
+                            "%(baseCounts)s.",
                         )
                         mismatch = (
                             " (mismatch: reference has %s, all-read "
@@ -1313,7 +1304,7 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
             # aligned reads minus the reads we don't want because they're
             # in a consistent component that is not the best for this
             # non-reference sequence.
-            consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffets(
+            consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffsets(
                 genomeLength, set(alignedReads) - unwantedCcReads
             )
 
@@ -1344,11 +1335,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
                         baseCount,
                         referenceBase,
                         infoFp,
-                        (
-                            "    WARNING: consensus base count draw at "
-                            "offset %d" % offset
-                        )
-                        + " %(baseCounts)s.",
+                        f"    WARNING: consensus base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     print(
                         "  Offset %d: %s from nucleotides %s"
@@ -1387,11 +1375,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
                         baseCount,
                         referenceBase,
                         infoFp,
-                        (
-                            "    WARNING: consensus base count draw at "
-                            "offset %d" % offset
-                        )
-                        + " %(baseCounts)s.",
+                        f"    WARNING: consensus base count draw at offset {offset} "
+                        "%(baseCounts)s.",
                     )
                     print(
                         "  Offset %d: %s from nucleotides %s"

diff --git a/src/midtools/offsets.py b/src/midtools/offsets.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 from collections import Counter
-from concurrent.futures import ProcessPoolExecutor
 from typing import Iterator, Optional
 
 from midtools.read import AlignedRead
@@ -161,48 +160,36 @@ def highestFrequenciesMultiple(a: OffsetBases, b: OffsetBases) -> Optional[float
             return orderedCounts[0][1] / orderedCounts[1][1]
 
 
-def analyzeOffets(
+def analyzeOffsets(
     genomeLength: int, alignedReads: list[AlignedRead]
 ) -> tuple[list[int], list[Counter[str]], list[set[AlignedRead]]]:
     """
     Analyze the aligned reads.
 
-    @param genomeLength: The C{int} length of the genome the reads were
-        aligned to.
+    @param genomeLength: The C{int} length of the genome the reads were aligned to.
     @param alignedReads: A C{list} of C{AlignedRead} instances.
     @return: A tuple of C{list}s (readCountAtOffset, baseCountAtOffset,
         readsAtOffset), each indexed from zero to the genome length.
     """
-    readCountAtOffset = dict()
-    baseCountAtOffset = dict()
-    readsAtOffset = dict()
-
-    offsets = list(range(genomeLength))
-    with ProcessPoolExecutor() as executor:
-        for offset, (reads, counts) in zip(
-            offsets, executor.map(processOffset, alignedReads, offsets)
-        ):
-            baseCountAtOffset[offset] = counts
-            readCountAtOffset[offset] = sum(counts.values())
-            readsAtOffset[offset] = reads
-
-    return (
-        [x for _, x in sorted(readCountAtOffset.items())],
-        [x for _, x in sorted(baseCountAtOffset.items())],
-        [x for _, x in sorted(readsAtOffset.items())],
-    )
+    readCountAtOffset = []
+    baseCountAtOffset = []
+    readsAtOffset = []
 
-
-def processOffset(alignedReads, offset):
     nucleotides = set("ACGT")
-    reads = set()
-    counts: Counter[str] = Counter({n: 0 for n in nucleotides})
-    for read in alignedReads:
-        base = read.base(offset)
-        if base in nucleotides:
-            counts[base] += 1
-            reads.add(read)
-    return reads, counts
+
+    for offset in range(genomeLength):
+        reads = set()
+        counts: Counter[str] = Counter()
+        for read in alignedReads:
+            base = read.base(offset)
+            if base in nucleotides:
+                counts[base] += 1
+                reads.add(read)
+        baseCountAtOffset.append(counts)
+        readCountAtOffset.append(sum(counts.values()))
+        readsAtOffset.append(reads)
+
+    return readCountAtOffset, baseCountAtOffset, readsAtOffset
 
 
 def findSignificantOffsets(

diff --git a/src/midtools/options.py b/src/midtools/options.py
@@ -5,7 +5,7 @@
 
 from dark.sam import SAMFilter, PaddedSAM
 
-from midtools.offsets import analyzeOffets, findSignificantOffsets
+from midtools.offsets import analyzeOffsets, findSignificantOffsets
 from midtools.read import AlignedRead
 
 
@@ -120,7 +120,7 @@ def parseCommandLineOptions(
         for alignedRead in executor.map(constructRead, queries):
             alignedReads.append(alignedRead)
 
-    readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffets(
+    readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffsets(
         genomeLength, alignedReads
     )
 
@@ -227,5 +227,5 @@ def addAnalysisCommandLineOptions(parser: ArgumentParser) -> None:
         "--verbose",
         type=int,
         default=0,
-        help=("The integer verbosity level (0 = no output, 1 = some output, etc)."),
+        help="The integer verbosity level (0 = no output, 1 = some output, etc).",
     )
diff --git a/src/midtools/read.py b/src/midtools/read.py
@@ -1,7 +1,5 @@
 from __future__ import annotations
 
-from typing import Optional
-
 from dark.reads import Read
 
 
@@ -16,7 +14,7 @@ class AlignedRead(Read):
     """
 
     def __init__(
-        self, id_: str, sequence: str, alignment: Optional[dict[str, str | bool]] = None
+        self, id_: str, sequence: str, alignment: dict[str, str | bool] | None = None
     ) -> None:
         self.significantOffsets: dict[int, str] = {}
         self._originalLength = len(sequence)
@@ -119,7 +117,7 @@ def setSignificantOffsets(self, significantOffsets: list[int]) -> None:
                 newSignificantOffsets[offset] = base
         self.significantOffsets = newSignificantOffsets
 
-    def base(self, n: int) -> Optional[str]:
+    def base(self, n: int) -> str | None:
         """
         Get the nucleotide base at a given offset.