Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion bin/mid-cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
help=(
"The approach to use when making a consensus if there are no reads "
"covering a site. A value of 'N' means to use an ambigous N nucleotide "
"code, whereas a value fo 'reference' means to take the base from the "
"code, whereas a value of 'reference' means to take the base from the "
"reference sequence."
),
)
Expand All @@ -71,6 +71,27 @@
list(chain.from_iterable(args.referenceId)) if args.referenceId else None
)

# The logic of the below is a little hard to follow. There are three steps:
#
# 1. Make a read analysis with a 'run' method (which we don't call yet).
#
# 2. Set up a Cluster Analysis, giving it the read analysis (so the running cluster
# analysis can get some parameters and produce reporting information when the
# read analysis is run.
#
# 3. Call the read analysis 'run' method (see 1 above), passing it the function from
# the cluster analysis that can analyze a reference.
#
# Things are done in this (seemingly?) convoluted way because I implemented several
# methods for doing the main work, of which the cluster analysis is just one. They
# all needed to work in the same way, and so I separated the organizational things
# (like making output directories and collecting final / overall results) into the
# common read analysis class, and the code that does the actual analysis for an
# input alignment file / reference pair. If you look at the 'run' method of the
# ReadAnalysis class you'll see it's basically just a loop over alignment file /
# reference pairs, each with some setup and then calling the cluster analysis
# function. Then some final gathering of overall results.

analysis = ReadAnalysis(
args.sampleName,
list(chain.from_iterable(args.alignmentFile)),
Expand Down
4 changes: 2 additions & 2 deletions src/midtools/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
from itertools import chain
from collections import defaultdict
from typing import Optional, Callable
from typing import Callable

from dark.dna import compareDNAReads
from dark.process import Executor
Expand Down Expand Up @@ -71,7 +71,7 @@ def __init__(
alignmentFiles: list[str],
referenceGenomeFiles: list[str],
outputDir: str,
referenceIds: Optional[list[str]] = None,
referenceIds: list[str] | None = None,
minReads: int = DEFAULT_MIN_READS,
homogeneousCutoff: float = DEFAULT_HOMOGENEOUS_CUTOFF,
plotSAM: bool = False,
Expand Down
26 changes: 14 additions & 12 deletions src/midtools/clusterAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from midtools.analysis import ReadAnalysis
from midtools.clusters import ReadCluster, ReadClusters
from midtools.match import matchToString
from midtools.offsets import analyzeOffets, findSignificantOffsets, OffsetBases
from midtools.offsets import analyzeOffsets, findSignificantOffsets, OffsetBases
from midtools.plotting import plotBaseFrequencies, plotConsistentComponents
from midtools.read import AlignedRead
from midtools.reference import Reference
Expand Down Expand Up @@ -349,9 +349,10 @@ def saveConsensuses(
" Saving component %d consensus info to %s"
% (count, consensusFilename, count, infoFilename)
)
with open(consensusFilename, "w") as consensusFp, open(
infoFilename, "w"
) as infoFp:
with (
open(consensusFilename, "w") as consensusFp,
open(infoFilename, "w") as infoFp,
):
# First write the reference sequence for this component.
(reference,) = list(
FastaReads(outputDir / ("reference-component-%d.fasta" % count))
Expand Down Expand Up @@ -481,7 +482,7 @@ def __init__(

def analyzeReference(self, reference: Reference) -> None:
"""
Analyze a reference.
Analyze a reference. This is called by the read analysis.

@param reference: A C{Reference} instance to analyze.
"""
Expand Down Expand Up @@ -517,10 +518,9 @@ def findConnectedComponents(self, reference: Reference) -> list[Component]:
Find all connected components.

@param reference: A C{Reference} instance to analyze.
@return: A C{list} of C{Component} instances,
sorted by component (the smallest offset is used for sorting
so this gives the components from left to right along the
reference genome.
@return: A C{list} of C{Component} instances, sorted by component. The smallest
offset is used for sorting, which arranges the components from left to right
along the reference genome.
"""
significantReads = set(
read for read in reference.alignedReads if read.significantOffsets
Expand Down Expand Up @@ -696,7 +696,7 @@ def scoreCcs(
return result

def partitionCcs(
scoredCcs: list[tuple[float, int, ConsistentComponent]]
scoredCcs: list[tuple[float, int, ConsistentComponent]],
) -> tuple[
set[tuple[int, ConsistentComponent]], set[tuple[int, ConsistentComponent]]
]:
Expand Down Expand Up @@ -809,8 +809,10 @@ def partitionCcs(

# Get the base counts at each offset, from the full set of reads minus those
# we're not using.
(wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = analyzeOffets(
referenceLength, set(reference.alignedReads) - unwantedReads
(wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset, _) = (
analyzeOffsets(
referenceLength, set(reference.alignedReads) - unwantedReads
)
)
remainingOffsets = sorted(set(range(referenceLength)) - offsetsDone)

Expand Down
59 changes: 22 additions & 37 deletions src/midtools/connectedComponentAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from dark.sam import samfile

from midtools.analysis import ReadAnalysis
from midtools.offsets import analyzeOffets, findSignificantOffsets
from midtools.offsets import analyzeOffsets, findSignificantOffsets
from midtools.match import matchToString
from midtools.plotting import plotBaseFrequencies, plotConsistentComponents
from midtools.utils import (
Expand Down Expand Up @@ -107,8 +107,7 @@ def consensusSequence(self, componentOffsets, referenceSequence, infoFp):
self.nucleotides[offset],
referenceBase,
infoFp,
"WARNING: consensus draw at offset %d" % offset
+ " %(baseCounts)s.",
f"WARNING: consensus draw at offset {offset} %(baseCounts)s.",
)
else:
base = "-"
Expand Down Expand Up @@ -323,9 +322,10 @@ def saveConsensuses(self, outputDir, count, referenceSequence, verbose):
" Saving component %d consensus info to %s"
% (count, consensusFilename, count, infoFilename)
)
with open(consensusFilename, "w") as consensusFp, open(
infoFilename, "w"
) as infoFp:
with (
open(consensusFilename, "w") as consensusFp,
open(infoFilename, "w") as infoFp,
):
# First write the reference sequence for this component.
(reference,) = list(
FastaReads(join(outputDir, "reference-component-%d.fasta" % count))
Expand Down Expand Up @@ -804,8 +804,8 @@ def bestConsistentComponent(component, reference, fp):
bestCc.nucleotides[offset],
referenceBase,
infoFp,
(" WARNING: base count draw at offset %d " % offset)
+ " %(baseCounts)s.",
f" WARNING: base count draw at offset {offset} "
"%(baseCounts)s.",
)
consensus[offset] = base
offsetsDone.add(offset)
Expand Down Expand Up @@ -864,7 +864,7 @@ def bestConsistentComponent(component, reference, fp):
consensusReadCountAtOffset,
consensusWantedReadsBaseCountAtOffset,
_,
) = analyzeOffets(genomeLength, set(alignedReads) - unwantedCcReads)
) = analyzeOffsets(genomeLength, set(alignedReads) - unwantedCcReads)

depthFile = join(outputDir, "consensus-depth.txt")
self.report(" Writing consensus depth information to", depthFile)
Expand All @@ -891,11 +891,8 @@ def bestConsistentComponent(component, reference, fp):
baseCount,
referenceBase,
infoFp,
(
" WARNING: consensus base count draw at "
"offset %d" % offset
)
+ " %(baseCounts)s.",
f" WARNING: consensus base count draw at offset {offset} "
"%(baseCounts)s.",
)
print(
" Offset %d: %s from nucleotides %s"
Expand Down Expand Up @@ -934,11 +931,8 @@ def bestConsistentComponent(component, reference, fp):
baseCount,
referenceBase,
infoFp,
(
" WARNING: consensus base count draw at "
"offset %d" % offset
)
+ " %(baseCounts)s.",
f" WARNING: consensus base count draw at offset {offset} "
"%(baseCounts)s.",
)
print(
" Offset %d: %s from nucleotides %s"
Expand Down Expand Up @@ -1258,8 +1252,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
bestCc.nucleotides[offset],
referenceBase,
infoFp,
(" WARNING: base count draw at offset %d " % offset)
+ " %(baseCounts)s.",
f" WARNING: base count draw at offset {offset} "
"%(baseCounts)s.",
)
if base == referenceBase:
mismatch = ""
Expand All @@ -1268,11 +1262,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
baseCountAtOffset[offset],
referenceBase,
infoFp,
(
" WARNING: consensus base count draw at "
"offset %d " % offset
)
+ " %(baseCounts)s.",
f" WARNING: consensus base count draw at offset {offset} "
"%(baseCounts)s.",
)
mismatch = (
" (mismatch: reference has %s, all-read "
Expand Down Expand Up @@ -1313,7 +1304,7 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
# aligned reads minus the reads we don't want because they're
# in a consistent component that is not the best for this
# non-reference sequence.
consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffets(
consensusReadCountAtOffset, wantedReadBaseCountAtOffset, _ = analyzeOffsets(
genomeLength, set(alignedReads) - unwantedCcReads
)

Expand Down Expand Up @@ -1344,11 +1335,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
baseCount,
referenceBase,
infoFp,
(
" WARNING: consensus base count draw at "
"offset %d" % offset
)
+ " %(baseCounts)s.",
f" WARNING: consensus base count draw at offset {offset} "
"%(baseCounts)s.",
)
print(
" Offset %d: %s from nucleotides %s"
Expand Down Expand Up @@ -1387,11 +1375,8 @@ def bestConsistentComponent(component, referenceCcIndex, fp):
baseCount,
referenceBase,
infoFp,
(
" WARNING: consensus base count draw at "
"offset %d" % offset
)
+ " %(baseCounts)s.",
f" WARNING: consensus base count draw at offset {offset} "
"%(baseCounts)s.",
)
print(
" Offset %d: %s from nucleotides %s"
Expand Down
51 changes: 19 additions & 32 deletions src/midtools/offsets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

from collections import Counter
from concurrent.futures import ProcessPoolExecutor
from typing import Iterator, Optional

from midtools.read import AlignedRead
Expand Down Expand Up @@ -161,48 +160,36 @@ def highestFrequenciesMultiple(a: OffsetBases, b: OffsetBases) -> Optional[float
return orderedCounts[0][1] / orderedCounts[1][1]


def analyzeOffets(
def analyzeOffsets(
genomeLength: int, alignedReads: list[AlignedRead]
) -> tuple[list[int], list[Counter[str]], list[set[AlignedRead]]]:
"""
Analyze the aligned reads.

@param genomeLength: The C{int} length of the genome the reads were
aligned to.
@param genomeLength: The C{int} length of the genome the reads were aligned to.
@param alignedReads: A C{list} of C{AlignedRead} instances.
@return: A tuple of C{list}s (readCountAtOffset, baseCountAtOffset,
readsAtOffset), each indexed from zero to the genome length.
"""
readCountAtOffset = dict()
baseCountAtOffset = dict()
readsAtOffset = dict()

offsets = list(range(genomeLength))
with ProcessPoolExecutor() as executor:
for offset, (reads, counts) in zip(
offsets, executor.map(processOffset, alignedReads, offsets)
):
baseCountAtOffset[offset] = counts
readCountAtOffset[offset] = sum(counts.values())
readsAtOffset[offset] = reads

return (
[x for _, x in sorted(readCountAtOffset.items())],
[x for _, x in sorted(baseCountAtOffset.items())],
[x for _, x in sorted(readsAtOffset.items())],
)
readCountAtOffset = []
baseCountAtOffset = []
readsAtOffset = []


def processOffset(alignedReads, offset):
nucleotides = set("ACGT")
reads = set()
counts: Counter[str] = Counter({n: 0 for n in nucleotides})
for read in alignedReads:
base = read.base(offset)
if base in nucleotides:
counts[base] += 1
reads.add(read)
return reads, counts

for offset in range(genomeLength):
reads = set()
counts: Counter[str] = Counter()
for read in alignedReads:
base = read.base(offset)
if base in nucleotides:
counts[base] += 1
reads.add(read)
baseCountAtOffset.append(counts)
readCountAtOffset.append(sum(counts.values()))
readsAtOffset.append(reads)

return readCountAtOffset, baseCountAtOffset, readsAtOffset


def findSignificantOffsets(
Expand Down
6 changes: 3 additions & 3 deletions src/midtools/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from dark.sam import SAMFilter, PaddedSAM

from midtools.offsets import analyzeOffets, findSignificantOffsets
from midtools.offsets import analyzeOffsets, findSignificantOffsets
from midtools.read import AlignedRead


Expand Down Expand Up @@ -120,7 +120,7 @@ def parseCommandLineOptions(
for alignedRead in executor.map(constructRead, queries):
alignedReads.append(alignedRead)

readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffets(
readCountAtOffset, baseCountAtOffset, readsAtOffset = analyzeOffsets(
genomeLength, alignedReads
)

Expand Down Expand Up @@ -227,5 +227,5 @@ def addAnalysisCommandLineOptions(parser: ArgumentParser) -> None:
"--verbose",
type=int,
default=0,
help=("The integer verbosity level (0 = no output, 1 = some output, etc)."),
help="The integer verbosity level (0 = no output, 1 = some output, etc).",
)
6 changes: 2 additions & 4 deletions src/midtools/read.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from __future__ import annotations

from typing import Optional

from dark.reads import Read


Expand All @@ -16,7 +14,7 @@ class AlignedRead(Read):
"""

def __init__(
self, id_: str, sequence: str, alignment: Optional[dict[str, str | bool]] = None
self, id_: str, sequence: str, alignment: dict[str, str | bool] | None = None
) -> None:
self.significantOffsets: dict[int, str] = {}
self._originalLength = len(sequence)
Expand Down Expand Up @@ -119,7 +117,7 @@ def setSignificantOffsets(self, significantOffsets: list[int]) -> None:
newSignificantOffsets[offset] = base
self.significantOffsets = newSignificantOffsets

def base(self, n: int) -> Optional[str]:
def base(self, n: int) -> str | None:
"""
Get the nucleotide base at a given offset.

Expand Down
Loading