From 2bae8c921aa1a9c4f8df28e10bebeb4fca817025 Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Sat, 11 Apr 2026 21:55:48 +0200
Subject: [PATCH 1/6] Update process_gatk.R

bug fix: remove any remainder indels
---
 .../dmsanalysis/process_gatk/templates/process_gatk.R | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R b/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R
index 6abc053..fc52f9e 100644
--- a/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R
+++ b/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R
@@ -115,6 +115,11 @@ filter_gatk_by_codon_library <- function(gatk_file_path, codon_library_path, out
     }) %>%
     ungroup()
 
+  # Remove any remainder indels  
+  if(length(grep("ins|del", as.character(unlist(filtered_gatk[,"pos_mut"])))) != 0){
+    filtered_gatk <- filtered_gatk[-grep("ins|del", as.character(unlist(filtered_gatk[,"pos_mut"]))),]
+  }
+    
   # Write the filtered GATK table to the output file path
   write.csv(filtered_gatk, file = output_file_path, row.names = FALSE)
 }
@@ -192,10 +197,10 @@ complete_prefiltered_gatk <- function(possible_nnk_path, prefiltered_gatk_path,
   # Merge both dataframes based on the codon_mut column (full join to include all)
   merged_data <- full_join(prefiltered_gatk, possible_nnk, by = "codon_mut")
 
-  # Fill missing values in counts_per_cov and counts with 0.0000001
+  # Fill missing values in counts_per_cov and counts with 0.00000001
   merged_data <- merged_data %>%
-    mutate(counts_per_cov = ifelse(is.na(counts_per_cov), 0.0000001, counts_per_cov),
-           counts = ifelse(is.na(counts), 0.000001, counts))
+    mutate(counts_per_cov = ifelse(is.na(counts_per_cov), 0.00000001, counts_per_cov),
+           counts = ifelse(is.na(counts), 0.00000001, counts))
 
   # Calculate Hamming distance (varying_bases) and mutation details (aa_mut, pos_mut)
   merged_data <- merged_data %>%

From 145a252559e3a6af01284a03452e4dc2f95f84fb Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Sat, 11 Apr 2026 21:59:22 +0200
Subject: [PATCH 2/6] Update main.nf

---
 modules/local/bamprocessing/bam_filter/main.nf | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modules/local/bamprocessing/bam_filter/main.nf b/modules/local/bamprocessing/bam_filter/main.nf
index 6cef905..0d7f770 100644
--- a/modules/local/bamprocessing/bam_filter/main.nf
+++ b/modules/local/bamprocessing/bam_filter/main.nf
@@ -25,8 +25,6 @@ process BAMFILTER_DMS {
     samtools view -h -F 4 -F 256 -q 30 $bam | \
     samtools view -h | \
     awk '{if(\$6 !~ /I/ && \$6 !~ /D/ && \$6 !~ /N/) print \$0}' | \
-    samtools view -h | \
-    awk '{for(i=1;i<=NF;i++) if(\$i ~ /^NM:i:/ && \$i != "NM:i:0") {print \$0; next}} \$1 ~ /^@/' | \
     samtools view -bS > ${meta.id}_filtered.bam
 
     cat <<-END_VERSIONS > versions.yml

From e5b00c324df3b7acd1df527654af8d99d7cc4724 Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Wed, 15 Apr 2026 16:26:36 +0200
Subject: [PATCH 3/6] Update main.nf

---
 modules/local/gatk/saturationmutagenesis/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/gatk/saturationmutagenesis/main.nf b/modules/local/gatk/saturationmutagenesis/main.nf
index 2616347..209cfc2 100644
--- a/modules/local/gatk/saturationmutagenesis/main.nf
+++ b/modules/local/gatk/saturationmutagenesis/main.nf
@@ -35,7 +35,7 @@ process GATK_SATURATIONMUTAGENESIS {
         -R $wt_seq \
         --orf \$start_stop_codon \
         --paired-mode false \
-        --min-q 30 \
+        --min-q 40 \
         --min-variant-obs $min_counts \
         -O gatk_output
 

From 04ba4b4597a39bf2df819f3a3cd3c32c71224a69 Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Wed, 15 Apr 2026 16:36:28 +0200
Subject: [PATCH 4/6] Update main.nf

Added the --fastq_qmax and --fastq_qmaxout flags with their default values set to 41 (typical for Illumina NovaSeq); these should also be open parameters set in the nf-core pipeline config.

These parameters may be useful for tweaking when dealing with inputs from very high quality reads like AVITI's platforms.
---
 modules/local/bamprocessing/premerge/main.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/bamprocessing/premerge/main.nf b/modules/local/bamprocessing/premerge/main.nf
index 4a66f85..1081163 100644
--- a/modules/local/bamprocessing/premerge/main.nf
+++ b/modules/local/bamprocessing/premerge/main.nf
@@ -24,7 +24,7 @@ process PREMERGE {
     samtools fastq -1 forward_reads.fastq -2 reverse_reads.fastq -0 /dev/null -s /dev/null -n $bam
 
     # Merge paired reads
-    vsearch --fastq_mergepairs forward_reads.fastq --reverse reverse_reads.fastq --fastqout merged_reads.fastq --fastq_minovlen 10 --fastq_allowmergestagger
+    vsearch --fastq_mergepairs forward_reads.fastq --reverse reverse_reads.fastq --fastqout merged_reads.fastq --fastq_minovlen 10 --fastq_allowmergestagger --fastq_qmax 41 --fastq_qmaxout 41
 
     # Re-align merged reads
     bwa index $wt_seq

From be6655a35b2e7cc0971522ffff6121aa842b42aa Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:18:05 +0200
Subject: [PATCH 5/6] Update and rename
 false-doubles_based_seq_error_correction.R to
 false-doubles_MLE_based_seq_error_correction.R

---
 ... => false-doubles_MLE_based_seq_error_correction.R} | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)
 rename modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/{false-doubles_based_seq_error_correction.R => false-doubles_MLE_based_seq_error_correction.R} (92%)

diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
similarity index 92%
rename from modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R
rename to modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
index 35ea63b..65b1add 100644
--- a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R
+++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
@@ -1,13 +1,13 @@
 #!/usr/bin/env Rscript
 
-## false-doubles based sequencing error correction in nf-core/deepmutscan
+## false-doubles MLE based sequencing error correction in nf-core/deepmutscan
 ## 30.03.2026
 
 
 ## --- Helper functions ---
 
 # function to run the false-doubles based sequencing error correction (nucleotide level)
-seq_error_correct_by_false_doubles <- function(input_count_path_raw, input_count_path_processed, output_file_path){
+seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_count_path_processed, output_file_path){
   
   ## load data (nucleotide-level counts from GATK)
   
@@ -178,9 +178,9 @@ seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_pa
 }
 
 ## --- Main functions ---
-seq_error_correct_by_false_doubles(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts",
-                                   input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv",
-                                   output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv")
+seq_error_correct_by_false_doubles_MLE(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts",
+                                       input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv",
+                                       output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv")
 
 seq_error_correct_counts_for_heatmaps(gatk_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv",
                                       aa_seq_file_path = "intermediate_files/aa_seq.txt",

From 722a25b67c9fe6bc3484b185b3e2a4568e6fadff Mon Sep 17 00:00:00 2001
From: Maximilian Stammnitz <MaximilianStammnitz@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:41:30 +0200
Subject: [PATCH 6/6] Add files via upload

---
 ...se-doubles_EB_based_seq_error_correction.R | 449 ++++++++++++++++++
 ...e-doubles_MLE_based_seq_error_correction.R | 200 ++++++--
 2 files changed, 610 insertions(+), 39 deletions(-)
 create mode 100644 modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R

diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R
new file mode 100644
index 0000000..a0b7e1e
--- /dev/null
+++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R
@@ -0,0 +1,449 @@
+#!/usr/bin/env Rscript
+
+## false-double mutant codon, empirical bayes (EB)-based sequencing error correction in nf-core/deepmutscan
+## 24.06.2026
+
+
+## --- Helper functions ---
+
+# function to run the false-doubles based sequencing error correction (nucleotide level)
+seq_error_correct_by_false_doubles_EB <- function(wt_path, input_count_path_raw, input_count_path_processed, output_file_path, seq_error_rate_path, codon_window){
+  
+  ## load data (nucleotide-level counts from GATK)
+  
+  # WT sequence
+  wt.seq <- readDNAStringSet(wt_path)
+  
+  # for the seq error rates, build a vector to be filled
+  seq.error.names <- c()
+  for(i in 1:nchar(wt.seq)){
+    tmp.wt <- strsplit(as.character(wt.seq), "")[[1]][i]
+    seq.error.names <- c(seq.error.names, paste0(i, ":", tmp.wt, ">", c("A", "T", "G", "C"))[-match(tmp.wt, c("A", "T", "G", "C"))])
+  }
+  seq.error.rate <- matrix(0, nrow = length(seq.error.names), ncol = 2)
+  rownames(seq.error.rate) <- seq.error.names
+  colnames(seq.error.rate) <- c("1nt counts/coverage", "1nt false counts/coverage")
+  
+  # Set the column names
+  colnames <- c("counts", "cov", "mean_length_variant_reads", "varying_bases",
+                "base_mut", "varying_codons", "codon_mut", "aa_mut", "pos_mut")
+  input.counts.raw <- read.table(input_count_path_raw, sep = "\t", header = FALSE, fill = TRUE, col.names = colnames)
+  input.counts.processed <- read.csv(input_count_path_processed)
+  
+  ## append key columns
+  input.counts.processed$counts_corrected <- input.counts.processed$counts
+  input.counts.processed$counts_per_cov_corrected <- input.counts.processed$counts_per_cov
+  
+  ## calculate some things upfront
+  
+  ### highest accessible codon
+  max.codon <- max(as.numeric(str_split_fixed(input.counts.raw$codon_mut, ":", 2)[,1]), na.rm = T)
+  
+  ### total distribution of false double mutant coverage decay (%) vs. codon-codon distance
+  cat("Estimating the distance-dependent coverage decay of false double mutants...\n")
+  all.false.doubles <- input.counts.raw
+  all.false.doubles <- all.false.doubles[which(all.false.doubles[,"varying_bases"] >= 3 & all.false.doubles[,"varying_bases"] < 5 & all.false.doubles[,"varying_codons"] == 2),,drop = F]
+  all.false.doubles$codon_dist <- vapply(regmatches(all.false.doubles[,"codon_mut"], gregexpr("\\d+(?=:)", all.false.doubles[,"codon_mut"], perl = TRUE)), function(z) abs(diff(as.integer(z))), integer(1))
+  
+  all.false.doubles.codons <- str_split_fixed(all.false.doubles$codon_mut, ", ", 2)
+  all.false.doubles.codons[,1] <- as.integer(sub(":.*", "", all.false.doubles.codons[,1]))
+  all.false.doubles.codons[,2] <- as.integer(sub(":.*", "", all.false.doubles.codons[,2]))
+  median.high.conf.coverage.per.pos <- rep(NA, max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1])))
+  names(median.high.conf.coverage.per.pos) <- 1:max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]))
+  for(i in 1:length(median.high.conf.coverage.per.pos)){
+    median.high.conf.coverage.per.pos[i] <- median(input.counts.processed[which(names(median.high.conf.coverage.per.pos)[i] == str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]),"cov"])
+  }
+  all.false.doubles.codons[,1] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,1], names(median.high.conf.coverage.per.pos))]
+  all.false.doubles.codons[,2] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,2], names(median.high.conf.coverage.per.pos))]
+  all.false.doubles$max_cov_possible <- as.numeric(apply(all.false.doubles.codons, 1, min))
+  all.false.doubles$perc_cov_to_max <- 100 * all.false.doubles$cov / all.false.doubles$max_cov_possible
+
+  fit <- lm(log(all.false.doubles$perc_cov_to_max) ~ all.false.doubles$codon_dist + I(all.false.doubles$codon_dist^2))
+  all.false.doubles$pred_cov_perc <- exp(predict(fit))
+  
+  ### convert this into a look-up table: "% max. possible coverage" depending on codon-codon distance
+  dual.codon.coverage.estimate <- matrix(NA, nrow = max(all.false.doubles$codon_dist), ncol = 2)
+  colnames(dual.codon.coverage.estimate) <- c("Codon distance", "Estimate % max. possible coverage")
+  dual.codon.coverage.estimate[,"Codon distance"] <- 1:nrow(dual.codon.coverage.estimate)
+  dual.codon.coverage.estimate[,"Estimate % max. possible coverage"] <- all.false.doubles[match(dual.codon.coverage.estimate[,"Codon distance"], all.false.doubles$codon_dist),"pred_cov_perc"]
+  
+  ## Process the GATK file, only look at single nucleotide variants
+  cat("Sequencing error correction of GATK counts...\n")
+  out.summary <- matrix(NA, ncol = 7, nrow = length(grep("[,]", input.counts.processed[,"base_mut"], invert = T)))
+  colnames(out.summary) <- c("SNV", "Category", 
+                             "False_double_variant_count", "False_double_variant_coverage",
+                             "True_high_conf_variant_count", "True_high_conf_variant_coverage",
+                             "Exposure_m")
+  out.summary[,"SNV"] <- input.counts.processed[grep("[,]", input.counts.processed[,"base_mut"], invert = T),"base_mut"]
+  out.summary[,"Category"] <- str_split_fixed(out.summary[,"SNV"], ":", 2)[,2]
+  category.translate <- cbind(c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"),
+                              c("G>T", "G>C", "G>A", "A>T", "A>G", "A>C"))
+  for(i in 1:nrow(out.summary)){
+    if(out.summary[i,"Category"] %in% category.translate[,2]){
+      out.summary[i,"Category"] <- category.translate[match(out.summary[i,"Category"],category.translate[,2]),1]
+    }
+  }
+  
+  for (i in grep("[,]", input.counts.processed[,"base_mut"], invert = T)){
+    
+    ### algorithm: 
+    ### 1.) isolate 1nt variant of interest
+    ### 2.) look for 2/3nt hits in a pre-specified window of codons upstream or downstream 
+    ### -> if there are zero false double mutants: skip / set correction factor to zero
+    ### -> if there are any false double mutants: continue
+    ### 3.) generate a table for ALL possible 2/3nt variants in the selected window:
+    ### true high-conf. variant count | true high-conf. variant coverage | false double double variant count (often 0 or 1) | false double double variant coverage
+    ### 4.) Iterate of all positions
+    ### 5.) Calculate the empirical Bayes
+    ### 6.) Apply correction factor
+    
+    ## 1.) isolate 1nt variant of interest
+    tmp.single <- input.counts.processed[i,"base_mut"] 
+    seq.error.rate[tmp.single, "1nt counts/coverage"] <- input.counts.processed[i,"counts_per_cov"]
+    
+    ## 2.) look for 2/3nt hits in up to 40 codons (120 bp) upstream or downstream 
+    tmp.false.doubles <- input.counts.raw[grep(paste0("(?<!\\d)",input.counts.processed[i,"codon_mut"]), input.counts.raw[,"codon_mut"], perl = T),,drop = F]
+    
+    ## subset multi-codon variants
+    tmp.false.doubles <- tmp.false.doubles[which(tmp.false.doubles[,"varying_bases"] >= 3 & tmp.false.doubles[,"varying_bases"] < 5 & tmp.false.doubles[,"varying_codons"] == 2),,drop = F]
+    if(nrow(tmp.false.doubles) == 0){
+      next
+    }
+    
+    ## set and enforce distance threshold
+    tmp.ref.codon <- as.numeric(strsplit(input.counts.processed[i,"codon_mut"] , ":")[[1]][1])
+    tmp.min.from.ref <- max(c(1, tmp.ref.codon - c(codon_window - 1)))
+    tmp.max.from.ref <- min(c(max.codon, tmp.ref.codon + c(codon_window - 1)))
+    tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2)
+    tmp.double.codon.n1 <- as.integer(sub(":.*", "", tmp.double.codon[,1]))
+    tmp.double.codon.n2 <- as.integer(sub(":.*", "", tmp.double.codon[,2]))
+    tmp.double.codon.within.range <- (tmp.double.codon.n1 >= tmp.min.from.ref & tmp.double.codon.n1 <= tmp.max.from.ref) & (tmp.double.codon.n2 >= tmp.min.from.ref & tmp.double.codon.n2 <= tmp.max.from.ref)
+    tmp.false.doubles <- tmp.false.doubles[which(tmp.double.codon.within.range == T),,drop = F]
+    if(nrow(tmp.false.doubles) == 0){
+      next
+    }
+    
+    ## 3.) generate a table for ALL possible 2/3nt variants in the selected window
+    tmp.summary.table.entries <- input.counts.processed
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) != tmp.ref.codon),]
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) >= tmp.min.from.ref &
+                                                                   as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) <= tmp.max.from.ref),]
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(tmp.summary.table.entries$varying_bases > 1),]
+    
+    ### fill the table
+    tmp.summary.table <- matrix(NA, ncol = 5, nrow = nrow(tmp.summary.table.entries))
+    colnames(tmp.summary.table) <- c("codon distance",
+                                     "true high-conf. variant count",
+                                     "true high-conf. variant coverage",
+                                     "false double double variant count",
+                                     "false double double variant coverage")
+    rownames(tmp.summary.table) <- paste0(c(tmp.summary.table.entries$codon_mut),", ", c(tmp.summary.table.entries$base_mut))
+    
+    tmp.summary.table[,"codon distance"] <- abs(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) - tmp.ref.codon)
+    tmp.summary.table[,"true high-conf. variant count"] <- tmp.summary.table.entries$counts
+    tmp.summary.table[,"true high-conf. variant coverage"] <- tmp.summary.table.entries$cov
+    tmp.summary.table[,"false double double variant count"] <- 0
+    
+    ## input the actual hits
+    tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2)
+    tmp.match1 <- match(tmp.double.codon[,1], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1])
+    tmp.match1[is.na(tmp.match1)] <- 0
+    tmp.match2 <- match(tmp.double.codon[,2], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1])
+    tmp.match2[is.na(tmp.match2)] <- 0
+    tmp.match <- c(tmp.match1 + tmp.match2)
+    
+    ### remove those false doubles for which there are no matched true high confidence calls
+    if(any(tmp.match == 0)){
+      tmp.false.doubles <- tmp.false.doubles[-which(tmp.match == 0),,drop = F]
+      tmp.match <- tmp.match[-which(tmp.match == 0)]
+      if(nrow(tmp.false.doubles) == 0){
+        next
+      }
+    }
+    
+    tmp.summary.table[tmp.match,"false double double variant count"] <- tmp.false.doubles[,"counts"]
+    tmp.summary.table[tmp.match,"false double double variant coverage"] <- tmp.false.doubles[,"cov"]
+    tmp.summary.table <- as.data.frame(tmp.summary.table)
+    
+    ## 4.) Iterate over codon positions
+    
+    ### summarise data by position
+    tmp.summary.table.pos <- matrix(NA, nrow = length(unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])), ncol = 5)
+    colnames(tmp.summary.table.pos) <- colnames(tmp.summary.table)
+    rownames(tmp.summary.table.pos) <- unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])
+    tmp.summary.table.pos[,"codon distance"] <- abs(as.numeric(str_split_fixed(rownames(tmp.summary.table.pos), ":", 2)[,1]) - as.numeric(str_split_fixed(input.counts.processed[i,"codon_mut"], ":", 2)[,1]))
+    for(j in 1:nrow(tmp.summary.table.pos)){
+      tmp.name <- rownames(tmp.summary.table.pos)[j]
+      tmp.summary.table.pos[j,"true high-conf. variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant count"])
+      tmp.summary.table.pos[j,"true high-conf. variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant coverage"])
+      tmp.summary.table.pos[j,"false double double variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant count"])
+      tmp.summary.table.pos[j,"false double double variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant coverage"], na.rm = T)
+      
+      ## infer a good proxy for the false double variant coverage with 0 counts (based estimated distance-dependent % coverage decay observed across all observed double mutants)
+      if(tmp.summary.table.pos[j,"false double double variant coverage"] == "-Inf"){
+        tmp.summary.table.pos[j,"false double double variant coverage"] <- c(dual.codon.coverage.estimate[abs(tmp.summary.table.pos[j,"codon distance"]),"Estimate % max. possible coverage"] / 100) * tmp.summary.table.pos[j,"true high-conf. variant coverage"]
+        tmp.summary.table.pos[j,"false double double variant coverage"] <- round(tmp.summary.table.pos[j,"false double double variant coverage"])
+      }
+    }
+    tmp.summary.table.pos <- as.data.frame(tmp.summary.table.pos)
+    
+    ### fill the master table
+    out.summary[match(tmp.single,out.summary[,"SNV"]),"False_double_variant_count"] <- sum(tmp.summary.table.pos$`false double double variant count`)
+    out.summary[match(tmp.single,out.summary[,"SNV"]),"False_double_variant_coverage"] <- sum(tmp.summary.table.pos$`false double double variant coverage`)
+    out.summary[match(tmp.single,out.summary[,"SNV"]),"True_high_conf_variant_count"] <- sum(tmp.summary.table.pos$`true high-conf. variant count`)
+    out.summary[match(tmp.single,out.summary[,"SNV"]),"True_high_conf_variant_coverage"] <- sum(tmp.summary.table.pos$`true high-conf. variant coverage`)
+    out.summary[match(tmp.single,out.summary[,"SNV"]),"Exposure_m"] <- sum(tmp.summary.table.pos$`true high-conf. variant count` * c(tmp.summary.table.pos$`false double double variant coverage` / tmp.summary.table.pos$`true high-conf. variant coverage`))
+    
+  }
+  
+  ## 5.) Calculate the empirical-Bayes sequencing error estimates
+  out.summary <- out.summary[-which(is.na(out.summary[,"False_double_variant_count"]) == T),]
+  out.summary <- as.data.frame(out.summary)
+  class(out.summary$False_double_variant_count) <- "numeric"
+  class(out.summary$False_double_variant_coverage) <- "numeric"
+  class(out.summary$True_high_conf_variant_count) <- "numeric"
+  class(out.summary$True_high_conf_variant_coverage) <- "numeric"
+  class(out.summary$Exposure_m) <- "numeric"
+  
+  ### calculate e_MLE across all
+  out.summary$e_MLE <- out.summary$False_double_variant_count / out.summary$Exposure_m
+  
+  ### calculate all the necessary rates, priors, etc.
+  
+  #### global background rate
+  mu_global <- sum(out.summary$False_double_variant_count) / sum(out.summary$Exposure_m)
+  mu_global <- max(mu_global, 1e-12) ## 
+  
+  ##### marginal log-likelihood under Gamma-Poisson with fixed mean mu (ChatGPT function, adapted)
+  ## prior: e ~ Gamma(shape = nu * mu, rate = nu)
+  ## mean = mu, variance = mu / nu
+  gp_loglik_nu <- function(log_nu, y, m, mu) {
+    nu <- exp(log_nu)
+    a <- max(nu * mu, 1e-12)
+    
+    # Negative-binomial marginal:
+    # y ~ NB(size = a, prob = nu / (nu + m))
+    ll <- lgamma(y + a) - lgamma(a) - lgamma(y + 1) +
+      a * log(nu / (nu + m)) +
+      y * log(m / (nu + m))
+    
+    -sum(ll)
+  }
+  opt_global <- optimize(f = gp_loglik_nu,
+                         interval = c(log(1e-6), log(1e6)),
+                         y = out.summary$False_double_variant_count,
+                         m = out.summary$Exposure_m,
+                         mu = mu_global)
+  nu_global <- exp(opt_global$minimum)
+  
+  #### category-level rates, shrunk to global rate
+  mu_cat <- vector(mode = "list", length = 6)
+  names(mu_cat) <- category.translate[,1]
+  mu_cat_raw <- mu_cat
+  for(i in 1:length(mu_cat)){
+    mu_cat[[i]] <- out.summary[which(out.summary$Category == names(mu_cat)[i]),]
+    mu_cat_raw[[i]] <- sum(mu_cat[[i]]$False_double_variant_count) / sum(mu_cat[[i]]$Exposure_m)
+    mu_cat_raw[[i]] <- pmax(mu_cat_raw[[i]], 1e-12)
+    mu_cat[[i]] <- c(sum(mu_cat[[i]]$False_double_variant_count) + nu_global * mu_global) / c(sum(mu_cat[[i]]$Exposure_m) + nu_global)
+    mu_cat[[i]] <- pmax(mu_cat[[i]], 1e-12)
+  }
+  mu_cat_raw <- do.call(c, mu_cat_raw)
+  mu_cat <- do.call(c, mu_cat)
+  
+  ##### category-specific prior strength nu_cat (ChatGPT function, adapted)
+  ## with fallback to global if category is too small
+  nu_cat <- rep(NA, 6)
+  names(nu_cat) <- names(mu_cat)
+  for (i in 1:length(mu_cat)) {
+    cat_name <- names(mu_cat)[i]
+    idx <- out.summary$Category == cat_name
+    
+    yk <- out.summary$False_double_variant_count[idx]
+    mk <- out.summary$Exposure_m[idx]
+    mu_k <- mu_cat[i]
+    
+    opt_k <- try(optimize(f = gp_loglik_nu,
+                          interval = c(log(1e-6), log(1e6)),
+                          y = yk,
+                          m = mk,
+                          mu = mu_k),
+                 silent = TRUE)
+    
+    if (inherits(opt_k, "try-error")) {
+      nu_cat[i] <- nu_global
+    } else {
+      nu_cat[i] <- exp(opt_k$minimum)
+    }
+  }
+  
+  #### posterior mean for each specific SNV
+  out.summary$mu_category <- rep(NA, nrow(out.summary))
+  out.summary$nu_category <- rep(NA, nrow(out.summary))
+  for(i in 1:6){
+    out.summary$mu_category[which(out.summary$Category == names(mu_cat)[i])] <- mu_cat[i]
+    out.summary$nu_category[which(out.summary$Category == names(nu_cat)[i])] <- nu_cat[i]
+  }
+  
+  ##### prior: e_j ~ Gamma(shape = nu_cat * mu_cat, rate = nu_cat)
+  out.summary$prior_shape <- out.summary$mu_category * out.summary$nu_category
+  out.summary$prior_rate <- out.summary$nu_category
+  
+  ##### posterior: e_j | data ~ Gamma(shape = prior_shape + F, rate = prior_rate [= nu_cat] + M)
+  out.summary$post_shape <- out.summary$prior_shape + out.summary$False_double_variant_count
+  out.summary$post_rate  <- out.summary$nu_category + out.summary$Exposure_m
+  
+  ##### posterior mean = shrunk EB estimate
+  out.summary$e_EB  <- out.summary$post_shape / out.summary$post_rate
+  
+  ##### other posterior intervals
+  # out.summary$e_EB_upper_1SD <- qgamma(0.68, shape = out.summary$post_shape, rate = out.summary$post_rate)
+  # out.summary$e_EB_upper_2SD <- qgamma(0.95, shape = out.summary$post_shape, rate = out.summary$post_rate)
+  # out.summary$e_EB_upper_3SD <- qgamma(0.997, shape = out.summary$post_shape, rate = out.summary$post_rate)
+
+  ## 6.) Apply correction factors systematically (use upper 95% percentile to correct, a.k.a. "conservative")
+  seq.error.rate[match(out.summary$SNV, rownames(seq.error.rate)),"1nt false counts/coverage"] <- out.summary$e_EB
+  input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_per_cov_corrected"] <- input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_per_cov"] - out.summary$e_EB
+  input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"] <- input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts"] - c(out.summary$e_EB * input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"cov"])
+  
+  ## round to nearest integer, do not allow for negative counts
+  input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"] <- round(input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"])
+  if(any(which(input.counts.processed[,"counts_corrected"] < 0))){
+    input.counts.processed[which(input.counts.processed[,"counts_corrected"] < 0),"counts_per_cov_corrected"] <- 0
+    input.counts.processed[which(input.counts.processed[,"counts_corrected"] < 0),"counts_corrected"] <- 0
+  }
+  
+  # Write the processed data
+  write.csv(seq.error.rate, file = seq_error_rate_path, row.names = T)
+  write.csv(input.counts.processed, file = output_file_path, row.names = F)
+  
+}
+
+# to re-make the AA level input table after WT sequencing error correction
+seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold) {
+  
+  # Load the raw GATK data
+  raw_gatk <- read.table(gatk_file_path, sep = ",", header = TRUE, stringsAsFactors = FALSE)
+  
+  # Read the wild-type amino acid sequence from the text file
+  wt_seq <- readLines(aa_seq_file_path)
+  wt_seq <- unlist(strsplit(wt_seq, ""))
+  
+  # Replace 'X' with '*', indicating the stop codon
+  wt_seq[wt_seq == "X"] <- "*"
+  
+  # Summarize counts for each unique pos_mut
+  aggregated_counts_per_cov <- aggregate(
+    counts_per_cov_corrected ~ pos_mut,
+    data = raw_gatk,
+    FUN = function(x) sum(x, na.rm = TRUE)
+  )
+  
+  aggregated_counts <- aggregate(
+    counts_corrected ~ pos_mut,
+    data = raw_gatk,
+    FUN = function(x) sum(x, na.rm = TRUE)
+  )
+  
+  # Merge the two aggregated tables
+  aggregated_data <- merge(
+    aggregated_counts_per_cov,
+    aggregated_counts,
+    by = "pos_mut",
+    all = TRUE
+  )
+  
+  # Rename columns to match original output
+  names(aggregated_data)[names(aggregated_data) == "counts_per_cov_corrected"] <- "total_counts_per_cov_corrected"
+  names(aggregated_data)[names(aggregated_data) == "counts_corrected"] <- "total_counts_corrected"
+  
+  # Extract wt_aa, position, and mut_aa from pos_mut
+  aggregated_data$wt_aa <- sub("(\\D)(\\d+)(\\D)", "\\1", aggregated_data$pos_mut)
+  aggregated_data$position <- as.numeric(sub("(\\D)(\\d+)(\\D)", "\\2", aggregated_data$pos_mut))
+  aggregated_data$mut_aa <- sub("(\\D)(\\d+)(\\D)", "\\3", aggregated_data$pos_mut)
+  
+  # Replace 'X' with '*'
+  aggregated_data$mut_aa[aggregated_data$mut_aa == "X"] <- "*"
+  
+  # Define all 20 standard amino acids and stop codon
+  all_amino_acids <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L",
+                       "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*")
+  
+  # Create all positions
+  all_positions <- seq_along(wt_seq)
+  
+  # Create complete grid of all possible position/mutation combinations
+  complete_data <- expand.grid(
+    mut_aa = all_amino_acids,
+    position = all_positions,
+    stringsAsFactors = FALSE
+  )
+  
+  # Merge aggregated data into complete grid
+  heatmap_data <- merge(
+    complete_data,
+    aggregated_data,
+    by = c("mut_aa", "position"),
+    all.x = TRUE,
+    sort = FALSE
+  )
+  
+  # Fill missing values
+  heatmap_data$total_counts_per_cov_corrected[is.na(heatmap_data$total_counts_per_cov_corrected)] <- 0
+  heatmap_data$wt_aa <- wt_seq[heatmap_data$position]
+  
+  # Apply threshold
+  low_count <- is.na(heatmap_data$total_counts_corrected) | heatmap_data$total_counts_corrected < threshold
+  heatmap_data$total_counts_per_cov_corrected[low_count] <- NA
+  heatmap_data$total_counts_corrected[low_count] <- NA
+  
+  # Fill missing pos_mut values
+  missing_pos_mut <- is.na(heatmap_data$pos_mut)
+  heatmap_data$pos_mut[missing_pos_mut] <- paste0(
+    heatmap_data$wt_aa[missing_pos_mut],
+    heatmap_data$position[missing_pos_mut],
+    heatmap_data$mut_aa[missing_pos_mut]
+  )
+  
+  # Re-order rows
+  out.order <- paste0(rep(1:max(heatmap_data$position), each = 21),
+                      rep(c("A", "C", "D", "E", "F", "G", "H",
+                            "I", "K", "L","M", "N", "P", "Q",
+                            "R", "S", "T", "V", "W", "Y", "*"),
+                          max(heatmap_data$position)))
+  heatmap_data <- heatmap_data[match(out.order, paste0(heatmap_data$position, heatmap_data$mut_aa)),]
+  rownames(heatmap_data) <- 1:nrow(heatmap_data)
+  
+  # Save output
+  write.csv(heatmap_data, file = output_csv_path, row.names = F)
+  print(paste("Aggregated data saved to:", output_csv_path))
+  
+}
+
+## --- Main functions ---
+seq_error_correct_by_false_doubles_EB(wt_path = "GID1A_bPCA.fasta", 
+                                      input_count_path_raw = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/gatk/GID1A_input_1_pe/gatk_output.variantCounts",
+                                      input_count_path_processed = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library.csv",
+                                      output_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_EB.csv",
+                                      seq_error_rate_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/seq_error_rate_EB.csv", 
+                                      codon_window = 40)
+
+seq_error_correct_counts_for_heatmaps(gatk_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_EB.csv",
+                                      aa_seq_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/aa_seq.txt",
+                                      output_csv_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles_EB.csv",
+                                      threshold = 3)
+
+
+####
+# create versions.yml
+####
+r_version <- strsplit(version[['version.string']], ' ')[[1]][3]
+if (is.null(r_version)) r_version <- "unknown"
+f <- file("versions.yml", "w")
+writeLines(
+  c(
+    '"\${task.process}":',
+    paste('    r-base:', r_version),
+  ),
+  f
+)
+close(f)
diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
index 65b1add..9ada64d 100644
--- a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
+++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R
@@ -1,16 +1,29 @@
 #!/usr/bin/env Rscript
 
-## false-doubles MLE based sequencing error correction in nf-core/deepmutscan
-## 30.03.2026
+## false-double mutant codon, maximum likelihood estimation (MLE)-based sequencing error correction in nf-core/deepmutscan
+## 24.06.2026
 
 
 ## --- Helper functions ---
 
 # function to run the false-doubles based sequencing error correction (nucleotide level)
-seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_count_path_processed, output_file_path){
+seq_error_correct_by_false_doubles_MLE <- function(wt_path, input_count_path_raw, input_count_path_processed, output_file_path, seq_error_rate_path, codon_window){
   
   ## load data (nucleotide-level counts from GATK)
   
+  # WT sequence
+  wt.seq <- readDNAStringSet(wt_path)
+  
+  # for the seq error rates, build a vector to be filled
+  seq.error.names <- c()
+  for(i in 1:nchar(wt.seq)){
+    tmp.wt <- strsplit(as.character(wt.seq), "")[[1]][i]
+    seq.error.names <- c(seq.error.names, paste0(i, ":", tmp.wt, ">", c("A", "T", "G", "C"))[-match(tmp.wt, c("A", "T", "G", "C"))])
+  }
+  seq.error.rate <- matrix(0, nrow = length(seq.error.names), ncol = 2)
+  rownames(seq.error.rate) <- seq.error.names
+  colnames(seq.error.rate) <- c("1nt counts/coverage", "1nt false counts/coverage")
+  
   # Set the column names
   colnames <- c("counts", "cov", "mean_length_variant_reads", "varying_bases",
                 "base_mut", "varying_codons", "codon_mut", "aa_mut", "pos_mut")
@@ -21,46 +34,149 @@ seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_c
   input.counts.processed$counts_corrected <- input.counts.processed$counts
   input.counts.processed$counts_per_cov_corrected <- input.counts.processed$counts_per_cov
   
-  # Process the GATK file, error-correct single nucleotide variant counts
+  ## calculate some things upfront
+  
+  ### highest accessible codon
+  max.codon <- max(as.numeric(str_split_fixed(input.counts.raw$codon_mut, ":", 2)[,1]), na.rm = T)
+  
+  ### total distribution of false double mutant coverage decay (%) vs. codon-codon distance
+  cat("Estimating the distance-dependent coverage decay of false double mutants...\n")
+  all.false.doubles <- input.counts.raw
+  all.false.doubles <- all.false.doubles[which(all.false.doubles[,"varying_bases"] >= 3 & all.false.doubles[,"varying_bases"] < 5 & all.false.doubles[,"varying_codons"] == 2),,drop = F]
+  all.false.doubles$codon_dist <- vapply(regmatches(all.false.doubles[,"codon_mut"], gregexpr("\\d+(?=:)", all.false.doubles[,"codon_mut"], perl = TRUE)), function(z) abs(diff(as.integer(z))), integer(1))
+  
+  all.false.doubles.codons <- str_split_fixed(all.false.doubles$codon_mut, ", ", 2)
+  all.false.doubles.codons[,1] <- as.integer(sub(":.*", "", all.false.doubles.codons[,1]))
+  all.false.doubles.codons[,2] <- as.integer(sub(":.*", "", all.false.doubles.codons[,2]))
+  median.high.conf.coverage.per.pos <- rep(NA, max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1])))
+  names(median.high.conf.coverage.per.pos) <- 1:max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]))
+  for(i in 1:length(median.high.conf.coverage.per.pos)){
+    median.high.conf.coverage.per.pos[i] <- median(input.counts.processed[which(names(median.high.conf.coverage.per.pos)[i] == str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]),"cov"])
+  }
+  all.false.doubles.codons[,1] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,1], names(median.high.conf.coverage.per.pos))]
+  all.false.doubles.codons[,2] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,2], names(median.high.conf.coverage.per.pos))]
+  all.false.doubles$max_cov_possible <- as.numeric(apply(all.false.doubles.codons, 1, min))
+  all.false.doubles$perc_cov_to_max <- 100 * all.false.doubles$cov / all.false.doubles$max_cov_possible
+  
+  fit <- lm(log(all.false.doubles$perc_cov_to_max) ~ all.false.doubles$codon_dist + I(all.false.doubles$codon_dist^2))
+  all.false.doubles$pred_cov_perc <- exp(predict(fit))
+ 
+  ### convert this into a look-up table: "% max. possible coverage" depending on codon-codon distance
+  dual.codon.coverage.estimate <- matrix(NA, nrow = max(all.false.doubles$codon_dist), ncol = 2)
+  colnames(dual.codon.coverage.estimate) <- c("Codon distance", "Estimate % max. possible coverage")
+  dual.codon.coverage.estimate[,"Codon distance"] <- 1:nrow(dual.codon.coverage.estimate)
+  dual.codon.coverage.estimate[,"Estimate % max. possible coverage"] <- all.false.doubles[match(dual.codon.coverage.estimate[,"Codon distance"], all.false.doubles$codon_dist),"pred_cov_perc"]
+  
+  ## Process the GATK file, only look at single nucleotide variants
   cat("Sequencing error correction of GATK counts...\n")
   for (i in grep("[,]", input.counts.processed[,"base_mut"], invert = T)){
-        
+    
+    ### algorithm: 
+    ### 1.) isolate 1nt variant of interest
+    ### 2.) look for 2/3nt hits in a pre-specified window of codons upstream or downstream 
+    ### -> if there are zero false double mutants: skip / set correction factor to zero
+    ### -> if there are any false double mutants: continue
+    ### 3.) generate a table for ALL possible 2/3nt variants in the selected window:
+    ### true high-conf. variant count | true high-conf. variant coverage | false double double variant count (often 0 or 1) | false double double variant coverage
+    ### 4.) Maximum likelihood estimation (MLE) over all events
+    ### 5.) Apply correction factor
+    
+    ## 1.) isolate 1nt variant of interest
     tmp.single <- input.counts.processed[i,"base_mut"] 
+    seq.error.rate[tmp.single, "1nt counts/coverage"] <- input.counts.processed[i,"counts_per_cov"]
     
-    ## locate this variant across all multi-codon variants in the original count matrix
-    tmp.false.doubles <- input.counts.raw[grep(paste0("(?<!\\d)",input.counts.processed[i,"codon_mut"]), input.counts.raw[,"codon_mut"], perl = T),]
+    ## 2.) look for 2/3nt hits in up to 40 codons (120 bp) upstream or downstream 
+    tmp.false.doubles <- input.counts.raw[grep(paste0("(?<!\\d)",input.counts.processed[i,"codon_mut"]), input.counts.raw[,"codon_mut"], perl = T),,drop = F]
     
-    ## subset false multi-codon variants
-    tmp.false.doubles <- tmp.false.doubles[which(tmp.false.doubles[,"varying_bases"] >= 3 & tmp.false.doubles[,"varying_codons"] < 3),]
+    ## subset multi-codon variants
+    tmp.false.doubles <- tmp.false.doubles[which(tmp.false.doubles[,"varying_bases"] >= 3 & tmp.false.doubles[,"varying_bases"] < 5 & tmp.false.doubles[,"varying_codons"] == 2),,drop = F]
     if(nrow(tmp.false.doubles) == 0){
       next
     }
-
-    ## need to match with the corresponding correct single codon variant(s)
-    tmp.true.singles <- tmp.false.doubles$base_mut
-    tmp.true.singles <- strsplit(tmp.true.singles, ", ")
-    tmp.true.singles <- lapply(tmp.true.singles, function(x){x <- x[x != tmp.single]; x <- paste0(x, collapse = ", "); return(x)})
-    tmp.true.singles <- do.call(c, tmp.true.singles)
-    tmp.true.singles <- input.counts.processed[match(tmp.true.singles, input.counts.processed[,"base_mut"]),]
-    if(all(is.na(tmp.true.singles) == T)){
+    
+    ## set and enforce distance threshold
+    tmp.ref.codon <- as.numeric(strsplit(input.counts.processed[i,"codon_mut"] , ":")[[1]][1])
+    tmp.min.from.ref <- max(c(1, tmp.ref.codon - c(codon_window - 1)))
+    tmp.max.from.ref <- min(c(max.codon, tmp.ref.codon + c(codon_window - 1)))
+    tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2)
+    tmp.double.codon.n1 <- as.integer(sub(":.*", "", tmp.double.codon[,1]))
+    tmp.double.codon.n2 <- as.integer(sub(":.*", "", tmp.double.codon[,2]))
+    tmp.double.codon.within.range <- (tmp.double.codon.n1 >= tmp.min.from.ref & tmp.double.codon.n1 <= tmp.max.from.ref) & (tmp.double.codon.n2 >= tmp.min.from.ref & tmp.double.codon.n2 <= tmp.max.from.ref)
+    tmp.false.doubles <- tmp.false.doubles[which(tmp.double.codon.within.range == T),,drop = F]
+    if(nrow(tmp.false.doubles) == 0){
       next
     }
-          
-    ## make sure that we only count events for which there are both true single codon and false double codon variants
-    if(any(is.na(tmp.true.singles$counts))){
-      tmp.false.doubles <- tmp.false.doubles[-which(is.na(tmp.true.singles$counts)),]
-      tmp.true.singles <- tmp.true.singles[-which(is.na(tmp.true.singles$counts)),]
-    }
-    if(all(is.na(tmp.true.singles) == T)){
-      next
+    
+    ## 3.) generate a table for ALL possible 2/3nt variants in the selected window
+    tmp.summary.table.entries <- input.counts.processed
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) != tmp.ref.codon),]
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) >= tmp.min.from.ref &
+                                                                   as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) <= tmp.max.from.ref),]
+    tmp.summary.table.entries <- tmp.summary.table.entries[which(tmp.summary.table.entries$varying_bases > 1),]
+    
+    ### fill the table
+    tmp.summary.table <- matrix(NA, ncol = 5, nrow = nrow(tmp.summary.table.entries))
+    colnames(tmp.summary.table) <- c("codon distance",
+                                     "true high-conf. variant count",
+                                     "true high-conf. variant coverage",
+                                     "false double double variant count",
+                                     "false double double variant coverage")
+    rownames(tmp.summary.table) <- paste0(c(tmp.summary.table.entries$codon_mut),", ", c(tmp.summary.table.entries$base_mut))
+    
+    tmp.summary.table[,"codon distance"] <- abs(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) - tmp.ref.codon)
+    tmp.summary.table[,"true high-conf. variant count"] <- tmp.summary.table.entries$counts
+    tmp.summary.table[,"true high-conf. variant coverage"] <- tmp.summary.table.entries$cov
+    tmp.summary.table[,"false double double variant count"] <- 0
+    
+    ## input the actual hits
+    tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2)
+    tmp.match1 <- match(tmp.double.codon[,1], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1])
+    tmp.match1[is.na(tmp.match1)] <- 0
+    tmp.match2 <- match(tmp.double.codon[,2], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1])
+    tmp.match2[is.na(tmp.match2)] <- 0
+    tmp.match <- c(tmp.match1 + tmp.match2)
+    
+    ### remove those false doubles for which there are no matched true high confidence calls
+    if(any(tmp.match == 0)){
+      tmp.false.doubles <- tmp.false.doubles[-which(tmp.match == 0),,drop = F]
+      tmp.match <- tmp.match[-which(tmp.match == 0)]
+      if(nrow(tmp.false.doubles) == 0){
+        next
+      }
     }
+    
+    tmp.summary.table[tmp.match,"false double double variant count"] <- tmp.false.doubles[,"counts"]
+    tmp.summary.table[tmp.match,"false double double variant coverage"] <- tmp.false.doubles[,"cov"]
+    tmp.summary.table <- as.data.frame(tmp.summary.table)
+    
+    ## 4.) Maximum likelihood estimation (MLE) over all events, iterating over codon positions
+    
+    ### summarise data by position
+    tmp.summary.table.pos <- matrix(NA, nrow = length(unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])), ncol = 5)
+    colnames(tmp.summary.table.pos) <- colnames(tmp.summary.table)
+    rownames(tmp.summary.table.pos) <- unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])
+    tmp.summary.table.pos[,"codon distance"] <- abs(as.numeric(str_split_fixed(rownames(tmp.summary.table.pos), ":", 2)[,1]) - as.numeric(str_split_fixed(input.counts.processed[i,"codon_mut"], ":", 2)[,1]))
+    for(j in 1:nrow(tmp.summary.table.pos)){
+      tmp.name <- rownames(tmp.summary.table.pos)[j]
+      tmp.summary.table.pos[j,"true high-conf. variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant count"])
+      tmp.summary.table.pos[j,"true high-conf. variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant coverage"])
+      tmp.summary.table.pos[j,"false double double variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant count"])
+      tmp.summary.table.pos[j,"false double double variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant coverage"], na.rm = T)
       
-    ## calculate the expected false 1nt count probability (MLE)
-    e_MLE <- sum(tmp.false.doubles$counts) / sum(tmp.true.singles$counts * c(tmp.false.doubles$cov / tmp.true.singles$cov))
+      ## infer a good proxy for the false double variant coverage with 0 counts (based estimated distance-dependent % coverage decay observed across all observed double mutants)
+      if(tmp.summary.table.pos[j,"false double double variant coverage"] == "-Inf"){
+        tmp.summary.table.pos[j,"false double double variant coverage"] <- c(dual.codon.coverage.estimate[abs(tmp.summary.table.pos[j,"codon distance"]),"Estimate % max. possible coverage"] / 100) * tmp.summary.table.pos[j,"true high-conf. variant coverage"]
+        tmp.summary.table.pos[j,"false double double variant coverage"] <- round(tmp.summary.table.pos[j,"false double double variant coverage"])
+      }
+    }
+    tmp.summary.table.pos <- as.data.frame(tmp.summary.table.pos)
+    e_MLE <- sum(tmp.summary.table.pos$`false double double variant count`) /
+      sum(tmp.summary.table.pos$`true high-conf. variant count` * c(tmp.summary.table.pos$`false double double variant coverage` / tmp.summary.table.pos$`true high-conf. variant coverage`))
+    seq.error.rate[tmp.single,"1nt false counts/coverage"] <- e_MLE
     input.counts.processed[i,"counts_per_cov_corrected"] <- input.counts.processed[i,"counts_per_cov"] - e_MLE
-
-    ## based on this, also adjust the total_counts by cross-multiplication
-    input.counts.processed[i,"counts_corrected"] <- input.counts.processed[i,"counts"] * c(input.counts.processed[i,"counts_per_cov_corrected"] / input.counts.processed[i,"counts_per_cov"])
+    
+    ## 5.) Apply correction factor
+    input.counts.processed[i,"counts_corrected"] <- input.counts.processed[i,"counts"] - c(e_MLE * input.counts.processed[i,"cov"])
     
     ## round to nearest integer, do not allow for negative counts
     input.counts.processed[i,"counts_corrected"] <- round(input.counts.processed[i,"counts_corrected"])
@@ -72,12 +188,13 @@ seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_c
   }
   
   # Write the processed data
+  write.csv(seq.error.rate, file = seq_error_rate_path, row.names = T)
   write.csv(input.counts.processed, file = output_file_path, row.names = F)
   
 }
 
 # to re-make the AA level input table after WT sequencing error correction
-seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold = 3) {
+seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold) {
   
   # Load the raw GATK data
   raw_gatk <- read.table(gatk_file_path, sep = ",", header = TRUE, stringsAsFactors = FALSE)
@@ -172,19 +289,24 @@ seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_pa
   rownames(heatmap_data) <- 1:nrow(heatmap_data)
   
   # Save output
-  write.csv(heatmap_data, file = output_csv_path, row.names = FALSE)
+  write.csv(heatmap_data, file = output_csv_path, row.names = F)
   print(paste("Aggregated data saved to:", output_csv_path))
-
+  
 }
 
 ## --- Main functions ---
-seq_error_correct_by_false_doubles_MLE(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts",
-                                       input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv",
-                                       output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv")
+seq_error_correct_by_false_doubles_MLE(wt_path = "GID1A_bPCA.fasta", 
+                                       input_count_path_raw = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/gatk/GID1A_input_1_pe/gatk_output.variantCounts",
+                                       input_count_path_processed = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library.csv",
+                                       output_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_MLE.csv",
+                                       seq_error_rate_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/seq_error_rate_MLE.csv", 
+                                       codon_window = 40)
+
+seq_error_correct_counts_for_heatmaps(gatk_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_MLE.csv",
+                                      aa_seq_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/aa_seq.txt",
+                                      output_csv_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles_MLE.csv",
+                                      threshold = 3)
 
-seq_error_correct_counts_for_heatmaps(gatk_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv",
-                                      aa_seq_file_path = "intermediate_files/aa_seq.txt",
-                                      output_csv_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles.csv")
 
 ####
 # create versions.yml