From 2bae8c921aa1a9c4f8df28e10bebeb4fca817025 Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Sat, 11 Apr 2026 21:55:48 +0200 Subject: [PATCH 1/6] Update process_gatk.R bug fix: remove any remainder indels --- .../dmsanalysis/process_gatk/templates/process_gatk.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R b/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R index 6abc053..fc52f9e 100644 --- a/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R +++ b/modules/local/dmsanalysis/process_gatk/templates/process_gatk.R @@ -115,6 +115,11 @@ filter_gatk_by_codon_library <- function(gatk_file_path, codon_library_path, out }) %>% ungroup() + # Remove any remainder indels + if(length(grep("ins|del", as.character(unlist(filtered_gatk[,"pos_mut"])))) != 0){ + filtered_gatk <- filtered_gatk[-grep("ins|del", as.character(unlist(filtered_gatk[,"pos_mut"]))),] + } + # Write the filtered GATK table to the output file path write.csv(filtered_gatk, file = output_file_path, row.names = FALSE) } @@ -192,10 +197,10 @@ complete_prefiltered_gatk <- function(possible_nnk_path, prefiltered_gatk_path, # Merge both dataframes based on the codon_mut column (full join to include all) merged_data <- full_join(prefiltered_gatk, possible_nnk, by = "codon_mut") - # Fill missing values in counts_per_cov and counts with 0.0000001 + # Fill missing values in counts_per_cov and counts with 0.00000001 merged_data <- merged_data %>% - mutate(counts_per_cov = ifelse(is.na(counts_per_cov), 0.0000001, counts_per_cov), - counts = ifelse(is.na(counts), 0.000001, counts)) + mutate(counts_per_cov = ifelse(is.na(counts_per_cov), 0.00000001, counts_per_cov), + counts = ifelse(is.na(counts), 0.00000001, counts)) # Calculate Hamming distance (varying_bases) and mutation details (aa_mut, pos_mut) merged_data <- merged_data %>% From 145a252559e3a6af01284a03452e4dc2f95f84fb Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Sat, 11 Apr 2026 21:59:22 +0200 Subject: [PATCH 2/6] Update main.nf --- modules/local/bamprocessing/bam_filter/main.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/modules/local/bamprocessing/bam_filter/main.nf b/modules/local/bamprocessing/bam_filter/main.nf index 6cef905..0d7f770 100644 --- a/modules/local/bamprocessing/bam_filter/main.nf +++ b/modules/local/bamprocessing/bam_filter/main.nf @@ -25,8 +25,6 @@ process BAMFILTER_DMS { samtools view -h -F 4 -F 256 -q 30 $bam | \ samtools view -h | \ awk '{if(\$6 !~ /I/ && \$6 !~ /D/ && \$6 !~ /N/) print \$0}' | \ - samtools view -h | \ - awk '{for(i=1;i<=NF;i++) if(\$i ~ /^NM:i:/ && \$i != "NM:i:0") {print \$0; next}} \$1 ~ /^@/' | \ samtools view -bS > ${meta.id}_filtered.bam cat <<-END_VERSIONS > versions.yml From e5b00c324df3b7acd1df527654af8d99d7cc4724 Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Wed, 15 Apr 2026 16:26:36 +0200 Subject: [PATCH 3/6] Update main.nf --- modules/local/gatk/saturationmutagenesis/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/gatk/saturationmutagenesis/main.nf b/modules/local/gatk/saturationmutagenesis/main.nf index 2616347..209cfc2 100644 --- a/modules/local/gatk/saturationmutagenesis/main.nf +++ b/modules/local/gatk/saturationmutagenesis/main.nf @@ -35,7 +35,7 @@ process GATK_SATURATIONMUTAGENESIS { -R $wt_seq \ --orf \$start_stop_codon \ --paired-mode false \ - --min-q 30 \ + --min-q 40 \ --min-variant-obs $min_counts \ -O gatk_output From 04ba4b4597a39bf2df819f3a3cd3c32c71224a69 Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Wed, 15 Apr 2026 16:36:28 +0200 Subject: [PATCH 4/6] Update main.nf Added the --fastq_qmax and --fastq_qmaxout flags with their default values set to 41 (typical for Illumina NovaSeq); these should also be open parameters set in the nf-core pipeline config. These parameters may be useful for tweaking when dealing with inputs from very high quality reads like AVITI's platforms. --- modules/local/bamprocessing/premerge/main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/bamprocessing/premerge/main.nf b/modules/local/bamprocessing/premerge/main.nf index 4a66f85..1081163 100644 --- a/modules/local/bamprocessing/premerge/main.nf +++ b/modules/local/bamprocessing/premerge/main.nf @@ -24,7 +24,7 @@ process PREMERGE { samtools fastq -1 forward_reads.fastq -2 reverse_reads.fastq -0 /dev/null -s /dev/null -n $bam # Merge paired reads - vsearch --fastq_mergepairs forward_reads.fastq --reverse reverse_reads.fastq --fastqout merged_reads.fastq --fastq_minovlen 10 --fastq_allowmergestagger + vsearch --fastq_mergepairs forward_reads.fastq --reverse reverse_reads.fastq --fastqout merged_reads.fastq --fastq_minovlen 10 --fastq_allowmergestagger --fastq_qmax 41 --fastq_qmaxout 41 # Re-align merged reads bwa index $wt_seq From be6655a35b2e7cc0971522ffff6121aa842b42aa Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Wed, 24 Jun 2026 15:18:05 +0200 Subject: [PATCH 5/6] Update and rename false-doubles_based_seq_error_correction.R to false-doubles_MLE_based_seq_error_correction.R --- ... => false-doubles_MLE_based_seq_error_correction.R} | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) rename modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/{false-doubles_based_seq_error_correction.R => false-doubles_MLE_based_seq_error_correction.R} (92%) diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R similarity index 92% rename from modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R rename to modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R index 35ea63b..65b1add 100644 --- a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_based_seq_error_correction.R +++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R @@ -1,13 +1,13 @@ #!/usr/bin/env Rscript -## false-doubles based sequencing error correction in nf-core/deepmutscan +## false-doubles MLE based sequencing error correction in nf-core/deepmutscan ## 30.03.2026 ## --- Helper functions --- # function to run the false-doubles based sequencing error correction (nucleotide level) -seq_error_correct_by_false_doubles <- function(input_count_path_raw, input_count_path_processed, output_file_path){ +seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_count_path_processed, output_file_path){ ## load data (nucleotide-level counts from GATK) @@ -178,9 +178,9 @@ seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_pa } ## --- Main functions --- -seq_error_correct_by_false_doubles(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts", - input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv", - output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv") +seq_error_correct_by_false_doubles_MLE(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts", + input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv", + output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv") seq_error_correct_counts_for_heatmaps(gatk_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv", aa_seq_file_path = "intermediate_files/aa_seq.txt", From 722a25b67c9fe6bc3484b185b3e2a4568e6fadff Mon Sep 17 00:00:00 2001 From: Maximilian Stammnitz Date: Wed, 24 Jun 2026 15:41:30 +0200 Subject: [PATCH 6/6] Add files via upload --- ...se-doubles_EB_based_seq_error_correction.R | 449 ++++++++++++++++++ ...e-doubles_MLE_based_seq_error_correction.R | 200 ++++++-- 2 files changed, 610 insertions(+), 39 deletions(-) create mode 100644 modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R new file mode 100644 index 0000000..a0b7e1e --- /dev/null +++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_EB_based_seq_error_correction.R @@ -0,0 +1,449 @@ +#!/usr/bin/env Rscript + +## false-double mutant codon, empirical bayes (EB)-based sequencing error correction in nf-core/deepmutscan +## 24.06.2026 + + +## --- Helper functions --- + +# function to run the false-doubles based sequencing error correction (nucleotide level) +seq_error_correct_by_false_doubles_EB <- function(wt_path, input_count_path_raw, input_count_path_processed, output_file_path, seq_error_rate_path, codon_window){ + + ## load data (nucleotide-level counts from GATK) + + # WT sequence + wt.seq <- readDNAStringSet(wt_path) + + # for the seq error rates, build a vector to be filled + seq.error.names <- c() + for(i in 1:nchar(wt.seq)){ + tmp.wt <- strsplit(as.character(wt.seq), "")[[1]][i] + seq.error.names <- c(seq.error.names, paste0(i, ":", tmp.wt, ">", c("A", "T", "G", "C"))[-match(tmp.wt, c("A", "T", "G", "C"))]) + } + seq.error.rate <- matrix(0, nrow = length(seq.error.names), ncol = 2) + rownames(seq.error.rate) <- seq.error.names + colnames(seq.error.rate) <- c("1nt counts/coverage", "1nt false counts/coverage") + + # Set the column names + colnames <- c("counts", "cov", "mean_length_variant_reads", "varying_bases", + "base_mut", "varying_codons", "codon_mut", "aa_mut", "pos_mut") + input.counts.raw <- read.table(input_count_path_raw, sep = "\t", header = FALSE, fill = TRUE, col.names = colnames) + input.counts.processed <- read.csv(input_count_path_processed) + + ## append key columns + input.counts.processed$counts_corrected <- input.counts.processed$counts + input.counts.processed$counts_per_cov_corrected <- input.counts.processed$counts_per_cov + + ## calculate some things upfront + + ### highest accessible codon + max.codon <- max(as.numeric(str_split_fixed(input.counts.raw$codon_mut, ":", 2)[,1]), na.rm = T) + + ### total distribution of false double mutant coverage decay (%) vs. codon-codon distance + cat("Estimating the distance-dependent coverage decay of false double mutants...\n") + all.false.doubles <- input.counts.raw + all.false.doubles <- all.false.doubles[which(all.false.doubles[,"varying_bases"] >= 3 & all.false.doubles[,"varying_bases"] < 5 & all.false.doubles[,"varying_codons"] == 2),,drop = F] + all.false.doubles$codon_dist <- vapply(regmatches(all.false.doubles[,"codon_mut"], gregexpr("\\d+(?=:)", all.false.doubles[,"codon_mut"], perl = TRUE)), function(z) abs(diff(as.integer(z))), integer(1)) + + all.false.doubles.codons <- str_split_fixed(all.false.doubles$codon_mut, ", ", 2) + all.false.doubles.codons[,1] <- as.integer(sub(":.*", "", all.false.doubles.codons[,1])) + all.false.doubles.codons[,2] <- as.integer(sub(":.*", "", all.false.doubles.codons[,2])) + median.high.conf.coverage.per.pos <- rep(NA, max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]))) + names(median.high.conf.coverage.per.pos) <- 1:max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1])) + for(i in 1:length(median.high.conf.coverage.per.pos)){ + median.high.conf.coverage.per.pos[i] <- median(input.counts.processed[which(names(median.high.conf.coverage.per.pos)[i] == str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]),"cov"]) + } + all.false.doubles.codons[,1] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,1], names(median.high.conf.coverage.per.pos))] + all.false.doubles.codons[,2] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,2], names(median.high.conf.coverage.per.pos))] + all.false.doubles$max_cov_possible <- as.numeric(apply(all.false.doubles.codons, 1, min)) + all.false.doubles$perc_cov_to_max <- 100 * all.false.doubles$cov / all.false.doubles$max_cov_possible + + fit <- lm(log(all.false.doubles$perc_cov_to_max) ~ all.false.doubles$codon_dist + I(all.false.doubles$codon_dist^2)) + all.false.doubles$pred_cov_perc <- exp(predict(fit)) + + ### convert this into a look-up table: "% max. possible coverage" depending on codon-codon distance + dual.codon.coverage.estimate <- matrix(NA, nrow = max(all.false.doubles$codon_dist), ncol = 2) + colnames(dual.codon.coverage.estimate) <- c("Codon distance", "Estimate % max. possible coverage") + dual.codon.coverage.estimate[,"Codon distance"] <- 1:nrow(dual.codon.coverage.estimate) + dual.codon.coverage.estimate[,"Estimate % max. possible coverage"] <- all.false.doubles[match(dual.codon.coverage.estimate[,"Codon distance"], all.false.doubles$codon_dist),"pred_cov_perc"] + + ## Process the GATK file, only look at single nucleotide variants + cat("Sequencing error correction of GATK counts...\n") + out.summary <- matrix(NA, ncol = 7, nrow = length(grep("[,]", input.counts.processed[,"base_mut"], invert = T))) + colnames(out.summary) <- c("SNV", "Category", + "False_double_variant_count", "False_double_variant_coverage", + "True_high_conf_variant_count", "True_high_conf_variant_coverage", + "Exposure_m") + out.summary[,"SNV"] <- input.counts.processed[grep("[,]", input.counts.processed[,"base_mut"], invert = T),"base_mut"] + out.summary[,"Category"] <- str_split_fixed(out.summary[,"SNV"], ":", 2)[,2] + category.translate <- cbind(c("C>A", "C>G", "C>T", "T>A", "T>C", "T>G"), + c("G>T", "G>C", "G>A", "A>T", "A>G", "A>C")) + for(i in 1:nrow(out.summary)){ + if(out.summary[i,"Category"] %in% category.translate[,2]){ + out.summary[i,"Category"] <- category.translate[match(out.summary[i,"Category"],category.translate[,2]),1] + } + } + + for (i in grep("[,]", input.counts.processed[,"base_mut"], invert = T)){ + + ### algorithm: + ### 1.) isolate 1nt variant of interest + ### 2.) look for 2/3nt hits in a pre-specified window of codons upstream or downstream + ### -> if there are zero false double mutants: skip / set correction factor to zero + ### -> if there are any false double mutants: continue + ### 3.) generate a table for ALL possible 2/3nt variants in the selected window: + ### true high-conf. variant count | true high-conf. variant coverage | false double double variant count (often 0 or 1) | false double double variant coverage + ### 4.) Iterate of all positions + ### 5.) Calculate the empirical Bayes + ### 6.) Apply correction factor + + ## 1.) isolate 1nt variant of interest + tmp.single <- input.counts.processed[i,"base_mut"] + seq.error.rate[tmp.single, "1nt counts/coverage"] <- input.counts.processed[i,"counts_per_cov"] + + ## 2.) look for 2/3nt hits in up to 40 codons (120 bp) upstream or downstream + tmp.false.doubles <- input.counts.raw[grep(paste0("(?= 3 & tmp.false.doubles[,"varying_bases"] < 5 & tmp.false.doubles[,"varying_codons"] == 2),,drop = F] + if(nrow(tmp.false.doubles) == 0){ + next + } + + ## set and enforce distance threshold + tmp.ref.codon <- as.numeric(strsplit(input.counts.processed[i,"codon_mut"] , ":")[[1]][1]) + tmp.min.from.ref <- max(c(1, tmp.ref.codon - c(codon_window - 1))) + tmp.max.from.ref <- min(c(max.codon, tmp.ref.codon + c(codon_window - 1))) + tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2) + tmp.double.codon.n1 <- as.integer(sub(":.*", "", tmp.double.codon[,1])) + tmp.double.codon.n2 <- as.integer(sub(":.*", "", tmp.double.codon[,2])) + tmp.double.codon.within.range <- (tmp.double.codon.n1 >= tmp.min.from.ref & tmp.double.codon.n1 <= tmp.max.from.ref) & (tmp.double.codon.n2 >= tmp.min.from.ref & tmp.double.codon.n2 <= tmp.max.from.ref) + tmp.false.doubles <- tmp.false.doubles[which(tmp.double.codon.within.range == T),,drop = F] + if(nrow(tmp.false.doubles) == 0){ + next + } + + ## 3.) generate a table for ALL possible 2/3nt variants in the selected window + tmp.summary.table.entries <- input.counts.processed + tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) != tmp.ref.codon),] + tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) >= tmp.min.from.ref & + as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) <= tmp.max.from.ref),] + tmp.summary.table.entries <- tmp.summary.table.entries[which(tmp.summary.table.entries$varying_bases > 1),] + + ### fill the table + tmp.summary.table <- matrix(NA, ncol = 5, nrow = nrow(tmp.summary.table.entries)) + colnames(tmp.summary.table) <- c("codon distance", + "true high-conf. variant count", + "true high-conf. variant coverage", + "false double double variant count", + "false double double variant coverage") + rownames(tmp.summary.table) <- paste0(c(tmp.summary.table.entries$codon_mut),", ", c(tmp.summary.table.entries$base_mut)) + + tmp.summary.table[,"codon distance"] <- abs(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) - tmp.ref.codon) + tmp.summary.table[,"true high-conf. variant count"] <- tmp.summary.table.entries$counts + tmp.summary.table[,"true high-conf. variant coverage"] <- tmp.summary.table.entries$cov + tmp.summary.table[,"false double double variant count"] <- 0 + + ## input the actual hits + tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2) + tmp.match1 <- match(tmp.double.codon[,1], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1]) + tmp.match1[is.na(tmp.match1)] <- 0 + tmp.match2 <- match(tmp.double.codon[,2], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1]) + tmp.match2[is.na(tmp.match2)] <- 0 + tmp.match <- c(tmp.match1 + tmp.match2) + + ### remove those false doubles for which there are no matched true high confidence calls + if(any(tmp.match == 0)){ + tmp.false.doubles <- tmp.false.doubles[-which(tmp.match == 0),,drop = F] + tmp.match <- tmp.match[-which(tmp.match == 0)] + if(nrow(tmp.false.doubles) == 0){ + next + } + } + + tmp.summary.table[tmp.match,"false double double variant count"] <- tmp.false.doubles[,"counts"] + tmp.summary.table[tmp.match,"false double double variant coverage"] <- tmp.false.doubles[,"cov"] + tmp.summary.table <- as.data.frame(tmp.summary.table) + + ## 4.) Iterate over codon positions + + ### summarise data by position + tmp.summary.table.pos <- matrix(NA, nrow = length(unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])), ncol = 5) + colnames(tmp.summary.table.pos) <- colnames(tmp.summary.table) + rownames(tmp.summary.table.pos) <- unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1]) + tmp.summary.table.pos[,"codon distance"] <- abs(as.numeric(str_split_fixed(rownames(tmp.summary.table.pos), ":", 2)[,1]) - as.numeric(str_split_fixed(input.counts.processed[i,"codon_mut"], ":", 2)[,1])) + for(j in 1:nrow(tmp.summary.table.pos)){ + tmp.name <- rownames(tmp.summary.table.pos)[j] + tmp.summary.table.pos[j,"true high-conf. variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant count"]) + tmp.summary.table.pos[j,"true high-conf. variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant coverage"]) + tmp.summary.table.pos[j,"false double double variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant count"]) + tmp.summary.table.pos[j,"false double double variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant coverage"], na.rm = T) + + ## infer a good proxy for the false double variant coverage with 0 counts (based estimated distance-dependent % coverage decay observed across all observed double mutants) + if(tmp.summary.table.pos[j,"false double double variant coverage"] == "-Inf"){ + tmp.summary.table.pos[j,"false double double variant coverage"] <- c(dual.codon.coverage.estimate[abs(tmp.summary.table.pos[j,"codon distance"]),"Estimate % max. possible coverage"] / 100) * tmp.summary.table.pos[j,"true high-conf. variant coverage"] + tmp.summary.table.pos[j,"false double double variant coverage"] <- round(tmp.summary.table.pos[j,"false double double variant coverage"]) + } + } + tmp.summary.table.pos <- as.data.frame(tmp.summary.table.pos) + + ### fill the master table + out.summary[match(tmp.single,out.summary[,"SNV"]),"False_double_variant_count"] <- sum(tmp.summary.table.pos$`false double double variant count`) + out.summary[match(tmp.single,out.summary[,"SNV"]),"False_double_variant_coverage"] <- sum(tmp.summary.table.pos$`false double double variant coverage`) + out.summary[match(tmp.single,out.summary[,"SNV"]),"True_high_conf_variant_count"] <- sum(tmp.summary.table.pos$`true high-conf. variant count`) + out.summary[match(tmp.single,out.summary[,"SNV"]),"True_high_conf_variant_coverage"] <- sum(tmp.summary.table.pos$`true high-conf. variant coverage`) + out.summary[match(tmp.single,out.summary[,"SNV"]),"Exposure_m"] <- sum(tmp.summary.table.pos$`true high-conf. variant count` * c(tmp.summary.table.pos$`false double double variant coverage` / tmp.summary.table.pos$`true high-conf. variant coverage`)) + + } + + ## 5.) Calculate the empirical-Bayes sequencing error estimates + out.summary <- out.summary[-which(is.na(out.summary[,"False_double_variant_count"]) == T),] + out.summary <- as.data.frame(out.summary) + class(out.summary$False_double_variant_count) <- "numeric" + class(out.summary$False_double_variant_coverage) <- "numeric" + class(out.summary$True_high_conf_variant_count) <- "numeric" + class(out.summary$True_high_conf_variant_coverage) <- "numeric" + class(out.summary$Exposure_m) <- "numeric" + + ### calculate e_MLE across all + out.summary$e_MLE <- out.summary$False_double_variant_count / out.summary$Exposure_m + + ### calculate all the necessary rates, priors, etc. + + #### global background rate + mu_global <- sum(out.summary$False_double_variant_count) / sum(out.summary$Exposure_m) + mu_global <- max(mu_global, 1e-12) ## + + ##### marginal log-likelihood under Gamma-Poisson with fixed mean mu (ChatGPT function, adapted) + ## prior: e ~ Gamma(shape = nu * mu, rate = nu) + ## mean = mu, variance = mu / nu + gp_loglik_nu <- function(log_nu, y, m, mu) { + nu <- exp(log_nu) + a <- max(nu * mu, 1e-12) + + # Negative-binomial marginal: + # y ~ NB(size = a, prob = nu / (nu + m)) + ll <- lgamma(y + a) - lgamma(a) - lgamma(y + 1) + + a * log(nu / (nu + m)) + + y * log(m / (nu + m)) + + -sum(ll) + } + opt_global <- optimize(f = gp_loglik_nu, + interval = c(log(1e-6), log(1e6)), + y = out.summary$False_double_variant_count, + m = out.summary$Exposure_m, + mu = mu_global) + nu_global <- exp(opt_global$minimum) + + #### category-level rates, shrunk to global rate + mu_cat <- vector(mode = "list", length = 6) + names(mu_cat) <- category.translate[,1] + mu_cat_raw <- mu_cat + for(i in 1:length(mu_cat)){ + mu_cat[[i]] <- out.summary[which(out.summary$Category == names(mu_cat)[i]),] + mu_cat_raw[[i]] <- sum(mu_cat[[i]]$False_double_variant_count) / sum(mu_cat[[i]]$Exposure_m) + mu_cat_raw[[i]] <- pmax(mu_cat_raw[[i]], 1e-12) + mu_cat[[i]] <- c(sum(mu_cat[[i]]$False_double_variant_count) + nu_global * mu_global) / c(sum(mu_cat[[i]]$Exposure_m) + nu_global) + mu_cat[[i]] <- pmax(mu_cat[[i]], 1e-12) + } + mu_cat_raw <- do.call(c, mu_cat_raw) + mu_cat <- do.call(c, mu_cat) + + ##### category-specific prior strength nu_cat (ChatGPT function, adapted) + ## with fallback to global if category is too small + nu_cat <- rep(NA, 6) + names(nu_cat) <- names(mu_cat) + for (i in 1:length(mu_cat)) { + cat_name <- names(mu_cat)[i] + idx <- out.summary$Category == cat_name + + yk <- out.summary$False_double_variant_count[idx] + mk <- out.summary$Exposure_m[idx] + mu_k <- mu_cat[i] + + opt_k <- try(optimize(f = gp_loglik_nu, + interval = c(log(1e-6), log(1e6)), + y = yk, + m = mk, + mu = mu_k), + silent = TRUE) + + if (inherits(opt_k, "try-error")) { + nu_cat[i] <- nu_global + } else { + nu_cat[i] <- exp(opt_k$minimum) + } + } + + #### posterior mean for each specific SNV + out.summary$mu_category <- rep(NA, nrow(out.summary)) + out.summary$nu_category <- rep(NA, nrow(out.summary)) + for(i in 1:6){ + out.summary$mu_category[which(out.summary$Category == names(mu_cat)[i])] <- mu_cat[i] + out.summary$nu_category[which(out.summary$Category == names(nu_cat)[i])] <- nu_cat[i] + } + + ##### prior: e_j ~ Gamma(shape = nu_cat * mu_cat, rate = nu_cat) + out.summary$prior_shape <- out.summary$mu_category * out.summary$nu_category + out.summary$prior_rate <- out.summary$nu_category + + ##### posterior: e_j | data ~ Gamma(shape = prior_shape + F, rate = prior_rate [= nu_cat] + M) + out.summary$post_shape <- out.summary$prior_shape + out.summary$False_double_variant_count + out.summary$post_rate <- out.summary$nu_category + out.summary$Exposure_m + + ##### posterior mean = shrunk EB estimate + out.summary$e_EB <- out.summary$post_shape / out.summary$post_rate + + ##### other posterior intervals + # out.summary$e_EB_upper_1SD <- qgamma(0.68, shape = out.summary$post_shape, rate = out.summary$post_rate) + # out.summary$e_EB_upper_2SD <- qgamma(0.95, shape = out.summary$post_shape, rate = out.summary$post_rate) + # out.summary$e_EB_upper_3SD <- qgamma(0.997, shape = out.summary$post_shape, rate = out.summary$post_rate) + + ## 6.) Apply correction factors systematically (use upper 95% percentile to correct, a.k.a. "conservative") + seq.error.rate[match(out.summary$SNV, rownames(seq.error.rate)),"1nt false counts/coverage"] <- out.summary$e_EB + input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_per_cov_corrected"] <- input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_per_cov"] - out.summary$e_EB + input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"] <- input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts"] - c(out.summary$e_EB * input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"cov"]) + + ## round to nearest integer, do not allow for negative counts + input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"] <- round(input.counts.processed[match(out.summary$SNV, input.counts.processed$base_mut),"counts_corrected"]) + if(any(which(input.counts.processed[,"counts_corrected"] < 0))){ + input.counts.processed[which(input.counts.processed[,"counts_corrected"] < 0),"counts_per_cov_corrected"] <- 0 + input.counts.processed[which(input.counts.processed[,"counts_corrected"] < 0),"counts_corrected"] <- 0 + } + + # Write the processed data + write.csv(seq.error.rate, file = seq_error_rate_path, row.names = T) + write.csv(input.counts.processed, file = output_file_path, row.names = F) + +} + +# to re-make the AA level input table after WT sequencing error correction +seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold) { + + # Load the raw GATK data + raw_gatk <- read.table(gatk_file_path, sep = ",", header = TRUE, stringsAsFactors = FALSE) + + # Read the wild-type amino acid sequence from the text file + wt_seq <- readLines(aa_seq_file_path) + wt_seq <- unlist(strsplit(wt_seq, "")) + + # Replace 'X' with '*', indicating the stop codon + wt_seq[wt_seq == "X"] <- "*" + + # Summarize counts for each unique pos_mut + aggregated_counts_per_cov <- aggregate( + counts_per_cov_corrected ~ pos_mut, + data = raw_gatk, + FUN = function(x) sum(x, na.rm = TRUE) + ) + + aggregated_counts <- aggregate( + counts_corrected ~ pos_mut, + data = raw_gatk, + FUN = function(x) sum(x, na.rm = TRUE) + ) + + # Merge the two aggregated tables + aggregated_data <- merge( + aggregated_counts_per_cov, + aggregated_counts, + by = "pos_mut", + all = TRUE + ) + + # Rename columns to match original output + names(aggregated_data)[names(aggregated_data) == "counts_per_cov_corrected"] <- "total_counts_per_cov_corrected" + names(aggregated_data)[names(aggregated_data) == "counts_corrected"] <- "total_counts_corrected" + + # Extract wt_aa, position, and mut_aa from pos_mut + aggregated_data$wt_aa <- sub("(\\D)(\\d+)(\\D)", "\\1", aggregated_data$pos_mut) + aggregated_data$position <- as.numeric(sub("(\\D)(\\d+)(\\D)", "\\2", aggregated_data$pos_mut)) + aggregated_data$mut_aa <- sub("(\\D)(\\d+)(\\D)", "\\3", aggregated_data$pos_mut) + + # Replace 'X' with '*' + aggregated_data$mut_aa[aggregated_data$mut_aa == "X"] <- "*" + + # Define all 20 standard amino acids and stop codon + all_amino_acids <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", + "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y", "*") + + # Create all positions + all_positions <- seq_along(wt_seq) + + # Create complete grid of all possible position/mutation combinations + complete_data <- expand.grid( + mut_aa = all_amino_acids, + position = all_positions, + stringsAsFactors = FALSE + ) + + # Merge aggregated data into complete grid + heatmap_data <- merge( + complete_data, + aggregated_data, + by = c("mut_aa", "position"), + all.x = TRUE, + sort = FALSE + ) + + # Fill missing values + heatmap_data$total_counts_per_cov_corrected[is.na(heatmap_data$total_counts_per_cov_corrected)] <- 0 + heatmap_data$wt_aa <- wt_seq[heatmap_data$position] + + # Apply threshold + low_count <- is.na(heatmap_data$total_counts_corrected) | heatmap_data$total_counts_corrected < threshold + heatmap_data$total_counts_per_cov_corrected[low_count] <- NA + heatmap_data$total_counts_corrected[low_count] <- NA + + # Fill missing pos_mut values + missing_pos_mut <- is.na(heatmap_data$pos_mut) + heatmap_data$pos_mut[missing_pos_mut] <- paste0( + heatmap_data$wt_aa[missing_pos_mut], + heatmap_data$position[missing_pos_mut], + heatmap_data$mut_aa[missing_pos_mut] + ) + + # Re-order rows + out.order <- paste0(rep(1:max(heatmap_data$position), each = 21), + rep(c("A", "C", "D", "E", "F", "G", "H", + "I", "K", "L","M", "N", "P", "Q", + "R", "S", "T", "V", "W", "Y", "*"), + max(heatmap_data$position))) + heatmap_data <- heatmap_data[match(out.order, paste0(heatmap_data$position, heatmap_data$mut_aa)),] + rownames(heatmap_data) <- 1:nrow(heatmap_data) + + # Save output + write.csv(heatmap_data, file = output_csv_path, row.names = F) + print(paste("Aggregated data saved to:", output_csv_path)) + +} + +## --- Main functions --- +seq_error_correct_by_false_doubles_EB(wt_path = "GID1A_bPCA.fasta", + input_count_path_raw = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/gatk/GID1A_input_1_pe/gatk_output.variantCounts", + input_count_path_processed = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library.csv", + output_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_EB.csv", + seq_error_rate_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/seq_error_rate_EB.csv", + codon_window = 40) + +seq_error_correct_counts_for_heatmaps(gatk_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_EB.csv", + aa_seq_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/aa_seq.txt", + output_csv_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles_EB.csv", + threshold = 3) + + +#### +# create versions.yml +#### +r_version <- strsplit(version[['version.string']], ' ')[[1]][3] +if (is.null(r_version)) r_version <- "unknown" +f <- file("versions.yml", "w") +writeLines( + c( + '"\${task.process}":', + paste(' r-base:', r_version), + ), + f +) +close(f) diff --git a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R index 65b1add..9ada64d 100644 --- a/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R +++ b/modules/local/dmsanalysis/false-doubles_based_seq_error_correction/templates/false-doubles_MLE_based_seq_error_correction.R @@ -1,16 +1,29 @@ #!/usr/bin/env Rscript -## false-doubles MLE based sequencing error correction in nf-core/deepmutscan -## 30.03.2026 +## false-double mutant codon, maximum likelihood estimation (MLE)-based sequencing error correction in nf-core/deepmutscan +## 24.06.2026 ## --- Helper functions --- # function to run the false-doubles based sequencing error correction (nucleotide level) -seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_count_path_processed, output_file_path){ +seq_error_correct_by_false_doubles_MLE <- function(wt_path, input_count_path_raw, input_count_path_processed, output_file_path, seq_error_rate_path, codon_window){ ## load data (nucleotide-level counts from GATK) + # WT sequence + wt.seq <- readDNAStringSet(wt_path) + + # for the seq error rates, build a vector to be filled + seq.error.names <- c() + for(i in 1:nchar(wt.seq)){ + tmp.wt <- strsplit(as.character(wt.seq), "")[[1]][i] + seq.error.names <- c(seq.error.names, paste0(i, ":", tmp.wt, ">", c("A", "T", "G", "C"))[-match(tmp.wt, c("A", "T", "G", "C"))]) + } + seq.error.rate <- matrix(0, nrow = length(seq.error.names), ncol = 2) + rownames(seq.error.rate) <- seq.error.names + colnames(seq.error.rate) <- c("1nt counts/coverage", "1nt false counts/coverage") + # Set the column names colnames <- c("counts", "cov", "mean_length_variant_reads", "varying_bases", "base_mut", "varying_codons", "codon_mut", "aa_mut", "pos_mut") @@ -21,46 +34,149 @@ seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_c input.counts.processed$counts_corrected <- input.counts.processed$counts input.counts.processed$counts_per_cov_corrected <- input.counts.processed$counts_per_cov - # Process the GATK file, error-correct single nucleotide variant counts + ## calculate some things upfront + + ### highest accessible codon + max.codon <- max(as.numeric(str_split_fixed(input.counts.raw$codon_mut, ":", 2)[,1]), na.rm = T) + + ### total distribution of false double mutant coverage decay (%) vs. codon-codon distance + cat("Estimating the distance-dependent coverage decay of false double mutants...\n") + all.false.doubles <- input.counts.raw + all.false.doubles <- all.false.doubles[which(all.false.doubles[,"varying_bases"] >= 3 & all.false.doubles[,"varying_bases"] < 5 & all.false.doubles[,"varying_codons"] == 2),,drop = F] + all.false.doubles$codon_dist <- vapply(regmatches(all.false.doubles[,"codon_mut"], gregexpr("\\d+(?=:)", all.false.doubles[,"codon_mut"], perl = TRUE)), function(z) abs(diff(as.integer(z))), integer(1)) + + all.false.doubles.codons <- str_split_fixed(all.false.doubles$codon_mut, ", ", 2) + all.false.doubles.codons[,1] <- as.integer(sub(":.*", "", all.false.doubles.codons[,1])) + all.false.doubles.codons[,2] <- as.integer(sub(":.*", "", all.false.doubles.codons[,2])) + median.high.conf.coverage.per.pos <- rep(NA, max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]))) + names(median.high.conf.coverage.per.pos) <- 1:max(as.numeric(str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1])) + for(i in 1:length(median.high.conf.coverage.per.pos)){ + median.high.conf.coverage.per.pos[i] <- median(input.counts.processed[which(names(median.high.conf.coverage.per.pos)[i] == str_split_fixed(input.counts.processed$codon_mut, ":", 2)[,1]),"cov"]) + } + all.false.doubles.codons[,1] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,1], names(median.high.conf.coverage.per.pos))] + all.false.doubles.codons[,2] <- median.high.conf.coverage.per.pos[match(all.false.doubles.codons[,2], names(median.high.conf.coverage.per.pos))] + all.false.doubles$max_cov_possible <- as.numeric(apply(all.false.doubles.codons, 1, min)) + all.false.doubles$perc_cov_to_max <- 100 * all.false.doubles$cov / all.false.doubles$max_cov_possible + + fit <- lm(log(all.false.doubles$perc_cov_to_max) ~ all.false.doubles$codon_dist + I(all.false.doubles$codon_dist^2)) + all.false.doubles$pred_cov_perc <- exp(predict(fit)) + + ### convert this into a look-up table: "% max. possible coverage" depending on codon-codon distance + dual.codon.coverage.estimate <- matrix(NA, nrow = max(all.false.doubles$codon_dist), ncol = 2) + colnames(dual.codon.coverage.estimate) <- c("Codon distance", "Estimate % max. possible coverage") + dual.codon.coverage.estimate[,"Codon distance"] <- 1:nrow(dual.codon.coverage.estimate) + dual.codon.coverage.estimate[,"Estimate % max. possible coverage"] <- all.false.doubles[match(dual.codon.coverage.estimate[,"Codon distance"], all.false.doubles$codon_dist),"pred_cov_perc"] + + ## Process the GATK file, only look at single nucleotide variants cat("Sequencing error correction of GATK counts...\n") for (i in grep("[,]", input.counts.processed[,"base_mut"], invert = T)){ - + + ### algorithm: + ### 1.) isolate 1nt variant of interest + ### 2.) look for 2/3nt hits in a pre-specified window of codons upstream or downstream + ### -> if there are zero false double mutants: skip / set correction factor to zero + ### -> if there are any false double mutants: continue + ### 3.) generate a table for ALL possible 2/3nt variants in the selected window: + ### true high-conf. variant count | true high-conf. variant coverage | false double double variant count (often 0 or 1) | false double double variant coverage + ### 4.) Maximum likelihood estimation (MLE) over all events + ### 5.) Apply correction factor + + ## 1.) isolate 1nt variant of interest tmp.single <- input.counts.processed[i,"base_mut"] + seq.error.rate[tmp.single, "1nt counts/coverage"] <- input.counts.processed[i,"counts_per_cov"] - ## locate this variant across all multi-codon variants in the original count matrix - tmp.false.doubles <- input.counts.raw[grep(paste0("(?= 3 & tmp.false.doubles[,"varying_codons"] < 3),] + ## subset multi-codon variants + tmp.false.doubles <- tmp.false.doubles[which(tmp.false.doubles[,"varying_bases"] >= 3 & tmp.false.doubles[,"varying_bases"] < 5 & tmp.false.doubles[,"varying_codons"] == 2),,drop = F] if(nrow(tmp.false.doubles) == 0){ next } - - ## need to match with the corresponding correct single codon variant(s) - tmp.true.singles <- tmp.false.doubles$base_mut - tmp.true.singles <- strsplit(tmp.true.singles, ", ") - tmp.true.singles <- lapply(tmp.true.singles, function(x){x <- x[x != tmp.single]; x <- paste0(x, collapse = ", "); return(x)}) - tmp.true.singles <- do.call(c, tmp.true.singles) - tmp.true.singles <- input.counts.processed[match(tmp.true.singles, input.counts.processed[,"base_mut"]),] - if(all(is.na(tmp.true.singles) == T)){ + + ## set and enforce distance threshold + tmp.ref.codon <- as.numeric(strsplit(input.counts.processed[i,"codon_mut"] , ":")[[1]][1]) + tmp.min.from.ref <- max(c(1, tmp.ref.codon - c(codon_window - 1))) + tmp.max.from.ref <- min(c(max.codon, tmp.ref.codon + c(codon_window - 1))) + tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2) + tmp.double.codon.n1 <- as.integer(sub(":.*", "", tmp.double.codon[,1])) + tmp.double.codon.n2 <- as.integer(sub(":.*", "", tmp.double.codon[,2])) + tmp.double.codon.within.range <- (tmp.double.codon.n1 >= tmp.min.from.ref & tmp.double.codon.n1 <= tmp.max.from.ref) & (tmp.double.codon.n2 >= tmp.min.from.ref & tmp.double.codon.n2 <= tmp.max.from.ref) + tmp.false.doubles <- tmp.false.doubles[which(tmp.double.codon.within.range == T),,drop = F] + if(nrow(tmp.false.doubles) == 0){ next } - - ## make sure that we only count events for which there are both true single codon and false double codon variants - if(any(is.na(tmp.true.singles$counts))){ - tmp.false.doubles <- tmp.false.doubles[-which(is.na(tmp.true.singles$counts)),] - tmp.true.singles <- tmp.true.singles[-which(is.na(tmp.true.singles$counts)),] - } - if(all(is.na(tmp.true.singles) == T)){ - next + + ## 3.) generate a table for ALL possible 2/3nt variants in the selected window + tmp.summary.table.entries <- input.counts.processed + tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) != tmp.ref.codon),] + tmp.summary.table.entries <- tmp.summary.table.entries[which(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) >= tmp.min.from.ref & + as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) <= tmp.max.from.ref),] + tmp.summary.table.entries <- tmp.summary.table.entries[which(tmp.summary.table.entries$varying_bases > 1),] + + ### fill the table + tmp.summary.table <- matrix(NA, ncol = 5, nrow = nrow(tmp.summary.table.entries)) + colnames(tmp.summary.table) <- c("codon distance", + "true high-conf. variant count", + "true high-conf. variant coverage", + "false double double variant count", + "false double double variant coverage") + rownames(tmp.summary.table) <- paste0(c(tmp.summary.table.entries$codon_mut),", ", c(tmp.summary.table.entries$base_mut)) + + tmp.summary.table[,"codon distance"] <- abs(as.numeric(str_split_fixed(tmp.summary.table.entries$codon_mut, ":", 2)[,1]) - tmp.ref.codon) + tmp.summary.table[,"true high-conf. variant count"] <- tmp.summary.table.entries$counts + tmp.summary.table[,"true high-conf. variant coverage"] <- tmp.summary.table.entries$cov + tmp.summary.table[,"false double double variant count"] <- 0 + + ## input the actual hits + tmp.double.codon <- str_split_fixed(tmp.false.doubles$codon_mut, ", ", 2) + tmp.match1 <- match(tmp.double.codon[,1], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1]) + tmp.match1[is.na(tmp.match1)] <- 0 + tmp.match2 <- match(tmp.double.codon[,2], str_split_fixed(rownames(tmp.summary.table), ", ", 2)[,1]) + tmp.match2[is.na(tmp.match2)] <- 0 + tmp.match <- c(tmp.match1 + tmp.match2) + + ### remove those false doubles for which there are no matched true high confidence calls + if(any(tmp.match == 0)){ + tmp.false.doubles <- tmp.false.doubles[-which(tmp.match == 0),,drop = F] + tmp.match <- tmp.match[-which(tmp.match == 0)] + if(nrow(tmp.false.doubles) == 0){ + next + } } + + tmp.summary.table[tmp.match,"false double double variant count"] <- tmp.false.doubles[,"counts"] + tmp.summary.table[tmp.match,"false double double variant coverage"] <- tmp.false.doubles[,"cov"] + tmp.summary.table <- as.data.frame(tmp.summary.table) + + ## 4.) Maximum likelihood estimation (MLE) over all events, iterating over codon positions + + ### summarise data by position + tmp.summary.table.pos <- matrix(NA, nrow = length(unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1])), ncol = 5) + colnames(tmp.summary.table.pos) <- colnames(tmp.summary.table) + rownames(tmp.summary.table.pos) <- unique(str_split_fixed(rownames(tmp.summary.table), ">", 2)[,1]) + tmp.summary.table.pos[,"codon distance"] <- abs(as.numeric(str_split_fixed(rownames(tmp.summary.table.pos), ":", 2)[,1]) - as.numeric(str_split_fixed(input.counts.processed[i,"codon_mut"], ":", 2)[,1])) + for(j in 1:nrow(tmp.summary.table.pos)){ + tmp.name <- rownames(tmp.summary.table.pos)[j] + tmp.summary.table.pos[j,"true high-conf. variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant count"]) + tmp.summary.table.pos[j,"true high-conf. variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"true high-conf. variant coverage"]) + tmp.summary.table.pos[j,"false double double variant count"] <- sum(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant count"]) + tmp.summary.table.pos[j,"false double double variant coverage"] <- max(tmp.summary.table[grep(paste0("^",tmp.name), rownames(tmp.summary.table)),"false double double variant coverage"], na.rm = T) - ## calculate the expected false 1nt count probability (MLE) - e_MLE <- sum(tmp.false.doubles$counts) / sum(tmp.true.singles$counts * c(tmp.false.doubles$cov / tmp.true.singles$cov)) + ## infer a good proxy for the false double variant coverage with 0 counts (based estimated distance-dependent % coverage decay observed across all observed double mutants) + if(tmp.summary.table.pos[j,"false double double variant coverage"] == "-Inf"){ + tmp.summary.table.pos[j,"false double double variant coverage"] <- c(dual.codon.coverage.estimate[abs(tmp.summary.table.pos[j,"codon distance"]),"Estimate % max. possible coverage"] / 100) * tmp.summary.table.pos[j,"true high-conf. variant coverage"] + tmp.summary.table.pos[j,"false double double variant coverage"] <- round(tmp.summary.table.pos[j,"false double double variant coverage"]) + } + } + tmp.summary.table.pos <- as.data.frame(tmp.summary.table.pos) + e_MLE <- sum(tmp.summary.table.pos$`false double double variant count`) / + sum(tmp.summary.table.pos$`true high-conf. variant count` * c(tmp.summary.table.pos$`false double double variant coverage` / tmp.summary.table.pos$`true high-conf. variant coverage`)) + seq.error.rate[tmp.single,"1nt false counts/coverage"] <- e_MLE input.counts.processed[i,"counts_per_cov_corrected"] <- input.counts.processed[i,"counts_per_cov"] - e_MLE - - ## based on this, also adjust the total_counts by cross-multiplication - input.counts.processed[i,"counts_corrected"] <- input.counts.processed[i,"counts"] * c(input.counts.processed[i,"counts_per_cov_corrected"] / input.counts.processed[i,"counts_per_cov"]) + + ## 5.) Apply correction factor + input.counts.processed[i,"counts_corrected"] <- input.counts.processed[i,"counts"] - c(e_MLE * input.counts.processed[i,"cov"]) ## round to nearest integer, do not allow for negative counts input.counts.processed[i,"counts_corrected"] <- round(input.counts.processed[i,"counts_corrected"]) @@ -72,12 +188,13 @@ seq_error_correct_by_false_doubles_MLE <- function(input_count_path_raw, input_c } # Write the processed data + write.csv(seq.error.rate, file = seq_error_rate_path, row.names = T) write.csv(input.counts.processed, file = output_file_path, row.names = F) } # to re-make the AA level input table after WT sequencing error correction -seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold = 3) { +seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_path, output_csv_path, threshold) { # Load the raw GATK data raw_gatk <- read.table(gatk_file_path, sep = ",", header = TRUE, stringsAsFactors = FALSE) @@ -172,19 +289,24 @@ seq_error_correct_counts_for_heatmaps <- function(gatk_file_path, aa_seq_file_pa rownames(heatmap_data) <- 1:nrow(heatmap_data) # Save output - write.csv(heatmap_data, file = output_csv_path, row.names = FALSE) + write.csv(heatmap_data, file = output_csv_path, row.names = F) print(paste("Aggregated data saved to:", output_csv_path)) - + } ## --- Main functions --- -seq_error_correct_by_false_doubles_MLE(input_count_path_raw = "intermediate_files/gatk/input_1_pe/gatk_output.variantCounts", - input_count_path_processed = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library.csv", - output_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv") +seq_error_correct_by_false_doubles_MLE(wt_path = "GID1A_bPCA.fasta", + input_count_path_raw = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/gatk/GID1A_input_1_pe/gatk_output.variantCounts", + input_count_path_processed = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library.csv", + output_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_MLE.csv", + seq_error_rate_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/seq_error_rate_MLE.csv", + codon_window = 40) + +seq_error_correct_counts_for_heatmaps(gatk_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles_MLE.csv", + aa_seq_file_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/aa_seq.txt", + output_csv_path = "GID1A_bPCA_conc1_out_min1_noWT_removal_Q40/intermediate_files/processed_gatk_files/GID1A_input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles_MLE.csv", + threshold = 3) -seq_error_correct_counts_for_heatmaps(gatk_file_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_filtered_by_library_err_corrected_false_doubles.csv", - aa_seq_file_path = "intermediate_files/aa_seq.txt", - output_csv_path = "intermediate_files/processed_gatk_files/input_1_pe/variantCounts_for_heatmaps_err_corrected_false_doubles.csv") #### # create versions.yml