From a4fd538181b2527e19a4b9baf554ca031de9dd3d Mon Sep 17 00:00:00 2001 From: jvrakor Date: Tue, 26 Jun 2018 15:41:30 -0500 Subject: [PATCH 1/2] Make Aquas ChIP-seq script work for paired-end reads --- CHANGELOG.rst | 4 ++++ chipseq_aquas_pipeline.R | 45 +++++++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index aef3151..383b927 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -8,3 +8,7 @@ All notable changes to this project are documented in this file. Added ----- - Add initial Aquas ChIP-Seq R script + +Fixed +----- +- Make Aquas ChIP-Seq R script work for paired-end reads diff --git a/chipseq_aquas_pipeline.R b/chipseq_aquas_pipeline.R index b40d7e1..01ed3c7 100755 --- a/chipseq_aquas_pipeline.R +++ b/chipseq_aquas_pipeline.R @@ -56,7 +56,8 @@ pushAquasAlignQC <- function(project.dir, dt.location, chip.type = "histone", } else if(endedness == "-se"){ input.file.cmd <- sprintf("-fastq1 %s", fastq.files[1]) } else { - input.file.cmd <- sprintf("-fastq1_1:%s -fastq1_2:%s", fastq.files[1], fastq.files[2]) + + input.file.cmd <- sprintf("-fastq1_1 %s -fastq1_2 %s", fastq.files[1], fastq.files[2]) } ##Check if everything needs to be done for true replicates @@ -236,9 +237,19 @@ mvBams <- function(project.dir, dt.location){ file.id <- as.character(subset(df.sample.info, NAME == sample.name)$UNIQUE_ID) + ##location of the fastq file + fastq.file.location <- as.character(subset(df.sample.info, NAME == sample.name)$FASTQ_FILE) + fastq.files <- unlist(strsplit(split = "::", fastq.file.location)) + ##Move the bam files - all.bam.files <- list.files(path = aquas.dir, pattern = "*.nodup.bam$", - full.names = T, recursive = T) + if(length(fastq.files) == 2){ + all.bam.files <- list.files(path = aquas.dir, pattern = "*.PE2SE.nodup.bam$", + full.names = T, recursive = T) + } else if (length(fastq.files) == 1){ + all.bam.files <- list.files(path = aquas.dir, pattern = "*.nodup.bam$", + full.names = T, recursive = T) + } + bam.file.logical <- grepl(pattern = file.id, x = all.bam.files) bam.file.location <- file.path(all.bam.files[bam.file.logical]) @@ -263,9 +274,19 @@ mvBams <- function(project.dir, dt.location){ file.id <- as.character(subset(df.sample.info, NAME == sample.name)$UNIQUE_ID) + ##location of the fastq file + fastq.file.location <- as.character(subset(df.sample.info, NAME == sample.name)$FASTQ_FILE) + fastq.files <- unlist(strsplit(split = "::", fastq.file.location)) + ##Move the tagAlign files - all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.nodup.tagAlign.gz$", - full.names = T, recursive = T) + if(length(fastq.files) == 2){ + all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.PE2SE.nodup.tagAlign.gz$", + full.names = T, recursive = T) + } else if (length(fastq.files) == 1){ + all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.nodup.tagAlign.gz$", + full.names = T, recursive = T) + } + tagAlign.file.logical <- grepl(pattern = file.id, x = all.tagAlign.files) tagAlign.file.location <- file.path(all.tagAlign.files[tagAlign.file.logical]) @@ -466,9 +487,19 @@ getCustomQC <- function(project.dir, dt.location, peaks.dirname){ stop("Experiment bam file not found") } + ##location of the fastq file + fastq.file.location <- as.character(subset(df.sample.info, NAME == sample.name)$FASTQ_FILE) + fastq.files <- unlist(strsplit(split = "::", fastq.file.location)) + ##get estimated fragment length from cross-corr. analysis log - cc.qc.log.files <- list.files(path = aquas.dir, pattern = "*.cc.qc$", full.names = T, - recursive = T) + if(length(fastq.files) == 2){ + cc.qc.log.files <- list.files(path = aquas.dir, pattern = "*.R1.cc.qc$", full.names = T, + recursive = T) + } else if (length(fastq.files) == 1){ + cc.qc.log.files <- list.files(path = aquas.dir, pattern = "*.cc.qc$", full.names = T, + recursive = T) + } + expt.cc.file.logical <- grepl(pattern = expt.file.id, x = cc.qc.log.files) expt.cc.file.location <- cc.qc.log.files[expt.cc.file.logical] fraglen <- read.delim(expt.cc.file.location, header = F)[1, 3] From 5baf817a1d3b8361550c2342d99eb8241d84420e Mon Sep 17 00:00:00 2001 From: jvrakor Date: Thu, 2 Aug 2018 14:41:53 -0500 Subject: [PATCH 2/2] Disable paired-end reads initial trimming --- CHANGELOG.rst | 4 ++++ chipseq_aquas_pipeline.R | 16 +++++++++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 383b927..c20b725 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,10 @@ Added ----- - Add initial Aquas ChIP-Seq R script +Changed +------- +- Disable paired-end reads initial trimming + Fixed ----- - Make Aquas ChIP-Seq R script work for paired-end reads diff --git a/chipseq_aquas_pipeline.R b/chipseq_aquas_pipeline.R index 01ed3c7..6ef96ff 100755 --- a/chipseq_aquas_pipeline.R +++ b/chipseq_aquas_pipeline.R @@ -1,6 +1,7 @@ suppressMessages(library(GenomicFeatures)) suppressMessages(library(GenomicAlignments)) +# TODO: remove absolute paths and turn them into inputs scripts.dir <- "/storage/cylin/bin/aquas/chip" fn.dir <- "/storage/cylin/anaconda3/envs/aquas_chipseq/bin/" @@ -56,7 +57,6 @@ pushAquasAlignQC <- function(project.dir, dt.location, chip.type = "histone", } else if(endedness == "-se"){ input.file.cmd <- sprintf("-fastq1 %s", fastq.files[1]) } else { - input.file.cmd <- sprintf("-fastq1_1 %s -fastq1_2 %s", fastq.files[1], fastq.files[2]) } @@ -69,6 +69,9 @@ pushAquasAlignQC <- function(project.dir, dt.location, chip.type = "histone", screen.name <- sprintf("align%d%d", sample(1:1000000, size = 1), sample(1:1000000, size = 1)) aquas.cmd <- sprintf("python %s/chipseq.py -screen %s", scripts.dir, screen.name) aquas.cmd <- sprintf("%s -type %s -final_stage xcor", aquas.cmd, chip.type) + if(endedness == "-pe"){ + aquas.cmd <- sprintf("%s -pe_no_trim_fastq", aquas.cmd) + } aquas.cmd <- sprintf("%s -out_dir %s %s -species %s", aquas.cmd, output.dir, endedness, genome.name) aquas.cmd <- sprintf("%s -system slurm -nth %d %s", aquas.cmd, ncores, input.file.cmd) @@ -249,7 +252,7 @@ mvBams <- function(project.dir, dt.location){ all.bam.files <- list.files(path = aquas.dir, pattern = "*.nodup.bam$", full.names = T, recursive = T) } - + bam.file.logical <- grepl(pattern = file.id, x = all.bam.files) bam.file.location <- file.path(all.bam.files[bam.file.logical]) @@ -280,10 +283,10 @@ mvBams <- function(project.dir, dt.location){ ##Move the tagAlign files if(length(fastq.files) == 2){ - all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.PE2SE.nodup.tagAlign.gz$", + all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.PE2SE.nodup.tagAlign.gz$", full.names = T, recursive = T) } else if (length(fastq.files) == 1){ - all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.nodup.tagAlign.gz$", + all.tagAlign.files <- list.files(path = aquas.dir, pattern = "*.nodup.tagAlign.gz$", full.names = T, recursive = T) } @@ -424,8 +427,9 @@ createSignalTrack <- function(project.dir, dt.location){ #bw.output.file <- sprintf("%s%s", bdg.output.file, "_treat_pileup.bw") #wig.output.file <- sprintf("%s%s", bdg.output.file, "_treat_pileup.wig") - signal.cmd <- sprintf("%s callpeak -t %s -c %s -B --nomodel --extsize %d --trackline --SPMR -g %s -n %s", + signal.cmd <- sprintf("%s callpeak -t %s -c %s -B --nomodel --extsize %d --trackline --SPMR -g %s -n %s", macs2.location, expt.bam.file.location, bkg.bam.file.location, fraglen, genome.abbrv, bdg.output.file) + # TODO: make it work if there is no background track.name.cmd <- sprintf("sed -i '/name/ s/treatment pileup/%s/' %s_treat_pileup.bdg", sample.name ,bdg.output.file) gzip.cmd <- sprintf("gzip %s_treat_pileup.bdg", bdg.output.file) @@ -462,7 +466,6 @@ getCustomQC <- function(project.dir, dt.location, peaks.dirname){ if(!dir.exists(aquas.dir)){ stop("No aquas output") } - df.sample.info <- read.delim(dt.location, sep = "\t", header = T) sample.names <- as.character(subset(df.sample.info, ENRICHED_MACS != "NONE")$NAME) df <- do.call(rbind, mclapply(sample.names, function(sample.name){ @@ -499,7 +502,6 @@ getCustomQC <- function(project.dir, dt.location, peaks.dirname){ cc.qc.log.files <- list.files(path = aquas.dir, pattern = "*.cc.qc$", full.names = T, recursive = T) } - expt.cc.file.logical <- grepl(pattern = expt.file.id, x = cc.qc.log.files) expt.cc.file.location <- cc.qc.log.files[expt.cc.file.logical] fraglen <- read.delim(expt.cc.file.location, header = F)[1, 3]