diff --git a/2passtools.yml b/2passtools.yml index cdea752..0904899 100755 --- a/2passtools.yml +++ b/2passtools.yml @@ -3,6 +3,8 @@ channels: - conda-forge - bioconda dependencies: + - minimap2 + - samtools - python=3.6 - pip - numpy diff --git a/lib2pass/main.py b/lib2pass/main.py index bab753c..97e13a4 100755 --- a/lib2pass/main.py +++ b/lib2pass/main.py @@ -3,6 +3,7 @@ ''' import os import logging +import tempfile import click import click_log @@ -282,10 +283,128 @@ def filter(bed_fn, output_bed_fn, exprs): @main.command() +@click.argument('fastq-fn', required=True, nargs=1) +@click.option('-b', '--output-bam-prefix', required=True, help='Output bam prefix') +@_common_options(SCORE_MERGE_COMMON_OPTIONS) +@click.option('--stranded/--unstranded', default=True, + help=('Whether input data is stranded or unstranded. ' + 'direct RNA is stranded, cDNA often isn\'t')) +@click.option('--exprs', required=False, default="decision_tree_2_pred") +@click.option('-mk', '--mm2-k', default=14, type=int, help='Minimap2 minimizer kmer size') +@click.option('-mw', '--mm2-w', default=5, type=int, help='Minimap2 minimizer window size') +@click.option('--mm2-splice-flank/--mm2-no-splice-flank', default=True, + help='Assume the next base to a GU donor site tends to be A/G') +@click.option('-mc', '--mm2-noncanon-splicing-pen', default=9, type=int, + help='Penalty for non-canonical (non GU/AG) splicing') +@click.option('--mm2-junc-bonus', default=12, type=int, + help='Score bonus for a splice donor or acceptor found in guide junctions') +@click.option('-mG', '--mm2-max-intron-size', default=100_000, type=int, help='Maximum intron size') +@click.option('--mm2-end-seed-pen', default=12, type=int, help='helps avoid tiny terminal exons') +@click.option('-p', '--processes', default=1) +@click.option('-s', '--random-seed', default=None, type=int) @click_log.simple_verbosity_option(log) -def mm2pass(): - raise NotImplementedError('TODO: implement convience tool to wrap ' - 'minimap2 and run two pass alignment') +def mm2pass(fastq_fn, output_bam_prefix, + output_bed_fn, ref_fasta_fn, annot_bed_fn, + jad_size_threshold, + primary_splice_local_dist, canonical_motifs, + lr_window_size, lr_kfold, + lr_low_confidence_threshold, lr_high_confidence_threshold, + classifier_type, keep_all_annot, stranded, exprs, + mm2_k, mm2_w, mm2_splice_flank, mm2_noncanon_splicing_pen, + mm2_junc_bonus, mm2_max_intron_size, mm2_end_seed_pen, + processes, random_seed): + ''' + 2passtools mm2pass: Wrapper which performs two pass alignment using + minimap2 and 2passtools score/filter. + ''' + if random_seed is not None: + np.random.seed(random_seed) + + from .minimap2 import map_with_minimap2 + + ## FIRST PASS + first_pass_fn = f'{output_bam_prefix}.1pass.bam' + log.info(f'First pass alignment will be output to {first_pass_fn}') + if annot_bed_fn is not None: + log.info(f'Using guide junctions from {annot_bed_fn}') + map_with_minimap2( + fastq_fn, + ref_fasta_fn, + output_fn=first_pass_fn, + threads=processes, + k=mm2_k, w=mm2_w, + splice_flank=mm2_splice_flank, + noncanon_pen=mm2_noncanon_splicing_pen, + canon_splice_strand='f' if stranded else 'b', + junc_bed=annot_bed_fn, + junc_bonus=mm2_junc_bonus, + max_intron_size=mm2_max_intron_size, + end_seed_pen=mm2_end_seed_pen, + ) + + ## SCORING + log.info(f'Parsing BAM file: {first_pass_fn}') + (introns, motifs, lengths, + counts, jad_labels, + is_primary_donor, is_primary_acceptor) = parse_introns( + first_pass_fn, + primary_splice_local_dist, + stranded, + 1_000_000, processes + ) + res = zip(*_all_predictions( + introns, motifs, lengths, counts, jad_labels, + is_primary_donor, is_primary_acceptor, + ref_fasta_fn, annot_bed_fn, + canonical_motifs, jad_size_threshold, + lr_window_size, lr_kfold, + lr_low_confidence_threshold, + lr_high_confidence_threshold, + classifier_type, keep_all_annot, processes + )) + + log.info(f'Writing results to {output_bed_fn}') + with open(output_bed_fn, 'w') as bed: + for i, motif, _, c, jad, pd, pa, d1, lrd, lra, d2 in res: + chrom, start, end, strand = i + bed.write( + f'{chrom:s}\t{start:d}\t{end:d}\t{motif:s}\t{c:d}\t{strand:s}\t' + f'{jad:d}\t{pd:d}\t{pa:d}\t{d1:d}\t' + f'{lrd:.3f}\t{lra:.3f}\t{d2:d}\n' + ) + + ## FILTERING + log.info('Getting filtered guide junctions') + b_handle, filtered_bed_fn = tempfile.mkstemp(suffix='.bed') + with open(filtered_bed_fn, 'w') as bed: + for chrom, start, end, strand, decision in apply_eval_expression(output_bed_fn, exprs): + if decision: + record = f'{chrom}\t{start}\t{end}\tintron\t0\t{strand}\n' + bed.write(record) + + ## SECOND PASS + second_pass_fn = f'{output_bam_prefix}.2pass.bam' + log.info(f'Second pass alignment will be output to {second_pass_fn}') + map_with_minimap2( + fastq_fn, + ref_fasta_fn, + output_fn=second_pass_fn, + threads=processes, + k=mm2_k, w=mm2_w, + splice_flank=mm2_splice_flank, + noncanon_pen=mm2_noncanon_splicing_pen, + canon_splice_strand='f' if stranded else 'b', + junc_bed=filtered_bed_fn, + junc_bonus=mm2_junc_bonus, + max_intron_size=mm2_max_intron_size, + end_seed_pen=mm2_end_seed_pen, + ) + os.close(b_handle) + os.remove(filtered_bed_fn) + log.info('Complete!') + + + if __name__ == '__main__': diff --git a/lib2pass/minimap2.py b/lib2pass/minimap2.py index 7bcef3e..ab6907f 100755 --- a/lib2pass/minimap2.py +++ b/lib2pass/minimap2.py @@ -1,11 +1,20 @@ import os import subprocess import tempfile +import logging -MINIMAP2 = os.path.abspath( - os.path.split(__file__)[0] + - '/../external/minimap2/minimap2' -) +log = logging.getLogger('2passtools') + +MINIMAP2 = os.environ.get('MINIMAP2_PATH', 'minimap2') +SAMTOOLS = os.environ.get('SAMTOOLS_PATH', 'samtools') + +for program, prg_name in zip([MINIMAP2, SAMTOOLS], ['minimap2', 'samtools']): + try: + subprocess.check_call([program, '--help'], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + except FileNotFoundError: + raise OSError(f'{prg_name} not found at location "{program}"') def subprocess_command(cmd, stdout_fn): @@ -19,50 +28,58 @@ def subprocess_command(cmd, stdout_fn): if proc.returncode: raise subprocess.CalledProcessError(stderr.decode()) else: - return stderr.decode() + return stderr.decode().strip() -def map_with_minimap2(fastq_fn, reference_fn, output_fn, threads=1, - use_canon=False, noncanon_pen=9, - junc_bed=None, junc_bonus=9): +def map_with_minimap2(fastq_fn, reference_fn, output_fn=None, threads=1, + k=14, w=5, splice_flank=True, noncanon_pen=9, + canon_splice_strand='f', + junc_bed=None, junc_bonus=12, + max_intron_size=100000, + end_seed_pen=12): if not os.path.exists(fastq_fn): raise OSError('fastq_fn not found') elif not os.path.exists(reference_fn): raise OSError('reference_fn not found') - splice_flank = 'yes' if use_canon else 'no' - noncanon_pen = noncanon_pen if use_canon else 0 - use_canon = 'f' if use_canon else 'n' + splice_flank = 'yes' if splice_flank else 'no' s_handle, sam_fn = tempfile.mkstemp(suffix='.sam') b_handle, bam_fn = tempfile.mkstemp(suffix='.bam') # run minimap minimap2_cmd = [ - MINIMAP2, f'-t{threads}', '-k14', '-w5', '--splice', - '-g2000', '-G10000', '-A1', '-B2', '-O2,32', '-E1,0', - f'-C{noncanon_pen}', f'--splice-flank={splice_flank}', f'-u{use_canon}', - '-z200', '-L', '--cs=long', '-a' + MINIMAP2, f'-t{threads}', '-a', '-x', 'splice', + '-L', '--cs=long', f'-G{max_intron_size}', + f'-C{noncanon_pen}', f'--splice-flank={splice_flank}', + f'-u{canon_splice_strand}', f'--end-seed-pen={end_seed_pen}', ] if junc_bed is not None: minimap2_cmd += ['--junc-bed', junc_bed, f'--junc-bonus={junc_bonus}'] minimap2_cmd += [reference_fn, fastq_fn] + + log.info('Running minimap2') + log.debug('cmd: ' + ' '.join(minimap2_cmd)) minimap2_stderr = subprocess_command(minimap2_cmd, sam_fn) + log.info('Mapping complete') + log.debug(f'minimap2 stderr:\n{minimap2_stderr}') # run samtools view - samtools_view_cmd = ['samtools', 'view', '-bS', sam_fn] + samtools_view_cmd = [SAMTOOLS, 'view', '-bS', sam_fn] samtools_view_stderr = subprocess_command(samtools_view_cmd, bam_fn) + # clean up minimap2 output os.close(s_handle) os.remove(sam_fn) # run samtools sort - samtools_sort_cmd = ['samtools', 'sort', '-@', str(threads), '-o', '-', bam_fn] + samtools_sort_cmd = [SAMTOOLS, 'sort', '-@', str(threads), '-o', '-', bam_fn] samtools_sort_stderr = subprocess_command(samtools_sort_cmd, output_fn) + log.debug(f'samtools sort stderr:\n{samtools_sort_stderr}') # clean up samtools view output os.close(b_handle) os.remove(bam_fn) # run samtools index - samtools_index_cmd = ['samtools', 'index', output_fn] + samtools_index_cmd = [SAMTOOLS, 'index', output_fn] subprocess.check_call(samtools_index_cmd) \ No newline at end of file diff --git a/setup.py b/setup.py index e7af1e4..a9767c4 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name='2passtools', - version='0.3.1', + version='0.4.1', description=( 'two pass alignment of long noisy reads' ),