From 705c04c92bba352d6e3e05d79b7a9cf460aefb65 Mon Sep 17 00:00:00 2001 From: Cecile Date: Tue, 5 Dec 2017 09:59:13 +0100 Subject: [PATCH 1/4] Add blast.ml to lib/bioinfo --- lib/bioinfo/blast.ml | 72 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 lib/bioinfo/blast.ml diff --git a/lib/bioinfo/blast.ml b/lib/bioinfo/blast.ml new file mode 100644 index 00000000..31574766 --- /dev/null +++ b/lib/bioinfo/blast.ml @@ -0,0 +1,72 @@ +open Core +open Bistro.Std +open Bistro.EDSL +open Bistro_bioinfo.Std + +type db = [`blast_db] directory +let env = docker_image ~account:"pveber" ~name:"ncbi-blast" ~tag:"2.4.0" () + +let db_name = "db" + +let fastadb fa dbtype = + workflow ~descr:"blast.makedb" [ + mkdir_p dest ; + cmd ~env "makeblastdb" [ + opt "-in" dep fa ; + opt "-dbtype" ident dbtype ; + opt "-out" ident (dest // db_name) ; + ] ; + ] + +(* Basic blastn*) + +let blastn ?evalue ?word_size ?task ?gapopen ?gapextend ?penalty +?reward ?outfmt ?perc_identity ?qcov_hsp_perc ?max_hsps ?max_target_seqs ?(threads = 4) db query out_name = (*See blastn documentation to know what options are*) + workflow ~descr:"blastn" ~np:threads [ + mkdir_p dest ; + cmd "blastn" ~env [ + opt "-db" ident (dep db // db_name) ; + opt "-query" dep query ; + opt "-out" ident (dest // out_name) ; + option (opt "-evalue" float) evalue ; + option (opt "-word_size" int) word_size ; + option (opt "-task" string) task ; + option (opt "-gapopen" int) gapopen ; + option (opt "-gapextend" int) gapextend ; + option (opt "-penalty" int) penalty ; + option (opt "-reward" int) reward ; + option (opt "-outfmt" string) outfmt ; + option (opt "-perc_identity" float) perc_identity ; + option (opt "-qcov_hsp_perc" float) qcov_hsp_perc ; + option (opt "-max_hsps" int) max_hsps ; + option (opt "-max_target_seqs" int) max_target_seqs ; + opt "-num_threads" ident np ; + ] + ] + +let blastp ?evalue ?word_size ?task ?gapopen ?gapextend ?penalty +?reward ?outfmt ?perc_identity ?qcov_hsp_perc ?max_hsps ?max_target_seqs ?(threads = 4) db query out_name = (*See blastn documentation to know what options are*) + workflow ~descr:"blastp" ~np:threads [ + mkdir_p dest ; + cmd "blastp" ~env [ + opt "-db" ident (dep db // db_name) ; + opt "-query" dep query ; + opt "-out" ident (dest // out_name) ; + option (opt "-evalue" float) evalue ; + option (opt "-word_size" int) word_size ; + option (opt "-task" string) task ; + option (opt "-gapopen" int) gapopen ; + option (opt "-gapextend" int) gapextend ; + option (opt "-penalty" int) penalty ; + option (opt "-reward" int) reward ; + option (opt "-outfmt" string) outfmt ; + option (opt "-perc_identity" float) perc_identity ; + option (opt "-qcov_hsp_perc" float) qcov_hsp_perc ; + option (opt "-max_hsps" int) max_hsps ; + option (opt "-max_target_seqs" int) max_target_seqs ; + opt "-num_threads" ident np ; + ] + ] + + + From 1f01944900ff2fd04a351e091769cf1482c9fcee Mon Sep 17 00:00:00 2001 From: Cecile Date: Tue, 5 Dec 2017 13:44:40 +0100 Subject: [PATCH 2/4] Add selectors to re-use prokka outputs + Modify command line to have an unique prefix name for results and recover them more easily --- lib/bioinfo/prokka.ml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/lib/bioinfo/prokka.ml b/lib/bioinfo/prokka.ml index 3a4414dc..3d2e5928 100644 --- a/lib/bioinfo/prokka.ml +++ b/lib/bioinfo/prokka.ml @@ -8,15 +8,14 @@ let gram_expr = function | `Plus -> string "+" | `Minus -> string "-" -let run ?prefix ?addgenes ?locustag ?increment ?gffver ?compliant +let run ?addgenes ?locustag ?increment ?gffver ?compliant ?centre ?genus ?species ?strain ?plasmid ?kingdom ?gcode ?gram ?usegenus ?proteins ?hmms ?metagenome ?rawproduct ?fast ?(threads = 1) ?mincontiglen ?evalue ?rfam ?norrna ?notrna ?rnammer fa = workflow ~descr:"prokka" ~np:threads ~mem:(3 * 1024) [ mkdir_p dest ; cmd "prokka" ~env [ - string "--force" ; - option (opt "--prefix" string) prefix ; + string "--force --prefix prokka_res" ; option (flag string "--addgenes") addgenes ; option (opt "--locustag" string) locustag ; option (opt "--increment" int) increment ; @@ -47,3 +46,9 @@ let run ?prefix ?addgenes ?locustag ?increment ?gffver ?compliant dep fa ; ] ; ] + + +let transcripts = selector ["prokka_res.ffn"] +let proteins = selector ["prokka_res.faa"] +let gff_annotation = selector [ "prokka_res.gff" ] +let gbk_annotation = selector [ "prokka_res.gbk" ] From a4c22425f1bf379c4e6dc5bf2cea69787925245a Mon Sep 17 00:00:00 2001 From: Cecile Date: Wed, 6 Dec 2017 18:01:16 +0100 Subject: [PATCH 3/4] Revert "Add selectors to re-use prokka outputs + Modify command line to have an unique prefix name for results and recover them more easily" This reverts commit 1f01944900ff2fd04a351e091769cf1482c9fcee. --- lib/bioinfo/prokka.ml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/lib/bioinfo/prokka.ml b/lib/bioinfo/prokka.ml index 3d2e5928..3a4414dc 100644 --- a/lib/bioinfo/prokka.ml +++ b/lib/bioinfo/prokka.ml @@ -8,14 +8,15 @@ let gram_expr = function | `Plus -> string "+" | `Minus -> string "-" -let run ?addgenes ?locustag ?increment ?gffver ?compliant +let run ?prefix ?addgenes ?locustag ?increment ?gffver ?compliant ?centre ?genus ?species ?strain ?plasmid ?kingdom ?gcode ?gram ?usegenus ?proteins ?hmms ?metagenome ?rawproduct ?fast ?(threads = 1) ?mincontiglen ?evalue ?rfam ?norrna ?notrna ?rnammer fa = workflow ~descr:"prokka" ~np:threads ~mem:(3 * 1024) [ mkdir_p dest ; cmd "prokka" ~env [ - string "--force --prefix prokka_res" ; + string "--force" ; + option (opt "--prefix" string) prefix ; option (flag string "--addgenes") addgenes ; option (opt "--locustag" string) locustag ; option (opt "--increment" int) increment ; @@ -46,9 +47,3 @@ let run ?addgenes ?locustag ?increment ?gffver ?compliant dep fa ; ] ; ] - - -let transcripts = selector ["prokka_res.ffn"] -let proteins = selector ["prokka_res.faa"] -let gff_annotation = selector [ "prokka_res.gff" ] -let gbk_annotation = selector [ "prokka_res.gbk" ] From f487d3bc9e388b24e02c3d2460c8f1234c320a67 Mon Sep 17 00:00:00 2001 From: Cecile Date: Wed, 6 Dec 2017 18:08:40 +0100 Subject: [PATCH 4/4] Try correction blast.ml --- lib/bioinfo/blast.ml | 74 ++++++++++++-------------------------------- 1 file changed, 20 insertions(+), 54 deletions(-) diff --git a/lib/bioinfo/blast.ml b/lib/bioinfo/blast.ml index 31574766..7db1ee75 100644 --- a/lib/bioinfo/blast.ml +++ b/lib/bioinfo/blast.ml @@ -8,65 +8,31 @@ let env = docker_image ~account:"pveber" ~name:"ncbi-blast" ~tag:"2.4.0" () let db_name = "db" -let fastadb fa dbtype = +let makedb ~dbtype:dbtype fa = + let args = match dbtype with + | `Nucl -> string "-dbtype nucl" + | `Prot -> string "-dbtype prot" + in workflow ~descr:"blast.makedb" [ - mkdir_p dest ; cmd ~env "makeblastdb" [ opt "-in" dep fa ; - opt "-dbtype" ident dbtype ; - opt "-out" ident (dest // db_name) ; + opt "-out" ident (dest // db_name); + args ; ] ; ] (* Basic blastn*) -let blastn ?evalue ?word_size ?task ?gapopen ?gapextend ?penalty -?reward ?outfmt ?perc_identity ?qcov_hsp_perc ?max_hsps ?max_target_seqs ?(threads = 4) db query out_name = (*See blastn documentation to know what options are*) - workflow ~descr:"blastn" ~np:threads [ - mkdir_p dest ; - cmd "blastn" ~env [ - opt "-db" ident (dep db // db_name) ; - opt "-query" dep query ; - opt "-out" ident (dest // out_name) ; - option (opt "-evalue" float) evalue ; - option (opt "-word_size" int) word_size ; - option (opt "-task" string) task ; - option (opt "-gapopen" int) gapopen ; - option (opt "-gapextend" int) gapextend ; - option (opt "-penalty" int) penalty ; - option (opt "-reward" int) reward ; - option (opt "-outfmt" string) outfmt ; - option (opt "-perc_identity" float) perc_identity ; - option (opt "-qcov_hsp_perc" float) qcov_hsp_perc ; - option (opt "-max_hsps" int) max_hsps ; - option (opt "-max_target_seqs" int) max_target_seqs ; - opt "-num_threads" ident np ; - ] - ] - -let blastp ?evalue ?word_size ?task ?gapopen ?gapextend ?penalty -?reward ?outfmt ?perc_identity ?qcov_hsp_perc ?max_hsps ?max_target_seqs ?(threads = 4) db query out_name = (*See blastn documentation to know what options are*) - workflow ~descr:"blastp" ~np:threads [ - mkdir_p dest ; - cmd "blastp" ~env [ - opt "-db" ident (dep db // db_name) ; - opt "-query" dep query ; - opt "-out" ident (dest // out_name) ; - option (opt "-evalue" float) evalue ; - option (opt "-word_size" int) word_size ; - option (opt "-task" string) task ; - option (opt "-gapopen" int) gapopen ; - option (opt "-gapextend" int) gapextend ; - option (opt "-penalty" int) penalty ; - option (opt "-reward" int) reward ; - option (opt "-outfmt" string) outfmt ; - option (opt "-perc_identity" float) perc_identity ; - option (opt "-qcov_hsp_perc" float) qcov_hsp_perc ; - option (opt "-max_hsps" int) max_hsps ; - option (opt "-max_target_seqs" int) max_target_seqs ; - opt "-num_threads" ident np ; - ] - ] - - - +let results = "results.blast" +let blastp ?evalue ?(threads = 4) ?outfmt db query = workflow ~descr:"blastp_xml" ~np:threads [ + mkdir_p dest ; + cmd "blastp" ~env [ + opt "-db" dep db // db_name ; + opt "-query" dep query ; + opt "-out" ident (dest // results) ; + option (opt "-evalue" float) evalue ; + option (opt "-outfmt" string) outfmt ; + ] + ] + +let blast_align = selector ["results.blast"] \ No newline at end of file