From 007f0e67a6b296b638b0c424399c1652318510c8 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 6 Dec 2022 15:43:08 -0800 Subject: [PATCH 1/5] Do everything --- segfault.wdl | 34 ++++++++++++++++++++++++++++++++++ segment_scatter.json | 4 ++++ segment_scatter.wdl | 43 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 segfault.wdl create mode 100644 segment_scatter.json create mode 100644 segment_scatter.wdl diff --git a/segfault.wdl b/segfault.wdl new file mode 100644 index 0000000..d2751f7 --- /dev/null +++ b/segfault.wdl @@ -0,0 +1,34 @@ +version 1.0 + +# Divide a bunch of file (or rather strings indicating their URLs/URIs) inputs +# into "segments." This is useful for controlling how many times you want to +# scatter a task. + +task segfault { + input { + Array[String] inputs + Int n_segments + } + + command <<< + python3 << CODE + import numpy as np + files = ["~{sep='","' inputs}"] + print(f"There's a total of {len(files)} inputs to segment.") + with open("segments.tsv", "w") as file: + for array in np.array_split(files, ~{n_segments}): + print(array) + file.writelines("\t".join(array) + "\n") + CODE + >>> + + runtime { + docker: "ashedpotatoes/sranwrp:1.1.0" + memory: "4 GB" + } + + output { + Array[Array[String]] segments = read_tsv("segments.tsv") + File segments_files = "segments.tsv" + } +} diff --git a/segment_scatter.json b/segment_scatter.json new file mode 100644 index 0000000..8f12503 --- /dev/null +++ b/segment_scatter.json @@ -0,0 +1,4 @@ +{ + "Segment_Scatter.input_files": ["segment_scatter.wdl", "segfault.wdl", "big_cheese.wdl", "enumouse.wdl", "metamouse.wdl", "metamouse.json", "metamouse-local.wdl"], + "Segment_Scatter.n_segments": 3 +} \ No newline at end of file diff --git a/segment_scatter.wdl b/segment_scatter.wdl new file mode 100644 index 0000000..a16daae --- /dev/null +++ b/segment_scatter.wdl @@ -0,0 +1,43 @@ +version 1.0 + +import "./segfault.wdl" + +workflow Segment_Scatter { + input { + # if you input 10 files and n_segments = 5, each segment gets 2 files + Array[File] input_files + Int n_segments + } + + call segfault.segfault { + input: + inputs = input_files, + n_segments = n_segments + } + + scatter(segment in segfault.segments) { + call echo_files { + input: + files_to_echo = segment + } + } +} + +task echo_files { + input { + Array[File] files_to_echo + } + + command <<< + python3 << CODE + files = ["~{sep='","' files_to_echo}"] + for file in files: + print(file) + CODE + >>> + + runtime { + docker: "ashedpotatoes/sranwrp:1.1.0" + memory: "4 GB" + } +} From 3147c4777a98d09ad6398a4eaf1c135e84276813 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 6 Dec 2022 15:50:28 -0800 Subject: [PATCH 2/5] Explain segfault/segment_scatter in README --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 923b4dd..afe506d 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@ Type `enum` exists in CWL, but not WDL. This task very roughly mimics the `enum` *Depreciated: Use https://github.com/dockstore/checker-WDL-templates instead* Checks if two files are equivalent, as opposed to arraycheck_* iterating through two arrays. Includes the same Rdata equivalence checker of arraycheck_rdata (disabled by default). +### segfault +Divide an array of inputs into "segments" for scattering. For example, 10 input files can make 2 segments of 5, and you can scatter 2x on those segments. + ## workflow-level ### dot_product_scatter @@ -31,5 +34,8 @@ Example of how to use the pair variable type along with zip() to do a dot produc ### metamouse Checker/Debugger for Stuart tasks. The test files are derived from the [WDL translation](https://github.com/DataBiosphere/analysis_pipeline_WDL) of the [UWGAC TOPMed Pipeline](https://github.com/UW-GAC/analysis_pipeline). +### segment_scatter +Workflow implementation/example of segfault. + ### whiskertail Template for enforcing order in a WDL workspace. This could be useful if you want a workflow to fail early, rather than waste time/money on other steps. From b02ad1c135e061687b81c0cc52248923ac2eb110 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 6 Dec 2022 15:51:18 -0800 Subject: [PATCH 3/5] Pull from URL --- segment_scatter.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/segment_scatter.wdl b/segment_scatter.wdl index a16daae..22c1870 100644 --- a/segment_scatter.wdl +++ b/segment_scatter.wdl @@ -1,6 +1,6 @@ version 1.0 -import "./segfault.wdl" +import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segment_scatter.wdl" workflow Segment_Scatter { input { From 2e32811e8f4706187807cb064b6fff028867c8ef Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 6 Dec 2022 16:02:31 -0800 Subject: [PATCH 4/5] Workaround for Terra wf upload bug --- .dockstore.yml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .dockstore.yml diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..7b9d757 --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,7 @@ +version: 1.2 +workflows: + - name: segment_scatter + subclass: WDL + primaryDescriptorPath: /segment_scatter.wdl + authors: + - orcid: 0000-0003-4896-1858 \ No newline at end of file From 05ad575c97fdc152f5094b816e9bf4ba0907c289 Mon Sep 17 00:00:00 2001 From: aofarrel Date: Tue, 6 Dec 2022 17:39:20 -0800 Subject: [PATCH 5/5] fix circular import --- segment_scatter.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segment_scatter.wdl b/segment_scatter.wdl index 22c1870..9de4033 100644 --- a/segment_scatter.wdl +++ b/segment_scatter.wdl @@ -1,6 +1,6 @@ version 1.0 -import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segment_scatter.wdl" +import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segfault.wdl" workflow Segment_Scatter { input { @@ -40,4 +40,4 @@ task echo_files { docker: "ashedpotatoes/sranwrp:1.1.0" memory: "4 GB" } -} +} \ No newline at end of file