diff --git a/.dockstore.yml b/.dockstore.yml new file mode 100644 index 0000000..7b9d757 --- /dev/null +++ b/.dockstore.yml @@ -0,0 +1,7 @@ +version: 1.2 +workflows: + - name: segment_scatter + subclass: WDL + primaryDescriptorPath: /segment_scatter.wdl + authors: + - orcid: 0000-0003-4896-1858 \ No newline at end of file diff --git a/README.md b/README.md index 923b4dd..afe506d 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,9 @@ Type `enum` exists in CWL, but not WDL. This task very roughly mimics the `enum` *Depreciated: Use https://github.com/dockstore/checker-WDL-templates instead* Checks if two files are equivalent, as opposed to arraycheck_* iterating through two arrays. Includes the same Rdata equivalence checker of arraycheck_rdata (disabled by default). +### segfault +Divide an array of inputs into "segments" for scattering. For example, 10 input files can make 2 segments of 5, and you can scatter 2x on those segments. + ## workflow-level ### dot_product_scatter @@ -31,5 +34,8 @@ Example of how to use the pair variable type along with zip() to do a dot produc ### metamouse Checker/Debugger for Stuart tasks. The test files are derived from the [WDL translation](https://github.com/DataBiosphere/analysis_pipeline_WDL) of the [UWGAC TOPMed Pipeline](https://github.com/UW-GAC/analysis_pipeline). +### segment_scatter +Workflow implementation/example of segfault. + ### whiskertail Template for enforcing order in a WDL workspace. This could be useful if you want a workflow to fail early, rather than waste time/money on other steps. diff --git a/segfault.wdl b/segfault.wdl new file mode 100644 index 0000000..d2751f7 --- /dev/null +++ b/segfault.wdl @@ -0,0 +1,34 @@ +version 1.0 + +# Divide a bunch of file (or rather strings indicating their URLs/URIs) inputs +# into "segments." This is useful for controlling how many times you want to +# scatter a task. + +task segfault { + input { + Array[String] inputs + Int n_segments + } + + command <<< + python3 << CODE + import numpy as np + files = ["~{sep='","' inputs}"] + print(f"There's a total of {len(files)} inputs to segment.") + with open("segments.tsv", "w") as file: + for array in np.array_split(files, ~{n_segments}): + print(array) + file.writelines("\t".join(array) + "\n") + CODE + >>> + + runtime { + docker: "ashedpotatoes/sranwrp:1.1.0" + memory: "4 GB" + } + + output { + Array[Array[String]] segments = read_tsv("segments.tsv") + File segments_files = "segments.tsv" + } +} diff --git a/segment_scatter.json b/segment_scatter.json new file mode 100644 index 0000000..8f12503 --- /dev/null +++ b/segment_scatter.json @@ -0,0 +1,4 @@ +{ + "Segment_Scatter.input_files": ["segment_scatter.wdl", "segfault.wdl", "big_cheese.wdl", "enumouse.wdl", "metamouse.wdl", "metamouse.json", "metamouse-local.wdl"], + "Segment_Scatter.n_segments": 3 +} \ No newline at end of file diff --git a/segment_scatter.wdl b/segment_scatter.wdl new file mode 100644 index 0000000..9de4033 --- /dev/null +++ b/segment_scatter.wdl @@ -0,0 +1,43 @@ +version 1.0 + +import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segfault.wdl" + +workflow Segment_Scatter { + input { + # if you input 10 files and n_segments = 5, each segment gets 2 files + Array[File] input_files + Int n_segments + } + + call segfault.segfault { + input: + inputs = input_files, + n_segments = n_segments + } + + scatter(segment in segfault.segments) { + call echo_files { + input: + files_to_echo = segment + } + } +} + +task echo_files { + input { + Array[File] files_to_echo + } + + command <<< + python3 << CODE + files = ["~{sep='","' files_to_echo}"] + for file in files: + print(file) + CODE + >>> + + runtime { + docker: "ashedpotatoes/sranwrp:1.1.0" + memory: "4 GB" + } +} \ No newline at end of file