Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
version: 1.2
workflows:
- name: segment_scatter
subclass: WDL
primaryDescriptorPath: /segment_scatter.wdl
authors:
- orcid: 0000-0003-4896-1858
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ Type `enum` exists in CWL, but not WDL. This task very roughly mimics the `enum`
*Depreciated: Use https://github.com/dockstore/checker-WDL-templates instead*
Checks if two files are equivalent, as opposed to arraycheck_* iterating through two arrays. Includes the same Rdata equivalence checker of arraycheck_rdata (disabled by default).

### segfault
Divide an array of inputs into "segments" for scattering. For example, 10 input files can make 2 segments of 5, and you can scatter 2x on those segments.

## workflow-level

### dot_product_scatter
Expand All @@ -31,5 +34,8 @@ Example of how to use the pair variable type along with zip() to do a dot produc
### metamouse
Checker/Debugger for Stuart tasks. The test files are derived from the [WDL translation](https://github.com/DataBiosphere/analysis_pipeline_WDL) of the [UWGAC TOPMed Pipeline](https://github.com/UW-GAC/analysis_pipeline).

### segment_scatter
Workflow implementation/example of segfault.

### whiskertail
Template for enforcing order in a WDL workspace. This could be useful if you want a workflow to fail early, rather than waste time/money on other steps.
34 changes: 34 additions & 0 deletions segfault.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
version 1.0

# Divide a bunch of file (or rather strings indicating their URLs/URIs) inputs
# into "segments." This is useful for controlling how many times you want to
# scatter a task.

task segfault {
input {
Array[String] inputs
Int n_segments
}

command <<<
python3 << CODE
import numpy as np
files = ["~{sep='","' inputs}"]
print(f"There's a total of {len(files)} inputs to segment.")
with open("segments.tsv", "w") as file:
for array in np.array_split(files, ~{n_segments}):
print(array)
file.writelines("\t".join(array) + "\n")
CODE
>>>

runtime {
docker: "ashedpotatoes/sranwrp:1.1.0"
memory: "4 GB"
}

output {
Array[Array[String]] segments = read_tsv("segments.tsv")
File segments_files = "segments.tsv"
}
}
4 changes: 4 additions & 0 deletions segment_scatter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"Segment_Scatter.input_files": ["segment_scatter.wdl", "segfault.wdl", "big_cheese.wdl", "enumouse.wdl", "metamouse.wdl", "metamouse.json", "metamouse-local.wdl"],
"Segment_Scatter.n_segments": 3
}
43 changes: 43 additions & 0 deletions segment_scatter.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
version 1.0

import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segfault.wdl"

workflow Segment_Scatter {
input {
# if you input 10 files and n_segments = 5, each segment gets 2 files
Array[File] input_files
Int n_segments
}

call segfault.segfault {
input:
inputs = input_files,
n_segments = n_segments
}

scatter(segment in segfault.segments) {
call echo_files {
input:
files_to_echo = segment
}
}
}

task echo_files {
input {
Array[File] files_to_echo
}

command <<<
python3 << CODE
files = ["~{sep='","' files_to_echo}"]
for file in files:
print(file)
CODE
>>>

runtime {
docker: "ashedpotatoes/sranwrp:1.1.0"
memory: "4 GB"
}
}