aofarrel · aofarrel · Dec 6, 2022 · Dec 6, 2022 · Dec 6, 2022 · Dec 7, 2022
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -0,0 +1,7 @@
+version: 1.2
+workflows:
+  - name: segment_scatter
+    subclass: WDL
+    primaryDescriptorPath: /segment_scatter.wdl
+    authors:
+      - orcid: 0000-0003-4896-1858
diff --git a/README.md b/README.md
@@ -23,6 +23,9 @@ Type `enum` exists in CWL, but not WDL. This task very roughly mimics the `enum`
 *Depreciated: Use https://github.com/dockstore/checker-WDL-templates instead*  
 Checks if two files are equivalent, as opposed to arraycheck_* iterating through two arrays. Includes the same Rdata equivalence checker of arraycheck_rdata (disabled by default).
 
+### segfault
+Divide an array of inputs into "segments" for scattering. For example, 10 input files can make 2 segments of 5, and you can scatter 2x on those segments.
+
 ## workflow-level
 
 ### dot_product_scatter
@@ -31,5 +34,8 @@ Example of how to use the pair variable type along with zip() to do a dot produc
 ### metamouse
 Checker/Debugger for Stuart tasks. The test files are derived from the [WDL translation](https://github.com/DataBiosphere/analysis_pipeline_WDL) of the [UWGAC TOPMed Pipeline](https://github.com/UW-GAC/analysis_pipeline).
 
+### segment_scatter
+Workflow implementation/example of segfault.
+
 ### whiskertail
 Template for enforcing order in a WDL workspace. This could be useful if you want a workflow to fail early, rather than waste time/money on other steps.
diff --git a/segfault.wdl b/segfault.wdl
@@ -0,0 +1,34 @@
+version 1.0
+
+# Divide a bunch of file (or rather strings indicating their URLs/URIs) inputs
+# into "segments." This is useful for controlling how many times you want to
+# scatter a task.
+
+task segfault {
+	input {
+		Array[String] inputs
+		Int n_segments
+	}
+
+	command <<<
+	python3 << CODE
+	import numpy as np
+	files = ["~{sep='","' inputs}"]
+	print(f"There's a total of {len(files)} inputs to segment.")
+	with open("segments.tsv", "w") as file:
+		for array in np.array_split(files, ~{n_segments}):
+			print(array)
+			file.writelines("\t".join(array) + "\n")
+	CODE
+	>>>
+
+	runtime {
+		docker: "ashedpotatoes/sranwrp:1.1.0"
+		memory: "4 GB"
+	}
+
+	output {
+		Array[Array[String]] segments = read_tsv("segments.tsv")
+		File segments_files = "segments.tsv"
+	}
+}
diff --git a/segment_scatter.json b/segment_scatter.json
@@ -0,0 +1,4 @@
+{
+   "Segment_Scatter.input_files": ["segment_scatter.wdl", "segfault.wdl", "big_cheese.wdl", "enumouse.wdl", "metamouse.wdl", "metamouse.json", "metamouse-local.wdl"],
+   "Segment_Scatter.n_segments": 3 
+}
diff --git a/segment_scatter.wdl b/segment_scatter.wdl
@@ -0,0 +1,43 @@
+version 1.0
+
+import "https://raw.githubusercontent.com/aofarrel/Stuart-WDL/segment_scatter/segfault.wdl"
+
+workflow Segment_Scatter {
+	input {
+		# if you input 10 files and n_segments = 5, each segment gets 2 files
+		Array[File] input_files
+		Int n_segments
+	}
+
+	call segfault.segfault {
+		input:
+			inputs = input_files,
+			n_segments = n_segments
+	}
+
+	scatter(segment in segfault.segments) {
+		call echo_files {
+			input:
+				files_to_echo = segment
+		}
+	}
+}
+
+task echo_files {
+	input {
+		Array[File] files_to_echo
+	}
+
+	command <<<
+	python3 << CODE
+	files = ["~{sep='","' files_to_echo}"]
+	for file in files:
+		print(file)
+	CODE
+	>>>
+
+	runtime {
+		docker: "ashedpotatoes/sranwrp:1.1.0"
+		memory: "4 GB"
+	}
+}