From 45c7cd0de9302229ca38ec61d75148639913c268 Mon Sep 17 00:00:00 2001
From: bloodearnest <simon.davy@thedatalab.org>
Date: Wed, 16 Mar 2022 08:30:15 +0000
Subject: [PATCH] experimenting

---
 analysis/example.do | 13 +++++++++++++
 analysis/example.py | 10 ++++++++++
 analysis/example.r  |  7 +++++++
 project.yaml        | 21 +++++++++++++++++++++
 4 files changed, 51 insertions(+)
 create mode 100644 analysis/example.do
 create mode 100644 analysis/example.py
 create mode 100644 analysis/example.r

diff --git a/analysis/example.do b/analysis/example.do
new file mode 100644
index 0000000..25b4c95
--- /dev/null
+++ b/analysis/example.do
@@ -0,0 +1,13 @@
+// stata cannot handle compressed csv files directly, so unzip first to a plain csv file
+!gunzip output/input.csv.gz
+
+// now import the uncompressed csv using delimited
+import delimited using output/input.csv
+
+
+// your analysis code goes here
+
+
+// all dta file outputs should be saved using `gzsave` and a .dta.gz extension
+// In subsequent actions, use `gzuse` to load them.
+gzsave output/stata.dta.gz
diff --git a/analysis/example.py b/analysis/example.py
new file mode 100644
index 0000000..5d2141e
--- /dev/null
+++ b/analysis/example.py
@@ -0,0 +1,10 @@
+import pandas as pd
+import pyarrow.feather
+
+df = pd.read_csv("output/input.csv.gz")
+
+
+# feather files are compressed by default in python
+df.to_feather("output/python.feather.lz4")
+pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed")
+pyarrow.feather.write_feather(df, "output/python.feather.zstd", compression="zstd")
diff --git a/analysis/example.r b/analysis/example.r
new file mode 100644
index 0000000..773b1b3
--- /dev/null
+++ b/analysis/example.r
@@ -0,0 +1,7 @@
+# read compressed .csv file
+df <- readr::read_csv("output/input.csv.gz")
+
+# write a .feather file output
+arrow::write_feather(df, "output/r.feather.lz4")
+arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed")
+arrow::write_feather(df, "output/r.feather.zstd", compression = "zstd")
diff --git a/project.yaml b/project.yaml
index b941ab8..ad54c8d 100644
--- a/project.yaml
+++ b/project.yaml
@@ -6,3 +6,24 @@ actions:
     outputs:
       highly_sensitive:
         dataset: output/dataset.csv.gz
+
+  python_example:
+    run: python:latest analysis/example.py
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/python.feather*
+
+  stata_example:
+    run: stata-mp:latest analysis/example.do
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/stata.dta.gz
+
+  r_example:
+    run: r:latest analysis/example.r
+    needs: [generate_study_population]
+    outputs:
+      highly_sensitive:
+        cohort: output/r.feather*