From 45c7cd0de9302229ca38ec61d75148639913c268 Mon Sep 17 00:00:00 2001 From: bloodearnest Date: Wed, 16 Mar 2022 08:30:15 +0000 Subject: [PATCH] experimenting --- analysis/example.do | 13 +++++++++++++ analysis/example.py | 10 ++++++++++ analysis/example.r | 7 +++++++ project.yaml | 21 +++++++++++++++++++++ 4 files changed, 51 insertions(+) create mode 100644 analysis/example.do create mode 100644 analysis/example.py create mode 100644 analysis/example.r diff --git a/analysis/example.do b/analysis/example.do new file mode 100644 index 0000000..25b4c95 --- /dev/null +++ b/analysis/example.do @@ -0,0 +1,13 @@ +// stata cannot handle compressed csv files directly, so unzip first to a plain csv file +!gunzip output/input.csv.gz + +// now import the uncompressed csv using delimited +import delimited using output/input.csv + + +// your analysis code goes here + + +// all dta file outputs should be saved using `gzsave` and a .dta.gz extension +// In subsequent actions, use `gzuse` to load them. +gzsave output/stata.dta.gz diff --git a/analysis/example.py b/analysis/example.py new file mode 100644 index 0000000..5d2141e --- /dev/null +++ b/analysis/example.py @@ -0,0 +1,10 @@ +import pandas as pd +import pyarrow.feather + +df = pd.read_csv("output/input.csv.gz") + + +# feather files are compressed by default in python +df.to_feather("output/python.feather.lz4") +pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed") +pyarrow.feather.write_feather(df, "output/python.feather.zstd", compression="zstd") diff --git a/analysis/example.r b/analysis/example.r new file mode 100644 index 0000000..773b1b3 --- /dev/null +++ b/analysis/example.r @@ -0,0 +1,7 @@ +# read compressed .csv file +df <- readr::read_csv("output/input.csv.gz") + +# write a .feather file output +arrow::write_feather(df, "output/r.feather.lz4") +arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed") +arrow::write_feather(df, "output/r.feather.zstd", compression = "zstd") diff --git a/project.yaml b/project.yaml index b941ab8..ad54c8d 100644 --- a/project.yaml +++ b/project.yaml @@ -6,3 +6,24 @@ actions: outputs: highly_sensitive: dataset: output/dataset.csv.gz + + python_example: + run: python:latest analysis/example.py + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/python.feather* + + stata_example: + run: stata-mp:latest analysis/example.do + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/stata.dta.gz + + r_example: + run: r:latest analysis/example.r + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/r.feather*