diff --git a/analysis/example.do b/analysis/example.do new file mode 100644 index 0000000..25b4c95 --- /dev/null +++ b/analysis/example.do @@ -0,0 +1,13 @@ +// stata cannot handle compressed csv files directly, so unzip first to a plain csv file +!gunzip output/input.csv.gz + +// now import the uncompressed csv using delimited +import delimited using output/input.csv + + +// your analysis code goes here + + +// all dta file outputs should be saved using `gzsave` and a .dta.gz extension +// In subsequent actions, use `gzuse` to load them. +gzsave output/stata.dta.gz diff --git a/analysis/example.py b/analysis/example.py new file mode 100644 index 0000000..5d2141e --- /dev/null +++ b/analysis/example.py @@ -0,0 +1,10 @@ +import pandas as pd +import pyarrow.feather + +df = pd.read_csv("output/input.csv.gz") + + +# feather files are compressed by default in python +df.to_feather("output/python.feather.lz4") +pyarrow.feather.write_feather(df, "output/python.feather.raw", compression="uncompressed") +pyarrow.feather.write_feather(df, "output/python.feather.zstd", compression="zstd") diff --git a/analysis/example.r b/analysis/example.r new file mode 100644 index 0000000..773b1b3 --- /dev/null +++ b/analysis/example.r @@ -0,0 +1,7 @@ +# read compressed .csv file +df <- readr::read_csv("output/input.csv.gz") + +# write a .feather file output +arrow::write_feather(df, "output/r.feather.lz4") +arrow::write_feather(df, "output/r.feather.raw", compression = "uncompressed") +arrow::write_feather(df, "output/r.feather.zstd", compression = "zstd") diff --git a/project.yaml b/project.yaml index b941ab8..ad54c8d 100644 --- a/project.yaml +++ b/project.yaml @@ -6,3 +6,24 @@ actions: outputs: highly_sensitive: dataset: output/dataset.csv.gz + + python_example: + run: python:latest analysis/example.py + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/python.feather* + + stata_example: + run: stata-mp:latest analysis/example.do + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/stata.dta.gz + + r_example: + run: r:latest analysis/example.r + needs: [generate_study_population] + outputs: + highly_sensitive: + cohort: output/r.feather*