Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v6
with:
submodules: recursive
- name: Install uv
uses: astral-sh/setup-uv@v7
- name: Download public datasets
run: ./scripts/download-public-datasets.sh --all
- uses: mlugg/setup-zig@v2
Expand Down
50 changes: 0 additions & 50 deletions .github/workflows/claude.yml

This file was deleted.

60 changes: 60 additions & 0 deletions scripts/download-public-datasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,65 @@ download_clickbench() {
fi
}

# =============================================================================
# TPC-H SF1 Dataset
# Generated using DuckDB's TPC-H extension
# =============================================================================
download_tpch() {
local mode="$1"
local dest="$DEST_DIR/tpch-sf1"
mkdir -p "$dest"

echo "=== TPC-H SF1 Dataset ==="

local small_tables=("nation" "region" "supplier")
local big_tables=("lineitem" "orders" "customer" "part" "partsupp")

local need_generate=false
for table in "${small_tables[@]}"; do
if [[ ! -f "$dest/$table.parquet" ]]; then
need_generate=true
break
fi
done

if [[ "$mode" == "all" ]]; then
for table in "${big_tables[@]}"; do
if [[ ! -f "$dest/$table.parquet" ]]; then
need_generate=true
break
fi
done
fi

if [[ "$need_generate" == "false" ]]; then
echo " All required TPC-H files already exist"
return
fi

local tables_to_generate=""
if [[ "$mode" == "all" ]]; then
tables_to_generate="nation region supplier lineitem orders customer part partsupp"
else
tables_to_generate="nation region supplier"
fi

echo " Generating TPC-H SF1 data via DuckDB..."
uvx --from "duckdb" --with pyarrow python -c "
import duckdb
con = duckdb.connect()
con.execute('INSTALL tpch; LOAD tpch; CALL dbgen(sf=1)')
for table in '${tables_to_generate}'.split():
dest = '${dest}/' + table + '.parquet'
import os
if not os.path.exists(dest):
print(f' Generating: {table}.parquet')
con.execute(f\"COPY {table} TO '{dest}' (FORMAT PARQUET)\")
else:
print(f' Already exists: {table}.parquet')
"
}

# =============================================================================
# Add more datasets here following the same pattern
# =============================================================================
Expand Down Expand Up @@ -116,6 +175,7 @@ mkdir -p "$DEST_DIR"

download_nyc_taxi "$MODE"
download_clickbench "$MODE"
download_tpch "$MODE"

echo ""
echo "Done!"
Loading
Loading