diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 25cf613..b023f79 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,9 +11,11 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive + - name: Install uv + uses: astral-sh/setup-uv@v7 - name: Download public datasets run: ./scripts/download-public-datasets.sh --all - uses: mlugg/setup-zig@v2 diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml deleted file mode 100644 index d300267..0000000 --- a/.github/workflows/claude.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Claude Code - -on: - issue_comment: - types: [created] - pull_request_review_comment: - types: [created] - issues: - types: [opened, assigned] - pull_request_review: - types: [submitted] - -jobs: - claude: - if: | - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || - (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: read - issues: read - id-token: write - actions: read # Required for Claude to read CI results on PRs - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Run Claude Code - id: claude - uses: anthropics/claude-code-action@v1 - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - - # This is an optional setting that allows Claude to read CI results on PRs - additional_permissions: | - actions: read - - # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. - # prompt: 'Update the pull request description to include a summary of changes.' - - # Optional: Add claude_args to customize behavior and configuration - # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md - # or https://code.claude.com/docs/en/cli-reference for available options - # claude_args: '--allowed-tools Bash(gh pr:*)' - diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh index e1220ae..acf2791 100755 --- a/scripts/download-public-datasets.sh +++ b/scripts/download-public-datasets.sh @@ -75,6 +75,65 @@ download_clickbench() { fi } +# ============================================================================= +# TPC-H SF1 Dataset +# Generated using DuckDB's TPC-H extension +# ============================================================================= +download_tpch() { + local mode="$1" + local dest="$DEST_DIR/tpch-sf1" + mkdir -p "$dest" + + echo "=== TPC-H SF1 Dataset ===" + + local small_tables=("nation" "region" "supplier") + local big_tables=("lineitem" "orders" "customer" "part" "partsupp") + + local need_generate=false + for table in "${small_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + + if [[ "$mode" == "all" ]]; then + for table in "${big_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + fi + + if [[ "$need_generate" == "false" ]]; then + echo " All required TPC-H files already exist" + return + fi + + local tables_to_generate="" + if [[ "$mode" == "all" ]]; then + tables_to_generate="nation region supplier lineitem orders customer part partsupp" + else + tables_to_generate="nation region supplier" + fi + + echo " Generating TPC-H SF1 data via DuckDB..." + uvx --from "duckdb" --with pyarrow python -c " +import duckdb +con = duckdb.connect() +con.execute('INSTALL tpch; LOAD tpch; CALL dbgen(sf=1)') +for table in '${tables_to_generate}'.split(): + dest = '${dest}/' + table + '.parquet' + import os + if not os.path.exists(dest): + print(f' Generating: {table}.parquet') + con.execute(f\"COPY {table} TO '{dest}' (FORMAT PARQUET)\") + else: + print(f' Already exists: {table}.parquet') +" +} + # ============================================================================= # Add more datasets here following the same pattern # ============================================================================= @@ -116,6 +175,7 @@ mkdir -p "$DEST_DIR" download_nyc_taxi "$MODE" download_clickbench "$MODE" +download_tpch "$MODE" echo "" echo "Done!" diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig index c0e81ea..64b93f7 100644 --- a/src/public_datasets_testing.zig +++ b/src/public_datasets_testing.zig @@ -116,6 +116,275 @@ test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" { // try readAllRowGroups(&file); } +// ============================================================================= +// TPC-H SF1 Dataset +// Generated using DuckDB's TPC-H extension +// 8 tables, diverse types including DECIMAL and DATE +// ============================================================================= + +test "tpch sf1: region" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/region.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(5, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "AFRICA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "AMERICA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "ASIA"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "EUROPE"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "MIDDLE EAST"), names[4]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: nation" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/nation.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(25, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys[0..5]); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "ALGERIA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "ARGENTINA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "BRAZIL"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "CANADA"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "EGYPT"), names[4]); + try testing.expectEqualDeep(@as([]const u8, "UNITED STATES"), names[24]); + + const region_keys = try rg.readColumn(i32, 2); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 1, 1, 4 }, region_keys[0..5]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: supplier" { + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/supplier.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(10000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // s_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, suppkeys[0..3]); + + // s_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000001"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000002"), names[1]); + + // s_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 17, 5, 1 }, nationkeys[0..3]); + + // s_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(5755.94, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(4032.68, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(4192.40, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: lineitem (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Date = parzig.parquet.Date; + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/lineitem.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(49, file.metadata.row_groups.len); + try testing.expectEqual(6001215, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // l_orderkey (i64) + const orderkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, orderkeys[0..3]); + + // l_quantity (decimal(15,2)) + const quantities = try rg.readColumn(Decimal, 4); + try testing.expectApproxEqAbs(17.0, @as(f64, @floatCast(quantities[0].value)), 0.01); + try testing.expectApproxEqAbs(36.0, @as(f64, @floatCast(quantities[1].value)), 0.01); + try testing.expectApproxEqAbs(8.0, @as(f64, @floatCast(quantities[2].value)), 0.01); + + // l_returnflag (string) + const returnflags = try rg.readColumn([]const u8, 8); + try testing.expectEqualDeep(@as([]const u8, "N"), returnflags[0]); + + // l_shipdate (date) + const shipdates = try rg.readColumn(Date, 10); + try testing.expectEqual(9568, shipdates[0].days_since_epoch); // 1996-03-13 + try testing.expectEqual(9598, shipdates[1].days_since_epoch); // 1996-04-12 + try testing.expectEqual(9524, shipdates[2].days_since_epoch); // 1996-01-29 + + // l_shipmode (string) + const shipmodes = try rg.readColumn([]const u8, 14); + try testing.expectEqualDeep(@as([]const u8, "TRUCK"), shipmodes[0]); + try testing.expectEqualDeep(@as([]const u8, "MAIL"), shipmodes[1]); + try testing.expectEqualDeep(@as([]const u8, "REG AIR"), shipmodes[2]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: orders (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/orders.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(13, file.metadata.row_groups.len); + try testing.expectEqual(1500000, file.metadata.num_rows); + + try readAllRowGroups(&file); +} + +test "tpch sf1: part (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/part.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(200000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // p_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, partkeys[0..3]); + + // p_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "goldenrod lavender spring chocolate lace"), names[0]); + + // p_brand (string) + const brands = try rg.readColumn([]const u8, 3); + try testing.expectEqualDeep(@as([]const u8, "Brand#13"), brands[0]); + + // p_size (i32) + const sizes = try rg.readColumn(i32, 5); + try testing.expectEqualSlices(i32, &[_]i32{ 7, 1, 21 }, sizes[0..3]); + + // p_retailprice (decimal(15,2)) + const prices = try rg.readColumn(Decimal, 7); + try testing.expectApproxEqAbs(901.0, @as(f64, @floatCast(prices[0].value)), 0.01); + try testing.expectApproxEqAbs(902.0, @as(f64, @floatCast(prices[1].value)), 0.01); + try testing.expectApproxEqAbs(903.0, @as(f64, @floatCast(prices[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: partsupp (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/partsupp.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(7, file.metadata.row_groups.len); + try testing.expectEqual(800000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // ps_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, partkeys[0..3]); + + // ps_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 1); + try testing.expectEqualSlices(i64, &[_]i64{ 2, 2502, 5002 }, suppkeys[0..3]); + + // ps_availqty (i64) + const qtys = try rg.readColumn(i64, 2); + try testing.expectEqualSlices(i64, &[_]i64{ 3325, 8076, 3956 }, qtys[0..3]); + + // ps_supplycost (decimal(15,2)) + const costs = try rg.readColumn(Decimal, 3); + try testing.expectApproxEqAbs(771.64, @as(f64, @floatCast(costs[0].value)), 0.01); + try testing.expectApproxEqAbs(993.49, @as(f64, @floatCast(costs[1].value)), 0.01); + try testing.expectApproxEqAbs(337.09, @as(f64, @floatCast(costs[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: customer (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/customer.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(150000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // c_custkey (i64) + const custkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, custkeys[0..3]); + + // c_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Customer#000000001"), names[0]); + + // c_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 15, 13, 1 }, nationkeys[0..3]); + + // c_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(711.56, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(121.65, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(7498.12, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + // c_mktsegment (string) + const segments = try rg.readColumn([]const u8, 6); + try testing.expectEqualDeep(@as([]const u8, "BUILDING"), segments[0]); + try testing.expectEqualDeep(@as([]const u8, "AUTOMOBILE"), segments[1]); + + try readAllRowGroups(&file); +} + // ============================================================================= // ClickBench Dataset - CI only // Source: https://github.com/ClickHouse/ClickBench diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md index 7de34c0..8d949b5 100644 --- a/testdata/public-datasets/README.md +++ b/testdata/public-datasets/README.md @@ -12,6 +12,15 @@ public-datasets/ │ ├── fhv_tripdata_2025-10.parquet │ ├── yellow_tripdata_2025-10.parquet (CI only) │ └── fhvhv_tripdata_2025-10.parquet (CI only) +├── tpch-sf1/ # TPC-H benchmark data (SF1) +│ ├── nation.parquet +│ ├── region.parquet +│ ├── supplier.parquet +│ ├── lineitem.parquet (CI only) +│ ├── orders.parquet (CI only) +│ ├── customer.parquet (CI only) +│ ├── part.parquet (CI only) +│ └── partsupp.parquet (CI only) ├── clickbench/ # ClickBench web analytics data (CI only) │ ├── hits_0.parquet │ ├── hits_1.parquet @@ -32,6 +41,25 @@ Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-recor | `yellow_tripdata_2025-10.parquet` | ~50MB | Yes | | `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes | +### TPC-H SF1 + +Source: [TPC-H Benchmark](https://www.tpc.org/tpch/) (generated via DuckDB's TPC-H extension) + +The de facto standard benchmark for analytical database systems. 8 tables with a +normalized supply chain schema. Provides diverse types including DECIMAL(15,2) and +DATE columns, plus 49 row groups in the lineitem table. + +| File | Size | CI Only | +|------|------|---------| +| `nation.parquet` | ~2KB | No | +| `region.parquet` | ~1KB | No | +| `supplier.parquet` | ~771KB | No | +| `lineitem.parquet` | ~197MB | Yes | +| `orders.parquet` | ~53MB | Yes | +| `customer.parquet` | ~12MB | Yes | +| `part.parquet` | ~6MB | Yes | +| `partsupp.parquet` | ~40MB | Yes | + ### ClickBench Source: [ClickHouse/ClickBench](https://github.com/ClickHouse/ClickBench)