From ddaf2670f31b66f0c929d193a2fe34515b31e7d8 Mon Sep 17 00:00:00 2001 From: Burak Varli Date: Sat, 7 Feb 2026 19:18:39 +0000 Subject: [PATCH] Add TPC-H SF1 public dataset tests Add all 8 TPC-H SF1 tables as a new public dataset for testing. The TPC-H benchmark is the de facto standard for analytical database systems and provides type coverage for DECIMAL(15,2) and DATE columns not exercised by existing tests. Local tests (region, nation, supplier) verify i32, i64, string, and decimal columns. CI-only tests (lineitem, orders, part, partsupp, customer) cover large files with up to 49 row groups and 6M rows, testing date, decimal, and string columns across the full TPC-H schema. Co-authored-by: amp[bot] --- .github/workflows/ci.yaml | 4 +- .github/workflows/claude.yml | 50 ------ scripts/download-public-datasets.sh | 60 +++++++ src/public_datasets_testing.zig | 269 ++++++++++++++++++++++++++++ testdata/public-datasets/README.md | 28 +++ 5 files changed, 360 insertions(+), 51 deletions(-) delete mode 100644 .github/workflows/claude.yml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 25cf613..b023f79 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,9 +11,11 @@ jobs: test: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: submodules: recursive + - name: Install uv + uses: astral-sh/setup-uv@v7 - name: Download public datasets run: ./scripts/download-public-datasets.sh --all - uses: mlugg/setup-zig@v2 diff --git a/.github/workflows/claude.yml b/.github/workflows/claude.yml deleted file mode 100644 index d300267..0000000 --- a/.github/workflows/claude.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Claude Code - -on: - issue_comment: - types: [created] - pull_request_review_comment: - types: [created] - issues: - types: [opened, assigned] - pull_request_review: - types: [submitted] - -jobs: - claude: - if: | - (github.event_name == 'issue_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review_comment' && contains(github.event.comment.body, '@claude')) || - (github.event_name == 'pull_request_review' && contains(github.event.review.body, '@claude')) || - (github.event_name == 'issues' && (contains(github.event.issue.body, '@claude') || contains(github.event.issue.title, '@claude'))) - runs-on: ubuntu-latest - permissions: - contents: read - pull-requests: read - issues: read - id-token: write - actions: read # Required for Claude to read CI results on PRs - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 1 - - - name: Run Claude Code - id: claude - uses: anthropics/claude-code-action@v1 - with: - claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} - - # This is an optional setting that allows Claude to read CI results on PRs - additional_permissions: | - actions: read - - # Optional: Give a custom prompt to Claude. If this is not specified, Claude will perform the instructions specified in the comment that tagged it. - # prompt: 'Update the pull request description to include a summary of changes.' - - # Optional: Add claude_args to customize behavior and configuration - # See https://github.com/anthropics/claude-code-action/blob/main/docs/usage.md - # or https://code.claude.com/docs/en/cli-reference for available options - # claude_args: '--allowed-tools Bash(gh pr:*)' - diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh index e1220ae..acf2791 100755 --- a/scripts/download-public-datasets.sh +++ b/scripts/download-public-datasets.sh @@ -75,6 +75,65 @@ download_clickbench() { fi } +# ============================================================================= +# TPC-H SF1 Dataset +# Generated using DuckDB's TPC-H extension +# ============================================================================= +download_tpch() { + local mode="$1" + local dest="$DEST_DIR/tpch-sf1" + mkdir -p "$dest" + + echo "=== TPC-H SF1 Dataset ===" + + local small_tables=("nation" "region" "supplier") + local big_tables=("lineitem" "orders" "customer" "part" "partsupp") + + local need_generate=false + for table in "${small_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + + if [[ "$mode" == "all" ]]; then + for table in "${big_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + fi + + if [[ "$need_generate" == "false" ]]; then + echo " All required TPC-H files already exist" + return + fi + + local tables_to_generate="" + if [[ "$mode" == "all" ]]; then + tables_to_generate="nation region supplier lineitem orders customer part partsupp" + else + tables_to_generate="nation region supplier" + fi + + echo " Generating TPC-H SF1 data via DuckDB..." + uvx --from "duckdb" --with pyarrow python -c " +import duckdb +con = duckdb.connect() +con.execute('INSTALL tpch; LOAD tpch; CALL dbgen(sf=1)') +for table in '${tables_to_generate}'.split(): + dest = '${dest}/' + table + '.parquet' + import os + if not os.path.exists(dest): + print(f' Generating: {table}.parquet') + con.execute(f\"COPY {table} TO '{dest}' (FORMAT PARQUET)\") + else: + print(f' Already exists: {table}.parquet') +" +} + # ============================================================================= # Add more datasets here following the same pattern # ============================================================================= @@ -116,6 +175,7 @@ mkdir -p "$DEST_DIR" download_nyc_taxi "$MODE" download_clickbench "$MODE" +download_tpch "$MODE" echo "" echo "Done!" diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig index c0e81ea..64b93f7 100644 --- a/src/public_datasets_testing.zig +++ b/src/public_datasets_testing.zig @@ -116,6 +116,275 @@ test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" { // try readAllRowGroups(&file); } +// ============================================================================= +// TPC-H SF1 Dataset +// Generated using DuckDB's TPC-H extension +// 8 tables, diverse types including DECIMAL and DATE +// ============================================================================= + +test "tpch sf1: region" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/region.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(5, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "AFRICA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "AMERICA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "ASIA"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "EUROPE"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "MIDDLE EAST"), names[4]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: nation" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/nation.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(25, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys[0..5]); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "ALGERIA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "ARGENTINA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "BRAZIL"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "CANADA"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "EGYPT"), names[4]); + try testing.expectEqualDeep(@as([]const u8, "UNITED STATES"), names[24]); + + const region_keys = try rg.readColumn(i32, 2); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 1, 1, 4 }, region_keys[0..5]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: supplier" { + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/supplier.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(10000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // s_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, suppkeys[0..3]); + + // s_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000001"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000002"), names[1]); + + // s_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 17, 5, 1 }, nationkeys[0..3]); + + // s_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(5755.94, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(4032.68, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(4192.40, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: lineitem (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Date = parzig.parquet.Date; + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/lineitem.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(49, file.metadata.row_groups.len); + try testing.expectEqual(6001215, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // l_orderkey (i64) + const orderkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, orderkeys[0..3]); + + // l_quantity (decimal(15,2)) + const quantities = try rg.readColumn(Decimal, 4); + try testing.expectApproxEqAbs(17.0, @as(f64, @floatCast(quantities[0].value)), 0.01); + try testing.expectApproxEqAbs(36.0, @as(f64, @floatCast(quantities[1].value)), 0.01); + try testing.expectApproxEqAbs(8.0, @as(f64, @floatCast(quantities[2].value)), 0.01); + + // l_returnflag (string) + const returnflags = try rg.readColumn([]const u8, 8); + try testing.expectEqualDeep(@as([]const u8, "N"), returnflags[0]); + + // l_shipdate (date) + const shipdates = try rg.readColumn(Date, 10); + try testing.expectEqual(9568, shipdates[0].days_since_epoch); // 1996-03-13 + try testing.expectEqual(9598, shipdates[1].days_since_epoch); // 1996-04-12 + try testing.expectEqual(9524, shipdates[2].days_since_epoch); // 1996-01-29 + + // l_shipmode (string) + const shipmodes = try rg.readColumn([]const u8, 14); + try testing.expectEqualDeep(@as([]const u8, "TRUCK"), shipmodes[0]); + try testing.expectEqualDeep(@as([]const u8, "MAIL"), shipmodes[1]); + try testing.expectEqualDeep(@as([]const u8, "REG AIR"), shipmodes[2]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: orders (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/orders.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(13, file.metadata.row_groups.len); + try testing.expectEqual(1500000, file.metadata.num_rows); + + try readAllRowGroups(&file); +} + +test "tpch sf1: part (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/part.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(200000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // p_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, partkeys[0..3]); + + // p_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "goldenrod lavender spring chocolate lace"), names[0]); + + // p_brand (string) + const brands = try rg.readColumn([]const u8, 3); + try testing.expectEqualDeep(@as([]const u8, "Brand#13"), brands[0]); + + // p_size (i32) + const sizes = try rg.readColumn(i32, 5); + try testing.expectEqualSlices(i32, &[_]i32{ 7, 1, 21 }, sizes[0..3]); + + // p_retailprice (decimal(15,2)) + const prices = try rg.readColumn(Decimal, 7); + try testing.expectApproxEqAbs(901.0, @as(f64, @floatCast(prices[0].value)), 0.01); + try testing.expectApproxEqAbs(902.0, @as(f64, @floatCast(prices[1].value)), 0.01); + try testing.expectApproxEqAbs(903.0, @as(f64, @floatCast(prices[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: partsupp (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/partsupp.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(7, file.metadata.row_groups.len); + try testing.expectEqual(800000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // ps_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, partkeys[0..3]); + + // ps_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 1); + try testing.expectEqualSlices(i64, &[_]i64{ 2, 2502, 5002 }, suppkeys[0..3]); + + // ps_availqty (i64) + const qtys = try rg.readColumn(i64, 2); + try testing.expectEqualSlices(i64, &[_]i64{ 3325, 8076, 3956 }, qtys[0..3]); + + // ps_supplycost (decimal(15,2)) + const costs = try rg.readColumn(Decimal, 3); + try testing.expectApproxEqAbs(771.64, @as(f64, @floatCast(costs[0].value)), 0.01); + try testing.expectApproxEqAbs(993.49, @as(f64, @floatCast(costs[1].value)), 0.01); + try testing.expectApproxEqAbs(337.09, @as(f64, @floatCast(costs[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: customer (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/customer.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(150000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // c_custkey (i64) + const custkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, custkeys[0..3]); + + // c_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Customer#000000001"), names[0]); + + // c_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 15, 13, 1 }, nationkeys[0..3]); + + // c_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(711.56, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(121.65, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(7498.12, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + // c_mktsegment (string) + const segments = try rg.readColumn([]const u8, 6); + try testing.expectEqualDeep(@as([]const u8, "BUILDING"), segments[0]); + try testing.expectEqualDeep(@as([]const u8, "AUTOMOBILE"), segments[1]); + + try readAllRowGroups(&file); +} + // ============================================================================= // ClickBench Dataset - CI only // Source: https://github.com/ClickHouse/ClickBench diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md index 7de34c0..8d949b5 100644 --- a/testdata/public-datasets/README.md +++ b/testdata/public-datasets/README.md @@ -12,6 +12,15 @@ public-datasets/ │ ├── fhv_tripdata_2025-10.parquet │ ├── yellow_tripdata_2025-10.parquet (CI only) │ └── fhvhv_tripdata_2025-10.parquet (CI only) +├── tpch-sf1/ # TPC-H benchmark data (SF1) +│ ├── nation.parquet +│ ├── region.parquet +│ ├── supplier.parquet +│ ├── lineitem.parquet (CI only) +│ ├── orders.parquet (CI only) +│ ├── customer.parquet (CI only) +│ ├── part.parquet (CI only) +│ └── partsupp.parquet (CI only) ├── clickbench/ # ClickBench web analytics data (CI only) │ ├── hits_0.parquet │ ├── hits_1.parquet @@ -32,6 +41,25 @@ Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-recor | `yellow_tripdata_2025-10.parquet` | ~50MB | Yes | | `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes | +### TPC-H SF1 + +Source: [TPC-H Benchmark](https://www.tpc.org/tpch/) (generated via DuckDB's TPC-H extension) + +The de facto standard benchmark for analytical database systems. 8 tables with a +normalized supply chain schema. Provides diverse types including DECIMAL(15,2) and +DATE columns, plus 49 row groups in the lineitem table. + +| File | Size | CI Only | +|------|------|---------| +| `nation.parquet` | ~2KB | No | +| `region.parquet` | ~1KB | No | +| `supplier.parquet` | ~771KB | No | +| `lineitem.parquet` | ~197MB | Yes | +| `orders.parquet` | ~53MB | Yes | +| `customer.parquet` | ~12MB | Yes | +| `part.parquet` | ~6MB | Yes | +| `partsupp.parquet` | ~40MB | Yes | + ### ClickBench Source: [ClickHouse/ClickBench](https://github.com/ClickHouse/ClickBench)