From ed0a6c411c84d1e5d6e100489e6ae26d3c72f9bd Mon Sep 17 00:00:00 2001 From: Burak Varli Date: Sat, 7 Feb 2026 18:49:52 +0000 Subject: [PATCH 1/3] Migrate from Claude commands to Amp ship-it skill --- .agents/skills/ship-it/SKILL.md | 31 +++++++++++++++++++++++++++++++ .claude/commands/create-pr.md | 30 ------------------------------ .claude/settings.json | 15 --------------- CLAUDE.md | 1 - 4 files changed, 31 insertions(+), 46 deletions(-) create mode 100644 .agents/skills/ship-it/SKILL.md delete mode 100644 .claude/commands/create-pr.md delete mode 100644 .claude/settings.json delete mode 120000 CLAUDE.md diff --git a/.agents/skills/ship-it/SKILL.md b/.agents/skills/ship-it/SKILL.md new file mode 100644 index 0000000..fa80e2b --- /dev/null +++ b/.agents/skills/ship-it/SKILL.md @@ -0,0 +1,31 @@ +--- +name: ship-it +description: "Creates a GitHub PR using jj and gh. Use when asked to ship, create a PR, or push changes." +--- + +# Ship It + +Reviews and ships the current changes as a GitHub PR. + +## Workflow + +1. Gather context: + - Run `jj status` to see current status + - Run `jj diff -f main` to see the diff from main + - Run `jj log -r main::` to see commits from main + +2. **Review the changes** before proceeding: + - Code quality and best practices + - Potential bugs or issues + - Performance considerations + - Memory safety (leaks, double-frees, proper use of `errdefer`) + - Test coverage + + Use the repository's AGENTS.md for guidance on style and conventions. If you find any issues, stop and report them to the user instead of creating the PR. + +3. Add commit description if empty with an appropriate commit message using `jj describe -m "Commit description"` +4. Create a new empty change using `jj new` +5. Push the change to upstream using `jj git push -c @-` +6. Get the autogenerated bookmark name (should be something like `push-*`) using `jj bookmark list -r 'main+::'` +7. Create a PR using the GitHub CLI (`gh pr create`) +8. Enable auto-merge with `gh pr merge --auto --squash` diff --git a/.claude/commands/create-pr.md b/.claude/commands/create-pr.md deleted file mode 100644 index e902131..0000000 --- a/.claude/commands/create-pr.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -allowed-tools: Bash(gh pr create:*), Bash(jj:*) -description: Create a GitHub PR using jj and gh ---- - -## Context - -- Current status: !`jj status` -- Current diff from `main`: !`jj diff -f main` -- Current commits from `main`: !`jj log -r main::` - -## Your task - -Based on the above changes: - -1. **Review the changes** before proceeding: - - Code quality and best practices - - Potential bugs or issues - - Performance considerations - - Memory safety (leaks, double-frees, proper use of `errdefer`) - - Test coverage - - Use the repository's AGENTS.md for guidance on style and conventions. If you find any issues, stop and report them to the user instead of creating the PR. - -2. Add commit description if empty with appropriate commit message using `jj describe -m "Commit description"` -3. Create a new empty change using `jj new` -4. Push the change to upstream using `jj git push -c @-` -5. Get autogenerated bookmark name (should be something like `push-*`) using `jj bookmark list -r 'main+::'` -6. Create a PR using the GitHub CLI -7. Enable auto-merge with `gh pr merge --auto --squash` diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 7f45fcc..0000000 --- a/.claude/settings.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "hooks": { - "SessionStart": [ - { - "matcher": "startup", - "hooks": [ - { - "type": "command", - "command": "\"$CLAUDE_PROJECT_DIR\"/scripts/install_pkgs_on_remote.sh" - } - ] - } - ] - } -} diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 120000 index 47dc3e3..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -AGENTS.md \ No newline at end of file From b0fcb5143095c388650d2260b0574f14539a0dde Mon Sep 17 00:00:00 2001 From: Burak Varli Date: Sat, 7 Feb 2026 18:59:14 +0000 Subject: [PATCH 2/3] Add schema inspection documentation to README --- README.md | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/README.md b/README.md index e6bb3a9..e7125e7 100644 --- a/README.md +++ b/README.md @@ -107,3 +107,53 @@ Supported logical types: - **Temporal**: `Date`, `TimeMillis`, `TimeMicros`, `TimeNanos`, `TimestampMillis`, `TimestampMicros`, `TimestampNanos` - **Numeric**: `Int8`, `UInt8`, `Int16`, `UInt16`, `UInt32`, `UInt64`, `Float16`, `Decimal` - **Other**: `UUID`, `String`, `Enum`, `Json`, `Bson` + +### Schema Inspection + +After reading a file, you can inspect its schema through `file.metadata.schema`, which is an array of `SchemaElement` entries. The first element is always the root; the rest describe individual fields. + +**Listing all fields and their types:** +```zig +for (file.metadata.schema[1..]) |elem| { + std.debug.print("name={s} type={any} repetition={any} logical={any}\n", .{ + elem.name, + elem.type, + elem.repetition_type, + elem.logicalType, + }); +} +``` + +Each `SchemaElement` exposes: +- `name` — field name +- `type` — physical type (`BOOLEAN`, `INT32`, `INT64`, `INT96`, `FLOAT`, `DOUBLE`, `BYTE_ARRAY`, `FIXED_LEN_BYTE_ARRAY`) +- `repetition_type` — `REQUIRED`, `OPTIONAL`, or `REPEATED` +- `logicalType` — logical type (`STRING`, `DATE`, `TIMESTAMP`, `DECIMAL`, `UUID`, `MAP`, `LIST`, etc.) +- `converted_type` — legacy converted type +- `num_children` — non-null for group (struct/nested) elements +- `type_length` — byte width for `FIXED_LEN_BYTE_ARRAY` +- `scale` / `precision` — for decimal types + +**Looking up a column by name:** +```zig +const info = file.findSchemaElement(&.{"fare_amount"}).?; +// info.column_index — index to pass to readColumn / readColumnDynamic +// info.max_definition_level — for nullable columns +// info.max_repetition_level — for repeated (list) columns +// info.elem — the SchemaElement with full type info +``` + +For nested schemas, pass the full path: +```zig +const nested = file.findSchemaElement(&.{ "address", "city" }).?; +``` + +**File-level metadata** is also available: +```zig +std.debug.print("version: {d}\n", .{file.metadata.version}); +std.debug.print("num_rows: {d}\n", .{file.metadata.num_rows}); +std.debug.print("row_groups: {d}\n", .{file.metadata.row_groups.len}); +if (file.metadata.created_by) |created_by| { + std.debug.print("created_by: {s}\n", .{created_by}); +} +``` From f249aa732612156b67f7a3e7fd1763b0a1861716 Mon Sep 17 00:00:00 2001 From: Burak Varli Date: Sat, 7 Feb 2026 19:18:39 +0000 Subject: [PATCH 3/3] Add TPC-H SF1 public dataset tests Add all 8 TPC-H SF1 tables as a new public dataset for testing. The TPC-H benchmark is the de facto standard for analytical database systems and provides type coverage for DECIMAL(15,2) and DATE columns not exercised by existing tests. Local tests (region, nation, supplier) verify i32, i64, string, and decimal columns. CI-only tests (lineitem, orders, part, partsupp, customer) cover large files with up to 49 row groups and 6M rows, testing date, decimal, and string columns across the full TPC-H schema. Co-authored-by: amp[bot] --- scripts/download-public-datasets.sh | 60 +++++++ src/public_datasets_testing.zig | 269 ++++++++++++++++++++++++++++ testdata/public-datasets/README.md | 28 +++ 3 files changed, 357 insertions(+) diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh index e1220ae..acf2791 100755 --- a/scripts/download-public-datasets.sh +++ b/scripts/download-public-datasets.sh @@ -75,6 +75,65 @@ download_clickbench() { fi } +# ============================================================================= +# TPC-H SF1 Dataset +# Generated using DuckDB's TPC-H extension +# ============================================================================= +download_tpch() { + local mode="$1" + local dest="$DEST_DIR/tpch-sf1" + mkdir -p "$dest" + + echo "=== TPC-H SF1 Dataset ===" + + local small_tables=("nation" "region" "supplier") + local big_tables=("lineitem" "orders" "customer" "part" "partsupp") + + local need_generate=false + for table in "${small_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + + if [[ "$mode" == "all" ]]; then + for table in "${big_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + fi + + if [[ "$need_generate" == "false" ]]; then + echo " All required TPC-H files already exist" + return + fi + + local tables_to_generate="" + if [[ "$mode" == "all" ]]; then + tables_to_generate="nation region supplier lineitem orders customer part partsupp" + else + tables_to_generate="nation region supplier" + fi + + echo " Generating TPC-H SF1 data via DuckDB..." + uvx --from "duckdb" --with pyarrow python -c " +import duckdb +con = duckdb.connect() +con.execute('INSTALL tpch; LOAD tpch; CALL dbgen(sf=1)') +for table in '${tables_to_generate}'.split(): + dest = '${dest}/' + table + '.parquet' + import os + if not os.path.exists(dest): + print(f' Generating: {table}.parquet') + con.execute(f\"COPY {table} TO '{dest}' (FORMAT PARQUET)\") + else: + print(f' Already exists: {table}.parquet') +" +} + # ============================================================================= # Add more datasets here following the same pattern # ============================================================================= @@ -116,6 +175,7 @@ mkdir -p "$DEST_DIR" download_nyc_taxi "$MODE" download_clickbench "$MODE" +download_tpch "$MODE" echo "" echo "Done!" diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig index c0e81ea..64b93f7 100644 --- a/src/public_datasets_testing.zig +++ b/src/public_datasets_testing.zig @@ -116,6 +116,275 @@ test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" { // try readAllRowGroups(&file); } +// ============================================================================= +// TPC-H SF1 Dataset +// Generated using DuckDB's TPC-H extension +// 8 tables, diverse types including DECIMAL and DATE +// ============================================================================= + +test "tpch sf1: region" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/region.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(5, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "AFRICA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "AMERICA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "ASIA"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "EUROPE"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "MIDDLE EAST"), names[4]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: nation" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/nation.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(25, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys[0..5]); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "ALGERIA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "ARGENTINA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "BRAZIL"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "CANADA"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "EGYPT"), names[4]); + try testing.expectEqualDeep(@as([]const u8, "UNITED STATES"), names[24]); + + const region_keys = try rg.readColumn(i32, 2); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 1, 1, 4 }, region_keys[0..5]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: supplier" { + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/supplier.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(10000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // s_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, suppkeys[0..3]); + + // s_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000001"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000002"), names[1]); + + // s_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 17, 5, 1 }, nationkeys[0..3]); + + // s_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(5755.94, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(4032.68, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(4192.40, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: lineitem (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Date = parzig.parquet.Date; + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/lineitem.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(49, file.metadata.row_groups.len); + try testing.expectEqual(6001215, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // l_orderkey (i64) + const orderkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, orderkeys[0..3]); + + // l_quantity (decimal(15,2)) + const quantities = try rg.readColumn(Decimal, 4); + try testing.expectApproxEqAbs(17.0, @as(f64, @floatCast(quantities[0].value)), 0.01); + try testing.expectApproxEqAbs(36.0, @as(f64, @floatCast(quantities[1].value)), 0.01); + try testing.expectApproxEqAbs(8.0, @as(f64, @floatCast(quantities[2].value)), 0.01); + + // l_returnflag (string) + const returnflags = try rg.readColumn([]const u8, 8); + try testing.expectEqualDeep(@as([]const u8, "N"), returnflags[0]); + + // l_shipdate (date) + const shipdates = try rg.readColumn(Date, 10); + try testing.expectEqual(9568, shipdates[0].days_since_epoch); // 1996-03-13 + try testing.expectEqual(9598, shipdates[1].days_since_epoch); // 1996-04-12 + try testing.expectEqual(9524, shipdates[2].days_since_epoch); // 1996-01-29 + + // l_shipmode (string) + const shipmodes = try rg.readColumn([]const u8, 14); + try testing.expectEqualDeep(@as([]const u8, "TRUCK"), shipmodes[0]); + try testing.expectEqualDeep(@as([]const u8, "MAIL"), shipmodes[1]); + try testing.expectEqualDeep(@as([]const u8, "REG AIR"), shipmodes[2]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: orders (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/orders.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(13, file.metadata.row_groups.len); + try testing.expectEqual(1500000, file.metadata.num_rows); + + try readAllRowGroups(&file); +} + +test "tpch sf1: part (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/part.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(200000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // p_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, partkeys[0..3]); + + // p_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "goldenrod lavender spring chocolate lace"), names[0]); + + // p_brand (string) + const brands = try rg.readColumn([]const u8, 3); + try testing.expectEqualDeep(@as([]const u8, "Brand#13"), brands[0]); + + // p_size (i32) + const sizes = try rg.readColumn(i32, 5); + try testing.expectEqualSlices(i32, &[_]i32{ 7, 1, 21 }, sizes[0..3]); + + // p_retailprice (decimal(15,2)) + const prices = try rg.readColumn(Decimal, 7); + try testing.expectApproxEqAbs(901.0, @as(f64, @floatCast(prices[0].value)), 0.01); + try testing.expectApproxEqAbs(902.0, @as(f64, @floatCast(prices[1].value)), 0.01); + try testing.expectApproxEqAbs(903.0, @as(f64, @floatCast(prices[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: partsupp (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/partsupp.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(7, file.metadata.row_groups.len); + try testing.expectEqual(800000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // ps_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, partkeys[0..3]); + + // ps_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 1); + try testing.expectEqualSlices(i64, &[_]i64{ 2, 2502, 5002 }, suppkeys[0..3]); + + // ps_availqty (i64) + const qtys = try rg.readColumn(i64, 2); + try testing.expectEqualSlices(i64, &[_]i64{ 3325, 8076, 3956 }, qtys[0..3]); + + // ps_supplycost (decimal(15,2)) + const costs = try rg.readColumn(Decimal, 3); + try testing.expectApproxEqAbs(771.64, @as(f64, @floatCast(costs[0].value)), 0.01); + try testing.expectApproxEqAbs(993.49, @as(f64, @floatCast(costs[1].value)), 0.01); + try testing.expectApproxEqAbs(337.09, @as(f64, @floatCast(costs[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: customer (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/customer.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(150000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // c_custkey (i64) + const custkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, custkeys[0..3]); + + // c_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Customer#000000001"), names[0]); + + // c_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 15, 13, 1 }, nationkeys[0..3]); + + // c_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(711.56, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(121.65, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(7498.12, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + // c_mktsegment (string) + const segments = try rg.readColumn([]const u8, 6); + try testing.expectEqualDeep(@as([]const u8, "BUILDING"), segments[0]); + try testing.expectEqualDeep(@as([]const u8, "AUTOMOBILE"), segments[1]); + + try readAllRowGroups(&file); +} + // ============================================================================= // ClickBench Dataset - CI only // Source: https://github.com/ClickHouse/ClickBench diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md index 7de34c0..8d949b5 100644 --- a/testdata/public-datasets/README.md +++ b/testdata/public-datasets/README.md @@ -12,6 +12,15 @@ public-datasets/ │ ├── fhv_tripdata_2025-10.parquet │ ├── yellow_tripdata_2025-10.parquet (CI only) │ └── fhvhv_tripdata_2025-10.parquet (CI only) +├── tpch-sf1/ # TPC-H benchmark data (SF1) +│ ├── nation.parquet +│ ├── region.parquet +│ ├── supplier.parquet +│ ├── lineitem.parquet (CI only) +│ ├── orders.parquet (CI only) +│ ├── customer.parquet (CI only) +│ ├── part.parquet (CI only) +│ └── partsupp.parquet (CI only) ├── clickbench/ # ClickBench web analytics data (CI only) │ ├── hits_0.parquet │ ├── hits_1.parquet @@ -32,6 +41,25 @@ Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-recor | `yellow_tripdata_2025-10.parquet` | ~50MB | Yes | | `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes | +### TPC-H SF1 + +Source: [TPC-H Benchmark](https://www.tpc.org/tpch/) (generated via DuckDB's TPC-H extension) + +The de facto standard benchmark for analytical database systems. 8 tables with a +normalized supply chain schema. Provides diverse types including DECIMAL(15,2) and +DATE columns, plus 49 row groups in the lineitem table. + +| File | Size | CI Only | +|------|------|---------| +| `nation.parquet` | ~2KB | No | +| `region.parquet` | ~1KB | No | +| `supplier.parquet` | ~771KB | No | +| `lineitem.parquet` | ~197MB | Yes | +| `orders.parquet` | ~53MB | Yes | +| `customer.parquet` | ~12MB | Yes | +| `part.parquet` | ~6MB | Yes | +| `partsupp.parquet` | ~40MB | Yes | + ### ClickBench Source: [ClickHouse/ClickBench](https://github.com/ClickHouse/ClickBench)