diff --git a/.agents/skills/ship-it/SKILL.md b/.agents/skills/ship-it/SKILL.md new file mode 100644 index 0000000..fa80e2b --- /dev/null +++ b/.agents/skills/ship-it/SKILL.md @@ -0,0 +1,31 @@ +--- +name: ship-it +description: "Creates a GitHub PR using jj and gh. Use when asked to ship, create a PR, or push changes." +--- + +# Ship It + +Reviews and ships the current changes as a GitHub PR. + +## Workflow + +1. Gather context: + - Run `jj status` to see current status + - Run `jj diff -f main` to see the diff from main + - Run `jj log -r main::` to see commits from main + +2. **Review the changes** before proceeding: + - Code quality and best practices + - Potential bugs or issues + - Performance considerations + - Memory safety (leaks, double-frees, proper use of `errdefer`) + - Test coverage + + Use the repository's AGENTS.md for guidance on style and conventions. If you find any issues, stop and report them to the user instead of creating the PR. + +3. Add commit description if empty with an appropriate commit message using `jj describe -m "Commit description"` +4. Create a new empty change using `jj new` +5. Push the change to upstream using `jj git push -c @-` +6. Get the autogenerated bookmark name (should be something like `push-*`) using `jj bookmark list -r 'main+::'` +7. Create a PR using the GitHub CLI (`gh pr create`) +8. Enable auto-merge with `gh pr merge --auto --squash` diff --git a/.claude/commands/create-pr.md b/.claude/commands/create-pr.md deleted file mode 100644 index e902131..0000000 --- a/.claude/commands/create-pr.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -allowed-tools: Bash(gh pr create:*), Bash(jj:*) -description: Create a GitHub PR using jj and gh ---- - -## Context - -- Current status: !`jj status` -- Current diff from `main`: !`jj diff -f main` -- Current commits from `main`: !`jj log -r main::` - -## Your task - -Based on the above changes: - -1. **Review the changes** before proceeding: - - Code quality and best practices - - Potential bugs or issues - - Performance considerations - - Memory safety (leaks, double-frees, proper use of `errdefer`) - - Test coverage - - Use the repository's AGENTS.md for guidance on style and conventions. If you find any issues, stop and report them to the user instead of creating the PR. - -2. Add commit description if empty with appropriate commit message using `jj describe -m "Commit description"` -3. Create a new empty change using `jj new` -4. Push the change to upstream using `jj git push -c @-` -5. Get autogenerated bookmark name (should be something like `push-*`) using `jj bookmark list -r 'main+::'` -6. Create a PR using the GitHub CLI -7. Enable auto-merge with `gh pr merge --auto --squash` diff --git a/.claude/settings.json b/.claude/settings.json deleted file mode 100644 index 7f45fcc..0000000 --- a/.claude/settings.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "hooks": { - "SessionStart": [ - { - "matcher": "startup", - "hooks": [ - { - "type": "command", - "command": "\"$CLAUDE_PROJECT_DIR\"/scripts/install_pkgs_on_remote.sh" - } - ] - } - ] - } -} diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 120000 index 47dc3e3..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1 +0,0 @@ -AGENTS.md \ No newline at end of file diff --git a/README.md b/README.md index e6bb3a9..e7125e7 100644 --- a/README.md +++ b/README.md @@ -107,3 +107,53 @@ Supported logical types: - **Temporal**: `Date`, `TimeMillis`, `TimeMicros`, `TimeNanos`, `TimestampMillis`, `TimestampMicros`, `TimestampNanos` - **Numeric**: `Int8`, `UInt8`, `Int16`, `UInt16`, `UInt32`, `UInt64`, `Float16`, `Decimal` - **Other**: `UUID`, `String`, `Enum`, `Json`, `Bson` + +### Schema Inspection + +After reading a file, you can inspect its schema through `file.metadata.schema`, which is an array of `SchemaElement` entries. The first element is always the root; the rest describe individual fields. + +**Listing all fields and their types:** +```zig +for (file.metadata.schema[1..]) |elem| { + std.debug.print("name={s} type={any} repetition={any} logical={any}\n", .{ + elem.name, + elem.type, + elem.repetition_type, + elem.logicalType, + }); +} +``` + +Each `SchemaElement` exposes: +- `name` — field name +- `type` — physical type (`BOOLEAN`, `INT32`, `INT64`, `INT96`, `FLOAT`, `DOUBLE`, `BYTE_ARRAY`, `FIXED_LEN_BYTE_ARRAY`) +- `repetition_type` — `REQUIRED`, `OPTIONAL`, or `REPEATED` +- `logicalType` — logical type (`STRING`, `DATE`, `TIMESTAMP`, `DECIMAL`, `UUID`, `MAP`, `LIST`, etc.) +- `converted_type` — legacy converted type +- `num_children` — non-null for group (struct/nested) elements +- `type_length` — byte width for `FIXED_LEN_BYTE_ARRAY` +- `scale` / `precision` — for decimal types + +**Looking up a column by name:** +```zig +const info = file.findSchemaElement(&.{"fare_amount"}).?; +// info.column_index — index to pass to readColumn / readColumnDynamic +// info.max_definition_level — for nullable columns +// info.max_repetition_level — for repeated (list) columns +// info.elem — the SchemaElement with full type info +``` + +For nested schemas, pass the full path: +```zig +const nested = file.findSchemaElement(&.{ "address", "city" }).?; +``` + +**File-level metadata** is also available: +```zig +std.debug.print("version: {d}\n", .{file.metadata.version}); +std.debug.print("num_rows: {d}\n", .{file.metadata.num_rows}); +std.debug.print("row_groups: {d}\n", .{file.metadata.row_groups.len}); +if (file.metadata.created_by) |created_by| { + std.debug.print("created_by: {s}\n", .{created_by}); +} +``` diff --git a/scripts/download-public-datasets.sh b/scripts/download-public-datasets.sh index e1220ae..acf2791 100755 --- a/scripts/download-public-datasets.sh +++ b/scripts/download-public-datasets.sh @@ -75,6 +75,65 @@ download_clickbench() { fi } +# ============================================================================= +# TPC-H SF1 Dataset +# Generated using DuckDB's TPC-H extension +# ============================================================================= +download_tpch() { + local mode="$1" + local dest="$DEST_DIR/tpch-sf1" + mkdir -p "$dest" + + echo "=== TPC-H SF1 Dataset ===" + + local small_tables=("nation" "region" "supplier") + local big_tables=("lineitem" "orders" "customer" "part" "partsupp") + + local need_generate=false + for table in "${small_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + + if [[ "$mode" == "all" ]]; then + for table in "${big_tables[@]}"; do + if [[ ! -f "$dest/$table.parquet" ]]; then + need_generate=true + break + fi + done + fi + + if [[ "$need_generate" == "false" ]]; then + echo " All required TPC-H files already exist" + return + fi + + local tables_to_generate="" + if [[ "$mode" == "all" ]]; then + tables_to_generate="nation region supplier lineitem orders customer part partsupp" + else + tables_to_generate="nation region supplier" + fi + + echo " Generating TPC-H SF1 data via DuckDB..." + uvx --from "duckdb" --with pyarrow python -c " +import duckdb +con = duckdb.connect() +con.execute('INSTALL tpch; LOAD tpch; CALL dbgen(sf=1)') +for table in '${tables_to_generate}'.split(): + dest = '${dest}/' + table + '.parquet' + import os + if not os.path.exists(dest): + print(f' Generating: {table}.parquet') + con.execute(f\"COPY {table} TO '{dest}' (FORMAT PARQUET)\") + else: + print(f' Already exists: {table}.parquet') +" +} + # ============================================================================= # Add more datasets here following the same pattern # ============================================================================= @@ -116,6 +175,7 @@ mkdir -p "$DEST_DIR" download_nyc_taxi "$MODE" download_clickbench "$MODE" +download_tpch "$MODE" echo "" echo "Done!" diff --git a/src/public_datasets_testing.zig b/src/public_datasets_testing.zig index c0e81ea..64b93f7 100644 --- a/src/public_datasets_testing.zig +++ b/src/public_datasets_testing.zig @@ -116,6 +116,275 @@ test "nyc taxi: fhvhv tripdata 2025-10 (ci only)" { // try readAllRowGroups(&file); } +// ============================================================================= +// TPC-H SF1 Dataset +// Generated using DuckDB's TPC-H extension +// 8 tables, diverse types including DECIMAL and DATE +// ============================================================================= + +test "tpch sf1: region" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/region.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(5, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "AFRICA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "AMERICA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "ASIA"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "EUROPE"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "MIDDLE EAST"), names[4]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: nation" { + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/nation.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(25, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + const keys = try rg.readColumn(i32, 0); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 2, 3, 4 }, keys[0..5]); + + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "ALGERIA"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "ARGENTINA"), names[1]); + try testing.expectEqualDeep(@as([]const u8, "BRAZIL"), names[2]); + try testing.expectEqualDeep(@as([]const u8, "CANADA"), names[3]); + try testing.expectEqualDeep(@as([]const u8, "EGYPT"), names[4]); + try testing.expectEqualDeep(@as([]const u8, "UNITED STATES"), names[24]); + + const region_keys = try rg.readColumn(i32, 2); + try testing.expectEqualSlices(i32, &[_]i32{ 0, 1, 1, 1, 4 }, region_keys[0..5]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: supplier" { + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/supplier.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(1, file.metadata.row_groups.len); + try testing.expectEqual(10000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // s_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, suppkeys[0..3]); + + // s_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000001"), names[0]); + try testing.expectEqualDeep(@as([]const u8, "Supplier#000000002"), names[1]); + + // s_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 17, 5, 1 }, nationkeys[0..3]); + + // s_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(5755.94, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(4032.68, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(4192.40, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: lineitem (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Date = parzig.parquet.Date; + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/lineitem.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(49, file.metadata.row_groups.len); + try testing.expectEqual(6001215, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // l_orderkey (i64) + const orderkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, orderkeys[0..3]); + + // l_quantity (decimal(15,2)) + const quantities = try rg.readColumn(Decimal, 4); + try testing.expectApproxEqAbs(17.0, @as(f64, @floatCast(quantities[0].value)), 0.01); + try testing.expectApproxEqAbs(36.0, @as(f64, @floatCast(quantities[1].value)), 0.01); + try testing.expectApproxEqAbs(8.0, @as(f64, @floatCast(quantities[2].value)), 0.01); + + // l_returnflag (string) + const returnflags = try rg.readColumn([]const u8, 8); + try testing.expectEqualDeep(@as([]const u8, "N"), returnflags[0]); + + // l_shipdate (date) + const shipdates = try rg.readColumn(Date, 10); + try testing.expectEqual(9568, shipdates[0].days_since_epoch); // 1996-03-13 + try testing.expectEqual(9598, shipdates[1].days_since_epoch); // 1996-04-12 + try testing.expectEqual(9524, shipdates[2].days_since_epoch); // 1996-01-29 + + // l_shipmode (string) + const shipmodes = try rg.readColumn([]const u8, 14); + try testing.expectEqualDeep(@as([]const u8, "TRUCK"), shipmodes[0]); + try testing.expectEqualDeep(@as([]const u8, "MAIL"), shipmodes[1]); + try testing.expectEqualDeep(@as([]const u8, "REG AIR"), shipmodes[2]); + + try readAllRowGroups(&file); +} + +test "tpch sf1: orders (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/orders.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(13, file.metadata.row_groups.len); + try testing.expectEqual(1500000, file.metadata.num_rows); + + try readAllRowGroups(&file); +} + +test "tpch sf1: part (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/part.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(200000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // p_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, partkeys[0..3]); + + // p_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "goldenrod lavender spring chocolate lace"), names[0]); + + // p_brand (string) + const brands = try rg.readColumn([]const u8, 3); + try testing.expectEqualDeep(@as([]const u8, "Brand#13"), brands[0]); + + // p_size (i32) + const sizes = try rg.readColumn(i32, 5); + try testing.expectEqualSlices(i32, &[_]i32{ 7, 1, 21 }, sizes[0..3]); + + // p_retailprice (decimal(15,2)) + const prices = try rg.readColumn(Decimal, 7); + try testing.expectApproxEqAbs(901.0, @as(f64, @floatCast(prices[0].value)), 0.01); + try testing.expectApproxEqAbs(902.0, @as(f64, @floatCast(prices[1].value)), 0.01); + try testing.expectApproxEqAbs(903.0, @as(f64, @floatCast(prices[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: partsupp (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/partsupp.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(7, file.metadata.row_groups.len); + try testing.expectEqual(800000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // ps_partkey (i64) + const partkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 1, 1 }, partkeys[0..3]); + + // ps_suppkey (i64) + const suppkeys = try rg.readColumn(i64, 1); + try testing.expectEqualSlices(i64, &[_]i64{ 2, 2502, 5002 }, suppkeys[0..3]); + + // ps_availqty (i64) + const qtys = try rg.readColumn(i64, 2); + try testing.expectEqualSlices(i64, &[_]i64{ 3325, 8076, 3956 }, qtys[0..3]); + + // ps_supplycost (decimal(15,2)) + const costs = try rg.readColumn(Decimal, 3); + try testing.expectApproxEqAbs(771.64, @as(f64, @floatCast(costs[0].value)), 0.01); + try testing.expectApproxEqAbs(993.49, @as(f64, @floatCast(costs[1].value)), 0.01); + try testing.expectApproxEqAbs(337.09, @as(f64, @floatCast(costs[2].value)), 0.01); + + try readAllRowGroups(&file); +} + +test "tpch sf1: customer (ci only)" { + if (!ci_tests) return error.SkipZigTest; + + const Decimal = parzig.parquet.Decimal; + + var reader_buf: [4096]u8 = undefined; + var file_reader = (try Io.Dir.cwd().openFile(io, "testdata/public-datasets/tpch-sf1/customer.parquet", .{ .mode = .read_only })).reader(io, &reader_buf); + var file = try File.read(testing.allocator, &file_reader); + defer file.deinit(); + + try testing.expectEqual(2, file.metadata.row_groups.len); + try testing.expectEqual(150000, file.metadata.num_rows); + + var rg = file.rowGroup(0); + + // c_custkey (i64) + const custkeys = try rg.readColumn(i64, 0); + try testing.expectEqualSlices(i64, &[_]i64{ 1, 2, 3 }, custkeys[0..3]); + + // c_name (string) + const names = try rg.readColumn([]const u8, 1); + try testing.expectEqualDeep(@as([]const u8, "Customer#000000001"), names[0]); + + // c_nationkey (i32) + const nationkeys = try rg.readColumn(i32, 3); + try testing.expectEqualSlices(i32, &[_]i32{ 15, 13, 1 }, nationkeys[0..3]); + + // c_acctbal (decimal(15,2)) + const acctbals = try rg.readColumn(Decimal, 5); + try testing.expectApproxEqAbs(711.56, @as(f64, @floatCast(acctbals[0].value)), 0.01); + try testing.expectApproxEqAbs(121.65, @as(f64, @floatCast(acctbals[1].value)), 0.01); + try testing.expectApproxEqAbs(7498.12, @as(f64, @floatCast(acctbals[2].value)), 0.01); + + // c_mktsegment (string) + const segments = try rg.readColumn([]const u8, 6); + try testing.expectEqualDeep(@as([]const u8, "BUILDING"), segments[0]); + try testing.expectEqualDeep(@as([]const u8, "AUTOMOBILE"), segments[1]); + + try readAllRowGroups(&file); +} + // ============================================================================= // ClickBench Dataset - CI only // Source: https://github.com/ClickHouse/ClickBench diff --git a/testdata/public-datasets/README.md b/testdata/public-datasets/README.md index 7de34c0..8d949b5 100644 --- a/testdata/public-datasets/README.md +++ b/testdata/public-datasets/README.md @@ -12,6 +12,15 @@ public-datasets/ │ ├── fhv_tripdata_2025-10.parquet │ ├── yellow_tripdata_2025-10.parquet (CI only) │ └── fhvhv_tripdata_2025-10.parquet (CI only) +├── tpch-sf1/ # TPC-H benchmark data (SF1) +│ ├── nation.parquet +│ ├── region.parquet +│ ├── supplier.parquet +│ ├── lineitem.parquet (CI only) +│ ├── orders.parquet (CI only) +│ ├── customer.parquet (CI only) +│ ├── part.parquet (CI only) +│ └── partsupp.parquet (CI only) ├── clickbench/ # ClickBench web analytics data (CI only) │ ├── hits_0.parquet │ ├── hits_1.parquet @@ -32,6 +41,25 @@ Source: [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-recor | `yellow_tripdata_2025-10.parquet` | ~50MB | Yes | | `fhvhv_tripdata_2025-10.parquet` | ~400MB | Yes | +### TPC-H SF1 + +Source: [TPC-H Benchmark](https://www.tpc.org/tpch/) (generated via DuckDB's TPC-H extension) + +The de facto standard benchmark for analytical database systems. 8 tables with a +normalized supply chain schema. Provides diverse types including DECIMAL(15,2) and +DATE columns, plus 49 row groups in the lineitem table. + +| File | Size | CI Only | +|------|------|---------| +| `nation.parquet` | ~2KB | No | +| `region.parquet` | ~1KB | No | +| `supplier.parquet` | ~771KB | No | +| `lineitem.parquet` | ~197MB | Yes | +| `orders.parquet` | ~53MB | Yes | +| `customer.parquet` | ~12MB | Yes | +| `part.parquet` | ~6MB | Yes | +| `partsupp.parquet` | ~40MB | Yes | + ### ClickBench Source: [ClickHouse/ClickBench](https://github.com/ClickHouse/ClickBench)