From a2fd1eb0ec62fb48946a56158f1393e0b9a820b5 Mon Sep 17 00:00:00 2001 From: Burak Varli Date: Sun, 1 Feb 2026 12:50:32 +0000 Subject: [PATCH] Improve README with NYC taxi example and add examples directory --- README.md | 76 +++++++++++++++++++++++++++++++++++++++---- build.zig | 15 +++++++++ examples/nyc_taxi.zig | 50 ++++++++++++++++++++++++++++ src/main.zig | 12 +++---- src/parquet/File.zig | 2 +- 5 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 examples/nyc_taxi.zig diff --git a/README.md b/README.md index 8263d84..0c2da60 100644 --- a/README.md +++ b/README.md @@ -4,27 +4,91 @@ A Parquet parser written in Zig using only the standard library. ## Usage +Here's an example analyzing [NYC taxi trip data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page): + ```zig +const std = @import("std"); const parzig = @import("parzig"); -var file = try parzig.parquet.File.read(allocator, &file_reader); -defer file.deinit(); +const Io = std.Io; +const File = parzig.parquet.File; + +pub fn main(init: std.process.Init) !void { + const allocator = init.gpa; + const io = init.io; + + var reader_buf: [4096]u8 = undefined; + const f = try Io.Dir.cwd().openFile(io, "green_tripdata_2025-10.parquet", .{ .mode = .read_only }); + var file_reader = f.reader(io, &reader_buf); + + var file = try parzig.parquet.File.read(allocator, &file_reader); + defer file.deinit(); + + var rg = file.rowGroup(0); + + // Static typing: specify column type at compile time + const fares = try rg.readColumn(f64, columnIndex(&file, "fare_amount")); + const tips = try rg.readColumn(f64, columnIndex(&file, "tip_amount")); + const passengers = try rg.readColumn(?i64, columnIndex(&file, "passenger_count")); // nullable + + var total_fare: f64 = 0; + var total_tips: f64 = 0; + var total_passengers: i64 = 0; + + for (fares, tips, passengers) |fare, tip, passenger| { + total_fare += fare; + total_tips += tip; + if (passenger) |p| total_passengers += p; + } + + std.debug.print("Total rides: {}\n", .{file.metadata.num_rows}); + std.debug.print("Total fares: ${d:.2}\n", .{total_fare}); + std.debug.print("Total tips: ${d:.2}\n", .{total_tips}); + std.debug.print("Total passengers: {}\n", .{total_passengers}); + + // Dynamic typing: type determined at runtime + const dynamic = try rg.readColumnDynamic(columnIndex(&file, "fare_amount")); + switch (dynamic) { + .double => |values| std.debug.print("First fare: ${d:.2}\n", .{values[0].?}), + else => unreachable, + } +} + +fn columnIndex(file: *File, name: []const u8) usize { + return file.findSchemaElement(&.{name}).?.index - 1; +} +``` + +Output: +``` +Total rides: 49416 +Total fares: $898727.45 +Total tips: $136046.83 +Total passengers: 57441 +First fare: $5.80 +``` -var rg = file.rowGroup(0); +### Column Access -// Static typing: specify type at compile time +**Static typing** - specify the column type at compile time: +```zig const values = try rg.readColumn(i64, 0); const nullable = try rg.readColumn(?i64, 1); +``` -// Dynamic typing: type determined at runtime +**Dynamic typing** - type determined at runtime: +```zig const dynamic = try rg.readColumnDynamic(0); switch (dynamic) { .int64 => |data| // ... + .double => |data| // ... .byte_array => |data| // ... // ... } +``` -// Nested types +**Nested types** - lists and maps: +```zig const list = try rg.readListColumn(i32, 0); const map = try rg.readMapColumn([]const u8, i64, 0, 1); ``` diff --git a/build.zig b/build.zig index 7fbd9e1..8b7a1ea 100644 --- a/build.zig +++ b/build.zig @@ -46,6 +46,21 @@ pub fn build(b: *std.Build) void { const generate_step = b.step("generate", "Generate Zig file from parquet.thrift"); generate_step.dependOn(&generate_cmd.step); + const nyc_taxi_example = b.addExecutable(.{ + .name = "nyc_taxi", + .root_module = b.createModule(.{ + .root_source_file = b.path("examples/nyc_taxi.zig"), + .target = target, + .optimize = optimize, + }), + }); + nyc_taxi_example.root_module.addImport("parzig", lib); + b.installArtifact(nyc_taxi_example); + + const nyc_taxi_cmd = b.addRunArtifact(nyc_taxi_example); + const nyc_taxi_step = b.step("nyc-taxi", "Run NYC taxi example"); + nyc_taxi_step.dependOn(&nyc_taxi_cmd.step); + const lib_unit_tests = b.addTest(.{ .root_module = b.createModule(.{ .root_source_file = b.path("src/parzig.zig"), diff --git a/examples/nyc_taxi.zig b/examples/nyc_taxi.zig new file mode 100644 index 0000000..4f26698 --- /dev/null +++ b/examples/nyc_taxi.zig @@ -0,0 +1,50 @@ +const std = @import("std"); +const parzig = @import("parzig"); + +const Io = std.Io; +const File = parzig.parquet.File; + +pub fn main(init: std.process.Init) !void { + const allocator = init.gpa; + const io = init.io; + + var reader_buf: [4096]u8 = undefined; + const f = try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/green_tripdata_2025-10.parquet", .{ .mode = .read_only }); + var file_reader = f.reader(io, &reader_buf); + + var file = try parzig.parquet.File.read(allocator, &file_reader); + defer file.deinit(); + + var rg = file.rowGroup(0); + + // Static typing: specify column type at compile time + const fares = try rg.readColumn(f64, columnIndex(&file, "fare_amount")); + const tips = try rg.readColumn(f64, columnIndex(&file, "tip_amount")); + const passengers = try rg.readColumn(?i64, columnIndex(&file, "passenger_count")); // nullable + + var total_fare: f64 = 0; + var total_tips: f64 = 0; + var total_passengers: i64 = 0; + + for (fares, tips, passengers) |fare, tip, passenger| { + total_fare += fare; + total_tips += tip; + if (passenger) |p| total_passengers += p; + } + + std.debug.print("Total rides: {}\n", .{file.metadata.num_rows}); + std.debug.print("Total fares: ${d:.2}\n", .{total_fare}); + std.debug.print("Total tips: ${d:.2}\n", .{total_tips}); + std.debug.print("Total passengers: {}\n", .{total_passengers}); + + // Dynamic typing: type determined at runtime + const dynamic = try rg.readColumnDynamic(columnIndex(&file, "fare_amount")); + switch (dynamic) { + .double => |values| std.debug.print("First fare: ${d:.2}\n", .{values[0].?}), + else => unreachable, + } +} + +fn columnIndex(file: *File, name: []const u8) usize { + return file.findSchemaElement(&.{name}).?.index - 1; +} diff --git a/src/main.zig b/src/main.zig index 50ebfaf..685aa0b 100644 --- a/src/main.zig +++ b/src/main.zig @@ -3,19 +3,15 @@ const parzig = @import("parzig"); const Io = std.Io; -pub fn main(init: std.process.Init.Minimal) !void { - var gpa: std.heap.DebugAllocator(.{}) = .init; - defer { - std.debug.assert(gpa.deinit() == .ok); - } - const allocator = gpa.allocator(); +pub fn main(init: std.process.Init) !void { + const allocator = init.gpa; - var args = std.process.Args.Iterator.init(init.args); + var args = std.process.Args.Iterator.init(init.minimal.args); _ = args.skip(); // program name const path = args.next() orelse return error.MissingArgument; std.debug.print("Parsing {s}\n", .{path}); - var threaded: Io.Threaded = .init(allocator, .{ .environ = init.environ }); + var threaded: Io.Threaded = .init(allocator, .{ .environ = init.minimal.environ }); defer threaded.deinit(); const io = threaded.io(); diff --git a/src/parquet/File.zig b/src/parquet/File.zig index 0d5a0ba..f3c28fa 100644 --- a/src/parquet/File.zig +++ b/src/parquet/File.zig @@ -94,7 +94,7 @@ pub fn deinit(self: *File) void { /// Find a schema element by path and return its index, definition level, repetition level, and the element itself. /// Returns null if the path is not found or invalid. -pub fn findSchemaElement(self: *File, path: [][]const u8) ?struct { index: usize, max_definition_level: u8, max_repetition_level: u8, elem: parquet_schema.SchemaElement } { +pub fn findSchemaElement(self: *File, path: []const []const u8) ?struct { index: usize, max_definition_level: u8, max_repetition_level: u8, elem: parquet_schema.SchemaElement } { if (path.len == 0 or self.metadata.schema.len < 2) { return null; }