Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 70 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,91 @@ A Parquet parser written in Zig using only the standard library.

## Usage

Here's an example analyzing [NYC taxi trip data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page):

```zig
const std = @import("std");
const parzig = @import("parzig");

var file = try parzig.parquet.File.read(allocator, &file_reader);
defer file.deinit();
const Io = std.Io;
const File = parzig.parquet.File;

pub fn main(init: std.process.Init) !void {
const allocator = init.gpa;
const io = init.io;

var reader_buf: [4096]u8 = undefined;
const f = try Io.Dir.cwd().openFile(io, "green_tripdata_2025-10.parquet", .{ .mode = .read_only });
var file_reader = f.reader(io, &reader_buf);

var file = try parzig.parquet.File.read(allocator, &file_reader);
defer file.deinit();

var rg = file.rowGroup(0);

// Static typing: specify column type at compile time
const fares = try rg.readColumn(f64, columnIndex(&file, "fare_amount"));
const tips = try rg.readColumn(f64, columnIndex(&file, "tip_amount"));
const passengers = try rg.readColumn(?i64, columnIndex(&file, "passenger_count")); // nullable

var total_fare: f64 = 0;
var total_tips: f64 = 0;
var total_passengers: i64 = 0;

for (fares, tips, passengers) |fare, tip, passenger| {
total_fare += fare;
total_tips += tip;
if (passenger) |p| total_passengers += p;
}

std.debug.print("Total rides: {}\n", .{file.metadata.num_rows});
std.debug.print("Total fares: ${d:.2}\n", .{total_fare});
std.debug.print("Total tips: ${d:.2}\n", .{total_tips});
std.debug.print("Total passengers: {}\n", .{total_passengers});

// Dynamic typing: type determined at runtime
const dynamic = try rg.readColumnDynamic(columnIndex(&file, "fare_amount"));
switch (dynamic) {
.double => |values| std.debug.print("First fare: ${d:.2}\n", .{values[0].?}),
else => unreachable,
}
}

fn columnIndex(file: *File, name: []const u8) usize {
return file.findSchemaElement(&.{name}).?.index - 1;
}
```

Output:
```
Total rides: 49416
Total fares: $898727.45
Total tips: $136046.83
Total passengers: 57441
First fare: $5.80
```

var rg = file.rowGroup(0);
### Column Access

// Static typing: specify type at compile time
**Static typing** - specify the column type at compile time:
```zig
const values = try rg.readColumn(i64, 0);
const nullable = try rg.readColumn(?i64, 1);
```

// Dynamic typing: type determined at runtime
**Dynamic typing** - type determined at runtime:
```zig
const dynamic = try rg.readColumnDynamic(0);
switch (dynamic) {
.int64 => |data| // ...
.double => |data| // ...
.byte_array => |data| // ...
// ...
}
```

// Nested types
**Nested types** - lists and maps:
```zig
const list = try rg.readListColumn(i32, 0);
const map = try rg.readMapColumn([]const u8, i64, 0, 1);
```
15 changes: 15 additions & 0 deletions build.zig
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,21 @@ pub fn build(b: *std.Build) void {
const generate_step = b.step("generate", "Generate Zig file from parquet.thrift");
generate_step.dependOn(&generate_cmd.step);

const nyc_taxi_example = b.addExecutable(.{
.name = "nyc_taxi",
.root_module = b.createModule(.{
.root_source_file = b.path("examples/nyc_taxi.zig"),
.target = target,
.optimize = optimize,
}),
});
nyc_taxi_example.root_module.addImport("parzig", lib);
b.installArtifact(nyc_taxi_example);

const nyc_taxi_cmd = b.addRunArtifact(nyc_taxi_example);
const nyc_taxi_step = b.step("nyc-taxi", "Run NYC taxi example");
nyc_taxi_step.dependOn(&nyc_taxi_cmd.step);

const lib_unit_tests = b.addTest(.{
.root_module = b.createModule(.{
.root_source_file = b.path("src/parzig.zig"),
Expand Down
50 changes: 50 additions & 0 deletions examples/nyc_taxi.zig
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
const std = @import("std");
const parzig = @import("parzig");

const Io = std.Io;
const File = parzig.parquet.File;

pub fn main(init: std.process.Init) !void {
const allocator = init.gpa;
const io = init.io;

var reader_buf: [4096]u8 = undefined;
const f = try Io.Dir.cwd().openFile(io, "testdata/public-datasets/nyc-taxi/green_tripdata_2025-10.parquet", .{ .mode = .read_only });
var file_reader = f.reader(io, &reader_buf);

var file = try parzig.parquet.File.read(allocator, &file_reader);
defer file.deinit();

var rg = file.rowGroup(0);

// Static typing: specify column type at compile time
const fares = try rg.readColumn(f64, columnIndex(&file, "fare_amount"));
const tips = try rg.readColumn(f64, columnIndex(&file, "tip_amount"));
const passengers = try rg.readColumn(?i64, columnIndex(&file, "passenger_count")); // nullable

var total_fare: f64 = 0;
var total_tips: f64 = 0;
var total_passengers: i64 = 0;

for (fares, tips, passengers) |fare, tip, passenger| {
total_fare += fare;
total_tips += tip;
if (passenger) |p| total_passengers += p;
}

std.debug.print("Total rides: {}\n", .{file.metadata.num_rows});
std.debug.print("Total fares: ${d:.2}\n", .{total_fare});
std.debug.print("Total tips: ${d:.2}\n", .{total_tips});
std.debug.print("Total passengers: {}\n", .{total_passengers});

// Dynamic typing: type determined at runtime
const dynamic = try rg.readColumnDynamic(columnIndex(&file, "fare_amount"));
switch (dynamic) {
.double => |values| std.debug.print("First fare: ${d:.2}\n", .{values[0].?}),
else => unreachable,
}
}

fn columnIndex(file: *File, name: []const u8) usize {
return file.findSchemaElement(&.{name}).?.index - 1;
}
12 changes: 4 additions & 8 deletions src/main.zig
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,15 @@ const parzig = @import("parzig");

const Io = std.Io;

pub fn main(init: std.process.Init.Minimal) !void {
var gpa: std.heap.DebugAllocator(.{}) = .init;
defer {
std.debug.assert(gpa.deinit() == .ok);
}
const allocator = gpa.allocator();
pub fn main(init: std.process.Init) !void {
const allocator = init.gpa;

var args = std.process.Args.Iterator.init(init.args);
var args = std.process.Args.Iterator.init(init.minimal.args);
_ = args.skip(); // program name
const path = args.next() orelse return error.MissingArgument;
std.debug.print("Parsing {s}\n", .{path});

var threaded: Io.Threaded = .init(allocator, .{ .environ = init.environ });
var threaded: Io.Threaded = .init(allocator, .{ .environ = init.minimal.environ });
defer threaded.deinit();
const io = threaded.io();

Expand Down
2 changes: 1 addition & 1 deletion src/parquet/File.zig
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ pub fn deinit(self: *File) void {

/// Find a schema element by path and return its index, definition level, repetition level, and the element itself.
/// Returns null if the path is not found or invalid.
pub fn findSchemaElement(self: *File, path: [][]const u8) ?struct { index: usize, max_definition_level: u8, max_repetition_level: u8, elem: parquet_schema.SchemaElement } {
pub fn findSchemaElement(self: *File, path: []const []const u8) ?struct { index: usize, max_definition_level: u8, max_repetition_level: u8, elem: parquet_schema.SchemaElement } {
if (path.len == 0 or self.metadata.schema.len < 2) {
return null;
}
Expand Down