From e6a5032fa3179fb5d745d131507b8b782ffd2cbc Mon Sep 17 00:00:00 2001
From: Burak Varli <unexge@gmail.com>
Date: Sun, 1 Feb 2026 13:39:44 +0000
Subject: [PATCH] Refactor findSchemaElement to use named struct and add
 column_index

- Define SchemaInfo struct instead of anonymous struct
- Add column_index field that corresponds to the index in row group columns array
- Track column_index by counting only leaf elements (elements without children)
- Update NYC taxi example to use column_index directly instead of index - 1
- Update README example to use column_index
- Add tests for findSchemaElement covering flat and nested schemas
---
 README.md             |  2 +-
 examples/nyc_taxi.zig |  2 +-
 src/parquet/File.zig  | 90 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 0c2da60..c06c54d 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ pub fn main(init: std.process.Init) !void {
 }
 
 fn columnIndex(file: *File, name: []const u8) usize {
-    return file.findSchemaElement(&.{name}).?.index - 1;
+    return file.findSchemaElement(&.{name}).?.column_index;
 }
 ```
 
diff --git a/examples/nyc_taxi.zig b/examples/nyc_taxi.zig
index 4f26698..7946f23 100644
--- a/examples/nyc_taxi.zig
+++ b/examples/nyc_taxi.zig
@@ -46,5 +46,5 @@ pub fn main(init: std.process.Init) !void {
 }
 
 fn columnIndex(file: *File, name: []const u8) usize {
-    return file.findSchemaElement(&.{name}).?.index - 1;
+    return file.findSchemaElement(&.{name}).?.column_index;
 }
diff --git a/src/parquet/File.zig b/src/parquet/File.zig
index f3c28fa..c538d9e 100644
--- a/src/parquet/File.zig
+++ b/src/parquet/File.zig
@@ -13,6 +13,13 @@ const nestedReader = @import("./nestedReader.zig");
 
 const File = @This();
 
+pub const SchemaInfo = struct {
+    column_index: usize,
+    max_definition_level: u8,
+    max_repetition_level: u8,
+    elem: parquet_schema.SchemaElement,
+};
+
 const MAGIC = "PAR1";
 const METADATA_LENGTH_SIZE = 4;
 const FOOTER_SIZE = MAGIC.len + METADATA_LENGTH_SIZE;
@@ -92,14 +99,16 @@ pub fn deinit(self: *File) void {
     self.arena.deinit();
 }
 
-/// Find a schema element by path and return its index, definition level, repetition level, and the element itself.
+/// Find a schema element by path and return its column index, definition level, repetition level, and the element itself.
+/// The column_index corresponds to the index in the row group's columns array.
 /// Returns null if the path is not found or invalid.
-pub fn findSchemaElement(self: *File, path: []const []const u8) ?struct { index: usize, max_definition_level: u8, max_repetition_level: u8, elem: parquet_schema.SchemaElement } {
+pub fn findSchemaElement(self: *File, path: []const []const u8) ?SchemaInfo {
     if (path.len == 0 or self.metadata.schema.len < 2) {
         return null;
     }
 
     var current_idx: usize = 1; // Skip the root element
+    var column_index: usize = 0;
     var max_definition_level: u8 = 0;
     var max_repetition_level: u8 = 0;
     var elem: parquet_schema.SchemaElement = undefined;
@@ -113,13 +122,16 @@ pub fn findSchemaElement(self: *File, path: []const []const u8) ?struct { index:
                     break :blk current_idx;
                 }
 
-                // Skip this element and all its children
+                // Skip this element and all its children, counting leaf columns
                 var skip_idx = current_idx;
-                var nodes_to_skip: i32 = 1; // Need to skip the current one at least
+                var nodes_to_skip: i32 = 1;
 
                 while (nodes_to_skip > 0 and skip_idx < self.metadata.schema.len) {
-                    if (self.metadata.schema[skip_idx].num_children) |num_children| {
+                    const skip_elem = self.metadata.schema[skip_idx];
+                    if (skip_elem.num_children) |num_children| {
                         nodes_to_skip += @as(i32, @intCast(num_children));
+                    } else {
+                        column_index += 1;
                     }
 
                     nodes_to_skip -= 1;
@@ -150,7 +162,7 @@ pub fn findSchemaElement(self: *File, path: []const []const u8) ?struct { index:
         current_idx = found_idx + 1;
     }
 
-    return .{ .index = current_idx - 1, .max_definition_level = max_definition_level, .max_repetition_level = max_repetition_level, .elem = elem };
+    return .{ .column_index = column_index, .max_definition_level = max_definition_level, .max_repetition_level = max_repetition_level, .elem = elem };
 }
 
 pub fn readLevelDataV1(_: *File, reader: *Reader, encoding: parquet_schema.Encoding, bit_width: u8, dest: []u16) !void {
@@ -319,3 +331,69 @@ test "reading simple file with nulls with dynamic types" {
     try std.testing.expectEqualSlices(?i64, &[_]?i64{ 1, 2, null, 4 }, (try rg.readColumnDynamic(0)).int64);
     try std.testing.expectEqualDeep(&[_]?[]const u8{ null, "foo", "bar", null }, (try rg.readColumnDynamic(1)).byte_array);
 }
+
+test "findSchemaElement returns correct column_index for flat schema" {
+    var reader_buf: [1024]u8 = undefined;
+    var file_reader = (try std.Io.Dir.cwd().openFile(std.testing.io, "testdata/simple.parquet", .{ .mode = .read_only })).reader(std.testing.io, &reader_buf);
+    var file = try File.read(std.testing.allocator, &file_reader);
+    defer file.deinit();
+
+    // Schema: root, foo, bar, ham (3 columns, indices 0, 1, 2)
+    const foo = file.findSchemaElement(&.{"foo"}).?;
+    try std.testing.expectEqual(0, foo.column_index);
+    try std.testing.expectEqualStrings("foo", foo.elem.name);
+
+    const bar = file.findSchemaElement(&.{"bar"}).?;
+    try std.testing.expectEqual(1, bar.column_index);
+    try std.testing.expectEqualStrings("bar", bar.elem.name);
+
+    const ham = file.findSchemaElement(&.{"ham"}).?;
+    try std.testing.expectEqual(2, ham.column_index);
+    try std.testing.expectEqualStrings("ham", ham.elem.name);
+}
+
+test "findSchemaElement returns null for non-existent path" {
+    var reader_buf: [1024]u8 = undefined;
+    var file_reader = (try std.Io.Dir.cwd().openFile(std.testing.io, "testdata/simple.parquet", .{ .mode = .read_only })).reader(std.testing.io, &reader_buf);
+    var file = try File.read(std.testing.allocator, &file_reader);
+    defer file.deinit();
+
+    try std.testing.expect(file.findSchemaElement(&.{"nonexistent"}) == null);
+    try std.testing.expect(file.findSchemaElement(&.{ "foo", "nested" }) == null);
+    try std.testing.expect(file.findSchemaElement(&.{}) == null);
+}
+
+test "findSchemaElement returns correct levels for optional columns" {
+    var reader_buf: [1024]u8 = undefined;
+    var file_reader = (try std.Io.Dir.cwd().openFile(std.testing.io, "testdata/simple_with_nulls.parquet", .{ .mode = .read_only })).reader(std.testing.io, &reader_buf);
+    var file = try File.read(std.testing.allocator, &file_reader);
+    defer file.deinit();
+
+    const col0 = file.findSchemaElement(&.{"a"}).?;
+    try std.testing.expectEqual(1, col0.max_definition_level);
+    try std.testing.expectEqual(0, col0.max_repetition_level);
+
+    const col1 = file.findSchemaElement(&.{"b"}).?;
+    try std.testing.expectEqual(1, col1.max_definition_level);
+    try std.testing.expectEqual(0, col1.max_repetition_level);
+}
+
+test "findSchemaElement returns correct column_index for nested schema" {
+    var reader_buf: [1024]u8 = undefined;
+    var file_reader = (try std.Io.Dir.cwd().openFile(std.testing.io, "testdata/parquet-testing/data/nested_structs.rust.parquet", .{ .mode = .read_only })).reader(std.testing.io, &reader_buf);
+    var file = try File.read(std.testing.allocator, &file_reader);
+    defer file.deinit();
+
+    // First struct: roll_num with 6 nested fields (min, max, mean, count, sum, variance)
+    // These are columns 0-5
+    const roll_num_min = file.findSchemaElement(&.{ "roll_num", "min" }).?;
+    try std.testing.expectEqual(0, roll_num_min.column_index);
+
+    const roll_num_variance = file.findSchemaElement(&.{ "roll_num", "variance" }).?;
+    try std.testing.expectEqual(5, roll_num_variance.column_index);
+
+    // Second struct: PC_CUR with 6 nested fields
+    // These are columns 6-11
+    const pc_cur_min = file.findSchemaElement(&.{ "PC_CUR", "min" }).?;
+    try std.testing.expectEqual(6, pc_cur_min.column_index);
+}