This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new a886a572 fix(parquet/pqarrow): selective column reading of complex map
column (#668)
a886a572 is described below
commit a886a5722b872823c2a8f46445901908a7ce5254
Author: Matt Topol <[email protected]>
AuthorDate: Thu Feb 19 20:50:23 2026 -0500
fix(parquet/pqarrow): selective column reading of complex map column (#668)
### Rationale for this change
Upstream fix for the issue identified in
https://github.com/apache/iceberg-go/issues/737. When reading maps with
nested values using column indices for selective column reading, if the
child fields of the map weren't in the list of indices there was a
problem:
- Maps are represented in Parquet as a list of key-value structs
(`list<struct<key, value>>`
- The struct *MUST* have exactly 2 fields (key and value) to be
converted into a proper Arrow typed Map column
- When applying the column filtering, if only the key *OR* value field
(but not both) were in the list of columns, the resulting child struct
would only have 1 field
- As a result, the `Map.validateData()` method would fail with a panic
of `arrow/array: map array child array should have two fields`.
### What changes are included in this PR?
In pqarrow/file_reader.go leaf filtering is disabled when reading a
map's key-value struct. This will ensure both the key and value columns
are always read together, maintaining the required 2-field structure for
map array.
### Are these changes tested?
Yes a test case is added for the change.
### Are there any user-facing changes?
This only affects map type reading with column filter selection ensuring
correctness. The only change is that a failure mode has been eliminated.
---
parquet/pqarrow/file_reader.go | 17 ++++-
parquet/pqarrow/file_reader_test.go | 124 ++++++++++++++++++++++++++++++++++++
2 files changed, 140 insertions(+), 1 deletion(-)
diff --git a/parquet/pqarrow/file_reader.go b/parquet/pqarrow/file_reader.go
index dbda55ab..6ca3c2fd 100644
--- a/parquet/pqarrow/file_reader.go
+++ b/parquet/pqarrow/file_reader.go
@@ -606,7 +606,22 @@ func (fr *FileReader) getReader(ctx context.Context, field
*SchemaField, arrowFi
out = newStructReader(&rctx, &filtered, field.LevelInfo,
childReaders, fr.Props)
case arrow.LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
child := field.Children[0]
- childReader, err := fr.getReader(ctx, &child, *child.Field)
+
+ // For maps, we must read BOTH key and value columns,
regardless of column selection.
+ // Map arrays require a complete key-value struct with exactly
2 fields.
+ // Disable leaf filtering when reading a map's key-value struct.
+ childCtx := ctx
+ if _, isMap := arrowField.Type.(*arrow.MapType); isMap {
+ childCtx = context.WithValue(ctx, rdrCtxKey{},
readerCtx{
+ rdr: rctx.rdr,
+ mem: rctx.mem,
+ colFactory: rctx.colFactory,
+ filterLeaves: false, // Don't filter leaves
for map key-value struct
+ includedLeaves: rctx.includedLeaves,
+ })
+ }
+
+ childReader, err := fr.getReader(childCtx, &child, *child.Field)
if err != nil {
return nil, err
}
diff --git a/parquet/pqarrow/file_reader_test.go
b/parquet/pqarrow/file_reader_test.go
index 104fab54..aeb32086 100644
--- a/parquet/pqarrow/file_reader_test.go
+++ b/parquet/pqarrow/file_reader_test.go
@@ -652,3 +652,127 @@ func TestPartialStructColumnRead(t *testing.T) {
cArr := arr.Field(1).(*array.Float64)
require.Equal(t, 3.0, cArr.Value(0))
}
+
+// TestMapColumnWithFilters tests that map columns can be read correctly when
+// using column filtering. This is a regression test for a bug where reading
+// a map column with filters would fail because the code tried to filter the
+// individual key and value columns of the map's internal key-value struct.
+// Maps require both key and value columns to be read together, so leaf
filtering
+// must be disabled for map types.
+func TestMapColumnWithFilters(t *testing.T) {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(t, 0)
+
+ // Create schema with a map column and other columns
+ schema := arrow.NewSchema([]arrow.Field{
+ {Name: "id", Type: arrow.PrimitiveTypes.Int64},
+ {Name: "properties", Type:
arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int32)},
+ {Name: "value", Type: arrow.PrimitiveTypes.Float64},
+ }, nil)
+
+ // Build test data
+ b := array.NewRecordBuilder(mem, schema)
+ defer b.Release()
+
+ // Build ID column
+ idBuilder := b.Field(0).(*array.Int64Builder)
+ idBuilder.AppendValues([]int64{1, 2}, nil)
+
+ // Build map column
+ mapBuilder := b.Field(1).(*array.MapBuilder)
+ kb := mapBuilder.KeyBuilder().(*array.StringBuilder)
+ vb := mapBuilder.ItemBuilder().(*array.Int32Builder)
+
+ // First map: {"key1": 100, "key2": 200}
+ mapBuilder.Append(true)
+ kb.AppendValues([]string{"key1", "key2"}, nil)
+ vb.AppendValues([]int32{100, 200}, nil)
+
+ // Second map: {"key3": 300}
+ mapBuilder.Append(true)
+ kb.AppendValues([]string{"key3"}, nil)
+ vb.AppendValues([]int32{300}, nil)
+
+ // Build value column
+ valueBuilder := b.Field(2).(*array.Float64Builder)
+ valueBuilder.AppendValues([]float64{1.5, 2.5}, nil)
+
+ rec := b.NewRecordBatch()
+ defer rec.Release()
+
+ // Write to parquet
+ buf := new(bytes.Buffer)
+ writer, err := pqarrow.NewFileWriter(schema, buf, nil,
pqarrow.DefaultWriterProps())
+ require.NoError(t, err)
+ require.NoError(t, writer.Write(rec))
+ require.NoError(t, writer.Close())
+
+ // Read back with column filtering (only read id and properties, skip
value)
+ pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+ require.NoError(t, err)
+ defer pf.Close()
+
+ fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, mem)
+ require.NoError(t, err)
+
+ // Read only columns for the first two fields (id and map)
+ // Column 0 = id
+ // Column 1 = map.key
+ // Column 2 = map.value
+ // Column 3 = value (skipped)
+ // This exercises the code path where maps need both key and value
columns
+ // even when column filtering is active
+ ctx := context.Background()
+ colIndices := []int{0, 1, 2} // id, map.key, map.value
+ rr, err := fr.GetRecordReader(ctx, colIndices, nil)
+ require.NoError(t, err)
+ require.NotNil(t, rr)
+ defer rr.Release()
+
+ // Read the record batch
+ require.True(t, rr.Next())
+ result := rr.RecordBatch()
+ // Note: Don't release result manually - the record reader owns it
+
+ // Verify schema - should have only 2 fields (id and properties)
+ require.Equal(t, 2, int(result.NumCols()))
+ require.Equal(t, "id", result.Schema().Field(0).Name)
+ require.Equal(t, "properties", result.Schema().Field(1).Name)
+
+ // Verify ID column
+ idCol := result.Column(0).(*array.Int64)
+ require.Equal(t, int64(1), idCol.Value(0))
+ require.Equal(t, int64(2), idCol.Value(1))
+
+ // Verify map column - this is the critical test for the fix
+ // The key test is that reading succeeds without panic
+ mapCol := result.Column(1).(*array.Map)
+ require.Equal(t, 2, mapCol.Len())
+
+ // Verify the map has the correct structure (keys and items arrays
exist)
+ keys := mapCol.Keys().(*array.String)
+ vals := mapCol.Items().(*array.Int32)
+ require.NotNil(t, keys)
+ require.NotNil(t, vals)
+
+ // Verify total number of key-value pairs across all maps
+ require.Equal(t, 3, keys.Len()) // Total: 2 from first map + 1 from
second map
+ require.Equal(t, 3, vals.Len())
+
+ // Verify the map offsets are correct
+ start0, end0 := mapCol.ValueOffsets(0)
+ require.Equal(t, int64(0), start0)
+ require.Equal(t, int64(2), end0) // First map has entries from 0 to 2
(2 entries)
+
+ start1, end1 := mapCol.ValueOffsets(1)
+ require.Equal(t, int64(2), start1)
+ require.Equal(t, int64(3), end1) // Second map has entries from 2 to 3
(1 entry)
+
+ // Verify key-value pairs
+ require.Equal(t, "key1", keys.Value(0))
+ require.Equal(t, int32(100), vals.Value(0))
+ require.Equal(t, "key2", keys.Value(1))
+ require.Equal(t, int32(200), vals.Value(1))
+ require.Equal(t, "key3", keys.Value(2))
+ require.Equal(t, int32(300), vals.Value(2))
+}