(arrow-go) branch main updated: fix(parquet/pqarrow): selective column reading of complex map column (#668)

zeroshade Thu, 19 Feb 2026 17:50:39 -0800

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git



The following commit(s) were added to refs/heads/main by this push:
     new a886a572 fix(parquet/pqarrow): selective column reading of complex map 
column (#668)
a886a572 is described below

commit a886a5722b872823c2a8f46445901908a7ce5254
Author: Matt Topol <[email protected]>
AuthorDate: Thu Feb 19 20:50:23 2026 -0500

    fix(parquet/pqarrow): selective column reading of complex map column (#668)
    
    ### Rationale for this change
    Upstream fix for the issue identified in
    https://github.com/apache/iceberg-go/issues/737. When reading maps with
    nested values using column indices for selective column reading, if the
    child fields of the map weren't in the list of indices there was a
    problem:
    
    - Maps are represented in Parquet as a list of key-value structs
    (`list<struct<key, value>>`
    - The struct *MUST* have exactly 2 fields (key and value) to be
    converted into a proper Arrow typed Map column
    - When applying the column filtering, if only the key *OR* value field
    (but not both) were in the list of columns, the resulting child struct
    would only have 1 field
    - As a result, the `Map.validateData()` method would fail with a panic
    of `arrow/array: map array child array should have two fields`.
    
    ### What changes are included in this PR?
    In pqarrow/file_reader.go leaf filtering is disabled when reading a
    map's key-value struct. This will ensure both the key and value columns
    are always read together, maintaining the required 2-field structure for
    map array.
    
    ### Are these changes tested?
    Yes a test case is added for the change.
    
    ### Are there any user-facing changes?
    This only affects map type reading with column filter selection ensuring
    correctness. The only change is that a failure mode has been eliminated.
---
 parquet/pqarrow/file_reader.go      |  17 ++++-
 parquet/pqarrow/file_reader_test.go | 124 ++++++++++++++++++++++++++++++++++++
 2 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/parquet/pqarrow/file_reader.go b/parquet/pqarrow/file_reader.go
index dbda55ab..6ca3c2fd 100644
--- a/parquet/pqarrow/file_reader.go
+++ b/parquet/pqarrow/file_reader.go
@@ -606,7 +606,22 @@ func (fr *FileReader) getReader(ctx context.Context, field 
*SchemaField, arrowFi
                out = newStructReader(&rctx, &filtered, field.LevelInfo, 
childReaders, fr.Props)
        case arrow.LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
                child := field.Children[0]
-               childReader, err := fr.getReader(ctx, &child, *child.Field)
+
+               // For maps, we must read BOTH key and value columns, 
regardless of column selection.
+               // Map arrays require a complete key-value struct with exactly 
2 fields.
+               // Disable leaf filtering when reading a map's key-value struct.
+               childCtx := ctx
+               if _, isMap := arrowField.Type.(*arrow.MapType); isMap {
+                       childCtx = context.WithValue(ctx, rdrCtxKey{}, 
readerCtx{
+                               rdr:            rctx.rdr,
+                               mem:            rctx.mem,
+                               colFactory:     rctx.colFactory,
+                               filterLeaves:   false, // Don't filter leaves 
for map key-value struct
+                               includedLeaves: rctx.includedLeaves,
+                       })
+               }
+
+               childReader, err := fr.getReader(childCtx, &child, *child.Field)
                if err != nil {
                        return nil, err
                }
diff --git a/parquet/pqarrow/file_reader_test.go 
b/parquet/pqarrow/file_reader_test.go
index 104fab54..aeb32086 100644
--- a/parquet/pqarrow/file_reader_test.go
+++ b/parquet/pqarrow/file_reader_test.go
@@ -652,3 +652,127 @@ func TestPartialStructColumnRead(t *testing.T) {
        cArr := arr.Field(1).(*array.Float64)
        require.Equal(t, 3.0, cArr.Value(0))
 }
+
+// TestMapColumnWithFilters tests that map columns can be read correctly when
+// using column filtering. This is a regression test for a bug where reading
+// a map column with filters would fail because the code tried to filter the
+// individual key and value columns of the map's internal key-value struct.
+// Maps require both key and value columns to be read together, so leaf 
filtering
+// must be disabled for map types.
+func TestMapColumnWithFilters(t *testing.T) {
+       mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+       defer mem.AssertSize(t, 0)
+
+       // Create schema with a map column and other columns
+       schema := arrow.NewSchema([]arrow.Field{
+               {Name: "id", Type: arrow.PrimitiveTypes.Int64},
+               {Name: "properties", Type: 
arrow.MapOf(arrow.BinaryTypes.String, arrow.PrimitiveTypes.Int32)},
+               {Name: "value", Type: arrow.PrimitiveTypes.Float64},
+       }, nil)
+
+       // Build test data
+       b := array.NewRecordBuilder(mem, schema)
+       defer b.Release()
+
+       // Build ID column
+       idBuilder := b.Field(0).(*array.Int64Builder)
+       idBuilder.AppendValues([]int64{1, 2}, nil)
+
+       // Build map column
+       mapBuilder := b.Field(1).(*array.MapBuilder)
+       kb := mapBuilder.KeyBuilder().(*array.StringBuilder)
+       vb := mapBuilder.ItemBuilder().(*array.Int32Builder)
+
+       // First map: {"key1": 100, "key2": 200}
+       mapBuilder.Append(true)
+       kb.AppendValues([]string{"key1", "key2"}, nil)
+       vb.AppendValues([]int32{100, 200}, nil)
+
+       // Second map: {"key3": 300}
+       mapBuilder.Append(true)
+       kb.AppendValues([]string{"key3"}, nil)
+       vb.AppendValues([]int32{300}, nil)
+
+       // Build value column
+       valueBuilder := b.Field(2).(*array.Float64Builder)
+       valueBuilder.AppendValues([]float64{1.5, 2.5}, nil)
+
+       rec := b.NewRecordBatch()
+       defer rec.Release()
+
+       // Write to parquet
+       buf := new(bytes.Buffer)
+       writer, err := pqarrow.NewFileWriter(schema, buf, nil, 
pqarrow.DefaultWriterProps())
+       require.NoError(t, err)
+       require.NoError(t, writer.Write(rec))
+       require.NoError(t, writer.Close())
+
+       // Read back with column filtering (only read id and properties, skip 
value)
+       pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+       require.NoError(t, err)
+       defer pf.Close()
+
+       fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, mem)
+       require.NoError(t, err)
+
+       // Read only columns for the first two fields (id and map)
+       // Column 0 = id
+       // Column 1 = map.key
+       // Column 2 = map.value
+       // Column 3 = value (skipped)
+       // This exercises the code path where maps need both key and value 
columns
+       // even when column filtering is active
+       ctx := context.Background()
+       colIndices := []int{0, 1, 2} // id, map.key, map.value
+       rr, err := fr.GetRecordReader(ctx, colIndices, nil)
+       require.NoError(t, err)
+       require.NotNil(t, rr)
+       defer rr.Release()
+
+       // Read the record batch
+       require.True(t, rr.Next())
+       result := rr.RecordBatch()
+       // Note: Don't release result manually - the record reader owns it
+
+       // Verify schema - should have only 2 fields (id and properties)
+       require.Equal(t, 2, int(result.NumCols()))
+       require.Equal(t, "id", result.Schema().Field(0).Name)
+       require.Equal(t, "properties", result.Schema().Field(1).Name)
+
+       // Verify ID column
+       idCol := result.Column(0).(*array.Int64)
+       require.Equal(t, int64(1), idCol.Value(0))
+       require.Equal(t, int64(2), idCol.Value(1))
+
+       // Verify map column - this is the critical test for the fix
+       // The key test is that reading succeeds without panic
+       mapCol := result.Column(1).(*array.Map)
+       require.Equal(t, 2, mapCol.Len())
+
+       // Verify the map has the correct structure (keys and items arrays 
exist)
+       keys := mapCol.Keys().(*array.String)
+       vals := mapCol.Items().(*array.Int32)
+       require.NotNil(t, keys)
+       require.NotNil(t, vals)
+
+       // Verify total number of key-value pairs across all maps
+       require.Equal(t, 3, keys.Len()) // Total: 2 from first map + 1 from 
second map
+       require.Equal(t, 3, vals.Len())
+
+       // Verify the map offsets are correct
+       start0, end0 := mapCol.ValueOffsets(0)
+       require.Equal(t, int64(0), start0)
+       require.Equal(t, int64(2), end0) // First map has entries from 0 to 2 
(2 entries)
+
+       start1, end1 := mapCol.ValueOffsets(1)
+       require.Equal(t, int64(2), start1)
+       require.Equal(t, int64(3), end1) // Second map has entries from 2 to 3 
(1 entry)
+
+       // Verify key-value pairs
+       require.Equal(t, "key1", keys.Value(0))
+       require.Equal(t, int32(100), vals.Value(0))
+       require.Equal(t, "key2", keys.Value(1))
+       require.Equal(t, int32(200), vals.Value(1))
+       require.Equal(t, "key3", keys.Value(2))
+       require.Equal(t, int32(300), vals.Value(2))
+}

(arrow-go) branch main updated: fix(parquet/pqarrow): selective column reading of complex map column (#668)

Reply via email to