This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new ef9e9859 feat(parquet/pqarrow): support writing LARGE_LIST types (#838)
ef9e9859 is described below

commit ef9e9859a2050d6625bf8ea347813b072fce66b5
Author: David Li <[email protected]>
AuthorDate: Wed Jun 10 12:57:48 2026 -0700

    feat(parquet/pqarrow): support writing LARGE_LIST types (#838)
    
    ### Rationale for this change
    
    We can't write large list to a Parquet file.
    
    ### What changes are included in this PR?
    
    Implement support for large list in pqarrow.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    
    No
    
    Assisted-by: Claude Opus 4.6 <[email protected]>
---
 parquet/file/level_conversion.go     | 11 ++--
 parquet/pqarrow/column_readers.go    | 31 +++++++++---
 parquet/pqarrow/encode_arrow_test.go | 97 +++++++++++++++++++++++++++++++++++-
 parquet/pqarrow/file_reader.go       |  4 +-
 parquet/pqarrow/file_writer.go       |  7 +++
 parquet/pqarrow/path_builder.go      | 14 ++++++
 parquet/pqarrow/path_builder_test.go | 34 +++++++++++++
 parquet/pqarrow/schema.go            |  4 ++
 8 files changed, 188 insertions(+), 14 deletions(-)

diff --git a/parquet/file/level_conversion.go b/parquet/file/level_conversion.go
index 5e15dd97..8082f774 100644
--- a/parquet/file/level_conversion.go
+++ b/parquet/file/level_conversion.go
@@ -19,7 +19,6 @@ package file
 import (
        "errors"
        "fmt"
-       "math"
        "math/bits"
        "unsafe"
 
@@ -186,7 +185,9 @@ func DefLevelsToBitmap(defLevels []int16, info LevelInfo, 
out *ValidityBitmapInp
 
 // DefRepLevelsToListInfo takes in the definition and repetition levels in 
order to populate the validity bitmap
 // and properly handle nested lists and update the offsets for them.
-func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out 
*ValidityBitmapInputOutput, offsets []int32) error {
+func DefRepLevelsToListInfo[OffsetType int32 | int64](defLevels, repLevels 
[]int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []OffsetType) 
error {
+       bits := unsafe.Sizeof(OffsetType(0)) * 8
+       maxOffset := OffsetType(uint64(1)<<(bits-1) - 1)
        var wr utils.BitmapWriter
        if out.ValidBits != nil {
                wr = utils.NewFirstTimeBitmapWriter(out.ValidBits, 
out.ValidBitsOffset, out.ReadUpperBound)
@@ -203,7 +204,7 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16, 
info LevelInfo, out *V
                        // continuation of an existing list.
                        // offsets can be null for structs with repeated 
children
                        if offsetPos < len(offsets) {
-                               if offsets[offsetPos] == math.MaxInt32 {
+                               if offsets[offsetPos] == maxOffset {
                                        return errors.New("list index overflow")
                                }
                                offsets[offsetPos]++
@@ -223,7 +224,7 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16, 
info LevelInfo, out *V
                                // cumulative and subtract when validating 
fixed size lists
                                offsets[offsetPos] = offsets[offsetPos-1]
                                if defLevels[idx] >= info.DefLevel {
-                                       if offsets[offsetPos] == math.MaxInt32 {
+                                       if offsets[offsetPos] == maxOffset {
                                                return errors.New("list index 
overflow")
                                        }
                                        offsets[offsetPos]++
@@ -261,5 +262,5 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16, 
info LevelInfo, out *V
 func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out 
*ValidityBitmapInputOutput) error {
        info.RepLevel++
        info.DefLevel++
-       return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil)
+       return DefRepLevelsToListInfo[int32](defLevels, repLevels, info, out, 
nil)
 }
diff --git a/parquet/pqarrow/column_readers.go 
b/parquet/pqarrow/column_readers.go
index 1add79f6..2d571038 100644
--- a/parquet/pqarrow/column_readers.go
+++ b/parquet/pqarrow/column_readers.go
@@ -452,12 +452,26 @@ func (lr *listReader) BuildArray(lenBound int64) 
(*arrow.Chunked, error) {
                validityIO.ValidBits = validityBuffer.Bytes()
        }
        offsetsBuffer := memory.NewResizableBuffer(lr.rctx.mem)
-       offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
+       if lr.field.Type.ID() == arrow.LARGE_LIST {
+               
offsetsBuffer.Resize(arrow.Int64Traits.BytesRequired(int(lenBound) + 1))
+       } else {
+               
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
+       }
        defer offsetsBuffer.Release()
 
-       offsetData := arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
-       if err = file.DefRepLevelsToListInfo(defLevels, repLevels, lr.info, 
&validityIO, offsetData); err != nil {
-               return nil, err
+       var boundedLen int64
+       if lr.field.Type.ID() == arrow.LARGE_LIST {
+               offsetData := 
arrow.Int64Traits.CastFromBytes(offsetsBuffer.Bytes())
+               if err = file.DefRepLevelsToListInfo(defLevels, repLevels, 
lr.info, &validityIO, offsetData); err != nil {
+                       return nil, err
+               }
+               boundedLen = offsetData[int(validityIO.Read)]
+       } else {
+               offsetData := 
arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
+               if err = file.DefRepLevelsToListInfo(defLevels, repLevels, 
lr.info, &validityIO, offsetData); err != nil {
+                       return nil, err
+               }
+               boundedLen = int64(offsetData[int(validityIO.Read)])
        }
 
        // if the parent (itemRdr) has nulls and is a nested type like list
@@ -465,14 +479,18 @@ func (lr *listReader) BuildArray(lenBound int64) 
(*arrow.Chunked, error) {
        // definition levels when building out the bitmap. So the upper bound
        // to make sure we have the space for is the worst case scenario,
        // the upper bound is the value of the last offset + the nullcount
-       arr, err := 
lr.itemRdr.BuildArray(int64(offsetData[int(validityIO.Read)]) + 
validityIO.NullCount)
+       arr, err := lr.itemRdr.BuildArray(boundedLen)
        if err != nil {
                return nil, err
        }
        defer arr.Release()
 
        // resize to actual number of elems returned
-       
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
+       if lr.field.Type.ID() == arrow.LARGE_LIST {
+               
offsetsBuffer.Resize(arrow.Int64Traits.BytesRequired(int(validityIO.Read) + 1))
+       } else {
+               
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
+       }
        if validityBuffer != nil {
                
validityBuffer.Resize(int(bitutil.BytesForBits(validityIO.Read)))
        }
@@ -492,6 +510,7 @@ func (lr *listReader) BuildArray(lenBound int64) 
(*arrow.Chunked, error) {
        defer data.Release()
        if lr.field.Type.ID() == arrow.FIXED_SIZE_LIST {
                defer data.Buffers()[1].Release()
+               offsetData := 
arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
                listSize := lr.field.Type.(*arrow.FixedSizeListType).Len()
                for x := 1; x < data.Len(); x++ {
                        size := offsetData[x] - offsetData[x-1]
diff --git a/parquet/pqarrow/encode_arrow_test.go 
b/parquet/pqarrow/encode_arrow_test.go
index 10ff5074..95a5b3e5 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -1994,6 +1994,43 @@ func (ps *ParquetIOTestSuite) 
TestFixedSizeListNullableElements() {
        ps.roundTripTable(mem, tbl, true)
 }
 
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func (ps *ParquetIOTestSuite) TestLargeListRoundTrip() {
+       mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+       defer mem.AssertSize(ps.T(), 0)
+
+       elemField := arrow.Field{
+               Name:     "element",
+               Type:     arrow.PrimitiveTypes.Int32,
+               Nullable: true,
+               Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, 
[]string{"-1"}),
+       }
+       bldr := array.NewLargeListBuilderWithField(mem, elemField)
+       defer bldr.Release()
+
+       vb := bldr.ValueBuilder().(*array.Int32Builder)
+       bldr.Append(true)
+       vb.AppendValues([]int32{1, 2, 3}, nil)
+       bldr.AppendNull()
+       bldr.Append(true) // empty list
+       bldr.Append(true)
+       vb.AppendValues([]int32{4, 5}, nil)
+       arr := bldr.NewLargeListArray()
+       defer arr.Release()
+
+       field := arrow.Field{
+               Name:     "x",
+               Type:     arr.DataType(),
+               Nullable: true,
+               Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, 
[]string{"-1"}),
+       }
+
+       tbl := array.NewTableFromSlice(arrow.NewSchema([]arrow.Field{field}, 
nil), [][]arrow.Array{{arr}})
+       defer tbl.Release()
+
+       ps.roundTripTable(mem, tbl, true)
+}
+
 func (ps *ParquetIOTestSuite) TestNull() {
        mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
        defer mem.AssertSize(ps.T(), 0)
@@ -2208,6 +2245,64 @@ func TestListOfStructWithEmptyListStoreSchema(t 
*testing.T) {
                "ARROW:schema element name must match the Parquet column path 
segment")
 }
 
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func TestLargeListStoreSchema(t *testing.T) {
+       mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+       defer mem.AssertSize(t, 0)
+
+       sc := arrow.NewSchema([]arrow.Field{
+               {Name: "x", Type: 
arrow.LargeListOf(arrow.PrimitiveTypes.Int32), Nullable: true},
+       }, nil)
+
+       b := array.NewRecordBuilder(mem, sc)
+       defer b.Release()
+
+       lb := b.Field(0).(*array.LargeListBuilder)
+       vb := lb.ValueBuilder().(*array.Int32Builder)
+       lb.Append(true)
+       vb.AppendValues([]int32{1, 2, 3}, nil)
+       lb.AppendNull()
+       lb.Append(true) // empty
+       lb.Append(true)
+       vb.AppendValues([]int32{4, 5}, nil)
+       rec := b.NewRecordBatch()
+       defer rec.Release()
+
+       var buf bytes.Buffer
+       props := parquet.NewWriterProperties()
+       arrowProps := 
pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
+
+       pw, err := pqarrow.NewFileWriter(sc, &buf, props, arrowProps)
+       require.NoError(t, err)
+       require.NoError(t, pw.Write(rec))
+       require.NoError(t, pw.Close())
+
+       pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+       require.NoError(t, err)
+       defer pf.Close()
+
+       fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, mem)
+       require.NoError(t, err)
+
+       tbl, err := fr.ReadTable(context.Background())
+       require.NoError(t, err)
+       defer tbl.Release()
+
+       require.EqualValues(t, 4, tbl.NumRows())
+       require.Equal(t, arrow.LARGE_LIST, tbl.Schema().Field(0).Type.ID())
+       require.Equal(t, arrow.Field{
+               Name: "x",
+               Type: arrow.LargeListOfField(arrow.Field{
+                       Name:     "element",
+                       Type:     arrow.PrimitiveTypes.Int32,
+                       Nullable: true,
+                       Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}),
+               }),
+               Nullable: true,
+               Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"}, 
[]string{"-1"}),
+       }, tbl.Schema().Field(0))
+}
+
 func TestParquetArrowIO(t *testing.T) {
        suite.Run(t, new(ParquetIOTestSuite))
 }
@@ -2648,7 +2743,7 @@ func TestReadWriteShreddedVariant(t *testing.T) {
                        {"event_type": "text", "event_ts": "1970-01-21 
00:29:54.954163Z"},
                        {"event_type": "list", "event_ts": "1970-01-21 
00:29:54.240241Z"},
                        "text",
-                       {"event_type": "object", "event_ts": "1970-01-21 
00:29:54.146402Z"},                    
+                       {"event_type": "object", "event_ts": "1970-01-21 
00:29:54.146402Z"},
                        null
                ]`
 
diff --git a/parquet/pqarrow/file_reader.go b/parquet/pqarrow/file_reader.go
index b4e46008..ed6a2998 100644
--- a/parquet/pqarrow/file_reader.go
+++ b/parquet/pqarrow/file_reader.go
@@ -606,7 +606,7 @@ func (fr *FileReader) getReader(ctx context.Context, field 
*SchemaField, arrowFi
                        Metadata: arrowField.Metadata, Type: 
arrow.StructOf(childFields...),
                }
                out = newStructReader(&rctx, &filtered, field.LevelInfo, 
childReaders, fr.Props)
-       case arrow.LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
+       case arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
                child := field.Children[0]
 
                // For maps, we must read BOTH key and value columns, 
regardless of column selection.
@@ -638,7 +638,7 @@ func (fr *FileReader) getReader(ctx context.Context, field 
*SchemaField, arrowFi
                                arrowField.Type = 
arrow.ListOf(childReader.Field().Type)
                        }
                        out = newListReader(&rctx, &arrowField, 
field.LevelInfo, childReader, fr.Props)
-               case *arrow.ListType:
+               case *arrow.ListType, *arrow.LargeListType:
                        out = newListReader(&rctx, &arrowField, 
field.LevelInfo, childReader, fr.Props)
                case *arrow.FixedSizeListType:
                        out = newFixedSizeListReader(&rctx, &arrowField, 
field.LevelInfo, childReader, fr.Props)
diff --git a/parquet/pqarrow/file_writer.go b/parquet/pqarrow/file_writer.go
index 442ec8b4..827befaa 100644
--- a/parquet/pqarrow/file_writer.go
+++ b/parquet/pqarrow/file_writer.go
@@ -47,6 +47,13 @@ func normalizeFieldForParquet(f arrow.Field) (arrow.Field, 
error) {
                }
                elem.Name = "element"
                return arrow.Field{Name: f.Name, Type: arrow.ListOfField(elem), 
Nullable: f.Nullable, Metadata: f.Metadata}, nil
+       case *arrow.LargeListType:
+               elem, err := normalizeFieldForParquet(dt.ElemField())
+               if err != nil {
+                       return arrow.Field{}, err
+               }
+               elem.Name = "element"
+               return arrow.Field{Name: f.Name, Type: 
arrow.LargeListOfField(elem), Nullable: f.Nullable, Metadata: f.Metadata}, nil
        case *arrow.FixedSizeListType:
                elem, err := normalizeFieldForParquet(dt.ElemField())
                if err != nil {
diff --git a/parquet/pqarrow/path_builder.go b/parquet/pqarrow/path_builder.go
index a03e3187..991710b3 100644
--- a/parquet/pqarrow/path_builder.go
+++ b/parquet/pqarrow/path_builder.go
@@ -57,6 +57,14 @@ func (v varRangeSelector) GetRange(idx int64) elemRange {
        return elemRange{int64(v.offsets[idx]), int64(v.offsets[idx+1])}
 }
 
+type largeVarRangeSelector struct {
+       offsets []int64
+}
+
+func (v largeVarRangeSelector) GetRange(idx int64) elemRange {
+       return elemRange{v.offsets[idx], v.offsets[idx+1]}
+}
+
 type fixedSizeRangeSelector struct {
        listSize int32
 }
@@ -405,6 +413,12 @@ func (p *pathBuilder) Visit(arr arrow.Array) error {
                        varRangeSelector{larr.Offsets()[larr.Data().Offset():]},
                        -1, // defLevelIfEmpty = maxDefLevel - 1 (after all 
increments)
                        larr.ListValues())
+       case arrow.LARGE_LIST:
+               larr := arr.(*array.LargeList)
+               return p.visitListLike(arr,
+                       
largeVarRangeSelector{larr.Offsets()[larr.Data().Offset():]},
+                       -1, // defLevelIfEmpty = maxDefLevel - 1 (after all 
increments)
+                       larr.ListValues())
        case arrow.FIXED_SIZE_LIST:
                larr := arr.(*array.FixedSizeList)
                listSize := larr.DataType().(*arrow.FixedSizeListType).Len()
diff --git a/parquet/pqarrow/path_builder_test.go 
b/parquet/pqarrow/path_builder_test.go
index df548bcd..d736ee90 100644
--- a/parquet/pqarrow/path_builder_test.go
+++ b/parquet/pqarrow/path_builder_test.go
@@ -675,3 +675,37 @@ func TestPrimitiveNonNullable(t *testing.T) {
        assert.EqualValues(t, 0, result.postListVisitedElems[0].start)
        assert.EqualValues(t, 4, result.postListVisitedElems[0].end)
 }
+
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func TestNullableLargeListSomeNullEntriesSomeNullLists(t *testing.T) {
+       bldr := array.NewLargeListBuilder(memory.DefaultAllocator, 
arrow.PrimitiveTypes.Int64)
+       defer bldr.Release()
+
+       vb := bldr.ValueBuilder().(*array.Int64Builder)
+
+       bldr.AppendNull()
+       bldr.Append(true)
+       vb.AppendValues([]int64{1, 2, 3}, nil)
+       bldr.Append(true)
+       bldr.Append(true)
+       bldr.AppendNull()
+       bldr.AppendNull()
+       bldr.Append(true)
+       vb.AppendValues([]int64{4, 5}, nil)
+       bldr.Append(true)
+       vb.AppendNull()
+
+       arr := bldr.NewLargeListArray()
+       defer arr.Release()
+
+       mp, err := newMultipathLevelBuilder(arr, true)
+       require.NoError(t, err)
+       defer mp.Release()
+
+       ctx := arrowCtxFromContext(NewArrowWriteContext(context.Background(), 
nil))
+       result, err := mp.write(0, ctx)
+       require.NoError(t, err)
+
+       assert.Equal(t, []int16{0, 3, 3, 3, 1, 1, 0, 0, 3, 3, 2}, 
result.defLevels)
+       assert.Equal(t, []int16{0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0}, 
result.repLevels)
+}
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index cfa0574b..66db50e1 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -1058,6 +1058,10 @@ func getNestedFactory(origin, inferred arrow.DataType) 
func(fieldList []arrow.Fi
                        return func(list []arrow.Field) arrow.DataType {
                                return arrow.FixedSizeListOfField(sz, list[0])
                        }
+               case arrow.LARGE_LIST:
+                       return func(list []arrow.Field) arrow.DataType {
+                               return arrow.LargeListOfField(list[0])
+                       }
                }
        case arrow.MAP:
                if origin.ID() == arrow.MAP {

Reply via email to