This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new ef9e9859 feat(parquet/pqarrow): support writing LARGE_LIST types (#838)
ef9e9859 is described below
commit ef9e9859a2050d6625bf8ea347813b072fce66b5
Author: David Li <[email protected]>
AuthorDate: Wed Jun 10 12:57:48 2026 -0700
feat(parquet/pqarrow): support writing LARGE_LIST types (#838)
### Rationale for this change
We can't write large list to a Parquet file.
### What changes are included in this PR?
Implement support for large list in pqarrow.
### Are these changes tested?
Yes
### Are there any user-facing changes?
No
Assisted-by: Claude Opus 4.6 <[email protected]>
---
parquet/file/level_conversion.go | 11 ++--
parquet/pqarrow/column_readers.go | 31 +++++++++---
parquet/pqarrow/encode_arrow_test.go | 97 +++++++++++++++++++++++++++++++++++-
parquet/pqarrow/file_reader.go | 4 +-
parquet/pqarrow/file_writer.go | 7 +++
parquet/pqarrow/path_builder.go | 14 ++++++
parquet/pqarrow/path_builder_test.go | 34 +++++++++++++
parquet/pqarrow/schema.go | 4 ++
8 files changed, 188 insertions(+), 14 deletions(-)
diff --git a/parquet/file/level_conversion.go b/parquet/file/level_conversion.go
index 5e15dd97..8082f774 100644
--- a/parquet/file/level_conversion.go
+++ b/parquet/file/level_conversion.go
@@ -19,7 +19,6 @@ package file
import (
"errors"
"fmt"
- "math"
"math/bits"
"unsafe"
@@ -186,7 +185,9 @@ func DefLevelsToBitmap(defLevels []int16, info LevelInfo,
out *ValidityBitmapInp
// DefRepLevelsToListInfo takes in the definition and repetition levels in
order to populate the validity bitmap
// and properly handle nested lists and update the offsets for them.
-func DefRepLevelsToListInfo(defLevels, repLevels []int16, info LevelInfo, out
*ValidityBitmapInputOutput, offsets []int32) error {
+func DefRepLevelsToListInfo[OffsetType int32 | int64](defLevels, repLevels
[]int16, info LevelInfo, out *ValidityBitmapInputOutput, offsets []OffsetType)
error {
+ bits := unsafe.Sizeof(OffsetType(0)) * 8
+ maxOffset := OffsetType(uint64(1)<<(bits-1) - 1)
var wr utils.BitmapWriter
if out.ValidBits != nil {
wr = utils.NewFirstTimeBitmapWriter(out.ValidBits,
out.ValidBitsOffset, out.ReadUpperBound)
@@ -203,7 +204,7 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16,
info LevelInfo, out *V
// continuation of an existing list.
// offsets can be null for structs with repeated
children
if offsetPos < len(offsets) {
- if offsets[offsetPos] == math.MaxInt32 {
+ if offsets[offsetPos] == maxOffset {
return errors.New("list index overflow")
}
offsets[offsetPos]++
@@ -223,7 +224,7 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16,
info LevelInfo, out *V
// cumulative and subtract when validating
fixed size lists
offsets[offsetPos] = offsets[offsetPos-1]
if defLevels[idx] >= info.DefLevel {
- if offsets[offsetPos] == math.MaxInt32 {
+ if offsets[offsetPos] == maxOffset {
return errors.New("list index
overflow")
}
offsets[offsetPos]++
@@ -261,5 +262,5 @@ func DefRepLevelsToListInfo(defLevels, repLevels []int16,
info LevelInfo, out *V
func DefRepLevelsToBitmap(defLevels, repLevels []int16, info LevelInfo, out
*ValidityBitmapInputOutput) error {
info.RepLevel++
info.DefLevel++
- return DefRepLevelsToListInfo(defLevels, repLevels, info, out, nil)
+ return DefRepLevelsToListInfo[int32](defLevels, repLevels, info, out,
nil)
}
diff --git a/parquet/pqarrow/column_readers.go
b/parquet/pqarrow/column_readers.go
index 1add79f6..2d571038 100644
--- a/parquet/pqarrow/column_readers.go
+++ b/parquet/pqarrow/column_readers.go
@@ -452,12 +452,26 @@ func (lr *listReader) BuildArray(lenBound int64)
(*arrow.Chunked, error) {
validityIO.ValidBits = validityBuffer.Bytes()
}
offsetsBuffer := memory.NewResizableBuffer(lr.rctx.mem)
- offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
+ if lr.field.Type.ID() == arrow.LARGE_LIST {
+
offsetsBuffer.Resize(arrow.Int64Traits.BytesRequired(int(lenBound) + 1))
+ } else {
+
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(lenBound) + 1))
+ }
defer offsetsBuffer.Release()
- offsetData := arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
- if err = file.DefRepLevelsToListInfo(defLevels, repLevels, lr.info,
&validityIO, offsetData); err != nil {
- return nil, err
+ var boundedLen int64
+ if lr.field.Type.ID() == arrow.LARGE_LIST {
+ offsetData :=
arrow.Int64Traits.CastFromBytes(offsetsBuffer.Bytes())
+ if err = file.DefRepLevelsToListInfo(defLevels, repLevels,
lr.info, &validityIO, offsetData); err != nil {
+ return nil, err
+ }
+ boundedLen = offsetData[int(validityIO.Read)]
+ } else {
+ offsetData :=
arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
+ if err = file.DefRepLevelsToListInfo(defLevels, repLevels,
lr.info, &validityIO, offsetData); err != nil {
+ return nil, err
+ }
+ boundedLen = int64(offsetData[int(validityIO.Read)])
}
// if the parent (itemRdr) has nulls and is a nested type like list
@@ -465,14 +479,18 @@ func (lr *listReader) BuildArray(lenBound int64)
(*arrow.Chunked, error) {
// definition levels when building out the bitmap. So the upper bound
// to make sure we have the space for is the worst case scenario,
// the upper bound is the value of the last offset + the nullcount
- arr, err :=
lr.itemRdr.BuildArray(int64(offsetData[int(validityIO.Read)]) +
validityIO.NullCount)
+ arr, err := lr.itemRdr.BuildArray(boundedLen)
if err != nil {
return nil, err
}
defer arr.Release()
// resize to actual number of elems returned
-
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
+ if lr.field.Type.ID() == arrow.LARGE_LIST {
+
offsetsBuffer.Resize(arrow.Int64Traits.BytesRequired(int(validityIO.Read) + 1))
+ } else {
+
offsetsBuffer.Resize(arrow.Int32Traits.BytesRequired(int(validityIO.Read) + 1))
+ }
if validityBuffer != nil {
validityBuffer.Resize(int(bitutil.BytesForBits(validityIO.Read)))
}
@@ -492,6 +510,7 @@ func (lr *listReader) BuildArray(lenBound int64)
(*arrow.Chunked, error) {
defer data.Release()
if lr.field.Type.ID() == arrow.FIXED_SIZE_LIST {
defer data.Buffers()[1].Release()
+ offsetData :=
arrow.Int32Traits.CastFromBytes(offsetsBuffer.Bytes())
listSize := lr.field.Type.(*arrow.FixedSizeListType).Len()
for x := 1; x < data.Len(); x++ {
size := offsetData[x] - offsetData[x-1]
diff --git a/parquet/pqarrow/encode_arrow_test.go
b/parquet/pqarrow/encode_arrow_test.go
index 10ff5074..95a5b3e5 100644
--- a/parquet/pqarrow/encode_arrow_test.go
+++ b/parquet/pqarrow/encode_arrow_test.go
@@ -1994,6 +1994,43 @@ func (ps *ParquetIOTestSuite)
TestFixedSizeListNullableElements() {
ps.roundTripTable(mem, tbl, true)
}
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func (ps *ParquetIOTestSuite) TestLargeListRoundTrip() {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(ps.T(), 0)
+
+ elemField := arrow.Field{
+ Name: "element",
+ Type: arrow.PrimitiveTypes.Int32,
+ Nullable: true,
+ Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"},
[]string{"-1"}),
+ }
+ bldr := array.NewLargeListBuilderWithField(mem, elemField)
+ defer bldr.Release()
+
+ vb := bldr.ValueBuilder().(*array.Int32Builder)
+ bldr.Append(true)
+ vb.AppendValues([]int32{1, 2, 3}, nil)
+ bldr.AppendNull()
+ bldr.Append(true) // empty list
+ bldr.Append(true)
+ vb.AppendValues([]int32{4, 5}, nil)
+ arr := bldr.NewLargeListArray()
+ defer arr.Release()
+
+ field := arrow.Field{
+ Name: "x",
+ Type: arr.DataType(),
+ Nullable: true,
+ Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"},
[]string{"-1"}),
+ }
+
+ tbl := array.NewTableFromSlice(arrow.NewSchema([]arrow.Field{field},
nil), [][]arrow.Array{{arr}})
+ defer tbl.Release()
+
+ ps.roundTripTable(mem, tbl, true)
+}
+
func (ps *ParquetIOTestSuite) TestNull() {
mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
defer mem.AssertSize(ps.T(), 0)
@@ -2208,6 +2245,64 @@ func TestListOfStructWithEmptyListStoreSchema(t
*testing.T) {
"ARROW:schema element name must match the Parquet column path
segment")
}
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func TestLargeListStoreSchema(t *testing.T) {
+ mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+ defer mem.AssertSize(t, 0)
+
+ sc := arrow.NewSchema([]arrow.Field{
+ {Name: "x", Type:
arrow.LargeListOf(arrow.PrimitiveTypes.Int32), Nullable: true},
+ }, nil)
+
+ b := array.NewRecordBuilder(mem, sc)
+ defer b.Release()
+
+ lb := b.Field(0).(*array.LargeListBuilder)
+ vb := lb.ValueBuilder().(*array.Int32Builder)
+ lb.Append(true)
+ vb.AppendValues([]int32{1, 2, 3}, nil)
+ lb.AppendNull()
+ lb.Append(true) // empty
+ lb.Append(true)
+ vb.AppendValues([]int32{4, 5}, nil)
+ rec := b.NewRecordBatch()
+ defer rec.Release()
+
+ var buf bytes.Buffer
+ props := parquet.NewWriterProperties()
+ arrowProps :=
pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
+
+ pw, err := pqarrow.NewFileWriter(sc, &buf, props, arrowProps)
+ require.NoError(t, err)
+ require.NoError(t, pw.Write(rec))
+ require.NoError(t, pw.Close())
+
+ pf, err := file.NewParquetReader(bytes.NewReader(buf.Bytes()))
+ require.NoError(t, err)
+ defer pf.Close()
+
+ fr, err := pqarrow.NewFileReader(pf, pqarrow.ArrowReadProperties{}, mem)
+ require.NoError(t, err)
+
+ tbl, err := fr.ReadTable(context.Background())
+ require.NoError(t, err)
+ defer tbl.Release()
+
+ require.EqualValues(t, 4, tbl.NumRows())
+ require.Equal(t, arrow.LARGE_LIST, tbl.Schema().Field(0).Type.ID())
+ require.Equal(t, arrow.Field{
+ Name: "x",
+ Type: arrow.LargeListOfField(arrow.Field{
+ Name: "element",
+ Type: arrow.PrimitiveTypes.Int32,
+ Nullable: true,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"}),
+ }),
+ Nullable: true,
+ Metadata: arrow.NewMetadata([]string{"PARQUET:field_id"},
[]string{"-1"}),
+ }, tbl.Schema().Field(0))
+}
+
func TestParquetArrowIO(t *testing.T) {
suite.Run(t, new(ParquetIOTestSuite))
}
@@ -2648,7 +2743,7 @@ func TestReadWriteShreddedVariant(t *testing.T) {
{"event_type": "text", "event_ts": "1970-01-21
00:29:54.954163Z"},
{"event_type": "list", "event_ts": "1970-01-21
00:29:54.240241Z"},
"text",
- {"event_type": "object", "event_ts": "1970-01-21
00:29:54.146402Z"},
+ {"event_type": "object", "event_ts": "1970-01-21
00:29:54.146402Z"},
null
]`
diff --git a/parquet/pqarrow/file_reader.go b/parquet/pqarrow/file_reader.go
index b4e46008..ed6a2998 100644
--- a/parquet/pqarrow/file_reader.go
+++ b/parquet/pqarrow/file_reader.go
@@ -606,7 +606,7 @@ func (fr *FileReader) getReader(ctx context.Context, field
*SchemaField, arrowFi
Metadata: arrowField.Metadata, Type:
arrow.StructOf(childFields...),
}
out = newStructReader(&rctx, &filtered, field.LevelInfo,
childReaders, fr.Props)
- case arrow.LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
+ case arrow.LIST, arrow.LARGE_LIST, arrow.FIXED_SIZE_LIST, arrow.MAP:
child := field.Children[0]
// For maps, we must read BOTH key and value columns,
regardless of column selection.
@@ -638,7 +638,7 @@ func (fr *FileReader) getReader(ctx context.Context, field
*SchemaField, arrowFi
arrowField.Type =
arrow.ListOf(childReader.Field().Type)
}
out = newListReader(&rctx, &arrowField,
field.LevelInfo, childReader, fr.Props)
- case *arrow.ListType:
+ case *arrow.ListType, *arrow.LargeListType:
out = newListReader(&rctx, &arrowField,
field.LevelInfo, childReader, fr.Props)
case *arrow.FixedSizeListType:
out = newFixedSizeListReader(&rctx, &arrowField,
field.LevelInfo, childReader, fr.Props)
diff --git a/parquet/pqarrow/file_writer.go b/parquet/pqarrow/file_writer.go
index 442ec8b4..827befaa 100644
--- a/parquet/pqarrow/file_writer.go
+++ b/parquet/pqarrow/file_writer.go
@@ -47,6 +47,13 @@ func normalizeFieldForParquet(f arrow.Field) (arrow.Field,
error) {
}
elem.Name = "element"
return arrow.Field{Name: f.Name, Type: arrow.ListOfField(elem),
Nullable: f.Nullable, Metadata: f.Metadata}, nil
+ case *arrow.LargeListType:
+ elem, err := normalizeFieldForParquet(dt.ElemField())
+ if err != nil {
+ return arrow.Field{}, err
+ }
+ elem.Name = "element"
+ return arrow.Field{Name: f.Name, Type:
arrow.LargeListOfField(elem), Nullable: f.Nullable, Metadata: f.Metadata}, nil
case *arrow.FixedSizeListType:
elem, err := normalizeFieldForParquet(dt.ElemField())
if err != nil {
diff --git a/parquet/pqarrow/path_builder.go b/parquet/pqarrow/path_builder.go
index a03e3187..991710b3 100644
--- a/parquet/pqarrow/path_builder.go
+++ b/parquet/pqarrow/path_builder.go
@@ -57,6 +57,14 @@ func (v varRangeSelector) GetRange(idx int64) elemRange {
return elemRange{int64(v.offsets[idx]), int64(v.offsets[idx+1])}
}
+type largeVarRangeSelector struct {
+ offsets []int64
+}
+
+func (v largeVarRangeSelector) GetRange(idx int64) elemRange {
+ return elemRange{v.offsets[idx], v.offsets[idx+1]}
+}
+
type fixedSizeRangeSelector struct {
listSize int32
}
@@ -405,6 +413,12 @@ func (p *pathBuilder) Visit(arr arrow.Array) error {
varRangeSelector{larr.Offsets()[larr.Data().Offset():]},
-1, // defLevelIfEmpty = maxDefLevel - 1 (after all
increments)
larr.ListValues())
+ case arrow.LARGE_LIST:
+ larr := arr.(*array.LargeList)
+ return p.visitListLike(arr,
+
largeVarRangeSelector{larr.Offsets()[larr.Data().Offset():]},
+ -1, // defLevelIfEmpty = maxDefLevel - 1 (after all
increments)
+ larr.ListValues())
case arrow.FIXED_SIZE_LIST:
larr := arr.(*array.FixedSizeList)
listSize := larr.DataType().(*arrow.FixedSizeListType).Len()
diff --git a/parquet/pqarrow/path_builder_test.go
b/parquet/pqarrow/path_builder_test.go
index df548bcd..d736ee90 100644
--- a/parquet/pqarrow/path_builder_test.go
+++ b/parquet/pqarrow/path_builder_test.go
@@ -675,3 +675,37 @@ func TestPrimitiveNonNullable(t *testing.T) {
assert.EqualValues(t, 0, result.postListVisitedElems[0].start)
assert.EqualValues(t, 4, result.postListVisitedElems[0].end)
}
+
+// Regression test for https://github.com/apache/arrow-go/issues/834
+func TestNullableLargeListSomeNullEntriesSomeNullLists(t *testing.T) {
+ bldr := array.NewLargeListBuilder(memory.DefaultAllocator,
arrow.PrimitiveTypes.Int64)
+ defer bldr.Release()
+
+ vb := bldr.ValueBuilder().(*array.Int64Builder)
+
+ bldr.AppendNull()
+ bldr.Append(true)
+ vb.AppendValues([]int64{1, 2, 3}, nil)
+ bldr.Append(true)
+ bldr.Append(true)
+ bldr.AppendNull()
+ bldr.AppendNull()
+ bldr.Append(true)
+ vb.AppendValues([]int64{4, 5}, nil)
+ bldr.Append(true)
+ vb.AppendNull()
+
+ arr := bldr.NewLargeListArray()
+ defer arr.Release()
+
+ mp, err := newMultipathLevelBuilder(arr, true)
+ require.NoError(t, err)
+ defer mp.Release()
+
+ ctx := arrowCtxFromContext(NewArrowWriteContext(context.Background(),
nil))
+ result, err := mp.write(0, ctx)
+ require.NoError(t, err)
+
+ assert.Equal(t, []int16{0, 3, 3, 3, 1, 1, 0, 0, 3, 3, 2},
result.defLevels)
+ assert.Equal(t, []int16{0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0},
result.repLevels)
+}
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index cfa0574b..66db50e1 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -1058,6 +1058,10 @@ func getNestedFactory(origin, inferred arrow.DataType)
func(fieldList []arrow.Fi
return func(list []arrow.Field) arrow.DataType {
return arrow.FixedSizeListOfField(sz, list[0])
}
+ case arrow.LARGE_LIST:
+ return func(list []arrow.Field) arrow.DataType {
+ return arrow.LargeListOfField(list[0])
+ }
}
case arrow.MAP:
if origin.ID() == arrow.MAP {