This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 44534c1 fix(pqarrow): respect list element nullability during
conversion (#311)
44534c1 is described below
commit 44534c126ec274cd84b6ff7ae399ff710984ad5d
Author: Matt Topol <[email protected]>
AuthorDate: Wed Mar 12 10:30:35 2025 -0400
fix(pqarrow): respect list element nullability during conversion (#311)
### Rationale for this change
While working on iceberg-go I discovered a bug in how we converted Arrow
<--> Parquet schemas when dealing with the nullability of the element
for a list field.
### What changes are included in this PR?
Fixing the proper handling for nullability instead of blanket setting it
to nullable for elements of lists. Thus correctly propagating the
nullability.
### Are these changes tested?
Yes, unit tests are added.
### Are there any user-facing changes?
Parquet schemas that are being created from Arrow Schemas will now
properly respect the required/optional for elements of lists.
---
parquet/pqarrow/schema.go | 19 +++++--------------
parquet/pqarrow/schema_test.go | 23 ++++++++++++++++++++++-
2 files changed, 27 insertions(+), 15 deletions(-)
diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 8642a2e..416d59f 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -269,19 +269,14 @@ func fieldToNode(name string, field arrow.Field, props
*parquet.WriterProperties
case arrow.STRUCT:
return structToNode(field.Type.(*arrow.StructType), field.Name,
field.Nullable, props, arrprops)
case arrow.FIXED_SIZE_LIST, arrow.LIST:
- var elem arrow.DataType
- if lt, ok := field.Type.(*arrow.ListType); ok {
- elem = lt.Elem()
- } else {
- elem = field.Type.(*arrow.FixedSizeListType).Elem()
- }
+ elemField := field.Type.(arrow.ListLikeType).ElemField()
- child, err := fieldToNode(name, arrow.Field{Name: name, Type:
elem, Nullable: true}, props, arrprops)
+ child, err := fieldToNode(name, elemField, props, arrprops)
if err != nil {
return nil, err
}
- return schema.ListOf(child, repFromNullable(field.Nullable), -1)
+ return schema.ListOfWithName(name, child,
repFromNullable(field.Nullable), -1)
case arrow.DICTIONARY:
// parquet has no dictionary type, dictionary is encoding, not
schema level
dictType := field.Type.(*arrow.DictionaryType)
@@ -730,11 +725,7 @@ func listToSchemaField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *s
populateLeaf(colIndex, &itemField, currentLevels, ctx, out,
&out.Children[0])
}
- out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
- arrow.Field{Name: listNode.Name(),
- Type: out.Children[0].Field.Type,
- Metadata: out.Children[0].Field.Metadata,
- Nullable: true}),
+ out.Field = &arrow.Field{Name: n.Name(), Type:
arrow.ListOfField(*out.Children[0].Field),
Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
Metadata: createFieldMeta(int(n.FieldID()))}
out.LevelInfo = currentLevels
@@ -756,7 +747,7 @@ func groupToStructField(n *schema.GroupNode, currentLevels
file.LevelInfo, ctx *
}
out.Field = &arrow.Field{Name: n.Name(), Type:
arrow.StructOf(arrowFields...),
- Nullable: n.RepetitionType() == parquet.Repetitions.Optional,
Metadata: createFieldMeta(int(n.FieldID()))}
+ Nullable: n.RepetitionType() != parquet.Repetitions.Required,
Metadata: createFieldMeta(int(n.FieldID()))}
out.LevelInfo = currentLevels
return nil
}
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index 598386d..f075b46 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -423,7 +423,7 @@ func TestListStructBackwardCompatible(t *testing.T) {
arrsc, err := pqarrow.FromParquet(pqSchema, nil,
metadata.KeyValueMetadata{})
assert.NoError(t, err)
- assert.True(t, arrowSchema.Equal(arrsc))
+ assert.True(t, arrowSchema.Equal(arrsc), arrowSchema.String(),
arrsc.String())
}
// TestUnsupportedTypes tests the error message for unsupported types. This
test should be updated
@@ -451,3 +451,24 @@ func TestUnsupportedTypes(t *testing.T) {
})
}
}
+
+func TestProperListElementNullability(t *testing.T) {
+ arrSchema := arrow.NewSchema([]arrow.Field{
+ {Name: "qux", Type: arrow.ListOfField(
+ arrow.Field{
+ Name: "element",
+ Type: arrow.BinaryTypes.String,
+ Nullable: false,
+ Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
+ Nullable: false, Metadata:
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})},
+ }, nil)
+
+ pqSchema, err := pqarrow.ToParquet(arrSchema, nil,
pqarrow.DefaultWriterProps())
+ require.NoError(t, err)
+ assert.EqualValues(t, 1, pqSchema.Column(0).MaxDefinitionLevel())
+ assert.Equal(t, parquet.Repetitions.Required,
pqSchema.Column(0).SchemaNode().RepetitionType())
+
+ outSchema, err := pqarrow.FromParquet(pqSchema, nil,
metadata.KeyValueMetadata{})
+ require.NoError(t, err)
+ assert.True(t, arrSchema.Equal(outSchema), "expected: %s, got: %s",
arrSchema, outSchema)
+}