This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 44534c1  fix(pqarrow): respect list element nullability during 
conversion (#311)
44534c1 is described below

commit 44534c126ec274cd84b6ff7ae399ff710984ad5d
Author: Matt Topol <[email protected]>
AuthorDate: Wed Mar 12 10:30:35 2025 -0400

    fix(pqarrow): respect list element nullability during conversion (#311)
    
    ### Rationale for this change
    While working on iceberg-go I discovered a bug in how we converted Arrow
    <--> Parquet schemas when dealing with the nullability of the element
    for a list field.
    
    ### What changes are included in this PR?
    Fixing the proper handling for nullability instead of blanket setting it
    to nullable for elements of lists. Thus correctly propagating the
    nullability.
    
    ### Are these changes tested?
    Yes, unit tests are added.
    
    ### Are there any user-facing changes?
    Parquet schemas that are being created from Arrow Schemas will now
    properly respect the required/optional for elements of lists.
---
 parquet/pqarrow/schema.go      | 19 +++++--------------
 parquet/pqarrow/schema_test.go | 23 ++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/parquet/pqarrow/schema.go b/parquet/pqarrow/schema.go
index 8642a2e..416d59f 100644
--- a/parquet/pqarrow/schema.go
+++ b/parquet/pqarrow/schema.go
@@ -269,19 +269,14 @@ func fieldToNode(name string, field arrow.Field, props 
*parquet.WriterProperties
        case arrow.STRUCT:
                return structToNode(field.Type.(*arrow.StructType), field.Name, 
field.Nullable, props, arrprops)
        case arrow.FIXED_SIZE_LIST, arrow.LIST:
-               var elem arrow.DataType
-               if lt, ok := field.Type.(*arrow.ListType); ok {
-                       elem = lt.Elem()
-               } else {
-                       elem = field.Type.(*arrow.FixedSizeListType).Elem()
-               }
+               elemField := field.Type.(arrow.ListLikeType).ElemField()
 
-               child, err := fieldToNode(name, arrow.Field{Name: name, Type: 
elem, Nullable: true}, props, arrprops)
+               child, err := fieldToNode(name, elemField, props, arrprops)
                if err != nil {
                        return nil, err
                }
 
-               return schema.ListOf(child, repFromNullable(field.Nullable), -1)
+               return schema.ListOfWithName(name, child, 
repFromNullable(field.Nullable), -1)
        case arrow.DICTIONARY:
                // parquet has no dictionary type, dictionary is encoding, not 
schema level
                dictType := field.Type.(*arrow.DictionaryType)
@@ -730,11 +725,7 @@ func listToSchemaField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *s
                populateLeaf(colIndex, &itemField, currentLevels, ctx, out, 
&out.Children[0])
        }
 
-       out.Field = &arrow.Field{Name: n.Name(), Type: arrow.ListOfField(
-               arrow.Field{Name: listNode.Name(),
-                       Type:     out.Children[0].Field.Type,
-                       Metadata: out.Children[0].Field.Metadata,
-                       Nullable: true}),
+       out.Field = &arrow.Field{Name: n.Name(), Type: 
arrow.ListOfField(*out.Children[0].Field),
                Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 
Metadata: createFieldMeta(int(n.FieldID()))}
 
        out.LevelInfo = currentLevels
@@ -756,7 +747,7 @@ func groupToStructField(n *schema.GroupNode, currentLevels 
file.LevelInfo, ctx *
        }
 
        out.Field = &arrow.Field{Name: n.Name(), Type: 
arrow.StructOf(arrowFields...),
-               Nullable: n.RepetitionType() == parquet.Repetitions.Optional, 
Metadata: createFieldMeta(int(n.FieldID()))}
+               Nullable: n.RepetitionType() != parquet.Repetitions.Required, 
Metadata: createFieldMeta(int(n.FieldID()))}
        out.LevelInfo = currentLevels
        return nil
 }
diff --git a/parquet/pqarrow/schema_test.go b/parquet/pqarrow/schema_test.go
index 598386d..f075b46 100644
--- a/parquet/pqarrow/schema_test.go
+++ b/parquet/pqarrow/schema_test.go
@@ -423,7 +423,7 @@ func TestListStructBackwardCompatible(t *testing.T) {
 
        arrsc, err := pqarrow.FromParquet(pqSchema, nil, 
metadata.KeyValueMetadata{})
        assert.NoError(t, err)
-       assert.True(t, arrowSchema.Equal(arrsc))
+       assert.True(t, arrowSchema.Equal(arrsc), arrowSchema.String(), 
arrsc.String())
 }
 
 // TestUnsupportedTypes tests the error message for unsupported types. This 
test should be updated
@@ -451,3 +451,24 @@ func TestUnsupportedTypes(t *testing.T) {
                })
        }
 }
+
+func TestProperListElementNullability(t *testing.T) {
+       arrSchema := arrow.NewSchema([]arrow.Field{
+               {Name: "qux", Type: arrow.ListOfField(
+                       arrow.Field{
+                               Name:     "element",
+                               Type:     arrow.BinaryTypes.String,
+                               Nullable: false,
+                               Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})}),
+                       Nullable: false, Metadata: 
arrow.NewMetadata([]string{"PARQUET:field_id"}, []string{"-1"})},
+       }, nil)
+
+       pqSchema, err := pqarrow.ToParquet(arrSchema, nil, 
pqarrow.DefaultWriterProps())
+       require.NoError(t, err)
+       assert.EqualValues(t, 1, pqSchema.Column(0).MaxDefinitionLevel())
+       assert.Equal(t, parquet.Repetitions.Required, 
pqSchema.Column(0).SchemaNode().RepetitionType())
+
+       outSchema, err := pqarrow.FromParquet(pqSchema, nil, 
metadata.KeyValueMetadata{})
+       require.NoError(t, err)
+       assert.True(t, arrSchema.Equal(outSchema), "expected: %s, got: %s", 
arrSchema, outSchema)
+}

Reply via email to