This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new d41e0b50 fix(parquet): strip repetition_type from root SchemaElement 
during serialization (#723)
d41e0b50 is described below

commit d41e0b501985802c1c62443487a8488129fce29b
Author: Harrison Crosse <[email protected]>
AuthorDate: Thu Mar 19 14:18:13 2026 -0400

    fix(parquet): strip repetition_type from root SchemaElement during 
serialization (#723)
    
    ### Rationale
    
    The Parquet spec says the root of the schema doesn't have a
    `repetition_type`. arrow-go writes `REPEATED` for the root
    `SchemaElement` in the Thrift footer, which breaks interop with
    consumers like Snowflake. #722 has the full writeup and
    cross-implementation comparison.
    
    ### What changes are included in this PR?
    
    `ToThrift()` now nils out `RepetitionType` on the root element before
    returning, stripping it from the serialized output. This matches how
    parquet-java and arrow-rs handle the root.
    
    The in-memory representation and `WithRootRepetition` API are
    unaffected. `FromParquet` [already tolerates a nil root repetition
    
type](https://github.com/apache/arrow-go/blob/main/parquet/schema/schema.go#L78-L79),
    so this is backwards-compatible for both readers and writers.
    
    ### Are these changes tested?
    
    Updated the existing `TestNestedExample` and added
    `TestToThriftRootRepetitionStripped` which checks that the root's
    `repetition_type` is stripped for all three repetition variants and that
    non-root elements keep theirs.
    
    Closes #722
---
 parquet/pqarrow/file_writer_test.go   |  4 ++--
 parquet/schema/schema.go              |  1 +
 parquet/schema/schema_flatten_test.go | 21 ++++++++++++++++++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/parquet/pqarrow/file_writer_test.go 
b/parquet/pqarrow/file_writer_test.go
index d8503ccb..32713cd2 100644
--- a/parquet/pqarrow/file_writer_test.go
+++ b/parquet/pqarrow/file_writer_test.go
@@ -172,7 +172,7 @@ func TestFileWriterTotalBytes(t *testing.T) {
 
        // Verify total bytes & compressed bytes are correct
        assert.Equal(t, int64(408), writer.TotalCompressedBytes())
-       assert.Equal(t, int64(912), writer.TotalBytesWritten())
+       assert.Equal(t, int64(910), writer.TotalBytesWritten())
 }
 
 func TestFileWriterTotalBytesBuffered(t *testing.T) {
@@ -206,5 +206,5 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) {
 
        // Verify total bytes & compressed bytes are correct
        assert.Equal(t, int64(596), writer.TotalCompressedBytes())
-       assert.Equal(t, int64(1308), writer.TotalBytesWritten())
+       assert.Equal(t, int64(1306), writer.TotalBytesWritten())
 }
diff --git a/parquet/schema/schema.go b/parquet/schema/schema.go
index 6d124eb1..3ff37689 100644
--- a/parquet/schema/schema.go
+++ b/parquet/schema/schema.go
@@ -272,6 +272,7 @@ func (t *toThriftVisitor) VisitPost(Node) {}
 func ToThrift(schema *GroupNode) []*format.SchemaElement {
        t := &toThriftVisitor{make([]*format.SchemaElement, 0)}
        schema.Visit(t)
+       t.elements[0].RepetitionType = nil
        return t.elements
 }
 
diff --git a/parquet/schema/schema_flatten_test.go 
b/parquet/schema/schema_flatten_test.go
index ecbb431c..a3939161 100644
--- a/parquet/schema/schema_flatten_test.go
+++ b/parquet/schema/schema_flatten_test.go
@@ -92,8 +92,10 @@ func (s *SchemaFlattenSuite) TestDecimalMetadata() {
 
 func (s *SchemaFlattenSuite) TestNestedExample() {
        elements := make([]*format.SchemaElement, 0)
+       root := NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* 
numChildren */, 0 /* fieldID */)
+       root.RepetitionType = nil
        elements = append(elements,
-               NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /* 
numChildren */, 0 /* fieldID */),
+               root,
                NewPrimitive("a" /* name */, 
format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */),
                NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL, 
1 /* numChildren */, 2 /* fieldID */))
 
@@ -120,6 +122,23 @@ func TestSchemaFlatten(t *testing.T) {
        suite.Run(t, new(SchemaFlattenSuite))
 }
 
+func TestToThriftRootRepetitionStripped(t *testing.T) {
+       for _, rep := range []parquet.Repetition{
+               parquet.Repetitions.Repeated,
+               parquet.Repetitions.Required,
+               parquet.Repetitions.Optional,
+       } {
+               group := MustGroup(NewGroupNode("schema", rep, FieldList{
+                       NewInt32Node("a", parquet.Repetitions.Required, -1),
+               }, -1))
+               elements := ToThrift(group)
+               assert.False(t, elements[0].IsSetRepetitionType(),
+                       "root element should not have repetition_type set (was 
%v)", rep)
+               assert.True(t, elements[1].IsSetRepetitionType(),
+                       "non-root element must have repetition_type set")
+       }
+}
+
 func TestInvalidConvertedTypeInDeserialize(t *testing.T) {
        n := MustPrimitive(NewPrimitiveNodeLogical("string" /* name */, 
parquet.Repetitions.Required, StringLogicalType{},
                parquet.Types.ByteArray, -1 /* type len */, -1 /* fieldID */))

Reply via email to