This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new d41e0b50 fix(parquet): strip repetition_type from root SchemaElement
during serialization (#723)
d41e0b50 is described below
commit d41e0b501985802c1c62443487a8488129fce29b
Author: Harrison Crosse <[email protected]>
AuthorDate: Thu Mar 19 14:18:13 2026 -0400
fix(parquet): strip repetition_type from root SchemaElement during
serialization (#723)
### Rationale
The Parquet spec says the root of the schema doesn't have a
`repetition_type`. arrow-go writes `REPEATED` for the root
`SchemaElement` in the Thrift footer, which breaks interop with
consumers like Snowflake. #722 has the full writeup and
cross-implementation comparison.
### What changes are included in this PR?
`ToThrift()` now nils out `RepetitionType` on the root element before
returning, stripping it from the serialized output. This matches how
parquet-java and arrow-rs handle the root.
The in-memory representation and `WithRootRepetition` API are
unaffected. `FromParquet` [already tolerates a nil root repetition
type](https://github.com/apache/arrow-go/blob/main/parquet/schema/schema.go#L78-L79),
so this is backwards-compatible for both readers and writers.
### Are these changes tested?
Updated the existing `TestNestedExample` and added
`TestToThriftRootRepetitionStripped` which checks that the root's
`repetition_type` is stripped for all three repetition variants and that
non-root elements keep theirs.
Closes #722
---
parquet/pqarrow/file_writer_test.go | 4 ++--
parquet/schema/schema.go | 1 +
parquet/schema/schema_flatten_test.go | 21 ++++++++++++++++++++-
3 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/parquet/pqarrow/file_writer_test.go
b/parquet/pqarrow/file_writer_test.go
index d8503ccb..32713cd2 100644
--- a/parquet/pqarrow/file_writer_test.go
+++ b/parquet/pqarrow/file_writer_test.go
@@ -172,7 +172,7 @@ func TestFileWriterTotalBytes(t *testing.T) {
// Verify total bytes & compressed bytes are correct
assert.Equal(t, int64(408), writer.TotalCompressedBytes())
- assert.Equal(t, int64(912), writer.TotalBytesWritten())
+ assert.Equal(t, int64(910), writer.TotalBytesWritten())
}
func TestFileWriterTotalBytesBuffered(t *testing.T) {
@@ -206,5 +206,5 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) {
// Verify total bytes & compressed bytes are correct
assert.Equal(t, int64(596), writer.TotalCompressedBytes())
- assert.Equal(t, int64(1308), writer.TotalBytesWritten())
+ assert.Equal(t, int64(1306), writer.TotalBytesWritten())
}
diff --git a/parquet/schema/schema.go b/parquet/schema/schema.go
index 6d124eb1..3ff37689 100644
--- a/parquet/schema/schema.go
+++ b/parquet/schema/schema.go
@@ -272,6 +272,7 @@ func (t *toThriftVisitor) VisitPost(Node) {}
func ToThrift(schema *GroupNode) []*format.SchemaElement {
t := &toThriftVisitor{make([]*format.SchemaElement, 0)}
schema.Visit(t)
+ t.elements[0].RepetitionType = nil
return t.elements
}
diff --git a/parquet/schema/schema_flatten_test.go
b/parquet/schema/schema_flatten_test.go
index ecbb431c..a3939161 100644
--- a/parquet/schema/schema_flatten_test.go
+++ b/parquet/schema/schema_flatten_test.go
@@ -92,8 +92,10 @@ func (s *SchemaFlattenSuite) TestDecimalMetadata() {
func (s *SchemaFlattenSuite) TestNestedExample() {
elements := make([]*format.SchemaElement, 0)
+ root := NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /*
numChildren */, 0 /* fieldID */)
+ root.RepetitionType = nil
elements = append(elements,
- NewGroup(s.name, format.FieldRepetitionType_REPEATED, 2 /*
numChildren */, 0 /* fieldID */),
+ root,
NewPrimitive("a" /* name */,
format.FieldRepetitionType_REQUIRED, format.Type_INT32, 1 /* fieldID */),
NewGroup("bag" /* name */, format.FieldRepetitionType_OPTIONAL,
1 /* numChildren */, 2 /* fieldID */))
@@ -120,6 +122,23 @@ func TestSchemaFlatten(t *testing.T) {
suite.Run(t, new(SchemaFlattenSuite))
}
+func TestToThriftRootRepetitionStripped(t *testing.T) {
+ for _, rep := range []parquet.Repetition{
+ parquet.Repetitions.Repeated,
+ parquet.Repetitions.Required,
+ parquet.Repetitions.Optional,
+ } {
+ group := MustGroup(NewGroupNode("schema", rep, FieldList{
+ NewInt32Node("a", parquet.Repetitions.Required, -1),
+ }, -1))
+ elements := ToThrift(group)
+ assert.False(t, elements[0].IsSetRepetitionType(),
+ "root element should not have repetition_type set (was
%v)", rep)
+ assert.True(t, elements[1].IsSetRepetitionType(),
+ "non-root element must have repetition_type set")
+ }
+}
+
func TestInvalidConvertedTypeInDeserialize(t *testing.T) {
n := MustPrimitive(NewPrimitiveNodeLogical("string" /* name */,
parquet.Repetitions.Required, StringLogicalType{},
parquet.Types.ByteArray, -1 /* type len */, -1 /* fieldID */))