This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new fc9fc890 fix(parquet/metadata): do not pre-set hasDistinctCount in
stat factories (#807)
fc9fc890 is described below
commit fc9fc890a714f28f8a91c72fe732a622db0f76b1
Author: Sai Asish Y <[email protected]>
AuthorDate: Thu May 14 16:46:07 2026 -0700
fix(parquet/metadata): do not pre-set hasDistinctCount in stat factories
(#807)
### Rationale for this change
Fixes #806. The `New*Statistics` factories in
`parquet/metadata/statistics_types.gen.go` initialize `hasDistinctCount:
true` at construction time, so every fresh stat object reports a
distinct count of 0. `Encode()` then unconditionally calls
`enc.SetDistinctCount(0)`, which makes the writer emit `distinct_count:
0` in both `ColumnChunk.metadata.statistics` and
`DataPageHeader.statistics` for every column, even when no distinct
count has been computed.
The Parquet thrift `Statistics.distinct_count` field is OPTIONAL. Per
spec semantics, it must be absent when unknown. Emitting 0 conflates "no
distinct values" with "we don't know" and causes byte-asymmetry against
parquet-cpp (pyarrow), which leaves the field unset for the same input.
`IncDistinct()` is the only call site that should flip
`hasDistinctCount` to true, and grepping the repo confirms it is never
invoked from the writer paths, so flipping the default is safe.
### What changes are included in this PR?
- Remove `hasDistinctCount: true` from the factory initializer in
`statistics_types.gen.go.tmpl` and regenerate `statistics_types.gen.go`.
`hasNullCount: true` is kept as-is (its semantics differ: `IncNulls` is
called per page and `null_count: 0` is meaningful for a writer that
observes zero nulls).
- Add `TestNewStatisticsDistinctCountUnset` covering
Int32/Int64/Float32/Float64/Boolean/ByteArray, asserting both
`HasDistinctCount()` and `Encode().HasDistinctCount` start false and
flip to true after `IncDistinct`.
### Are these changes tested?
Yes. The new test fails on master (HasDistinctCount returns true on a
fresh stat object) and passes after the fix. `go test
./parquet/metadata/...` is green.
### Are there any user-facing changes?
Yes, writers that previously emitted `distinct_count: 0` for every
column will now omit the field, matching the spec and parquet-cpp.
Readers already tolerate both, so this is a wire-format normalization,
not a breaking change for downstream consumers.
---
parquet/metadata/statistics_test.go | 37 ++++++++++
parquet/metadata/statistics_types.gen.go | 99 ++++++++++++---------------
parquet/metadata/statistics_types.gen.go.tmpl | 1 -
parquet/pqarrow/file_writer_test.go | 8 +--
4 files changed, 86 insertions(+), 59 deletions(-)
diff --git a/parquet/metadata/statistics_test.go
b/parquet/metadata/statistics_test.go
index d312437a..a0d107fa 100644
--- a/parquet/metadata/statistics_test.go
+++ b/parquet/metadata/statistics_test.go
@@ -603,3 +603,40 @@ func TestBooleanStatisticsUpdateFromBitmapSpaced(t
*testing.T) {
assert.Equal(t, int64(8), stats.NullCount())
})
}
+
+func TestNewStatisticsDistinctCountUnset(t *testing.T) {
+ // Issue #806: factories must not pre-set hasDistinctCount, otherwise
+ // Encode() emits distinct_count=0 for every column instead of leaving
the
+ // optional Thrift field absent. IncDistinct() is responsible for
marking
+ // it set when an actual distinct count is computed.
+ mk := func(n *schema.PrimitiveNode) metadata.TypedStatistics {
+ col := schema.NewColumn(n, 0, 0)
+ return metadata.NewStatistics(col, memory.DefaultAllocator)
+ }
+ cases := []struct {
+ name string
+ s metadata.TypedStatistics
+ }{
+ {"Int32", mk(schema.NewInt32Node("i32",
parquet.Repetitions.Required, -1))},
+ {"Int64", mk(schema.NewInt64Node("i64",
parquet.Repetitions.Required, -1))},
+ {"Float32", mk(schema.NewFloat32Node("f32",
parquet.Repetitions.Required, -1))},
+ {"Float64", mk(schema.NewFloat64Node("f64",
parquet.Repetitions.Required, -1))},
+ {"Boolean", mk(schema.NewBooleanNode("b",
parquet.Repetitions.Required, -1))},
+ {"ByteArray", mk(schema.NewByteArrayNode("ba",
parquet.Repetitions.Required, -1))},
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ assert.False(t, c.s.HasDistinctCount(), "fresh stats
must not advertise a distinct count")
+ enc, err := c.s.Encode()
+ require.NoError(t, err)
+ assert.False(t, enc.HasDistinctCount, "encoded stats
must leave distinct_count unset until IncDistinct is called")
+
+ c.s.IncDistinct(3)
+ assert.True(t, c.s.HasDistinctCount())
+ enc, err = c.s.Encode()
+ require.NoError(t, err)
+ assert.True(t, enc.HasDistinctCount)
+ assert.Equal(t, int64(3), enc.DistinctCount)
+ })
+ }
+}
diff --git a/parquet/metadata/statistics_types.gen.go
b/parquet/metadata/statistics_types.gen.go
index a00010f9..a9a19ea1 100644
--- a/parquet/metadata/statistics_types.gen.go
+++ b/parquet/metadata/statistics_types.gen.go
@@ -57,12 +57,11 @@ func NewInt32Statistics(descr *schema.Column, mem
memory.Allocator) *Int32Statis
return &Int32Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -358,12 +357,11 @@ func NewInt64Statistics(descr *schema.Column, mem
memory.Allocator) *Int64Statis
return &Int64Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -659,12 +657,11 @@ func NewInt96Statistics(descr *schema.Column, mem
memory.Allocator) *Int96Statis
return &Int96Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -938,12 +935,11 @@ func NewFloat32Statistics(descr *schema.Column, mem
memory.Allocator) *Float32St
return &Float32Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -1231,12 +1227,11 @@ func NewFloat64Statistics(descr *schema.Column, mem
memory.Allocator) *Float64St
return &Float64Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -1524,12 +1519,11 @@ func NewBooleanStatistics(descr *schema.Column, mem
memory.Allocator) *BooleanSt
return &BooleanStatistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -1879,12 +1873,11 @@ func NewByteArrayStatistics(descr *schema.Column, mem
memory.Allocator) *ByteArr
return &ByteArrayStatistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
min: make([]byte, 0),
@@ -2187,12 +2180,11 @@ func NewFixedLenByteArrayStatistics(descr
*schema.Column, mem memory.Allocator)
return &FixedLenByteArrayStatistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
@@ -2502,12 +2494,11 @@ func NewFloat16Statistics(descr *schema.Column, mem
memory.Allocator) *Float16St
return &Float16Statistics{
statistics: statistics{
- descr: descr,
- hasNullCount: true,
- hasDistinctCount: true,
- order: descr.SortOrder(),
- encoder:
encoding.NewEncoder(descr.PhysicalType(), parquet.Encodings.Plain, false,
descr, mem),
- mem: mem,
+ descr: descr,
+ hasNullCount: true,
+ order: descr.SortOrder(),
+ encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
+ mem: mem,
},
}
}
diff --git a/parquet/metadata/statistics_types.gen.go.tmpl
b/parquet/metadata/statistics_types.gen.go.tmpl
index 83493998..76657943 100644
--- a/parquet/metadata/statistics_types.gen.go.tmpl
+++ b/parquet/metadata/statistics_types.gen.go.tmpl
@@ -62,7 +62,6 @@ func New{{.Name}}Statistics(descr *schema.Column, mem
memory.Allocator) *{{.Name
statistics: statistics{
descr: descr,
hasNullCount: true,
- hasDistinctCount: true,
order: descr.SortOrder(),
encoder: encoding.NewEncoder(descr.PhysicalType(),
parquet.Encodings.Plain, false, descr, mem),
mem: mem,
diff --git a/parquet/pqarrow/file_writer_test.go
b/parquet/pqarrow/file_writer_test.go
index 7b98b212..7ba4b85b 100644
--- a/parquet/pqarrow/file_writer_test.go
+++ b/parquet/pqarrow/file_writer_test.go
@@ -171,8 +171,8 @@ func TestFileWriterTotalBytes(t *testing.T) {
require.NoError(t, writer.Close())
// Verify total bytes & compressed bytes are correct
- assert.Equal(t, int64(340), writer.TotalCompressedBytes())
- assert.Equal(t, int64(799), writer.TotalBytesWritten())
+ assert.Equal(t, int64(332), writer.TotalCompressedBytes())
+ assert.Equal(t, int64(783), writer.TotalBytesWritten())
}
func TestFileWriterTotalBytesBuffered(t *testing.T) {
@@ -205,8 +205,8 @@ func TestFileWriterTotalBytesBuffered(t *testing.T) {
require.NoError(t, writer.Close())
// Verify total bytes & compressed bytes are correct
- assert.Equal(t, int64(494), writer.TotalCompressedBytes())
- assert.Equal(t, int64(1139), writer.TotalBytesWritten())
+ assert.Equal(t, int64(482), writer.TotalCompressedBytes())
+ assert.Equal(t, int64(1115), writer.TotalBytesWritten())
}
func TestWriteOnClosedFileWriter(t *testing.T) {