This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 8293e4a7 fix(parquet/variant): correct is_large bit position in
valueSize for arrays (#840)
8293e4a7 is described below
commit 8293e4a7464feb43745d67fddfc3e10fe913a243
Author: Jared Yu (余启正) <[email protected]>
AuthorDate: Mon Jun 8 11:03:12 2026 -0700
fix(parquet/variant): correct is_large bit position in valueSize for arrays
(#840)
### Rationale for this change
Closes #839
The `valueSize()` function in `parquet/variant/utils.go` uses `(typeInfo
>> 4) & 0x1` to check the `is_large` flag for both objects and arrays.
This is correct for objects (where `is_large` is at bit 4 of the
value_header) but incorrect for arrays (where `is_large` is at bit 2 per
the Variant Encoding Spec).
This causes `valueSize()` to return an incorrect size for arrays with
>255 elements, which can lead to silent data corruption when
`FinishObject()` compacts duplicate keys whose values are large arrays.
### What changes are included in this PR?
- **`parquet/variant/utils.go`**: Changed `(typeInfo >> 4)` to
`(typeInfo >> 2)` in the `BasicArray` case of `valueSize()`. The object
case remains unchanged (it was already correct).
- **`parquet/variant/valuesize_test.go`** (new): Added regression tests:
- `TestValueSizeLargeArray`: Builds a 300-element array and verifies
`valueSize()` returns the correct byte count.
- `TestValueSizeLargeObject`: Verifies that large objects (>255 fields)
still compute correctly after the fix.
### Are these changes tested?
Yes. Two new regression tests are included. The full existing test suite
passes with no regressions.
### Are there any user-facing changes?
No API changes. This is a correctness fix for an internal utility
function. Users who previously triggered the bug (allowed duplicate keys
in objects where a field value is a large array) will now get correct
behavior instead of silent data corruption.
---------
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
parquet/variant/utils.go | 2 +-
parquet/variant/valuesize_test.go | 76 +++++++++++++++++++++++++++++++++++++++
2 files changed, 77 insertions(+), 1 deletion(-)
diff --git a/parquet/variant/utils.go b/parquet/variant/utils.go
index ae9b8c15..edeea2de 100644
--- a/parquet/variant/utils.go
+++ b/parquet/variant/utils.go
@@ -129,7 +129,7 @@ func valueSize(v []byte) int {
return int(dataStart + readLEU32(v[idx:idx+offsetSize]))
case byte(BasicArray):
var szBytes uint8 = 1
- if ((typeInfo >> 4) & 0x1) != 0 {
+ if ((typeInfo >> 2) & 0x1) != 0 {
szBytes = 4
}
diff --git a/parquet/variant/valuesize_test.go
b/parquet/variant/valuesize_test.go
new file mode 100644
index 00000000..3381a57f
--- /dev/null
+++ b/parquet/variant/valuesize_test.go
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package variant
+
+import (
+ "testing"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+func TestValueSizeLargeArray(t *testing.T) {
+ // Build a large array with >255 elements to trigger is_large=true.
+ // This is a regression test for the is_large bit position in
valueSize().
+ var b Builder
+ start := b.Offset()
+ offsets := make([]int, 0, 300)
+ for i := 0; i < 300; i++ {
+ offsets = append(offsets, b.NextElement(start))
+ b.AppendNull()
+ }
+ require.NoError(t, b.FinishArray(start, offsets))
+
+ raw := b.BuildWithoutMeta()
+
+ // Verify the header has is_large set at the correct bit position
+ typeInfo := (raw[0] >> basicTypeBits) & typeInfoMask
+ isLarge := ((typeInfo >> 2) & 0x1) == 1
+ assert.True(t, isLarge, "expected is_large=true for 300-element array")
+
+ // Verify valueSize returns correct result
+ got := valueSize(raw)
+
+ // Compute expected size:
+ // header(1) + numElements(4, is_large=true) +
offsets((300+1)*offsetSize) + data(300*1)
+ offsetSize := int((typeInfo & 0b11) + 1)
+ expected := 1 + 4 + (300+1)*offsetSize + 300
+
+ assert.Equal(t, expected, got, "valueSize should correctly handle large
arrays")
+}
+
+func TestValueSizeLargeObject(t *testing.T) {
+ // Verify that valueSize still works correctly for large objects (>255
fields).
+ var b Builder
+ start := b.Offset()
+ fields := make([]FieldEntry, 0, 300)
+ for i := 0; i < 300; i++ {
+ fields = append(fields, b.NextField(start,
string(rune('a'+i%26))+string(rune('0'+i/26))))
+ b.AppendNull()
+ }
+ require.NoError(t, b.FinishObject(start, fields))
+
+ raw := b.BuildWithoutMeta()
+
+ // Verify the header has is_large set at the correct bit position for
objects (bit 4).
+ typeInfo := (raw[0] >> basicTypeBits) & typeInfoMask
+ isLarge := ((typeInfo >> 4) & 0x1) == 1
+ assert.True(t, isLarge, "expected is_large=true for 300-field object")
+
+ got := valueSize(raw)
+ assert.Equal(t, len(raw), got, "valueSize should return the full object
size")
+}