This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git


The following commit(s) were added to refs/heads/main by this push:
     new 8293e4a7 fix(parquet/variant): correct is_large bit position in 
valueSize for arrays (#840)
8293e4a7 is described below

commit 8293e4a7464feb43745d67fddfc3e10fe913a243
Author: Jared Yu (余启正) <[email protected]>
AuthorDate: Mon Jun 8 11:03:12 2026 -0700

    fix(parquet/variant): correct is_large bit position in valueSize for arrays 
(#840)
    
    ### Rationale for this change
    
    Closes #839
    
    The `valueSize()` function in `parquet/variant/utils.go` uses `(typeInfo
    >> 4) & 0x1` to check the `is_large` flag for both objects and arrays.
    This is correct for objects (where `is_large` is at bit 4 of the
    value_header) but incorrect for arrays (where `is_large` is at bit 2 per
    the Variant Encoding Spec).
    
    This causes `valueSize()` to return an incorrect size for arrays with
    >255 elements, which can lead to silent data corruption when
    `FinishObject()` compacts duplicate keys whose values are large arrays.
    
    ### What changes are included in this PR?
    
    - **`parquet/variant/utils.go`**: Changed `(typeInfo >> 4)` to
    `(typeInfo >> 2)` in the `BasicArray` case of `valueSize()`. The object
    case remains unchanged (it was already correct).
    
    - **`parquet/variant/valuesize_test.go`** (new): Added regression tests:
    - `TestValueSizeLargeArray`: Builds a 300-element array and verifies
    `valueSize()` returns the correct byte count.
    - `TestValueSizeLargeObject`: Verifies that large objects (>255 fields)
    still compute correctly after the fix.
    
    ### Are these changes tested?
    
    Yes. Two new regression tests are included. The full existing test suite
    passes with no regressions.
    
    ### Are there any user-facing changes?
    
    No API changes. This is a correctness fix for an internal utility
    function. Users who previously triggered the bug (allowed duplicate keys
    in objects where a field value is a large array) will now get correct
    behavior instead of silent data corruption.
    
    ---------
    
    Co-authored-by: Copilot Autofix powered by AI 
<[email protected]>
---
 parquet/variant/utils.go          |  2 +-
 parquet/variant/valuesize_test.go | 76 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)

diff --git a/parquet/variant/utils.go b/parquet/variant/utils.go
index ae9b8c15..edeea2de 100644
--- a/parquet/variant/utils.go
+++ b/parquet/variant/utils.go
@@ -129,7 +129,7 @@ func valueSize(v []byte) int {
                return int(dataStart + readLEU32(v[idx:idx+offsetSize]))
        case byte(BasicArray):
                var szBytes uint8 = 1
-               if ((typeInfo >> 4) & 0x1) != 0 {
+               if ((typeInfo >> 2) & 0x1) != 0 {
                        szBytes = 4
                }
 
diff --git a/parquet/variant/valuesize_test.go 
b/parquet/variant/valuesize_test.go
new file mode 100644
index 00000000..3381a57f
--- /dev/null
+++ b/parquet/variant/valuesize_test.go
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package variant
+
+import (
+       "testing"
+
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+func TestValueSizeLargeArray(t *testing.T) {
+       // Build a large array with >255 elements to trigger is_large=true.
+       // This is a regression test for the is_large bit position in 
valueSize().
+       var b Builder
+       start := b.Offset()
+       offsets := make([]int, 0, 300)
+       for i := 0; i < 300; i++ {
+               offsets = append(offsets, b.NextElement(start))
+               b.AppendNull()
+       }
+       require.NoError(t, b.FinishArray(start, offsets))
+
+       raw := b.BuildWithoutMeta()
+
+       // Verify the header has is_large set at the correct bit position
+       typeInfo := (raw[0] >> basicTypeBits) & typeInfoMask
+       isLarge := ((typeInfo >> 2) & 0x1) == 1
+       assert.True(t, isLarge, "expected is_large=true for 300-element array")
+
+       // Verify valueSize returns correct result
+       got := valueSize(raw)
+
+       // Compute expected size:
+       // header(1) + numElements(4, is_large=true) + 
offsets((300+1)*offsetSize) + data(300*1)
+       offsetSize := int((typeInfo & 0b11) + 1)
+       expected := 1 + 4 + (300+1)*offsetSize + 300
+
+       assert.Equal(t, expected, got, "valueSize should correctly handle large 
arrays")
+}
+
+func TestValueSizeLargeObject(t *testing.T) {
+       // Verify that valueSize still works correctly for large objects (>255 
fields).
+       var b Builder
+       start := b.Offset()
+       fields := make([]FieldEntry, 0, 300)
+       for i := 0; i < 300; i++ {
+               fields = append(fields, b.NextField(start, 
string(rune('a'+i%26))+string(rune('0'+i/26))))
+               b.AppendNull()
+       }
+       require.NoError(t, b.FinishObject(start, fields))
+
+       raw := b.BuildWithoutMeta()
+
+       // Verify the header has is_large set at the correct bit position for 
objects (bit 4).
+       typeInfo := (raw[0] >> basicTypeBits) & typeInfoMask
+       isLarge := ((typeInfo >> 4) & 0x1) == 1
+       assert.True(t, isLarge, "expected is_large=true for 300-field object")
+
+       got := valueSize(raw)
+       assert.Equal(t, len(raw), got, "valueSize should return the full object 
size")
+}

Reply via email to