(arrow-go) branch main updated: test(parquet/file): add regression test for mixed-size ByteArray WriteBatch (#757)

zeroshade Mon, 13 Apr 2026 08:57:28 -0700

This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git



The following commit(s) were added to refs/heads/main by this push:
     new 5615d94f test(parquet/file): add regression test for mixed-size 
ByteArray WriteBatch  (#757)
5615d94f is described below

commit 5615d94fb283385a370c1c1999164d45b2d476af
Author: Karen Li <[email protected]>
AuthorDate: Mon Apr 13 11:57:12 2026 -0400

    test(parquet/file): add regression test for mixed-size ByteArray WriteBatch 
 (#757)
    
    ### Rationale for this change
    Issue #756 reports that small ByteArray values are silently dropped when
    a large (≥1MB) value appears in the same WriteBatch. This was already
    fixed on main by #690, but there was no test covering the specific
    mixed-size scenario.
    
    ### What changes are included in this PR?
    Adds a regression test that writes a batch of mixed-size ByteArray
    values (small values flanking a 2MB value) and verifies all values
    round-trip correctly.
    
    ### Are these changes tested?
    The test itself is the change. Confirmed it fails at bbf7ab75 (#655, the
    buggy commit) and passes on main.
    
    ### Are there any user-facing changes?
    No
---
 parquet/file/large_value_test.go | 72 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/parquet/file/large_value_test.go b/parquet/file/large_value_test.go
index 66925f11..7791169d 100644
--- a/parquet/file/large_value_test.go
+++ b/parquet/file/large_value_test.go
@@ -236,6 +236,78 @@ func TestLargeByteArrayRoundTripCorrectness(t *testing.T) {
        require.Equal(t, numValues, rowIdx, "did not read back all values")
 }
 
+// TestMixedSizeByteArrayRoundTrip verifies that small ByteArray values
+// are not dropped when a large (≥1MB) value appears in the same WriteBatch.
+func TestMixedSizeByteArrayRoundTrip(t *testing.T) {
+       sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema", 
parquet.Repetitions.Required, schema.FieldList{
+               schema.Must(schema.NewPrimitiveNode("data", 
parquet.Repetitions.Required, parquet.Types.ByteArray, -1, -1)),
+       }, -1)))
+
+       props := parquet.NewWriterProperties(
+               parquet.WithStats(true),
+               parquet.WithDictionaryDefault(false),
+               parquet.WithDataPageSize(1024*1024),
+       )
+
+       // Build values: small, small, small, 2MB, small, small
+       // Each has a unique pattern so corruption is detectable.
+       sizes := []int{65, 100, 200, 2 * 1024 * 1024, 50, 80}
+       values := make([]parquet.ByteArray, len(sizes))
+       for i, sz := range sizes {
+               buf := make([]byte, sz)
+               // Header: index for identification
+               buf[0] = byte(i)
+               // Fill with deterministic pattern
+               for j := 1; j < sz; j++ {
+                       buf[j] = byte(i*31 + j)
+               }
+               values[i] = buf
+       }
+
+       // Write
+       out := &bytes.Buffer{}
+       writer := file.NewParquetWriter(out, sc.Root(), 
file.WithWriterProps(props))
+
+       rgw := writer.AppendRowGroup()
+       colWriter, err := rgw.NextColumn()
+       require.NoError(t, err)
+
+       byteArrayWriter := colWriter.(*file.ByteArrayColumnChunkWriter)
+       _, err = byteArrayWriter.WriteBatch(values, nil, nil)
+       require.NoError(t, err)
+
+       require.NoError(t, colWriter.Close())
+       require.NoError(t, rgw.Close())
+       require.NoError(t, writer.Close())
+
+       // Read back
+       rdr, err := file.NewParquetReader(bytes.NewReader(out.Bytes()))
+       require.NoError(t, err)
+       defer rdr.Close()
+
+       require.EqualValues(t, len(values), rdr.NumRows())
+
+       rgr := rdr.RowGroup(0)
+       colReader, err := rgr.Column(0)
+       require.NoError(t, err)
+
+       result := make([]parquet.ByteArray, len(values))
+       _, nVals, err := colReader.(*file.ByteArrayColumnChunkReader).ReadBatch(
+               int64(len(values)), result, nil, nil)
+       require.NoError(t, err)
+       require.Equal(t, len(values), nVals)
+
+       for i, expected := range values {
+               got := result[i]
+               require.Equal(t, len(expected), len(got),
+                       "value %d: length mismatch (expected %d, got %d)", i, 
len(expected), len(got))
+               require.Equal(t, expected[0], got[0],
+                       "value %d: header mismatch (data corruption)", i)
+               require.True(t, bytes.Equal(expected, got),
+                       "value %d: content mismatch", i)
+       }
+}
+
 // TestLargeByteArrayRoundTripWithNulls verifies correctness of the
 // WriteBatchSpaced path (nullable column) with moderately-sized values.
 // Every 3rd value is null. Uses ~3MB total.

(arrow-go) branch main updated: test(parquet/file): add regression test for mixed-size ByteArray WriteBatch (#757)

Reply via email to