This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 5615d94f test(parquet/file): add regression test for mixed-size
ByteArray WriteBatch (#757)
5615d94f is described below
commit 5615d94fb283385a370c1c1999164d45b2d476af
Author: Karen Li <[email protected]>
AuthorDate: Mon Apr 13 11:57:12 2026 -0400
test(parquet/file): add regression test for mixed-size ByteArray WriteBatch
(#757)
### Rationale for this change
Issue #756 reports that small ByteArray values are silently dropped when
a large (≥1MB) value appears in the same WriteBatch. This was already
fixed on main by #690, but there was no test covering the specific
mixed-size scenario.
### What changes are included in this PR?
Adds a regression test that writes a batch of mixed-size ByteArray
values (small values flanking a 2MB value) and verifies all values
round-trip correctly.
### Are these changes tested?
The test itself is the change. Confirmed it fails at bbf7ab75 (#655, the
buggy commit) and passes on main.
### Are there any user-facing changes?
No
---
parquet/file/large_value_test.go | 72 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 72 insertions(+)
diff --git a/parquet/file/large_value_test.go b/parquet/file/large_value_test.go
index 66925f11..7791169d 100644
--- a/parquet/file/large_value_test.go
+++ b/parquet/file/large_value_test.go
@@ -236,6 +236,78 @@ func TestLargeByteArrayRoundTripCorrectness(t *testing.T) {
require.Equal(t, numValues, rowIdx, "did not read back all values")
}
+// TestMixedSizeByteArrayRoundTrip verifies that small ByteArray values
+// are not dropped when a large (≥1MB) value appears in the same WriteBatch.
+func TestMixedSizeByteArrayRoundTrip(t *testing.T) {
+ sc := schema.NewSchema(schema.MustGroup(schema.NewGroupNode("schema",
parquet.Repetitions.Required, schema.FieldList{
+ schema.Must(schema.NewPrimitiveNode("data",
parquet.Repetitions.Required, parquet.Types.ByteArray, -1, -1)),
+ }, -1)))
+
+ props := parquet.NewWriterProperties(
+ parquet.WithStats(true),
+ parquet.WithDictionaryDefault(false),
+ parquet.WithDataPageSize(1024*1024),
+ )
+
+ // Build values: small, small, small, 2MB, small, small
+ // Each has a unique pattern so corruption is detectable.
+ sizes := []int{65, 100, 200, 2 * 1024 * 1024, 50, 80}
+ values := make([]parquet.ByteArray, len(sizes))
+ for i, sz := range sizes {
+ buf := make([]byte, sz)
+ // Header: index for identification
+ buf[0] = byte(i)
+ // Fill with deterministic pattern
+ for j := 1; j < sz; j++ {
+ buf[j] = byte(i*31 + j)
+ }
+ values[i] = buf
+ }
+
+ // Write
+ out := &bytes.Buffer{}
+ writer := file.NewParquetWriter(out, sc.Root(),
file.WithWriterProps(props))
+
+ rgw := writer.AppendRowGroup()
+ colWriter, err := rgw.NextColumn()
+ require.NoError(t, err)
+
+ byteArrayWriter := colWriter.(*file.ByteArrayColumnChunkWriter)
+ _, err = byteArrayWriter.WriteBatch(values, nil, nil)
+ require.NoError(t, err)
+
+ require.NoError(t, colWriter.Close())
+ require.NoError(t, rgw.Close())
+ require.NoError(t, writer.Close())
+
+ // Read back
+ rdr, err := file.NewParquetReader(bytes.NewReader(out.Bytes()))
+ require.NoError(t, err)
+ defer rdr.Close()
+
+ require.EqualValues(t, len(values), rdr.NumRows())
+
+ rgr := rdr.RowGroup(0)
+ colReader, err := rgr.Column(0)
+ require.NoError(t, err)
+
+ result := make([]parquet.ByteArray, len(values))
+ _, nVals, err := colReader.(*file.ByteArrayColumnChunkReader).ReadBatch(
+ int64(len(values)), result, nil, nil)
+ require.NoError(t, err)
+ require.Equal(t, len(values), nVals)
+
+ for i, expected := range values {
+ got := result[i]
+ require.Equal(t, len(expected), len(got),
+ "value %d: length mismatch (expected %d, got %d)", i,
len(expected), len(got))
+ require.Equal(t, expected[0], got[0],
+ "value %d: header mismatch (data corruption)", i)
+ require.True(t, bytes.Equal(expected, got),
+ "value %d: content mismatch", i)
+ }
+}
+
// TestLargeByteArrayRoundTripWithNulls verifies correctness of the
// WriteBatchSpaced path (nullable column) with moderately-sized values.
// Every 3rd value is null. Uses ~3MB total.