This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new f8eb6993 feat(arrow/array): add Validate/ValidateFull to binary and
string arrays (#747)
f8eb6993 is described below
commit f8eb699372b69465e7f882f24abf8e05082239a0
Author: Matt Topol <[email protected]>
AuthorDate: Tue Apr 14 13:52:26 2026 -0400
feat(arrow/array): add Validate/ValidateFull to binary and string arrays
(#747)
## Summary
Fixes #691
Adds `Validate()` and `ValidateFull()` methods to `Binary`,
`LargeBinary`, `String`, and `LargeString` array types, plus top-level
dispatch functions and record-level convenience helpers.
## Problem
The existing `setData` validation only checks the **last** offset
against the data buffer length. Subtly corrupted data — e.g.
non-monotonic or negative intermediate offsets — passes construction but
causes a runtime `panic: slice bounds out of range` when `Value(i)` is
called later, **after** the IPC reader's `recover()` scope has already
returned. Users receiving data from untrusted sources (e.g. Flight SQL
from Doris DB) have no way to detect this without crashing.
## Solution
- `Validate()` — O(1): checks offset buffer size and that the last
offset is within the data buffer (mirrors existing `setData` checks, but
returns an error instead of panicking)
- `ValidateFull()` — O(n): additionally verifies all offsets are
non-negative and monotonically non-decreasing, catching the subtle
corruption case
- `Validate(arr arrow.Array) error` / `ValidateFull(arr arrow.Array)
error` — top-level dispatch via the new `Validator` interface
- `ValidateRecord(rec arrow.RecordBatch) error` /
`ValidateRecordFull(...)` — convenience wrappers that validate all
columns, with error messages including column index and name
## Usage
```go
rec, err := reader.Read()
if err != nil { ... }
if err := array.ValidateRecordFull(rec); err != nil {
log.Printf("skipping corrupted batch: %v", err)
rec.Release()
continue
}
```
## Test plan
- [ ] `TestBinaryValidate` — valid arrays, sliced arrays, non-monotonic
offsets, negative first offset
- [ ] `TestLargeBinaryValidate` — same for large binary
- [ ] `TestStringValidate` — same for string
- [ ] `TestLargeStringValidate` — same for large string
- [ ] `TestTopLevelValidate` — dispatch to `Validator`, passthrough for
non-`Validator` types, `ValidateRecord` with mixed valid/corrupt columns
---
arrow/array/binary.go | 108 ++++++++++++++++++++++
arrow/array/string.go | 117 ++++++++++++++++++++++++
arrow/array/validate.go | 82 +++++++++++++++++
arrow/array/validate_test.go | 211 +++++++++++++++++++++++++++++++++++++++++++
4 files changed, 518 insertions(+)
diff --git a/arrow/array/binary.go b/arrow/array/binary.go
index 942fe307..a8e77ae9 100644
--- a/arrow/array/binary.go
+++ b/arrow/array/binary.go
@@ -169,6 +169,60 @@ func (a *Binary) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *Binary) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty binary array has no
offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.valueOffsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: binary offset buffer must have
at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
+ }
+ firstOffset := int(a.valueOffsets[a.data.offset])
+ if firstOffset > len(a.valueBytes) {
+ return fmt.Errorf("arrow/array: binary offset %d out of bounds
of data buffer (length %d)", firstOffset, len(a.valueBytes))
+ }
+
+ lastOffset := int(a.valueOffsets[expNumOffsets-1])
+ if lastOffset > len(a.valueBytes) {
+ return fmt.Errorf("arrow/array: binary offset %d out of bounds
of data buffer (length %d)", lastOffset, len(a.valueBytes))
+ }
+ return nil
+}
+
+// ValidateFull performs a full O(n) consistency check on the array data.
+// In addition to the checks performed by Validate, it also verifies that
+// all offsets are non-negative and monotonically non-decreasing.
+func (a *Binary) ValidateFull() error {
+ if err := a.Validate(); err != nil {
+ return err
+ }
+ if a.data.length == 0 {
+ return nil
+ }
+ offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
+ if offsets[0] < 0 {
+ return fmt.Errorf("arrow/array: binary offset at index %d is
negative: %d", a.data.offset, offsets[0])
+ }
+ for i := 1; i < len(offsets); i++ {
+ if offsets[i] < offsets[i-1] {
+ return fmt.Errorf("arrow/array: binary offsets are not
monotonically non-decreasing at index %d: %d < %d",
+ a.data.offset+i, offsets[i], offsets[i-1])
+ }
+ }
+ return nil
+}
+
func arrayEqualBinary(left, right *Binary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
@@ -309,6 +363,60 @@ func (a *LargeBinary) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *LargeBinary) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty large binary array
has no offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.valueOffsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: large binary offset buffer must
have at least %d values, got %d", expNumOffsets, len(a.valueOffsets))
+ }
+ firstOffset := int(a.valueOffsets[a.data.offset])
+ if firstOffset > len(a.valueBytes) {
+ return fmt.Errorf("arrow/array: large binary offset %d out of
bounds of data buffer (length %d)", firstOffset, len(a.valueBytes))
+ }
+
+ lastOffset := int(a.valueOffsets[expNumOffsets-1])
+ if lastOffset > len(a.valueBytes) {
+ return fmt.Errorf("arrow/array: large binary offset %d out of
bounds of data buffer (length %d)", lastOffset, len(a.valueBytes))
+ }
+ return nil
+}
+
+// ValidateFull performs a full O(n) consistency check on the array data.
+// In addition to the checks performed by Validate, it also verifies that
+// all offsets are non-negative and monotonically non-decreasing.
+func (a *LargeBinary) ValidateFull() error {
+ if err := a.Validate(); err != nil {
+ return err
+ }
+ if a.data.length == 0 {
+ return nil
+ }
+ offsets := a.valueOffsets[a.data.offset : a.data.offset+a.data.length+1]
+ if offsets[0] < 0 {
+ return fmt.Errorf("arrow/array: large binary offset at index %d
is negative: %d", a.data.offset, offsets[0])
+ }
+ for i := 1; i < len(offsets); i++ {
+ if offsets[i] < offsets[i-1] {
+ return fmt.Errorf("arrow/array: large binary offsets
are not monotonically non-decreasing at index %d: %d < %d",
+ a.data.offset+i, offsets[i], offsets[i-1])
+ }
+ }
+ return nil
+}
+
func arrayEqualLargeBinary(left, right *LargeBinary) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
diff --git a/arrow/array/string.go b/arrow/array/string.go
index dd2e2d20..60323e37 100644
--- a/arrow/array/string.go
+++ b/arrow/array/string.go
@@ -21,6 +21,7 @@ import (
"fmt"
"reflect"
"strings"
+ "unicode/utf8"
"unsafe"
"github.com/apache/arrow-go/v18/arrow"
@@ -169,6 +170,64 @@ func (a *String) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *String) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty string array has no
offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.offsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: string offset buffer must have
at least %d values, got %d", expNumOffsets, len(a.offsets))
+ }
+ firstOffset := int(a.offsets[a.data.offset])
+ if firstOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: string offset %d out of bounds
of data buffer (length %d)", firstOffset, len(a.values))
+ }
+ lastOffset := int(a.offsets[expNumOffsets-1])
+ if lastOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: string offset %d out of bounds
of data buffer (length %d)", lastOffset, len(a.values))
+ }
+ return nil
+}
+
+// ValidateFull performs a full O(n) consistency check on the array data.
+// In addition to the checks performed by Validate, it also verifies that
+// all offsets are non-negative and monotonically non-decreasing.
+func (a *String) ValidateFull() error {
+ if err := a.Validate(); err != nil {
+ return err
+ }
+ if a.data.length == 0 {
+ return nil
+ }
+ offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
+ if offsets[0] < 0 {
+ return fmt.Errorf("arrow/array: string offset at index %d is
negative: %d", a.data.offset, offsets[0])
+ }
+ for i := 1; i < len(offsets); i++ {
+ if offsets[i] < offsets[i-1] {
+ return fmt.Errorf("arrow/array: string offsets are not
monotonically non-decreasing at index %d: %d < %d",
+ a.data.offset+i, offsets[i], offsets[i-1])
+ }
+ value := a.values[offsets[i-1]:offsets[i]]
+ if !utf8.ValidString(value) {
+ return fmt.Errorf("arrow/array: string at index %d is
not valid utf8: %s", a.data.offset+i-1, value)
+ }
+ }
+
+ return nil
+}
+
func arrayEqualString(left, right *String) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
@@ -312,6 +371,64 @@ func (a *LargeString) MarshalJSON() ([]byte, error) {
return json.Marshal(vals)
}
+// Validate performs a basic, O(1) consistency check on the array data.
+// It returns an error if:
+// - The offset buffer is too small for the array length and offset
+// - The last offset exceeds the data buffer length
+//
+// This is useful for detecting corrupted data from untrusted sources (e.g.
+// Arrow Flight / Flight SQL servers) before accessing values, which may
+// otherwise cause a runtime panic.
+func (a *LargeString) Validate() error {
+ if a.data.length == 0 {
+ return nil
+ }
+ if a.data.buffers[1] == nil {
+ return fmt.Errorf("arrow/array: non-empty large string array
has no offsets buffer")
+ }
+ expNumOffsets := a.data.offset + a.data.length + 1
+ if len(a.offsets) < expNumOffsets {
+ return fmt.Errorf("arrow/array: large string offset buffer must
have at least %d values, got %d", expNumOffsets, len(a.offsets))
+ }
+ firstOffset := int(a.offsets[a.data.offset])
+ if firstOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: large string offset %d out of
bounds of data buffer (length %d)", firstOffset, len(a.values))
+ }
+
+ lastOffset := int(a.offsets[expNumOffsets-1])
+ if lastOffset > len(a.values) {
+ return fmt.Errorf("arrow/array: large string offset %d out of
bounds of data buffer (length %d)", lastOffset, len(a.values))
+ }
+ return nil
+}
+
+// ValidateFull performs a full O(n) consistency check on the array data.
+// In addition to the checks performed by Validate, it also verifies that
+// all offsets are non-negative and monotonically non-decreasing.
+func (a *LargeString) ValidateFull() error {
+ if err := a.Validate(); err != nil {
+ return err
+ }
+ if a.data.length == 0 {
+ return nil
+ }
+ offsets := a.offsets[a.data.offset : a.data.offset+a.data.length+1]
+ if offsets[0] < 0 {
+ return fmt.Errorf("arrow/array: large string offset at index %d
is negative: %d", a.data.offset, offsets[0])
+ }
+ for i := 1; i < len(offsets); i++ {
+ if offsets[i] < offsets[i-1] {
+ return fmt.Errorf("arrow/array: large string offsets
are not monotonically non-decreasing at index %d: %d < %d",
+ a.data.offset+i, offsets[i], offsets[i-1])
+ }
+ value := a.values[offsets[i-1]:offsets[i]]
+ if !utf8.ValidString(value) {
+ return fmt.Errorf("arrow/array: string at index %d is
not valid utf8: %s", a.data.offset+i-1, value)
+ }
+ }
+ return nil
+}
+
func arrayEqualLargeString(left, right *LargeString) bool {
for i := 0; i < left.Len(); i++ {
if left.IsNull(i) {
diff --git a/arrow/array/validate.go b/arrow/array/validate.go
new file mode 100644
index 00000000..70b7f669
--- /dev/null
+++ b/arrow/array/validate.go
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array
+
+import (
+ "fmt"
+
+ "github.com/apache/arrow-go/v18/arrow"
+)
+
+// Validator is implemented by array types that can validate their internal
+// consistency. See Validate and ValidateFull for top-level dispatch.
+type Validator interface {
+ arrow.Array
+ // Validate performs a basic O(1) consistency check.
+ Validate() error
+ // ValidateFull performs a thorough O(n) consistency check.
+ ValidateFull() error
+}
+
+// Validate performs a basic O(1) consistency check on arr, returning an error
+// if the array's internal buffers are inconsistent. For array types that do
not
+// implement Validator, nil is returned.
+//
+// Use this to detect corrupted data from untrusted sources such as Arrow
Flight
+// or Flight SQL servers before accessing values, which may otherwise panic.
+func Validate(arr arrow.Array) error {
+ if v, ok := arr.(Validator); ok {
+ return v.Validate()
+ }
+ return nil
+}
+
+// ValidateFull performs a thorough O(n) consistency check on arr, returning an
+// error if the array's internal buffers are inconsistent. For array types that
+// do not implement Validator, nil is returned.
+//
+// Unlike Validate, this checks every element and is therefore O(n). Use this
+// when receiving data from untrusted sources where subtle corruption (e.g.
+// non-monotonic offsets) may not be detected by Validate alone.
+func ValidateFull(arr arrow.Array) error {
+ if v, ok := arr.(Validator); ok {
+ return v.ValidateFull()
+ }
+ return nil
+}
+
+// ValidateRecord validates each column in rec using Validate, returning the
+// first error encountered. The error includes the column index and field name.
+func ValidateRecord(rec arrow.RecordBatch) error {
+ for i := int64(0); i < rec.NumCols(); i++ {
+ if err := Validate(rec.Column(int(i))); err != nil {
+ return fmt.Errorf("column %d (%s): %w", i,
rec.Schema().Field(int(i)).Name, err)
+ }
+ }
+ return nil
+}
+
+// ValidateRecordFull validates each column in rec using ValidateFull,
returning
+// the first error encountered. The error includes the column index and field
name.
+func ValidateRecordFull(rec arrow.RecordBatch) error {
+ for i := int64(0); i < rec.NumCols(); i++ {
+ if err := ValidateFull(rec.Column(int(i))); err != nil {
+ return fmt.Errorf("column %d (%s): %w", i,
rec.Schema().Field(int(i)).Name, err)
+ }
+ }
+ return nil
+}
diff --git a/arrow/array/validate_test.go b/arrow/array/validate_test.go
new file mode 100644
index 00000000..5d45ce50
--- /dev/null
+++ b/arrow/array/validate_test.go
@@ -0,0 +1,211 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package array
+
+import (
+ "testing"
+
+ "github.com/apache/arrow-go/v18/arrow"
+ "github.com/apache/arrow-go/v18/arrow/memory"
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+// makeBinaryArrayRaw creates a Binary array directly from raw buffers,
+// bypassing builder validation. Used to simulate corrupted IPC data.
+func makeBinaryArrayRaw(t *testing.T, offsets []int32, data []byte, length,
offset int) *Binary {
+ t.Helper()
+ offsetBuf :=
memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(offsets))
+ dataBuf := memory.NewBufferBytes(data)
+ d := NewData(arrow.BinaryTypes.Binary, length, []*memory.Buffer{nil,
offsetBuf, dataBuf}, nil, 0, offset)
+ return NewBinaryData(d)
+}
+
+// makeLargeBinaryArrayRaw creates a LargeBinary array directly from raw
buffers.
+func makeLargeBinaryArrayRaw(t *testing.T, offsets []int64, data []byte,
length, offset int) *LargeBinary {
+ t.Helper()
+ offsetBuf :=
memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(offsets))
+ dataBuf := memory.NewBufferBytes(data)
+ d := NewData(arrow.BinaryTypes.LargeBinary, length,
[]*memory.Buffer{nil, offsetBuf, dataBuf}, nil, 0, offset)
+ return NewLargeBinaryData(d)
+}
+
+// makeStringArrayRaw creates a String array directly from raw buffers.
+func makeStringArrayRaw(t *testing.T, offsets []int32, data string, length,
offset int) *String {
+ t.Helper()
+ offsetBuf :=
memory.NewBufferBytes(arrow.Int32Traits.CastToBytes(offsets))
+ dataBuf := memory.NewBufferBytes([]byte(data))
+ d := NewData(arrow.BinaryTypes.String, length, []*memory.Buffer{nil,
offsetBuf, dataBuf}, nil, 0, offset)
+ return NewStringData(d)
+}
+
+// makeLargeStringArrayRaw creates a LargeString array directly from raw
buffers.
+func makeLargeStringArrayRaw(t *testing.T, offsets []int64, data string,
length, offset int) *LargeString {
+ t.Helper()
+ offsetBuf :=
memory.NewBufferBytes(arrow.Int64Traits.CastToBytes(offsets))
+ dataBuf := memory.NewBufferBytes([]byte(data))
+ d := NewData(arrow.BinaryTypes.LargeString, length,
[]*memory.Buffer{nil, offsetBuf, dataBuf}, nil, 0, offset)
+ return NewLargeStringData(d)
+}
+
+func TestBinaryValidate(t *testing.T) {
+ t.Run("valid array passes", func(t *testing.T) {
+ // offsets [0,3,6,9], data "abcdefghi" — 3 elements of 3 bytes
each
+ arr := makeBinaryArrayRaw(t, []int32{0, 3, 6, 9},
[]byte("abcdefghi"), 3, 0)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("valid sliced array passes", func(t *testing.T) {
+ arr := makeBinaryArrayRaw(t, []int32{0, 3, 6, 9},
[]byte("abcdefghi"), 1, 1)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("empty array passes", func(t *testing.T) {
+ arr := makeBinaryArrayRaw(t, nil, nil, 0, 0)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("non-monotonic offsets pass Validate but fail ValidateFull",
func(t *testing.T) {
+ // last offset (5) is within data bounds so setData/Validate
pass,
+ // but offset[1]=5 then offset[2]=3 is decreasing —
ValidateFull must catch this.
+ arr := makeBinaryArrayRaw(t, []int32{0, 5, 3, 5},
[]byte("hello"), 3, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "not monotonically
non-decreasing")
+ })
+
+ t.Run("negative first offset passes Validate but fails ValidateFull",
func(t *testing.T) {
+ // last offset (5) is within bounds, but first offset is
negative.
+ arr := makeBinaryArrayRaw(t, []int32{-1, 2, 5},
[]byte("hello"), 2, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "negative")
+ })
+}
+
+func TestLargeBinaryValidate(t *testing.T) {
+ t.Run("valid array passes", func(t *testing.T) {
+ arr := makeLargeBinaryArrayRaw(t, []int64{0, 3, 6, 9},
[]byte("abcdefghi"), 3, 0)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("non-monotonic offsets pass Validate but fail ValidateFull",
func(t *testing.T) {
+ arr := makeLargeBinaryArrayRaw(t, []int64{0, 5, 3, 5},
[]byte("hello"), 3, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "not monotonically
non-decreasing")
+ })
+
+ t.Run("negative first offset passes Validate but fails ValidateFull",
func(t *testing.T) {
+ arr := makeLargeBinaryArrayRaw(t, []int64{-1, 2, 5},
[]byte("hello"), 2, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "negative")
+ })
+}
+
+func TestStringValidate(t *testing.T) {
+ t.Run("valid array passes", func(t *testing.T) {
+ arr := makeStringArrayRaw(t, []int32{0, 3, 5, 10},
"abcdeabcde", 3, 0)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("non-monotonic offsets pass Validate but fail ValidateFull",
func(t *testing.T) {
+ arr := makeStringArrayRaw(t, []int32{0, 5, 3, 5}, "hello", 3, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "not monotonically
non-decreasing")
+ })
+
+ t.Run("negative first offset passes Validate but fails ValidateFull",
func(t *testing.T) {
+ arr := makeStringArrayRaw(t, []int32{-1, 2, 5}, "hello", 2, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "negative")
+ })
+}
+
+func TestLargeStringValidate(t *testing.T) {
+ t.Run("valid array passes", func(t *testing.T) {
+ arr := makeLargeStringArrayRaw(t, []int64{0, 3, 5, 10},
"abcdeabcde", 3, 0)
+ assert.NoError(t, arr.Validate())
+ assert.NoError(t, arr.ValidateFull())
+ })
+
+ t.Run("non-monotonic offsets pass Validate but fail ValidateFull",
func(t *testing.T) {
+ arr := makeLargeStringArrayRaw(t, []int64{0, 5, 3, 5}, "hello",
3, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "not monotonically
non-decreasing")
+ })
+
+ t.Run("negative first offset passes Validate but fails ValidateFull",
func(t *testing.T) {
+ arr := makeLargeStringArrayRaw(t, []int64{-1, 2, 5}, "hello",
2, 0)
+ assert.NoError(t, arr.Validate())
+ err := arr.ValidateFull()
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "negative")
+ })
+}
+
+func TestTopLevelValidate(t *testing.T) {
+ t.Run("Validate dispatches to Validator", func(t *testing.T) {
+ // non-monotonic string array: passes setData but ValidateFull
must fail
+ arr := makeStringArrayRaw(t, []int32{0, 5, 3, 5}, "hello", 3, 0)
+ assert.NoError(t, Validate(arr))
+ require.Error(t, ValidateFull(arr))
+ })
+
+ t.Run("Validate returns nil for non-Validator types", func(t
*testing.T) {
+ // Bool arrays don't implement Validator — should return nil
+ bldr := NewBooleanBuilder(memory.NewGoAllocator())
+ bldr.AppendValues([]bool{true, false}, nil)
+ arr := bldr.NewBooleanArray()
+ defer arr.Release()
+ assert.NoError(t, Validate(arr))
+ assert.NoError(t, ValidateFull(arr))
+ })
+
+ t.Run("ValidateRecord validates all columns", func(t *testing.T) {
+ validArr := makeStringArrayRaw(t, []int32{0, 3, 6}, "abcdef",
2, 0)
+ corruptArr := makeStringArrayRaw(t, []int32{0, 5, 3, 5},
"hello", 3, 0)
+
+ schema := arrow.NewSchema([]arrow.Field{
+ {Name: "ok", Type: arrow.BinaryTypes.String},
+ {Name: "bad", Type: arrow.BinaryTypes.String},
+ }, nil)
+ rec := NewRecordBatch(schema, []arrow.Array{validArr,
corruptArr}, 2)
+ defer rec.Release()
+
+ assert.NoError(t, ValidateRecord(rec))
+ err := ValidateRecordFull(rec)
+ require.Error(t, err)
+ assert.Contains(t, err.Error(), "column 1 (bad)")
+ })
+}