This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new e0df4f91 `UnsafeAppendBoolToBitmap` for dictionary and REE builders
(#758)
e0df4f91 is described below
commit e0df4f91b5d00f647eaffd875ef40ffe159f8838
Author: Lucas Valente <[email protected]>
AuthorDate: Tue Apr 14 20:10:11 2026 +0200
`UnsafeAppendBoolToBitmap` for dictionary and REE builders (#758)
### Rationale for this change
For more context:
https://github.com/apache/arrow-go/pull/558/changes#r2758798540
The Builders for REE and dict-encoded arrays were inheriting the
`UnsafeAppendBoolToBitmap()` and `UnsafeAppend()` from the `Builder`
interface defined in `arrow/array/builder.go`. The default
implementation does not bump the length of the inner `idxBuilder` or
`runEndsBuilder`, which causes it to be in an inconsistent state where
the parent has greater length than the index/run-ends builder. This
causes a panic when instancing a record batch.
### What changes are included in this PR?
This PR makes `UnsafeAppendBoolToBitmap()` for `RunEndEncodedBuilder`
panic as there is not a good semantic meaning for this method in case of
REE.
It also implements `UnsafeAppendBoolToBitmap()` and `UnsafeAppend()` for
the Dictionary builders. This allows users to use `bldr.Reserve(n)`
followed by `n` calls to `UnsafeAppend()` or
`UnsafeAppendBoolToBitmap()` without the need to check if the array
needs to grow everytime.
### Are these changes tested?
Yes.
### Are there any user-facing changes?
No. The API was already there (inherited from `Builder`), but the
behavior was incorrect.
---------
Co-authored-by: Matt Topol <[email protected]>
---
arrow/array/dictionary.go | 93 +++++++++++++++++++++++++++++++++++++++++-
arrow/array/dictionary_test.go | 41 +++++++++++++++++++
arrow/array/encoded.go | 4 ++
3 files changed, 137 insertions(+), 1 deletion(-)
diff --git a/arrow/array/dictionary.go b/arrow/array/dictionary.go
index 109d2a97..8f9fe245 100644
--- a/arrow/array/dictionary.go
+++ b/arrow/array/dictionary.go
@@ -314,7 +314,8 @@ func arrayApproxEqualDict(l, r *Dictionary, opt
equalOption) bool {
// helper for building the properly typed indices of the dictionary builder
type IndexBuilder struct {
Builder
- Append func(int)
+ Append func(int)
+ UnsafeAppend func(int)
}
func createIndexBuilder(mem memory.Allocator, dt arrow.FixedWidthDataType)
(ret IndexBuilder, err error) {
@@ -324,34 +325,58 @@ func createIndexBuilder(mem memory.Allocator, dt
arrow.FixedWidthDataType) (ret
ret.Append = func(idx int) {
ret.Builder.(*Int8Builder).Append(int8(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Int8Builder).UnsafeAppend(int8(idx))
+ }
case arrow.UINT8:
ret.Append = func(idx int) {
ret.Builder.(*Uint8Builder).Append(uint8(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Uint8Builder).UnsafeAppend(uint8(idx))
+ }
case arrow.INT16:
ret.Append = func(idx int) {
ret.Builder.(*Int16Builder).Append(int16(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Int16Builder).UnsafeAppend(int16(idx))
+ }
case arrow.UINT16:
ret.Append = func(idx int) {
ret.Builder.(*Uint16Builder).Append(uint16(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Uint16Builder).UnsafeAppend(uint16(idx))
+ }
case arrow.INT32:
ret.Append = func(idx int) {
ret.Builder.(*Int32Builder).Append(int32(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Int32Builder).UnsafeAppend(int32(idx))
+ }
case arrow.UINT32:
ret.Append = func(idx int) {
ret.Builder.(*Uint32Builder).Append(uint32(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Uint32Builder).UnsafeAppend(uint32(idx))
+ }
case arrow.INT64:
ret.Append = func(idx int) {
ret.Builder.(*Int64Builder).Append(int64(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Int64Builder).UnsafeAppend(int64(idx))
+ }
case arrow.UINT64:
ret.Append = func(idx int) {
ret.Builder.(*Uint64Builder).Append(uint64(idx))
}
+ ret.UnsafeAppend = func(idx int) {
+ ret.Builder.(*Uint64Builder).UnsafeAppend(uint64(idx))
+ }
default:
debug.Assert(false, "dictionary index type must be integral")
err = fmt.Errorf("dictionary index type must be integral, not
%s", dt)
@@ -646,6 +671,14 @@ func (b *dictionaryBuilder) AppendEmptyValues(n int) {
}
}
+func (b *dictionaryBuilder) UnsafeAppendBoolToBitmap(v bool) {
+ if !v {
+ b.nulls += 1
+ }
+ b.length += 1
+ b.idxBuilder.UnsafeAppendBoolToBitmap(v)
+}
+
func (b *dictionaryBuilder) Reserve(n int) {
b.idxBuilder.Reserve(n)
}
@@ -781,6 +814,13 @@ func (b *dictionaryBuilder) insertDictBytes(val []byte)
error {
return err
}
+func (b *dictionaryBuilder) unsafeAppendValue(val interface{}) error {
+ idx, _, err := b.memoTable.GetOrInsert(val)
+ b.idxBuilder.UnsafeAppend(idx)
+ b.length += 1
+ return err
+}
+
func (b *dictionaryBuilder) appendValue(val interface{}) error {
idx, _, err := b.memoTable.GetOrInsert(val)
b.idxBuilder.Append(idx)
@@ -990,7 +1030,39 @@ type dictBuilder[T arrow.ValueType] struct {
dictionaryBuilder
}
+func (b *dictBuilder[T]) UnsafeAppend(v T) {
+ // SAFETY: it is safe to ignore the value returned by the calls to
`unsafeAppendValue()`
+ // here since `UnsafeAppend()` is statically typed and the only case in
which that method
+ // errors is when trying to insert an invalid `interface{}` into the
`memoTable`.
+ var err error
+ switch val := any(v).(type) {
+ case arrow.Duration:
+ err = b.unsafeAppendValue(int64(val))
+ case arrow.Timestamp:
+ err = b.unsafeAppendValue(int64(val))
+ case arrow.Time32:
+ err = b.unsafeAppendValue(int32(val))
+ case arrow.Time64:
+ err = b.unsafeAppendValue(int64(val))
+ case arrow.Date32:
+ err = b.unsafeAppendValue(int32(val))
+ case arrow.Date64:
+ err = b.unsafeAppendValue(int64(val))
+ case arrow.MonthInterval:
+ err = b.unsafeAppendValue(int32(val))
+ default:
+ err = b.unsafeAppendValue(v)
+ }
+ debug.Assert(err == nil, "Trying to insert wrong type into memoTable
even though this method is statically typed. This is an implementation bug.")
+}
+
func (b *dictBuilder[T]) Append(v T) error {
+ // TODO: it is safe to ignore the value returned by the calls to
`appendValue()`
+ // here since `Append()` is statically typed and the only case in which
that
+ // method errors is when trying to insert an invalid `interface{}` into
the `memoTable`.
+ //
+ // This would be a breaking change to the public API of `dictBuilder`,
so it needs
+ // to happen over a major release.
switch val := any(v).(type) {
case arrow.Duration:
return b.appendValue(int64(val))
@@ -1058,6 +1130,12 @@ func (b *BinaryDictionaryBuilder) Append(v []byte) error
{
return nil
}
+ // TODO: it is safe to ignore the value returned by the call to
`appendBytes()`
+ // here since `Append()` is statically typed and the only case in which
that
+ // method errors is when trying to insert an invalid `interface{}` into
the `memoTable`.
+ //
+ // This would be a breaking change to the public API of
`BinaryDictionaryBuilder`,
+ // so it needs to happen over a major release.
return b.appendBytes(v)
}
@@ -1134,6 +1212,13 @@ type fixedSizeDictionaryBuilder[T fsbType] struct {
}
func (b *fixedSizeDictionaryBuilder[T]) Append(v T) error {
+ // TODO: it is safe to ignore the value returned by the calls to
`appendValue()`
+ // and `appendBytes()` here since `Append()` is statically typed and
the only
+ // case in which these method error is when trying to insert an invalid
+ // `interface{}` into the `memoTable`.
+ //
+ // This would be a breaking change to the public API of
`fixedSizeDictionaryBuilder`,
+ // so it needs to happen over a major release.
if v, ok := any(v).([]byte); ok {
return b.appendBytes(v[:b.byteWidth])
}
@@ -1164,6 +1249,12 @@ type FixedSizeBinaryDictionaryBuilder struct {
}
func (b *FixedSizeBinaryDictionaryBuilder) Append(v []byte) error {
+ // TODO: it is safe to ignore the value returned by the calls to
`appendValue()`
+ // here since `Append()` is statically typed and the only case in which
that
+ // method errors is when trying to insert an invalid `interface{}` into
the `memoTable`.
+ //
+ // This would be a breaking change to the public API of
`FixedSizeBinaryDictionaryBuilder`,
+ // so it needs to happen over a major release.
return b.appendValue(v[:b.byteWidth])
}
diff --git a/arrow/array/dictionary_test.go b/arrow/array/dictionary_test.go
index 9b9d3b1f..24aab674 100644
--- a/arrow/array/dictionary_test.go
+++ b/arrow/array/dictionary_test.go
@@ -145,6 +145,47 @@ func (p *PrimitiveDictionaryTestSuite)
TestDictionaryBuilderInit() {
p.True(array.Equal(expected, arr))
}
+func (p *PrimitiveDictionaryTestSuite) TestDictionaryBuilderReserveAndAppend()
{
+ expectedType := &arrow.DictionaryType{IndexType: &arrow.Int8Type{},
ValueType: p.typ}
+ bldr := array.NewDictionaryBuilder(p.mem, expectedType)
+ defer bldr.Release()
+
+ builder := reflect.ValueOf(bldr)
+ appendFn := builder.MethodByName("UnsafeAppend")
+ validFn := builder.MethodByName("UnsafeAppendBoolToBitmap")
+
+ bldr.Reserve(7)
+ validFn.Call([]reflect.Value{reflect.ValueOf(true)})
+ validFn.Call([]reflect.Value{reflect.ValueOf(false)})
+ appendFn.Call([]reflect.Value{reflect.ValueOf(0).Convert(p.reftyp)})
+ appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
+ validFn.Call([]reflect.Value{reflect.ValueOf(false)})
+ appendFn.Call([]reflect.Value{reflect.ValueOf(1).Convert(p.reftyp)})
+ appendFn.Call([]reflect.Value{reflect.ValueOf(2).Convert(p.reftyp)})
+
+ p.EqualValues(7, bldr.Len())
+ p.EqualValues(2, bldr.NullN())
+
+ p.EqualValues(3, bldr.DictionarySize())
+
+ arr := bldr.NewArray().(*array.Dictionary)
+ defer arr.Release()
+
+ p.True(arrow.TypeEqual(expectedType, arr.DataType()))
+ expectedDict, _, err := array.FromJSON(p.mem, expectedType.ValueType,
strings.NewReader("[0, 1, 2]"))
+ p.NoError(err)
+ defer expectedDict.Release()
+
+ expectedIndices, _, err := array.FromJSON(p.mem,
expectedType.IndexType, strings.NewReader("[0, null, 0, 1, null, 1, 2]"))
+ p.NoError(err)
+ defer expectedIndices.Release()
+
+ expected := array.NewDictionaryArray(expectedType, expectedIndices,
expectedDict)
+ defer expected.Release()
+
+ p.True(array.Equal(expected, arr))
+}
+
func (p *PrimitiveDictionaryTestSuite) TestDictionaryNewBuilder() {
valueType := p.typ
dictArr, _, err := array.FromJSON(p.mem, valueType,
strings.NewReader("[1, 2]"))
diff --git a/arrow/array/encoded.go b/arrow/array/encoded.go
index 08800d4b..85432a13 100644
--- a/arrow/array/encoded.go
+++ b/arrow/array/encoded.go
@@ -398,6 +398,10 @@ func (b *RunEndEncodedBuilder) AppendNulls(n int) {
}
}
+func (b *RunEndEncodedBuilder) UnsafeAppendBoolToBitmap(v bool) {
+ panic("Calling UnsafeAppendBoolToBitmap on a run-end encoded array is
semantically undefined.")
+}
+
func (b *RunEndEncodedBuilder) NullN() int {
return UnknownNullCount
}