This is an automated email from the ASF dual-hosted git repository.
zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-go.git
The following commit(s) were added to refs/heads/main by this push:
new 8f0ce842 perf(parquet): optimize stats and bloom filters for bool
columns (#715)
8f0ce842 is described below
commit 8f0ce8424dbed073993ecb67fba180cbcb5c13f3
Author: Matt Topol <[email protected]>
AuthorDate: Thu Mar 19 13:23:47 2026 -0400
perf(parquet): optimize stats and bloom filters for bool columns (#715)
### Rationale for this change
The initial bool optimization (#707) added direct bitmap encoding with
`WriteBitmapBatch` and `WriteBitmapBatchSpaced` functions to reduce
allocations during encoding and decoding. However, this didn't implement
the same optimization for statistics updated, bloom filter inserts or
spaced encoding.
### What changes are included in this PR?
1. Update the bloom filter handling to compute hashes directly from
bitmaps, including for spaced values and reuse slice allocation across
iterations.
2. Update statistics handling to update min/max directly from bitmaps
for spaced and non-spaced scenarios.
3. Update the encoder interface for the boolean encoder to have a
`PutSpacedBitmap` and compress bitmaps with validity buffers.
4. Add `writeBitmapValues` and `writeBitmapValuesSpaced` for the boolean
column writer with fallback to `[]bool` conversions if the encoder
doesn't implement the interface.
### Are these changes tested?
Yes, unit tests are added for everything
### Are there any user-facing changes?
No user-facing API changes, this is a pure internal optimization change
for boolean columns.
*Benchmark Results*
*Statistics Update*
```
BEFORE (with bitmap → []bool conversion):
BenchmarkBooleanStatisticsWithConversion-16
153,398 ns/op 109,278 B/op (107 KB) 6 allocs/op
AFTER (direct bitmap operations):
BenchmarkBooleanStatisticsDirectBitmap-16
393 ns/op 2,698 B/op (2.6 KB) 5 allocs/op
```
* 390x faster
* 97.5% less memory
* 1 fewer allocation
*Bloom filter hashing*
```
BEFORE (with bitmap → []bool conversion):
BenchmarkBloomFilterHashingWithConversion-16
1,084,001 ns/op 3,309,593 B/op (3.23 MB) 3 allocs/op
AFTER (direct bitmap operations):
BenchmarkBloomFilterHashingDirectBitmap-16
448,882 ns/op 802,821 B/op (784 KB) 2 allocs/op
```
* 2.4x faster
* 76% less memory
* 2.5 MB saved/operation
* 1 fewer allocation
*Full write path (Stats + Bloom Filter)*
```
BEFORE (with bitmap → []bool conversion):
BenchmarkFullWritePathWithConversion-16
1,211,525 ns/op 3,315,566 B/op (3.24 MB) 15 allocs/op
AFTER (direct bitmap operations):
BenchmarkFullWritePathDirectBitmap-16
580,934 ns/op 807,640 B/op (789 KB) 13 allocs/op
```
* 2.1x faster
* 76% less memory
* 2.5 MB saved per 100k bools written
* 2 fewer allocations
---------
Co-authored-by: Matt <zero@gibson>
Co-authored-by: Claude Sonnet 4.5 <[email protected]>
---
parquet/file/column_writer_types.gen.go | 56 ++--
parquet/file/column_writer_types.gen.go.tmpl | 56 ++--
parquet/internal/encoding/boolean_encoder.go | 97 +++++++
parquet/internal/encoding/encoding_test.go | 198 ++++++++++++++
parquet/metadata/bitmap_benchmark_test.go | 98 +++++++
parquet/metadata/bloom_filter.go | 58 +++++
parquet/metadata/bloom_filter_test.go | 361 ++++++++++++++++++++++++++
parquet/metadata/statistics_test.go | 313 ++++++++++++++++++++++
parquet/metadata/statistics_types.gen.go | 79 +++++-
parquet/metadata/statistics_types.gen.go.tmpl | 80 +++++-
10 files changed, 1348 insertions(+), 48 deletions(-)
diff --git a/parquet/file/column_writer_types.gen.go
b/parquet/file/column_writer_types.gen.go
index 0f7feda0..e7f8c341 100644
--- a/parquet/file/column_writer_types.gen.go
+++ b/parquet/file/column_writer_types.gen.go
@@ -1277,39 +1277,49 @@ func (w *BooleanColumnChunkWriter)
writeBitmapValues(bitmap []byte, bitmapOffset
w.currentEncoder.(encoding.BooleanEncoder).Put(values)
}
- // Note: Statistics and bloom filter updates would require converting
back to []bool
- // For now, skip them to maintain the performance benefit
- // In production, we'd need bitmap-aware statistics/bloom filter methods
+ // Update statistics and bloom filter using bitmap-aware methods
if w.pageStatistics != nil {
- // Convert for statistics (unavoidable for now)
- values := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- values[i] = bitutil.BitIsSet(bitmap,
int(bitmapOffset+i))
- }
- w.pageStatistics.(*metadata.BooleanStatistics).Update(values,
numNulls)
+
w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmap(bitmap,
bitmapOffset, numValues, numNulls)
}
if w.bloomFilter != nil {
- // Convert for bloom filter (unavoidable for now)
- values := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- values[i] = bitutil.BitIsSet(bitmap,
int(bitmapOffset+i))
- }
-
w.bloomFilter.InsertBulk(metadata.GetHashes(w.bloomFilter.Hasher(), values))
+
w.bloomFilter.InsertBulk(metadata.GetHashesFromBitmap(w.bloomFilter.Hasher(),
bitmap, bitmapOffset, numValues))
}
}
// writeBitmapValuesSpaced writes boolean values from a bitmap with validity
information
func (w *BooleanColumnChunkWriter) writeBitmapValuesSpaced(bitmap []byte,
bitmapOffset int64, numRead, numValues int64, validBits []byte, validBitsOffset
int64) {
- // For spaced writes, we need to compress the bitmap according to
validity
- // This requires converting to []bool for now
- // A future optimization could implement bitmap-to-bitmap compression
- spacedValues := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- spacedValues[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
+ // Try to use bitmap-aware encoder interface if available
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+
+ if enc, ok := w.currentEncoder.(bitmapSpacedEncoder); ok {
+ // Direct bitmap path - no []bool conversion!
+ enc.PutSpacedBitmap(bitmap, bitmapOffset, numValues, validBits,
validBitsOffset)
+ } else {
+ // Fallback: convert to []bool for encoders that don't support
bitmap interface
+ spacedValues := make([]bool, numValues)
+ for i := int64(0); i < numValues; i++ {
+ spacedValues[i] = bitutil.BitIsSet(bitmap,
int(bitmapOffset+i))
+ }
+
+ if len(spacedValues) != int(numRead) {
+
w.currentEncoder.(encoding.BooleanEncoder).PutSpaced(spacedValues, validBits,
validBitsOffset)
+ } else {
+
w.currentEncoder.(encoding.BooleanEncoder).Put(spacedValues)
+ }
}
- // Use existing spaced write logic
- w.writeValuesSpaced(spacedValues, numRead, numValues, validBits,
validBitsOffset)
+ // Use bitmap-aware statistics update
+ if w.pageStatistics != nil {
+ nulls := numValues - numRead
+
w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmapSpaced(bitmap,
bitmapOffset, numValues, validBits, validBitsOffset, nulls)
+ }
+
+ // Use bitmap-aware bloom filter hashing
+ if w.bloomFilter != nil {
+
w.bloomFilter.InsertBulk(metadata.GetSpacedHashesFromBitmap(w.bloomFilter.Hasher(),
numRead, bitmap, bitmapOffset, numValues, validBits, validBitsOffset))
+ }
}
func (w *BooleanColumnChunkWriter) checkDictionarySizeLimit() {
diff --git a/parquet/file/column_writer_types.gen.go.tmpl
b/parquet/file/column_writer_types.gen.go.tmpl
index d3ed37cf..9a22bb71 100644
--- a/parquet/file/column_writer_types.gen.go.tmpl
+++ b/parquet/file/column_writer_types.gen.go.tmpl
@@ -430,39 +430,49 @@ func (w *{{.Name}}ColumnChunkWriter)
writeBitmapValues(bitmap []byte, bitmapOffs
w.currentEncoder.(encoding.BooleanEncoder).Put(values)
}
- // Note: Statistics and bloom filter updates would require converting back
to []bool
- // For now, skip them to maintain the performance benefit
- // In production, we'd need bitmap-aware statistics/bloom filter methods
+ // Update statistics and bloom filter using bitmap-aware methods
if w.pageStatistics != nil {
- // Convert for statistics (unavoidable for now)
- values := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- values[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
- }
- w.pageStatistics.(*metadata.BooleanStatistics).Update(values, numNulls)
+ w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmap(bitmap,
bitmapOffset, numValues, numNulls)
}
if w.bloomFilter != nil {
- // Convert for bloom filter (unavoidable for now)
- values := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- values[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
- }
- w.bloomFilter.InsertBulk(metadata.GetHashes(w.bloomFilter.Hasher(),
values))
+
w.bloomFilter.InsertBulk(metadata.GetHashesFromBitmap(w.bloomFilter.Hasher(),
bitmap, bitmapOffset, numValues))
}
}
// writeBitmapValuesSpaced writes boolean values from a bitmap with validity
information
func (w *{{.Name}}ColumnChunkWriter) writeBitmapValuesSpaced(bitmap []byte,
bitmapOffset int64, numRead, numValues int64, validBits []byte, validBitsOffset
int64) {
- // For spaced writes, we need to compress the bitmap according to validity
- // This requires converting to []bool for now
- // A future optimization could implement bitmap-to-bitmap compression
- spacedValues := make([]bool, numValues)
- for i := int64(0); i < numValues; i++ {
- spacedValues[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
+ // Try to use bitmap-aware encoder interface if available
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues int64,
validBits []byte, validBitsOffset int64) int64
}
- // Use existing spaced write logic
- w.writeValuesSpaced(spacedValues, numRead, numValues, validBits,
validBitsOffset)
+ if enc, ok := w.currentEncoder.(bitmapSpacedEncoder); ok {
+ // Direct bitmap path - no []bool conversion!
+ enc.PutSpacedBitmap(bitmap, bitmapOffset, numValues, validBits,
validBitsOffset)
+ } else {
+ // Fallback: convert to []bool for encoders that don't support bitmap
interface
+ spacedValues := make([]bool, numValues)
+ for i := int64(0); i < numValues; i++ {
+ spacedValues[i] = bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
+ }
+
+ if len(spacedValues) != int(numRead) {
+ w.currentEncoder.(encoding.BooleanEncoder).PutSpaced(spacedValues,
validBits, validBitsOffset)
+ } else {
+ w.currentEncoder.(encoding.BooleanEncoder).Put(spacedValues)
+ }
+ }
+
+ // Use bitmap-aware statistics update
+ if w.pageStatistics != nil {
+ nulls := numValues - numRead
+
w.pageStatistics.(*metadata.BooleanStatistics).UpdateFromBitmapSpaced(bitmap,
bitmapOffset, numValues, validBits, validBitsOffset, nulls)
+ }
+
+ // Use bitmap-aware bloom filter hashing
+ if w.bloomFilter != nil {
+
w.bloomFilter.InsertBulk(metadata.GetSpacedHashesFromBitmap(w.bloomFilter.Hasher(),
numRead, bitmap, bitmapOffset, numValues, validBits, validBitsOffset))
+ }
}
{{end}}
diff --git a/parquet/internal/encoding/boolean_encoder.go
b/parquet/internal/encoding/boolean_encoder.go
index 37a8309b..e012b98a 100644
--- a/parquet/internal/encoding/boolean_encoder.go
+++ b/parquet/internal/encoding/boolean_encoder.go
@@ -20,6 +20,7 @@ import (
"encoding/binary"
"github.com/apache/arrow-go/v18/arrow/bitutil"
+ "github.com/apache/arrow-go/v18/internal/bitutils"
"github.com/apache/arrow-go/v18/parquet"
"github.com/apache/arrow-go/v18/parquet/internal/debug"
"github.com/apache/arrow-go/v18/parquet/internal/utils"
@@ -30,6 +31,41 @@ const (
boolsInBuf = boolBufSize * 8
)
+// compressBitmapWithValidity extracts only the valid bits from a source
bitmap,
+// compressing it into a contiguous destination bitmap. Uses SetBitRunReader
for
+// efficient iteration over valid runs.
+func compressBitmapWithValidity(
+ srcBitmap []byte,
+ srcOffset int64,
+ numValues int64,
+ validBits []byte,
+ validBitsOffset int64,
+ numValid int64,
+) []byte {
+ if numValid == 0 {
+ return []byte{}
+ }
+
+ // Allocate destination bitmap to hold only valid bits
+ dstBitmap := make([]byte, bitutil.BytesForBits(numValid))
+ dstWriter := utils.NewBitmapWriter(dstBitmap, 0, int(numValid))
+
+ // Use SetBitRunReader to efficiently iterate over valid runs
+ reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset,
numValues)
+ for {
+ run := reader.NextRun()
+ if run.Length == 0 {
+ break
+ }
+
+ // Copy this run of valid bits from source to destination
+ dstWriter.AppendBitmap(srcBitmap, srcOffset+run.Pos, run.Length)
+ }
+
+ dstWriter.Finish()
+ return dstBitmap
+}
+
// PlainBooleanEncoder encodes bools as a bitmap as per the Plain Encoding
type PlainBooleanEncoder struct {
encoder
@@ -99,6 +135,29 @@ func (enc *PlainBooleanEncoder) PutSpaced(in []bool,
validBits []byte, validBits
enc.Put(bufferOut[:nvalid])
}
+// PutSpacedBitmap encodes boolean values directly from a bitmap with validity
information,
+// without converting to []bool. This avoids the 8x memory overhead of bool
slices.
+// It compresses the bitmap by extracting only valid (non-null) bits.
+func (enc *PlainBooleanEncoder) PutSpacedBitmap(bitmap []byte, bitmapOffset
int64, numValues int64, validBits []byte, validBitsOffset int64) int64 {
+ if numValues == 0 {
+ return 0
+ }
+
+ // Count the number of valid values to pre-allocate destination bitmap
+ numValid := int64(bitutil.CountSetBits(validBits, int(validBitsOffset),
int(numValues)))
+ if numValid == 0 {
+ return 0
+ }
+
+ // Compress bitmap: extract only valid bits
+ compressedBitmap := compressBitmapWithValidity(bitmap, bitmapOffset,
numValues, validBits, validBitsOffset, numValid)
+
+ // Encode the compressed bitmap
+ enc.PutBitmap(compressedBitmap, 0, numValid)
+
+ return numValid
+}
+
// EstimatedDataEncodedSize returns the current number of bytes that have
// been buffered so far
func (enc *PlainBooleanEncoder) EstimatedDataEncodedSize() int64 {
@@ -140,6 +199,44 @@ func (enc *RleBooleanEncoder) PutSpaced(in []bool,
validBits []byte, validBitsOf
enc.Put(bufferOut[:nvalid])
}
+// PutSpacedBitmap encodes boolean values from a bitmap with validity
information.
+// Note: RleBooleanEncoder buffers values as []bool for RLE encoding at flush
time,
+// so we extract valid bits to []bool. This is still better than the caller
doing
+// the full bitmap-to-[]bool conversion for all values.
+func (enc *RleBooleanEncoder) PutSpacedBitmap(bitmap []byte, bitmapOffset
int64, numValues int64, validBits []byte, validBitsOffset int64) int64 {
+ if numValues == 0 {
+ return 0
+ }
+
+ // Count valid values
+ numValid := int64(bitutil.CountSetBits(validBits, int(validBitsOffset),
int(numValues)))
+ if numValid == 0 {
+ return 0
+ }
+
+ // Extract valid bits to []bool for buffering
+ // Use SetBitRunReader to efficiently iterate over valid values
+ bufferOut := make([]bool, numValid)
+ idx := 0
+
+ reader := bitutils.NewSetBitRunReader(validBits, validBitsOffset,
numValues)
+ for {
+ run := reader.NextRun()
+ if run.Length == 0 {
+ break
+ }
+
+ // Convert this run of bits to bools
+ for i := int64(0); i < run.Length; i++ {
+ bufferOut[idx] = bitutil.BitIsSet(bitmap,
int(bitmapOffset+run.Pos+i))
+ idx++
+ }
+ }
+
+ enc.Put(bufferOut)
+ return numValid
+}
+
func (enc *RleBooleanEncoder) EstimatedDataEncodedSize() int64 {
return rleLengthInBytes + int64(enc.maxRleBufferSize())
}
diff --git a/parquet/internal/encoding/encoding_test.go
b/parquet/internal/encoding/encoding_test.go
index 721015cc..26e87aa6 100644
--- a/parquet/internal/encoding/encoding_test.go
+++ b/parquet/internal/encoding/encoding_test.go
@@ -1069,3 +1069,201 @@ func TestBooleanPlainDecoderDecodeToBitmapUnaligned(t
*testing.T) {
t.Skip("Decoder does not support DecodeToBitmap")
}
}
+
+func TestPlainBooleanEncoderPutSpacedBitmapBasic(t *testing.T) {
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Optional, -1), 0, 0)
+ enc := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.Plain, false, descr, memory.DefaultAllocator)
+ benc := enc.(encoding.BooleanEncoder)
+
+ // Cast to access PutSpacedBitmap
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+ spacedEnc := benc.(bitmapSpacedEncoder)
+
+ // Data bitmap: [true, false, true, true, false, false, true, false]
+ bitmap := []byte{0b01001101} // LSB first
+ validBits := []byte{0xFF} // all valid
+
+ numValid := spacedEnc.PutSpacedBitmap(bitmap, 0, 8, validBits, 0)
+ assert.Equal(t, int64(8), numValid)
+
+ buf, err := enc.FlushValues()
+ require.NoError(t, err)
+
+ // Decode and verify
+ dec := encoding.NewDecoder(parquet.Types.Boolean,
parquet.Encodings.Plain, descr, memory.DefaultAllocator)
+ bdec := dec.(encoding.BooleanDecoder)
+ err = bdec.SetData(8, buf.Bytes())
+ require.NoError(t, err)
+
+ out := make([]bool, 8)
+ n, err := bdec.Decode(out)
+ require.NoError(t, err)
+ assert.Equal(t, 8, n)
+
+ expected := []bool{true, false, true, true, false, false, true, false}
+ assert.Equal(t, expected, out)
+}
+
+func TestPlainBooleanEncoderPutSpacedBitmapWithNulls(t *testing.T) {
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Optional, -1), 0, 0)
+ enc := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.Plain, false, descr, memory.DefaultAllocator)
+ benc := enc.(encoding.BooleanEncoder)
+
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+ spacedEnc := benc.(bitmapSpacedEncoder)
+
+ // Data bitmap: [1,1,1,1,0,0,0,0]
+ bitmap := []byte{0b00001111}
+ // Valid bits: [1,0,1,0,1,0,1,0] - only positions 0,2,4,6 are valid
+ validBits := []byte{0b01010101}
+
+ numValid := spacedEnc.PutSpacedBitmap(bitmap, 0, 8, validBits, 0)
+ assert.Equal(t, int64(4), numValid, "should encode 4 valid values")
+
+ buf, err := enc.FlushValues()
+ require.NoError(t, err)
+
+ // Decode and verify - should get compressed bitmap (only valid values)
+ dec := encoding.NewDecoder(parquet.Types.Boolean,
parquet.Encodings.Plain, descr, memory.DefaultAllocator)
+ bdec := dec.(encoding.BooleanDecoder)
+ err = bdec.SetData(4, buf.Bytes())
+ require.NoError(t, err)
+
+ out := make([]bool, 4)
+ n, err := bdec.Decode(out)
+ require.NoError(t, err)
+ assert.Equal(t, 4, n)
+
+ // Valid positions 0,2,4,6 have data bits 1,1,0,0
+ expected := []bool{true, true, false, false}
+ assert.Equal(t, expected, out)
+}
+
+func TestPlainBooleanEncoderPutSpacedBitmapConsistency(t *testing.T) {
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Optional, -1), 0, 0)
+
+ // Verify PutSpacedBitmap produces same output as PutSpaced with []bool
+ bitmap := []byte{0b10110010}
+ validBits := []byte{0xFF}
+ numValues := int64(8)
+
+ // Encode using PutSpacedBitmap
+ enc1 := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.Plain, false, descr, memory.DefaultAllocator)
+ benc1 := enc1.(encoding.BooleanEncoder)
+
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+ spacedEnc1 := benc1.(bitmapSpacedEncoder)
+
+ spacedEnc1.PutSpacedBitmap(bitmap, 0, numValues, validBits, 0)
+ buf1, err := enc1.FlushValues()
+ require.NoError(t, err)
+
+ // Encode using PutSpaced with []bool
+ enc2 := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.Plain, false, descr, memory.DefaultAllocator)
+ benc2 := enc2.(encoding.BooleanEncoder)
+
+ bools := make([]bool, numValues)
+ for i := int64(0); i < numValues; i++ {
+ bools[i] = bitutil.BitIsSet(bitmap, int(i))
+ }
+ benc2.PutSpaced(bools, validBits, 0)
+ buf2, err := enc2.FlushValues()
+ require.NoError(t, err)
+
+ // Both should produce identical encoded output
+ assert.Equal(t, buf2.Bytes(), buf1.Bytes(), "encoded output should
match")
+}
+
+func TestRleBooleanEncoderPutSpacedBitmap(t *testing.T) {
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Optional, -1), 0, 0)
+ enc := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.RLE, false, descr, memory.DefaultAllocator)
+ benc := enc.(encoding.BooleanEncoder)
+
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+ spacedEnc, ok := benc.(bitmapSpacedEncoder)
+ require.True(t, ok, "RleBooleanEncoder should implement
bitmapSpacedEncoder interface")
+
+ // Data bitmap: [true,true,false,true,false,false,true,false]
+ bitmap := []byte{0b01001011}
+ // Valid bits: [1,0,1,0,1,0,1,0] - positions 0,2,4,6 are valid
+ validBits := []byte{0b01010101}
+
+ numValid := spacedEnc.PutSpacedBitmap(bitmap, 0, 8, validBits, 0)
+ assert.Equal(t, int64(4), numValid, "should encode 4 valid values")
+
+ buf, err := enc.FlushValues()
+ require.NoError(t, err)
+ defer buf.Release()
+
+ // Decode and verify - should get only the valid positions: [true,
false, false, true]
+ dec := encoding.NewDecoder(parquet.Types.Boolean,
parquet.Encodings.RLE, descr, memory.DefaultAllocator)
+ bdec := dec.(encoding.BooleanDecoder)
+ err = bdec.SetData(4, buf.Bytes())
+ require.NoError(t, err)
+
+ out := make([]bool, 4)
+ n, err := bdec.Decode(out)
+ require.NoError(t, err)
+ assert.Equal(t, 4, n)
+
+ // Valid values are at positions 0,2,4,6 with values
true,false,false,true
+ expected := []bool{true, false, false, true}
+ assert.Equal(t, expected, out)
+}
+
+func TestRleBooleanEncoderPutSpacedBitmapConsistency(t *testing.T) {
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Optional, -1), 0, 0)
+
+ // Create test data
+ bitmap := []byte{0b10110101}
+ validBits := []byte{0b11110011}
+
+ // Encode with bitmap method
+ enc1 := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.RLE, false, descr, memory.DefaultAllocator)
+ benc1 := enc1.(encoding.BooleanEncoder)
+ type bitmapSpacedEncoder interface {
+ PutSpacedBitmap(bitmap []byte, bitmapOffset int64, numValues
int64, validBits []byte, validBitsOffset int64) int64
+ }
+ spacedEnc := benc1.(bitmapSpacedEncoder)
+ numValid := spacedEnc.PutSpacedBitmap(bitmap, 0, 8, validBits, 0)
+ buf1, _ := enc1.FlushValues()
+ defer buf1.Release()
+
+ // Encode with []bool method (reference)
+ enc2 := encoding.NewEncoder(parquet.Types.Boolean,
parquet.Encodings.RLE, false, descr, memory.DefaultAllocator)
+ benc2 := enc2.(encoding.BooleanEncoder)
+ spacedValues := make([]bool, 8)
+ for i := 0; i < 8; i++ {
+ spacedValues[i] = bitutil.BitIsSet(bitmap, i)
+ }
+ benc2.PutSpaced(spacedValues, validBits, 0)
+ buf2, _ := enc2.FlushValues()
+ defer buf2.Release()
+
+ // Both should produce identical output
+ assert.Equal(t, buf2.Bytes(), buf1.Bytes(), "bitmap and []bool methods
should produce identical output")
+
+ // Decode both and verify they match
+ dec1 := encoding.NewDecoder(parquet.Types.Boolean,
parquet.Encodings.RLE, descr, memory.DefaultAllocator)
+ bdec1 := dec1.(encoding.BooleanDecoder)
+ bdec1.SetData(int(numValid), buf1.Bytes())
+
+ dec2 := encoding.NewDecoder(parquet.Types.Boolean,
parquet.Encodings.RLE, descr, memory.DefaultAllocator)
+ bdec2 := dec2.(encoding.BooleanDecoder)
+ bdec2.SetData(int(numValid), buf2.Bytes())
+
+ out1 := make([]bool, numValid)
+ out2 := make([]bool, numValid)
+ bdec1.Decode(out1)
+ bdec2.Decode(out2)
+
+ assert.Equal(t, out2, out1, "decoded values should match")
+}
diff --git a/parquet/metadata/bitmap_benchmark_test.go
b/parquet/metadata/bitmap_benchmark_test.go
new file mode 100644
index 00000000..ef61d6ff
--- /dev/null
+++ b/parquet/metadata/bitmap_benchmark_test.go
@@ -0,0 +1,98 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package metadata
+
+import (
+ "testing"
+
+ "github.com/apache/arrow-go/v18/arrow/bitutil"
+ "github.com/apache/arrow-go/v18/arrow/memory"
+ "github.com/apache/arrow-go/v18/parquet"
+ "github.com/apache/arrow-go/v18/parquet/schema"
+)
+
+// BenchmarkBooleanStatisticsFromBitmap benchmarks statistics update using
direct bitmap operations
+func BenchmarkBooleanStatisticsFromBitmap(b *testing.B) {
+ const numValues = 100000
+
+ bitmap := make([]byte, bitutil.BytesForBits(int64(numValues)))
+ for i := 0; i < numValues; i++ {
+ if i%2 == 0 {
+ bitutil.SetBit(bitmap, i)
+ }
+ }
+
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Required, -1), 0, 0)
+
+ b.ResetTimer()
+ b.ReportAllocs()
+
+ for i := 0; i < b.N; i++ {
+ stats := NewBooleanStatistics(descr, memory.DefaultAllocator)
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+ }
+}
+
+// BenchmarkBloomFilterHashingFromBitmap benchmarks bloom filter hashing using
direct bitmap operations
+func BenchmarkBloomFilterHashingFromBitmap(b *testing.B) {
+ const numValues = 100000
+
+ bitmap := make([]byte, bitutil.BytesForBits(int64(numValues)))
+ for i := 0; i < numValues; i++ {
+ if i%2 == 0 {
+ bitutil.SetBit(bitmap, i)
+ }
+ }
+
+ bloom := NewBloomFilter(1024, 1024, memory.DefaultAllocator)
+
+ b.ResetTimer()
+ b.ReportAllocs()
+
+ for i := 0; i < b.N; i++ {
+ hashes := GetHashesFromBitmap(bloom.Hasher(), bitmap, 0,
numValues)
+ _ = hashes
+ }
+}
+
+// BenchmarkBooleanWritePathBitmap benchmarks the complete write path with
bitmap operations
+func BenchmarkBooleanWritePathBitmap(b *testing.B) {
+ const numValues = 100000
+
+ bitmap := make([]byte, bitutil.BytesForBits(int64(numValues)))
+ for i := 0; i < numValues; i++ {
+ if i%2 == 0 {
+ bitutil.SetBit(bitmap, i)
+ }
+ }
+
+ descr := schema.NewColumn(schema.NewBooleanNode("bool",
parquet.Repetitions.Required, -1), 0, 0)
+
+ b.ResetTimer()
+ b.ReportAllocs()
+
+ for i := 0; i < b.N; i++ {
+ // Statistics update
+ stats := NewBooleanStatistics(descr, memory.DefaultAllocator)
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+
+ // Bloom filter update
+ bloom := NewBloomFilter(1024, 1024, memory.DefaultAllocator)
+ hashes := GetHashesFromBitmap(bloom.Hasher(), bitmap, 0,
numValues)
+ bloom.InsertBulk(hashes)
+ }
+}
diff --git a/parquet/metadata/bloom_filter.go b/parquet/metadata/bloom_filter.go
index f0ec8faf..13a4730c 100644
--- a/parquet/metadata/bloom_filter.go
+++ b/parquet/metadata/bloom_filter.go
@@ -134,6 +134,64 @@ func GetSpacedHashes[T parquet.ColumnTypes](h Hasher,
numValid int64, vals []T,
return out
}
+// GetHashesFromBitmap computes hashes for boolean values directly from a
bitmap
+// without converting to []bool, avoiding 8x memory overhead.
+func GetHashesFromBitmap(h Hasher, bitmap []byte, bitmapOffset int64,
numValues int64) []uint64 {
+ if numValues == 0 {
+ return []uint64{}
+ }
+
+ // Convert each bool bit to []byte for hashing
+ // Reuse a single-byte slice to avoid allocating per value
+ out := make([]uint64, numValues)
+ b := []byte{0}
+ for i := range numValues {
+ val := bitutil.BitIsSet(bitmap, int(bitmapOffset+i))
+ // Hash the boolean as a single byte (0x00 or 0x01)
+ if val {
+ b[0] = 1
+ } else {
+ b[0] = 0
+ }
+ out[i] = h.Sum64(b)
+ }
+ return out
+}
+
+// GetSpacedHashesFromBitmap computes hashes for boolean values from a bitmap
with validity information,
+// only hashing valid (non-null) values. Returns hashes for numValid values.
+// This avoids the 8x memory overhead of converting to []bool.
+func GetSpacedHashesFromBitmap(h Hasher, numValid int64, bitmap []byte,
bitmapOffset int64, numValues int64, validBits []byte, validBitsOffset int64)
[]uint64 {
+ if numValid == 0 {
+ return []uint64{}
+ }
+
+ out := make([]uint64, 0, numValid)
+
+ // Use SetBitRunReader to efficiently iterate over valid values
+ // Reuse a single-byte slice to avoid allocating per value
+ b := []byte{0}
+ setReader := bitutils.NewSetBitRunReader(validBits, validBitsOffset,
numValues)
+ for {
+ run := setReader.NextRun()
+ if run.Length == 0 {
+ break
+ }
+
+ // Hash each valid bit in this run
+ for i := range run.Length {
+ val := bitutil.BitIsSet(bitmap,
int(bitmapOffset+run.Pos+i))
+ if val {
+ b[0] = 1
+ } else {
+ b[0] = 0
+ }
+ out = append(out, h.Sum64(b))
+ }
+ }
+ return out
+}
+
func getBytes[T parquet.ColumnTypes](v T) []byte {
switch v := any(v).(type) {
case parquet.ByteArray:
diff --git a/parquet/metadata/bloom_filter_test.go
b/parquet/metadata/bloom_filter_test.go
index ea671438..011f67a5 100644
--- a/parquet/metadata/bloom_filter_test.go
+++ b/parquet/metadata/bloom_filter_test.go
@@ -360,3 +360,364 @@ func TestAdaptiveBloomFilterEndToEnd(t *testing.T) {
len(bf.candidates),
bf.optimalCandidate().bloomFilter.Size())
})
}
+
+func TestGetHashesFromBitmap(t *testing.T) {
+ mem := memory.DefaultAllocator
+ bf := NewBloomFilter(32, 1024, mem)
+ hasher := bf.Hasher()
+
+ t.Run("empty bitmap", func(t *testing.T) {
+ hashes := GetHashesFromBitmap(hasher, []byte{}, 0, 0)
+ assert.Empty(t, hashes)
+ })
+
+ t.Run("single byte aligned", func(t *testing.T) {
+ bitmap := []byte{0b10101010} // alternating true/false
+ numValues := int64(8)
+
+ hashes := GetHashesFromBitmap(hasher, bitmap, 0, numValues)
+ assert.Len(t, hashes, 8)
+
+ // Verify each hash corresponds to the bit value
+ // Bit 0 = 0 (false), Bit 1 = 1 (true), Bit 2 = 0 (false), etc.
+ for i := int64(0); i < numValues; i++ {
+ val := bitutil.BitIsSet(bitmap, int(i))
+ expectedHash := GetHash(hasher, val)
+ assert.Equal(t, expectedHash, hashes[i], "hash mismatch
at position %d", i)
+ }
+ })
+
+ t.Run("unaligned offset", func(t *testing.T) {
+ // Create bitmap: [0,0,0,1,1,0,1,0, 1,1,1,...]
+ bitmap := make([]byte, 2)
+ bitutil.SetBit(bitmap, 3)
+ bitutil.SetBit(bitmap, 4)
+ bitutil.SetBit(bitmap, 6)
+ bitutil.SetBit(bitmap, 8)
+ bitutil.SetBit(bitmap, 9)
+ bitutil.SetBit(bitmap, 10)
+
+ // Read 5 bits starting from offset 3: [1,1,0,1,0]
+ offset := int64(3)
+ numValues := int64(5)
+
+ hashes := GetHashesFromBitmap(hasher, bitmap, offset, numValues)
+ assert.Len(t, hashes, 5)
+
+ // Verify the hashes match the bit values at the offset
+ for i := int64(0); i < numValues; i++ {
+ val := bitutil.BitIsSet(bitmap, int(offset+i))
+ expectedHash := GetHash(hasher, val)
+ assert.Equal(t, expectedHash, hashes[i], "hash mismatch
at offset %d position %d", offset, i)
+ }
+ })
+
+ t.Run("all true bits", func(t *testing.T) {
+ bitmap := []byte{0xFF, 0xFF} // all bits set
+ numValues := int64(16)
+
+ hashes := GetHashesFromBitmap(hasher, bitmap, 0, numValues)
+ assert.Len(t, hashes, 16)
+
+ // All hashes should be identical (hash of true)
+ expectedHash := GetHash(hasher, true)
+ for i, hash := range hashes {
+ assert.Equal(t, expectedHash, hash, "hash mismatch at
position %d", i)
+ }
+ })
+
+ t.Run("all false bits", func(t *testing.T) {
+ bitmap := []byte{0x00, 0x00} // all bits clear
+ numValues := int64(16)
+
+ hashes := GetHashesFromBitmap(hasher, bitmap, 0, numValues)
+ assert.Len(t, hashes, 16)
+
+ // All hashes should be identical (hash of false)
+ expectedHash := GetHash(hasher, false)
+ for i, hash := range hashes {
+ assert.Equal(t, expectedHash, hash, "hash mismatch at
position %d", i)
+ }
+ })
+
+ t.Run("large bitmap", func(t *testing.T) {
+ numValues := int64(1000)
+ bitmap := make([]byte, bitutil.BytesForBits(numValues))
+
+ // Set every 3rd bit
+ for i := int64(0); i < numValues; i += 3 {
+ bitutil.SetBit(bitmap, int(i))
+ }
+
+ hashes := GetHashesFromBitmap(hasher, bitmap, 0, numValues)
+ assert.Len(t, hashes, 1000)
+
+ // Verify hashes match bit values
+ hashTrue := GetHash(hasher, true)
+ hashFalse := GetHash(hasher, false)
+
+ for i := int64(0); i < numValues; i++ {
+ if i%3 == 0 {
+ assert.Equal(t, hashTrue, hashes[i], "expected
true hash at position %d", i)
+ } else {
+ assert.Equal(t, hashFalse, hashes[i], "expected
false hash at position %d", i)
+ }
+ }
+ })
+
+ t.Run("consistency with GetHash", func(t *testing.T) {
+ // Verify that GetHashesFromBitmap produces same results as
GetHash for each bool
+ testCases := []struct {
+ name string
+ bitmap []byte
+ offset int64
+ count int64
+ }{
+ {"aligned_8", []byte{0b10110100}, 0, 8},
+ {"unaligned_start", []byte{0xFF, 0x00}, 3, 10},
+ {"single_bit", []byte{0b00000001}, 0, 1},
+ {"partial_byte", []byte{0b11110000}, 2, 5},
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ hashes := GetHashesFromBitmap(hasher,
tc.bitmap, tc.offset, tc.count)
+
+ // Generate expected hashes using GetHash on
individual bools
+ for i := int64(0); i < tc.count; i++ {
+ val := bitutil.BitIsSet(tc.bitmap,
int(tc.offset+i))
+ expectedHash := GetHash(hasher, val)
+ assert.Equal(t, expectedHash, hashes[i],
+ "hash mismatch at position %d
(bit value: %v)", i, val)
+ }
+ })
+ }
+ })
+}
+
+func TestGetSpacedHashesFromBitmap(t *testing.T) {
+ mem := memory.DefaultAllocator
+ bf := NewBloomFilter(32, 1024, mem)
+ hasher := bf.Hasher()
+
+ t.Run("all valid bits", func(t *testing.T) {
+ bitmap := []byte{0b10101010} // alternating true/false
+ validBits := []byte{0xFF} // all valid
+ numValues := int64(8)
+ numValid := int64(8)
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, 8)
+
+ // Should match non-spaced version since all are valid
+ expectedHashes := GetHashesFromBitmap(hasher, bitmap, 0,
numValues)
+ assert.Equal(t, expectedHashes, hashes)
+ })
+
+ t.Run("some null values", func(t *testing.T) {
+ // Data bitmap: [1,1,1,1,0,0,0,0]
+ bitmap := []byte{0b00001111}
+ // Valid bits: [1,0,1,0,1,0,1,0] - only positions 0,2,4,6 are
valid
+ validBits := []byte{0b01010101}
+ numValues := int64(8)
+ numValid := int64(4)
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, 4, "should only hash valid values")
+
+ // Manually verify the valid positions: 0,2,4,6 with data bits:
1,1,0,0
+ expectedHashes := []uint64{
+ GetHash(hasher, true), // position 0: bit is set
+ GetHash(hasher, true), // position 2: bit is set
+ GetHash(hasher, false), // position 4: bit is clear
+ GetHash(hasher, false), // position 6: bit is clear
+ }
+ assert.Equal(t, expectedHashes, hashes)
+ })
+
+ t.Run("all null values", func(t *testing.T) {
+ bitmap := []byte{0xFF}
+ validBits := []byte{0x00} // all null
+
+ hashes := GetSpacedHashesFromBitmap(hasher, 0, bitmap, 0, 8,
validBits, 0)
+ assert.Empty(t, hashes, "should return empty for all nulls")
+ })
+
+ t.Run("unaligned offsets", func(t *testing.T) {
+ // Create larger bitmaps with offsets
+ bitmap := make([]byte, 3)
+ validBits := make([]byte, 3)
+
+ // Set data bits starting from offset 5
+ bitutil.SetBit(bitmap, 5) // true
+ bitutil.SetBit(bitmap, 6) // true
+ // bit 7 = false
+ bitutil.SetBit(bitmap, 8) // true
+ // bit 9 = false
+ bitutil.SetBit(bitmap, 10) // true
+
+ // Set valid bits (skip position 6)
+ bitutil.SetBit(validBits, 5) // valid
+ // bit 6 is null
+ bitutil.SetBit(validBits, 7) // valid
+ bitutil.SetBit(validBits, 8) // valid
+ bitutil.SetBit(validBits, 9) // valid
+ bitutil.SetBit(validBits, 10) // valid
+
+ numValues := int64(6)
+ numValid := int64(5) // all except position 6
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
5, numValues, validBits, 5)
+ assert.Len(t, hashes, 5)
+
+ // Verify each hash: positions 5(true), 7(false), 8(true),
9(false), 10(true)
+ expectedHashes := []uint64{
+ GetHash(hasher, true), // position 5
+ GetHash(hasher, false), // position 7
+ GetHash(hasher, true), // position 8
+ GetHash(hasher, false), // position 9
+ GetHash(hasher, true), // position 10
+ }
+ assert.Equal(t, expectedHashes, hashes)
+ })
+
+ t.Run("first half valid", func(t *testing.T) {
+ bitmap := []byte{0xFF, 0xFF} // all true
+ validBits := []byte{0xFF, 0x00} // first 8 valid, last 8 null
+ numValues := int64(16)
+ numValid := int64(8)
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, 8)
+
+ // All should be hash of true
+ expectedHash := GetHash(hasher, true)
+ for i, hash := range hashes {
+ assert.Equal(t, expectedHash, hash, "position %d", i)
+ }
+ })
+
+ t.Run("second half valid", func(t *testing.T) {
+ bitmap := []byte{0xFF, 0x00} // first 8 true, last 8 false
+ validBits := []byte{0x00, 0xFF} // first 8 null, last 8 valid
+ numValues := int64(16)
+ numValid := int64(8)
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, 8)
+
+ // All should be hash of false
+ expectedHash := GetHash(hasher, false)
+ for i, hash := range hashes {
+ assert.Equal(t, expectedHash, hash, "position %d", i)
+ }
+ })
+
+ t.Run("sparse valid bits", func(t *testing.T) {
+ // Every 3rd bit is valid
+ numValues := int64(24)
+ bitmap := make([]byte, bitutil.BytesForBits(numValues))
+ validBits := make([]byte, bitutil.BytesForBits(numValues))
+
+ // Set data: alternating pattern
+ for i := int64(0); i < numValues; i++ {
+ if i%2 == 0 {
+ bitutil.SetBit(bitmap, int(i))
+ }
+ }
+
+ // Set valid: every 3rd bit
+ numValid := int64(0)
+ for i := int64(0); i < numValues; i += 3 {
+ bitutil.SetBit(validBits, int(i))
+ numValid++
+ }
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, int(numValid))
+
+ // Verify each hash matches the expected bit value
+ hashIdx := 0
+ for i := int64(0); i < numValues; i += 3 {
+ val := bitutil.BitIsSet(bitmap, int(i))
+ expectedHash := GetHash(hasher, val)
+ assert.Equal(t, expectedHash, hashes[hashIdx], "hash
mismatch at position %d", i)
+ hashIdx++
+ }
+ })
+
+ t.Run("consistency with GetSpacedHashes", func(t *testing.T) {
+ // Verify GetSpacedHashesFromBitmap produces same results as
GetSpacedHashes with []bool
+ testCases := []struct {
+ name string
+ bitmap []byte
+ validBits []byte
+ bitmapOffset int64
+ validOffset int64
+ numValues int64
+ numValid int64
+ }{
+ {"all_valid", []byte{0b10110010}, []byte{0xFF}, 0, 0,
8, 8},
+ {"half_valid", []byte{0xFF}, []byte{0x0F}, 0, 0, 8, 4},
+ {"sparse", []byte{0xFF, 0x00}, []byte{0b01010101,
0b10101010}, 0, 0, 16, 8},
+ {"unaligned", []byte{0xFF, 0xFF, 0xFF}, []byte{0xFF,
0x0F, 0x00}, 3, 3, 10, 9},
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ // Get hashes using bitmap method
+ hashesFromBitmap :=
GetSpacedHashesFromBitmap(hasher, tc.numValid, tc.bitmap, tc.bitmapOffset,
tc.numValues, tc.validBits, tc.validOffset)
+
+ // Get hashes using []bool method
+ bools := make([]bool, tc.numValues)
+ for i := int64(0); i < tc.numValues; i++ {
+ bools[i] = bitutil.BitIsSet(tc.bitmap,
int(tc.bitmapOffset+i))
+ }
+ hashesFromBool := GetSpacedHashes(hasher,
tc.numValid, bools, tc.validBits, tc.validOffset)
+
+ // Both should produce identical results
+ assert.Equal(t, hashesFromBool,
hashesFromBitmap, "hash mismatch")
+ assert.Len(t, hashesFromBitmap,
int(tc.numValid))
+ })
+ }
+ })
+
+ t.Run("large bitmap with sparse valid", func(t *testing.T) {
+ numValues := int64(1000)
+ bitmap := make([]byte, bitutil.BytesForBits(numValues))
+ validBits := make([]byte, bitutil.BytesForBits(numValues))
+
+ // Set every 5th bit in bitmap
+ for i := int64(0); i < numValues; i += 5 {
+ bitutil.SetBit(bitmap, int(i))
+ }
+
+ // Set every 7th bit as valid
+ numValid := int64(0)
+ for i := int64(0); i < numValues; i += 7 {
+ bitutil.SetBit(validBits, int(i))
+ numValid++
+ }
+
+ hashes := GetSpacedHashesFromBitmap(hasher, numValid, bitmap,
0, numValues, validBits, 0)
+ assert.Len(t, hashes, int(numValid))
+
+ // Verify count of true vs false hashes
+ hashTrue := GetHash(hasher, true)
+ hashFalse := GetHash(hasher, false)
+
+ trueCount, falseCount := 0, 0
+ for _, h := range hashes {
+ switch h {
+ case hashTrue:
+ trueCount++
+ case hashFalse:
+ falseCount++
+ default:
+ t.Fatalf("unexpected hash value")
+ }
+ }
+
+ assert.Equal(t, int(numValid), trueCount+falseCount, "all
hashes should be true or false")
+ })
+}
diff --git a/parquet/metadata/statistics_test.go
b/parquet/metadata/statistics_test.go
index 70dd3ebc..d312437a 100644
--- a/parquet/metadata/statistics_test.go
+++ b/parquet/metadata/statistics_test.go
@@ -290,3 +290,316 @@ func TestUnsignedDefaultStats(t *testing.T) {
assert.Zero(t, maxv)
}
}
+
+func TestBooleanStatisticsUpdateFromBitmap(t *testing.T) {
+ mem := memory.DefaultAllocator
+ node, err := schema.NewPrimitiveNode("test",
parquet.Repetitions.Required, parquet.Types.Boolean, 0, 0)
+ require.NoError(t, err)
+ descr := schema.NewColumn(node, 0, 0)
+
+ t.Run("all true bits", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0xFF, 0xFF} // all bits set
+ numValues := int64(16)
+
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, true, stats.Min(), "min should be true when all
bits are true")
+ assert.Equal(t, true, stats.Max(), "max should be true when all
bits are true")
+ assert.Equal(t, numValues, stats.NumValues())
+ assert.Equal(t, int64(0), stats.NullCount())
+ })
+
+ t.Run("all false bits", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0x00, 0x00} // all bits clear
+ numValues := int64(16)
+
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, false, stats.Min(), "min should be false when
all bits are false")
+ assert.Equal(t, false, stats.Max(), "max should be false when
all bits are false")
+ assert.Equal(t, numValues, stats.NumValues())
+ assert.Equal(t, int64(0), stats.NullCount())
+ })
+
+ t.Run("mixed true and false", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0b10101010} // alternating bits
+ numValues := int64(8)
+
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, false, stats.Min(), "min should be false with
mixed bits")
+ assert.Equal(t, true, stats.Max(), "max should be true with
mixed bits")
+ assert.Equal(t, numValues, stats.NumValues())
+ })
+
+ t.Run("unaligned offset", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ // Bitmap: [0,0,0,1,1,0,1,0, 1,1,...]
+ bitmap := make([]byte, 2)
+ bitutil.SetBit(bitmap, 3)
+ bitutil.SetBit(bitmap, 4)
+ bitutil.SetBit(bitmap, 6)
+ bitutil.SetBit(bitmap, 8)
+ bitutil.SetBit(bitmap, 9)
+
+ // Read 5 bits starting from offset 3: [1,1,0,1,0] -> has both
true and false
+ offset := int64(3)
+ numValues := int64(5)
+
+ stats.UpdateFromBitmap(bitmap, offset, numValues, 0)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, false, stats.Min(), "should find false bit")
+ assert.Equal(t, true, stats.Max(), "should find true bit")
+ assert.Equal(t, numValues, stats.NumValues())
+ })
+
+ t.Run("with nulls", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0xFF}
+ numValues := int64(8)
+ numNulls := int64(3)
+
+ stats.UpdateFromBitmap(bitmap, 0, numValues, numNulls)
+
+ assert.Equal(t, numValues, stats.NumValues())
+ assert.Equal(t, numNulls, stats.NullCount())
+ })
+
+ t.Run("multiple updates", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+
+ // First update: all true
+ bitmap1 := []byte{0xFF}
+ stats.UpdateFromBitmap(bitmap1, 0, 8, 0)
+ assert.Equal(t, true, stats.Min())
+ assert.Equal(t, true, stats.Max())
+
+ // Second update: has false - should update min
+ bitmap2 := []byte{0x00}
+ stats.UpdateFromBitmap(bitmap2, 0, 8, 0)
+ assert.Equal(t, false, stats.Min(), "min should update to
false")
+ assert.Equal(t, true, stats.Max(), "max should remain true")
+ assert.Equal(t, int64(16), stats.NumValues())
+ })
+
+ t.Run("early exit optimization", func(t *testing.T) {
+ // Create a large bitmap with both true and false in first few
bits
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ numValues := int64(10000)
+ bitmap := make([]byte, bitutil.BytesForBits(numValues))
+
+ // Set first bit to true, second to false (early exit should
trigger)
+ bitutil.SetBit(bitmap, 0) // true
+ // bit 1 is false
+
+ stats.UpdateFromBitmap(bitmap, 0, numValues, 0)
+
+ assert.Equal(t, false, stats.Min())
+ assert.Equal(t, true, stats.Max())
+ })
+
+ t.Run("consistency with Update method", func(t *testing.T) {
+ // Verify UpdateFromBitmap produces same results as Update with
[]bool
+ testCases := []struct {
+ name string
+ bitmap []byte
+ offset int64
+ count int64
+ }{
+ {"all_true", []byte{0xFF}, 0, 8},
+ {"all_false", []byte{0x00}, 0, 8},
+ {"mixed", []byte{0b10110010}, 0, 8},
+ {"unaligned", []byte{0xFF, 0x00}, 3, 10},
+ }
+
+ for _, tc := range testCases {
+ t.Run(tc.name, func(t *testing.T) {
+ stats1 := metadata.NewBooleanStatistics(descr,
mem)
+ stats2 := metadata.NewBooleanStatistics(descr,
mem)
+
+ // Update using bitmap
+ stats1.UpdateFromBitmap(tc.bitmap, tc.offset,
tc.count, 0)
+
+ // Update using []bool
+ bools := make([]bool, tc.count)
+ for i := int64(0); i < tc.count; i++ {
+ bools[i] = bitutil.BitIsSet(tc.bitmap,
int(tc.offset+i))
+ }
+ stats2.Update(bools, 0)
+
+ // Both should produce same results
+ assert.Equal(t, stats2.Min(), stats1.Min(),
"min mismatch")
+ assert.Equal(t, stats2.Max(), stats1.Max(),
"max mismatch")
+ assert.Equal(t, stats2.NumValues(),
stats1.NumValues(), "numValues mismatch")
+ })
+ }
+ })
+}
+
+func TestBooleanStatisticsUpdateFromBitmapSpaced(t *testing.T) {
+ mem := memory.DefaultAllocator
+ node, err := schema.NewPrimitiveNode("test",
parquet.Repetitions.Optional, parquet.Types.Boolean, 0, 0)
+ require.NoError(t, err)
+ descr := schema.NewColumn(node, 0, 0)
+
+ t.Run("all valid bits", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0b10101010} // data: alternating
+ validBits := []byte{0xFF} // all valid
+ numValues := int64(8)
+
+ stats.UpdateFromBitmapSpaced(bitmap, 0, numValues, validBits,
0, 0)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, false, stats.Min())
+ assert.Equal(t, true, stats.Max())
+ assert.Equal(t, numValues, stats.NumValues())
+ })
+
+ t.Run("some null values", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ // Data bitmap: [1,1,1,1,0,0,0,0]
+ bitmap := []byte{0b00001111}
+ // Valid bits: [1,0,1,0,1,0,1,0] - only positions 0,2,4,6 are
valid
+ validBits := []byte{0b01010101}
+ numValues := int64(8)
+ numNulls := int64(4)
+
+ stats.UpdateFromBitmapSpaced(bitmap, 0, numValues, validBits,
0, numNulls)
+
+ assert.True(t, stats.HasMinMax())
+ // Valid positions are 0,2,4,6 with data bits: 1,1,0,0
+ assert.Equal(t, false, stats.Min(), "should find false at
position 4 or 6")
+ assert.Equal(t, true, stats.Max(), "should find true at
position 0 or 2")
+ assert.Equal(t, int64(4), stats.NumValues(), "should count only
valid values")
+ assert.Equal(t, numNulls, stats.NullCount())
+ })
+
+ t.Run("all null values", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ bitmap := []byte{0xFF}
+ validBits := []byte{0x00} // all null
+ numValues := int64(8)
+
+ stats.UpdateFromBitmapSpaced(bitmap, 0, numValues, validBits,
0, numValues)
+
+ assert.False(t, stats.HasMinMax(), "should not have min/max
with all nulls")
+ assert.Equal(t, int64(0), stats.NumValues())
+ assert.Equal(t, numValues, stats.NullCount())
+ })
+
+ t.Run("unaligned offsets", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ // Create larger bitmaps with offset
+ bitmap := make([]byte, 3)
+ validBits := make([]byte, 3)
+
+ // Set some data bits starting from offset 5
+ bitutil.SetBit(bitmap, 5) // true
+ bitutil.SetBit(bitmap, 6) // true
+ // bit 7 = false
+ bitutil.SetBit(bitmap, 8) // true
+ // bit 9 = false
+
+ // Set valid bits (skip position 6)
+ bitutil.SetBit(validBits, 5)
+ // bit 6 is null
+ bitutil.SetBit(validBits, 7)
+ bitutil.SetBit(validBits, 8)
+ bitutil.SetBit(validBits, 9)
+
+ stats.UpdateFromBitmapSpaced(bitmap, 5, 5, validBits, 5, 1)
+
+ assert.True(t, stats.HasMinMax())
+ // Valid positions: 5(true), 7(false), 8(true), 9(false)
+ assert.Equal(t, false, stats.Min())
+ assert.Equal(t, true, stats.Max())
+ assert.Equal(t, int64(4), stats.NumValues())
+ })
+
+ t.Run("only true values valid", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ // Data: [1,1,1,1,0,0,0,0]
+ bitmap := []byte{0b00001111}
+ // Valid: only first 4 bits (all true in data)
+ validBits := []byte{0b00001111}
+
+ stats.UpdateFromBitmapSpaced(bitmap, 0, 8, validBits, 0, 4)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, true, stats.Min(), "only true values are valid")
+ assert.Equal(t, true, stats.Max())
+ })
+
+ t.Run("only false values valid", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+ // Data: [1,1,1,1,0,0,0,0]
+ bitmap := []byte{0b00001111}
+ // Valid: only last 4 bits (all false in data)
+ validBits := []byte{0b11110000}
+
+ stats.UpdateFromBitmapSpaced(bitmap, 0, 8, validBits, 0, 4)
+
+ assert.True(t, stats.HasMinMax())
+ assert.Equal(t, false, stats.Min(), "only false values are
valid")
+ assert.Equal(t, false, stats.Max())
+ })
+
+ t.Run("consistency with UpdateSpaced", func(t *testing.T) {
+ // Verify UpdateFromBitmapSpaced produces same results as
UpdateSpaced with []bool
+ bitmap := []byte{0b10110010}
+ validBits := []byte{0b11010110}
+ numValues := int64(8)
+ numNulls := int64(3)
+
+ stats1 := metadata.NewBooleanStatistics(descr, mem)
+ stats2 := metadata.NewBooleanStatistics(descr, mem)
+
+ // Update using bitmap
+ stats1.UpdateFromBitmapSpaced(bitmap, 0, numValues, validBits,
0, numNulls)
+
+ // Update using []bool
+ bools := make([]bool, numValues)
+ for i := int64(0); i < numValues; i++ {
+ bools[i] = bitutil.BitIsSet(bitmap, int(i))
+ }
+ stats2.UpdateSpaced(bools, validBits, 0, numNulls)
+
+ // Both should produce same results
+ assert.Equal(t, stats2.HasMinMax(), stats1.HasMinMax())
+ if stats1.HasMinMax() {
+ assert.Equal(t, stats2.Min(), stats1.Min(), "min
mismatch")
+ assert.Equal(t, stats2.Max(), stats1.Max(), "max
mismatch")
+ }
+ assert.Equal(t, stats2.NumValues(), stats1.NumValues(),
"numValues mismatch")
+ assert.Equal(t, stats2.NullCount(), stats1.NullCount(),
"nullCount mismatch")
+ })
+
+ t.Run("multiple updates accumulate correctly", func(t *testing.T) {
+ stats := metadata.NewBooleanStatistics(descr, mem)
+
+ // First update: only true values
+ bitmap1 := []byte{0xFF}
+ validBits1 := []byte{0x0F} // first 4 valid
+ stats.UpdateFromBitmapSpaced(bitmap1, 0, 8, validBits1, 0, 4)
+ assert.Equal(t, true, stats.Min())
+ assert.Equal(t, true, stats.Max())
+
+ // Second update: has false values
+ bitmap2 := []byte{0x00}
+ validBits2 := []byte{0xF0} // last 4 valid
+ stats.UpdateFromBitmapSpaced(bitmap2, 0, 8, validBits2, 0, 4)
+
+ assert.Equal(t, false, stats.Min(), "should update to include
false")
+ assert.Equal(t, true, stats.Max())
+ assert.Equal(t, int64(8), stats.NumValues())
+ assert.Equal(t, int64(8), stats.NullCount())
+ })
+}
diff --git a/parquet/metadata/statistics_types.gen.go
b/parquet/metadata/statistics_types.gen.go
index 73988e5e..a00010f9 100644
--- a/parquet/metadata/statistics_types.gen.go
+++ b/parquet/metadata/statistics_types.gen.go
@@ -24,6 +24,7 @@ import (
"github.com/apache/arrow-go/v18/arrow"
"github.com/apache/arrow-go/v18/arrow/array"
+ "github.com/apache/arrow-go/v18/arrow/bitutil"
"github.com/apache/arrow-go/v18/arrow/float16"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/internal/bitutils"
@@ -1713,6 +1714,82 @@ func (s *BooleanStatistics) UpdateFromArrow(values
arrow.Array, updateCounts boo
return fmt.Errorf("%w: update boolean stats from Arrow",
arrow.ErrNotImplemented)
}
+// UpdateFromBitmap updates statistics from a boolean bitmap without
converting to []bool.
+// This avoids the 8x memory overhead and provides better performance.
+func (s *BooleanStatistics) UpdateFromBitmap(bitmap []byte, bitmapOffset
int64, numValues int64, numNulls int64) {
+ s.IncNulls(numNulls)
+ s.nvalues += numValues
+
+ if numValues == 0 {
+ return
+ }
+
+ s.SetMinMax(s.getMinMaxFromBitmap(bitmap, bitmapOffset, numValues))
+}
+
+// UpdateFromBitmapSpaced updates statistics from a spaced boolean bitmap.
+func (s *BooleanStatistics) UpdateFromBitmapSpaced(bitmap []byte, bitmapOffset
int64, numValues int64, validBits []byte, validBitsOffset int64, numNull int64)
{
+ s.IncNulls(numNull)
+ notnull := numValues - numNull
+ s.nvalues += notnull
+
+ if notnull == 0 {
+ return
+ }
+
+ // For spaced case, we need to check valid bits
+ min := s.defaultMin()
+ max := s.defaultMax()
+
+ if s.bitSetReader == nil {
+ s.bitSetReader = bitutils.NewSetBitRunReader(validBits,
validBitsOffset, numValues)
+ } else {
+ s.bitSetReader.Reset(validBits, validBitsOffset, numValues)
+ }
+
+ for {
+ run := s.bitSetReader.NextRun()
+ if run.Length == 0 {
+ break
+ }
+ // Check bits in this valid run
+ for i := int64(0); i < run.Length; i++ {
+ if bitutil.BitIsSet(bitmap,
int(bitmapOffset+run.Pos+i)) {
+ max = true
+ } else {
+ min = false
+ }
+ // Early exit if we've found both values
+ if !min && max {
+ s.SetMinMax(min, max)
+ return
+ }
+ }
+ }
+
+ s.SetMinMax(min, max)
+}
+
+// getMinMaxFromBitmap finds min/max directly from bitmap.
+// For booleans: min is false if any false exists, max is true if any true
exists.
+func (s *BooleanStatistics) getMinMaxFromBitmap(bitmap []byte, bitmapOffset
int64, numValues int64) (min, max bool) {
+ min = true // Will become false if we find any false
+ max = false // Will become true if we find any true
+
+ for i := int64(0); i < numValues; i++ {
+ if bitutil.BitIsSet(bitmap, int(bitmapOffset+i)) {
+ max = true
+ } else {
+ min = false
+ }
+ // Early exit optimization: if we've seen both values, we're
done
+ if !min && max {
+ return
+ }
+ }
+ return
+}
+
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the
current max
func (s *BooleanStatistics) SetMinMax(argMin, argMax bool) {
@@ -2005,7 +2082,7 @@ func (s *ByteArrayStatistics) UpdateFromArrow(values
arrow.Array, updateCounts b
)
for i := 0; i < arr.Len(); i++ {
- nextOffset := curOffset + int64(arr.ValueLen(i))
+ nextOffset := curOffset + int64(arr.ValueLen(i))
v := data[curOffset:nextOffset]
curOffset = nextOffset
diff --git a/parquet/metadata/statistics_types.gen.go.tmpl
b/parquet/metadata/statistics_types.gen.go.tmpl
index fb14ad43..83493998 100644
--- a/parquet/metadata/statistics_types.gen.go.tmpl
+++ b/parquet/metadata/statistics_types.gen.go.tmpl
@@ -412,7 +412,7 @@ func (s *{{.Name}}Statistics) UpdateFromArrow(values
arrow.Array, updateCounts b
return fmt.Errorf("%w: update float16 stats from Arrow",
arrow.ErrNotImplemented)
{{else}}
if values.DataType().(arrow.FixedWidthDataType).Bytes() !=
arrow.{{.Name}}SizeBytes {
- return fmt.Errorf("%w: cannot update {{.name}} stats with %s arrow array",
+ return fmt.Errorf("%w: cannot update {{.name}} stats with %s arrow array",
arrow.ErrInvalid, values.DataType())
}
@@ -422,6 +422,84 @@ func (s *{{.Name}}Statistics) UpdateFromArrow(values
arrow.Array, updateCounts b
{{end -}}
}
+{{if eq .Name "Boolean"}}
+// UpdateFromBitmap updates statistics from a boolean bitmap without
converting to []bool.
+// This avoids the 8x memory overhead and provides better performance.
+func (s *{{.Name}}Statistics) UpdateFromBitmap(bitmap []byte, bitmapOffset
int64, numValues int64, numNulls int64) {
+ s.IncNulls(numNulls)
+ s.nvalues += numValues
+
+ if numValues == 0 {
+ return
+ }
+
+ s.SetMinMax(s.getMinMaxFromBitmap(bitmap, bitmapOffset, numValues))
+}
+
+// UpdateFromBitmapSpaced updates statistics from a spaced boolean bitmap.
+func (s *{{.Name}}Statistics) UpdateFromBitmapSpaced(bitmap []byte,
bitmapOffset int64, numValues int64, validBits []byte, validBitsOffset int64,
numNull int64) {
+ s.IncNulls(numNull)
+ notnull := numValues - numNull
+ s.nvalues += notnull
+
+ if notnull == 0 {
+ return
+ }
+
+ // For spaced case, we need to check valid bits
+ min := s.defaultMin()
+ max := s.defaultMax()
+
+ if s.bitSetReader == nil {
+ s.bitSetReader = bitutils.NewSetBitRunReader(validBits, validBitsOffset,
numValues)
+ } else {
+ s.bitSetReader.Reset(validBits, validBitsOffset, numValues)
+ }
+
+ for {
+ run := s.bitSetReader.NextRun()
+ if run.Length == 0 {
+ break
+ }
+ // Check bits in this valid run
+ for i := int64(0); i < run.Length; i++ {
+ if bitutil.BitIsSet(bitmap, int(bitmapOffset+run.Pos+i)) {
+ max = true
+ } else {
+ min = false
+ }
+ // Early exit if we've found both values
+ if !min && max {
+ s.SetMinMax(min, max)
+ return
+ }
+ }
+ }
+
+ s.SetMinMax(min, max)
+}
+
+// getMinMaxFromBitmap finds min/max directly from bitmap.
+// For booleans: min is false if any false exists, max is true if any true
exists.
+func (s *{{.Name}}Statistics) getMinMaxFromBitmap(bitmap []byte, bitmapOffset
int64, numValues int64) (min, max bool) {
+ min = true // Will become false if we find any false
+ max = false // Will become true if we find any true
+
+ for i := int64(0); i < numValues; i++ {
+ if bitutil.BitIsSet(bitmap, int(bitmapOffset+i)) {
+ max = true
+ } else {
+ min = false
+ }
+ // Early exit optimization: if we've seen both values, we're done
+ if !min && max {
+ return
+ }
+ }
+ return
+}
+{{end}}
+
// SetMinMax updates the min and max values only if they are not currently set
// or if argMin is less than the current min / argMax is greater than the
current max
func (s *{{.Name}}Statistics) SetMinMax(argMin, argMax {{.name}}) {