Copilot commented on code in PR #844:
URL:
https://github.com/apache/skywalking-banyandb/pull/844#discussion_r2512993394
##########
pkg/filter/bloom_filter.go:
##########
@@ -130,3 +139,13 @@ func (bf *BloomFilter) ResizeBits(n int) {
bits = bits[:n]
bf.bits = bits
}
+
+// OptimalBitsSize returns the optimal number of uint64s needed for n items.
+// With B=16, this is simply n/4 (n >> 2), with a minimum of 1.
+func OptimalBitsSize(n int) int {
+ size := n >> 2
+ if size == 0 {
+ return 1
+ }
+ return size
Review Comment:
The `OptimalBitsSize` function uses the same incorrect bit shift calculation
`n >> 2` which doesn't properly round up when `n` is not a multiple of 4. This
can lead to insufficient bit allocation.
For correct rounding up with `B=16`, the formula should be: `(n * 16 + 63) /
64` or equivalently `(n << 4 + 63) >> 6`.
##########
banyand/internal/sidx/block_scanner.go:
##########
@@ -84,92 +84,62 @@ type blockScanner struct {
minKey int64
maxKey int64
asc bool
+ batchSize int
}
func (bsn *blockScanner) scan(ctx context.Context, blockCh chan
*blockScanResultBatch) {
if len(bsn.parts) < 1 {
return
}
- // Check for context cancellation before starting expensive operations
- select {
- case <-ctx.Done():
+ if !bsn.checkContext(ctx) {
return
- default:
}
- bma := generateBlockMetadataArray()
- defer releaseBlockMetadataArray(bma)
-
it := generateIter()
defer releaseIter(it)
- it.init(bma, bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey,
bsn.filter)
+ it.init(bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, bsn.filter,
bsn.asc)
batch := generateBlockScanResultBatch()
if it.Error() != nil {
batch.err = fmt.Errorf("cannot init iter: %w", it.Error())
- select {
- case blockCh <- batch:
- case <-ctx.Done():
- releaseBlockScanResultBatch(batch)
- }
+ bsn.sendBatch(ctx, blockCh, batch)
return
}
var totalBlockBytes uint64
for it.nextBlock() {
- // Check for context cancellation during iteration
- select {
- case <-ctx.Done():
+ if !bsn.checkContext(ctx) {
releaseBlockScanResultBatch(batch)
return
- default:
}
- p := it.piHeap[0]
+ bm, p := it.current()
+ if err := bsn.validateBlockMetadata(bm, p, it); err != nil {
+ batch.err = err
+ bsn.sendBatch(ctx, blockCh, batch)
+ return
+ }
- // Get block size before adding to batch
- blockSize := p.curBlock.uncompressedSize
+ blockSize := bm.uncompressedSize
// Check if adding this block would exceed quota
- quota := bsn.pm.AvailableBytes()
- if quota >= 0 && totalBlockBytes+blockSize > uint64(quota) {
- if len(batch.bss) > 0 {
- // Send current batch without error
- select {
- case blockCh <- batch:
- case <-ctx.Done():
- releaseBlockScanResultBatch(batch)
- }
- return
- }
- // Batch is empty, send error
- err := fmt.Errorf("sidx block scan quota exceeded:
block size %s, quota is %s", humanize.Bytes(blockSize),
humanize.Bytes(uint64(quota)))
- batch.err = err
- select {
- case blockCh <- batch:
- case <-ctx.Done():
- releaseBlockScanResultBatch(batch)
- return
+ if exceeded, err := bsn.checkQuotaExceeded(totalBlockBytes,
blockSize, batch); exceeded {
+ if err != nil {
+ batch.err = err
}
+ bsn.sendBatch(ctx, blockCh, batch)
return
}
// Quota OK, add block to batch
- batch.bss = append(batch.bss, blockScanResult{
- p: p.p,
- })
- bs := &batch.bss[len(batch.bss)-1]
- bs.bm.copyFrom(p.curBlock)
+ bsn.addBlockToBatch(batch, bm, p)
totalBlockBytes += blockSize
// Check if batch is full
- if len(batch.bss) >= cap(batch.bss) {
- select {
- case blockCh <- batch:
- case <-ctx.Done():
- releaseBlockScanResultBatch(batch)
+ if len(batch.bss) >= bsn.batchSize || len(batch.bss) >=
cap(batch.bss) {
Review Comment:
When `req.MaxBatchSize` is 0 (which is allowed by validation), the condition
`len(batch.bss) >= bsn.batchSize` will always be true, causing the batch to be
sent immediately after adding the first block. This defeats the purpose of
batching and could impact performance.
Consider adding a check to use a default batch size when `bsn.batchSize` is
0:
```go
batchLimit := bsn.batchSize
if batchLimit <= 0 {
batchLimit = cap(batch.bss)
}
if len(batch.bss) >= batchLimit || len(batch.bss) >= cap(batch.bss) {
```
```suggestion
batchLimit := bsn.batchSize
if batchLimit <= 0 {
batchLimit = cap(batch.bss)
}
if len(batch.bss) >= batchLimit || len(batch.bss) >=
cap(batch.bss) {
```
##########
pkg/filter/bloom_filter.go:
##########
@@ -39,8 +42,14 @@ type BloomFilter struct {
// NewBloomFilter creates a new Bloom filter with the number of items n and
false positive rate p.
func NewBloomFilter(n int) *BloomFilter {
- m := n * B
- bits := make([]uint64, (m+63)/64)
+ // With B=16, we can optimize: m = n * 16 = n << 4
+ // Number of uint64s needed: (n * 16) / 64 = n / 4 = n >> 2
+ // Ensure at least 1 uint64 to avoid empty slice
+ numBits := n >> 2
Review Comment:
The bit shift calculation `n >> 2` doesn't account for cases where `n` is
not a multiple of 4, which could lead to insufficient bit allocation. For
example, if `n = 5`, then `n >> 2 = 1`, which gives only 64 bits total instead
of the expected `5 * 16 = 80 bits` (requires 2 uint64s). This could cause
out-of-bounds access or incorrect bloom filter behavior.
The calculation should be: `(n * B + 63) / 64` or equivalently `(n << 4 +
63) >> 6` to round up properly.
```suggestion
numBits := (n<<4 + 63) >> 6 // (n * B + 63) / 64, rounds up to ensure
enough bits
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]