Re: [PR] [WIIP]Improve sidx scan efficiency [skywalking-banyandb]

via GitHub Mon, 10 Nov 2025 22:23:49 -0800


Copilot commented on code in PR #844:
URL: 
https://github.com/apache/skywalking-banyandb/pull/844#discussion_r2512993394



##########
pkg/filter/bloom_filter.go:
##########
@@ -130,3 +139,13 @@ func (bf *BloomFilter) ResizeBits(n int) {
        bits = bits[:n]
        bf.bits = bits
 }
+
+// OptimalBitsSize returns the optimal number of uint64s needed for n items.
+// With B=16, this is simply n/4 (n >> 2), with a minimum of 1.
+func OptimalBitsSize(n int) int {
+       size := n >> 2
+       if size == 0 {
+               return 1
+       }
+       return size

Review Comment:
   The `OptimalBitsSize` function uses the same incorrect bit shift calculation 
`n >> 2` which doesn't properly round up when `n` is not a multiple of 4. This 
can lead to insufficient bit allocation.
   
   For correct rounding up with `B=16`, the formula should be: `(n * 16 + 63) / 
64` or equivalently `(n << 4 + 63) >> 6`.



##########
banyand/internal/sidx/block_scanner.go:
##########
@@ -84,92 +84,62 @@ type blockScanner struct {
        minKey     int64
        maxKey     int64
        asc        bool
+       batchSize  int
 }
 
 func (bsn *blockScanner) scan(ctx context.Context, blockCh chan 
*blockScanResultBatch) {
        if len(bsn.parts) < 1 {
                return
        }
 
-       // Check for context cancellation before starting expensive operations
-       select {
-       case <-ctx.Done():
+       if !bsn.checkContext(ctx) {
                return
-       default:
        }
 
-       bma := generateBlockMetadataArray()
-       defer releaseBlockMetadataArray(bma)
-
        it := generateIter()
        defer releaseIter(it)
 
-       it.init(bma, bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, 
bsn.filter)
+       it.init(bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, bsn.filter, 
bsn.asc)
 
        batch := generateBlockScanResultBatch()
        if it.Error() != nil {
                batch.err = fmt.Errorf("cannot init iter: %w", it.Error())
-               select {
-               case blockCh <- batch:
-               case <-ctx.Done():
-                       releaseBlockScanResultBatch(batch)
-               }
+               bsn.sendBatch(ctx, blockCh, batch)
                return
        }
 
        var totalBlockBytes uint64
        for it.nextBlock() {
-               // Check for context cancellation during iteration
-               select {
-               case <-ctx.Done():
+               if !bsn.checkContext(ctx) {
                        releaseBlockScanResultBatch(batch)
                        return
-               default:
                }
 
-               p := it.piHeap[0]
+               bm, p := it.current()
+               if err := bsn.validateBlockMetadata(bm, p, it); err != nil {
+                       batch.err = err
+                       bsn.sendBatch(ctx, blockCh, batch)
+                       return
+               }
 
-               // Get block size before adding to batch
-               blockSize := p.curBlock.uncompressedSize
+               blockSize := bm.uncompressedSize
 
                // Check if adding this block would exceed quota
-               quota := bsn.pm.AvailableBytes()
-               if quota >= 0 && totalBlockBytes+blockSize > uint64(quota) {
-                       if len(batch.bss) > 0 {
-                               // Send current batch without error
-                               select {
-                               case blockCh <- batch:
-                               case <-ctx.Done():
-                                       releaseBlockScanResultBatch(batch)
-                               }
-                               return
-                       }
-                       // Batch is empty, send error
-                       err := fmt.Errorf("sidx block scan quota exceeded: 
block size %s, quota is %s", humanize.Bytes(blockSize), 
humanize.Bytes(uint64(quota)))
-                       batch.err = err
-                       select {
-                       case blockCh <- batch:
-                       case <-ctx.Done():
-                               releaseBlockScanResultBatch(batch)
-                               return
+               if exceeded, err := bsn.checkQuotaExceeded(totalBlockBytes, 
blockSize, batch); exceeded {
+                       if err != nil {
+                               batch.err = err
                        }
+                       bsn.sendBatch(ctx, blockCh, batch)
                        return
                }
 
                // Quota OK, add block to batch
-               batch.bss = append(batch.bss, blockScanResult{
-                       p: p.p,
-               })
-               bs := &batch.bss[len(batch.bss)-1]
-               bs.bm.copyFrom(p.curBlock)
+               bsn.addBlockToBatch(batch, bm, p)
                totalBlockBytes += blockSize
 
                // Check if batch is full
-               if len(batch.bss) >= cap(batch.bss) {
-                       select {
-                       case blockCh <- batch:
-                       case <-ctx.Done():
-                               releaseBlockScanResultBatch(batch)
+               if len(batch.bss) >= bsn.batchSize || len(batch.bss) >= 
cap(batch.bss) {

Review Comment:
   When `req.MaxBatchSize` is 0 (which is allowed by validation), the condition 
`len(batch.bss) >= bsn.batchSize` will always be true, causing the batch to be 
sent immediately after adding the first block. This defeats the purpose of 
batching and could impact performance.
   
   Consider adding a check to use a default batch size when `bsn.batchSize` is 
0:
   ```go
   batchLimit := bsn.batchSize
   if batchLimit <= 0 {
       batchLimit = cap(batch.bss)
   }
   if len(batch.bss) >= batchLimit || len(batch.bss) >= cap(batch.bss) {
   ```
   ```suggestion
                batchLimit := bsn.batchSize
                if batchLimit <= 0 {
                        batchLimit = cap(batch.bss)
                }
                if len(batch.bss) >= batchLimit || len(batch.bss) >= 
cap(batch.bss) {
   ```



##########
pkg/filter/bloom_filter.go:
##########
@@ -39,8 +42,14 @@ type BloomFilter struct {
 
 // NewBloomFilter creates a new Bloom filter with the number of items n and 
false positive rate p.
 func NewBloomFilter(n int) *BloomFilter {
-       m := n * B
-       bits := make([]uint64, (m+63)/64)
+       // With B=16, we can optimize: m = n * 16 = n << 4
+       // Number of uint64s needed: (n * 16) / 64 = n / 4 = n >> 2
+       // Ensure at least 1 uint64 to avoid empty slice
+       numBits := n >> 2

Review Comment:
   The bit shift calculation `n >> 2` doesn't account for cases where `n` is 
not a multiple of 4, which could lead to insufficient bit allocation. For 
example, if `n = 5`, then `n >> 2 = 1`, which gives only 64 bits total instead 
of the expected `5 * 16 = 80 bits` (requires 2 uint64s). This could cause 
out-of-bounds access or incorrect bloom filter behavior.
   
   The calculation should be: `(n * B + 63) / 64` or equivalently `(n << 4 + 
63) >> 6` to round up properly.
   ```suggestion
        numBits := (n<<4 + 63) >> 6 // (n * B + 63) / 64, rounds up to ensure 
enough bits
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] [WIIP]Improve sidx scan efficiency [skywalking-banyandb]

Reply via email to