Re: [PR] Fix sidx tag filter range check issue [skywalking-banyandb]

via GitHub Thu, 05 Mar 2026 17:48:23 -0800


Copilot commented on code in PR #991:
URL: 
https://github.com/apache/skywalking-banyandb/pull/991#discussion_r2893328218



##########
banyand/cmd/dump/sidx.go:
##########
@@ -1077,3 +1089,81 @@ func outputScanResultsAsCSV(results 
[]*sidx.QueryResponse, seriesMap map[common.
 
        return nil
 }
+
+func discoverProjectionTagTypes(sidxPath string, projectionTagNames []string) 
(map[string]pbv1.ValueType, error) {
+       if len(projectionTagNames) == 0 {
+               return nil, nil
+       }
+       partEntries, err := os.ReadDir(sidxPath)
+       if err != nil {
+               return nil, fmt.Errorf("failed to read sidx path %s: %w", 
sidxPath, err)
+       }
+       partPaths := make([]string, 0, len(partEntries))
+       for _, entry := range partEntries {
+               if !entry.IsDir() {
+                       continue
+               }
+               partPaths = append(partPaths, filepath.Join(sidxPath, 
entry.Name()))
+       }
+       sort.Strings(partPaths)
+
+       result := make(map[string]pbv1.ValueType, len(projectionTagNames))
+       for _, tagName := range projectionTagNames {
+               for _, partPath := range partPaths {
+                       tmPath := filepath.Join(partPath, tagName+".tm")
+                       data, readErr := os.ReadFile(tmPath)
+                       if readErr != nil {
+                               continue
+                       }
+                       valueType, parseErr := parseSidxTagValueType(data)
+                       if parseErr != nil {
+                               fmt.Fprintf(os.Stderr, "Warning: failed to 
parse %s: %v\n", tmPath, parseErr)
+                               break
+                       }
+                       result[tagName] = valueType
+                       break
+               }
+       }
+       return result, nil
+}
+
+func parseSidxTagValueType(data []byte) (pbv1.ValueType, error) {
+       src, _, err := encoding.DecodeBytes(data)
+       if err != nil {
+               return pbv1.ValueTypeUnknown, fmt.Errorf("cannot decode tag 
name: %w", err)
+       }
+       if len(src) < 1 {
+               return pbv1.ValueTypeUnknown, fmt.Errorf("invalid tag metadata: 
missing value type")
+       }
+       return pbv1.ValueType(src[0]), nil
+}
+
+func decodeProjectedTagValue(raw string, valueType pbv1.ValueType) string {
+       if raw == "" {
+               return raw
+       }
+       rawBytes := []byte(raw)
+
+       switch valueType {
+       case pbv1.ValueTypeInt64:
+               if len(rawBytes) < 8 {
+                       return raw
+               }
+               return strconv.FormatInt(convert.BytesToInt64(rawBytes), 10)
+       case pbv1.ValueTypeFloat64:
+               if len(rawBytes) < 8 {
+                       return raw
+               }
+               return strconv.FormatFloat(convert.BytesToFloat64(rawBytes), 
'f', -1, 64)
+       case pbv1.ValueTypeTimestamp:
+               if len(rawBytes) < 8 {

Review Comment:
   `decodeProjectedTagValue` checks `len(rawBytes) < 8` before decoding int64. 
If the projected value is unexpectedly longer than 8 bytes, 
`convert.BytesToInt64` will decode only the first 8 bytes and silently produce 
a wrong value. Safer is to require an exact length (e.g., `len(rawBytes) == 8`) 
for numeric/timestamp types and fall back to returning `raw` otherwise.
   ```suggestion
                if len(rawBytes) != 8 {
                        return raw
                }
                return strconv.FormatInt(convert.BytesToInt64(rawBytes), 10)
        case pbv1.ValueTypeFloat64:
                if len(rawBytes) != 8 {
                        return raw
                }
                return strconv.FormatFloat(convert.BytesToFloat64(rawBytes), 
'f', -1, 64)
        case pbv1.ValueTypeTimestamp:
                if len(rawBytes) != 8 {
   ```



##########
banyand/internal/sidx/block_scanner.go:
##########
@@ -96,10 +119,35 @@ func (bsn *blockScanner) scan(ctx context.Context, blockCh 
chan *blockScanResult
                return
        }
 
+       var (
+               totalBlockBytes uint64
+               scannedBlocks   int
+       )
+
+       if tracer := query.GetTracer(ctx); tracer != nil {
+               span, spanCtx := tracer.StartSpan(ctx, "sidx.scan-blocks")
+               bsn.span = span
+               ctx = spanCtx
+               span.Tagf("part_count", "%d", len(bsn.parts))
+               span.Tagf("series_id_count", "%d", len(bsn.seriesIDs))
+               span.Tagf("min_key", "%d", bsn.minKey)
+               span.Tagf("max_key", "%d", bsn.maxKey)
+               span.Tagf("ascending", "%t", bsn.asc)
+               span.Tagf("batch_size", "%d", bsn.batchSize)
+               defer func() {
+                       if span != nil {
+                               span.Tagf("scanned_blocks", "%d", scannedBlocks)
+                               span.Tagf("total_block_bytes", "%d", 
totalBlockBytes)
+                               bsn.flushSkipSummaryToSpan()
+                               span.Stop()
+                       }
+               }()
+       }
+
        it := generateIter()
        defer releaseIter(it)
 
-       it.init(bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, bsn.filter, 
bsn.asc)
+       it.init(bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, bsn.filter, 
bsn.asc, bsn.recordSkip)
 

Review Comment:
   `it.init(..., bsn.recordSkip)` passes the skip recorder even when there is 
no tracer/span on the context. In those cases `recordSkip` will still allocate 
and update `skipStats` for every skipped block, but the data is never flushed 
anywhere (because `bsn.span` stays nil). Consider only wiring the skipRecorder 
when a tracer/span is present (pass nil otherwise), or make `recordSkip` a 
no-op unless `bsn.span != nil` to avoid unnecessary overhead in non-traced 
scans.
   ```suggestion
        var skipRecorder blockSkipRecorderFunc
        if bsn.span != nil {
                skipRecorder = bsn.recordSkip
        }
   
        it.init(bsn.parts, bsn.seriesIDs, bsn.minKey, bsn.maxKey, bsn.filter, 
bsn.asc, skipRecorder)
   ```



##########
banyand/cmd/dump/sidx.go:
##########
@@ -1077,3 +1089,81 @@ func outputScanResultsAsCSV(results 
[]*sidx.QueryResponse, seriesMap map[common.
 
        return nil
 }
+
+func discoverProjectionTagTypes(sidxPath string, projectionTagNames []string) 
(map[string]pbv1.ValueType, error) {
+       if len(projectionTagNames) == 0 {
+               return nil, nil
+       }
+       partEntries, err := os.ReadDir(sidxPath)
+       if err != nil {
+               return nil, fmt.Errorf("failed to read sidx path %s: %w", 
sidxPath, err)
+       }
+       partPaths := make([]string, 0, len(partEntries))
+       for _, entry := range partEntries {
+               if !entry.IsDir() {
+                       continue
+               }
+               partPaths = append(partPaths, filepath.Join(sidxPath, 
entry.Name()))
+       }
+       sort.Strings(partPaths)
+
+       result := make(map[string]pbv1.ValueType, len(projectionTagNames))
+       for _, tagName := range projectionTagNames {
+               for _, partPath := range partPaths {
+                       tmPath := filepath.Join(partPath, tagName+".tm")
+                       data, readErr := os.ReadFile(tmPath)
+                       if readErr != nil {
+                               continue

Review Comment:
   `discoverProjectionTagTypes` reads entire `*.tm` files via `os.ReadFile`, 
but tag-metadata files can be large (one record per block). Since you only need 
the first record to determine `valueType`, consider reading just enough bytes 
to decode the first `EncodeBytes(name)` + 1 byte valueType (e.g., open the file 
and read a small prefix, then extend only if needed) to reduce memory and IO 
overhead.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix sidx tag filter range check issue [skywalking-banyandb]

Reply via email to