Re: [PR] refactor deduplicateAggregatedDataPoints [skywalking-banyandb]

via GitHub Thu, 15 Jan 2026 06:53:56 -0800


Copilot commented on code in PR #943:
URL: 
https://github.com/apache/skywalking-banyandb/pull/943#discussion_r2694672268



##########
pkg/query/logical/measure/measure_plan_distributed.go:
##########
@@ -531,26 +542,54 @@ func (s *pushedDownAggregatedIterator) Current() 
[]*measurev1.DataPoint {
        return []*measurev1.DataPoint{s.dataPoints[s.index-1]}
 }
 
+func (s *pushedDownAggregatedIterator) CurrentShardID() common.ShardID {
+       // In distributed query with aggregation, data comes from multiple 
shards, so shard ID is not applicable
+       return 0
+}
+
 func (s *pushedDownAggregatedIterator) Close() error {
        return nil
 }
 
-// deduplicateAggregatedDataPoints removes duplicate aggregated results from 
multiple replicas.
-func deduplicateAggregatedDataPoints(dataPoints []*measurev1.DataPoint, 
groupByTagsRefs [][]*logical.TagRef) ([]*measurev1.DataPoint, error) {
+// deduplicateAggregatedDataPointsWithShard removes duplicate aggregated 
results from multiple replicas
+// of the same shard, while preserving results from different shards.
+func deduplicateAggregatedDataPointsWithShard(dataPoints 
[]*measurev1.InternalDataPoint, groupByTagsRefs [][]*logical.TagRef) 
([]*measurev1.DataPoint, error) {
        if len(groupByTagsRefs) == 0 {
-               return dataPoints, nil
+               return extractDataPoints(dataPoints), nil
        }
+       // key = hash(shard_id, group_key)
+       // Same shard with same group key will be deduplicated
+       // Different shards with same group key will be preserved
        groupMap := make(map[uint64]struct{})
        result := make([]*measurev1.DataPoint, 0, len(dataPoints))
-       for _, dp := range dataPoints {
-               key, err := formatGroupByKey(dp, groupByTagsRefs)
-               if err != nil {
-                       return nil, err
+       for _, idp := range dataPoints {
+               groupKey, keyErr := formatGroupByKey(idp.DataPoint, 
groupByTagsRefs)
+               if keyErr != nil {
+                       return nil, keyErr
                }
+               // Include shard_id in key calculation
+               key := hashWithShard(uint64(idp.ShardId), groupKey)
                if _, exists := groupMap[key]; !exists {
                        groupMap[key] = struct{}{}
-                       result = append(result, dp)
+                       result = append(result, idp.DataPoint)
                }
        }
        return result, nil
 }

Review Comment:
   The new deduplicateAggregatedDataPointsWithShard function lacks test 
coverage. Given that this function implements critical deduplication logic that 
distinguishes between replicas and shards, it should have comprehensive unit 
tests to verify:
   1. Deduplication of data from replicas of the same shard
   2. Preservation of data from different shards with the same group key
   3. Handling of empty groupByTagsRefs
   4. Handling of error cases from formatGroupByKey



##########
banyand/query/processor.go:
##########
@@ -161,6 +162,138 @@ type measureQueryProcessor struct {
        *bus.UnImplementedHealthyListener
 }
 
+// measureExecutionContext holds the common execution context for measure 
queries.
+type measureExecutionContext struct {
+       ml       *logger.Logger
+       ecc      []executor.MeasureExecutionContext
+       metadata []*commonv1.Metadata
+       schemas  []logical.Schema
+}
+
+// buildMeasureContext builds the execution context for a measure query.
+func buildMeasureContext(measureService measure.Service, log *logger.Logger, 
queryCriteria *measurev1.QueryRequest, logPrefix string) 
(*measureExecutionContext, error) {
+       var metadata []*commonv1.Metadata
+       var schemas []logical.Schema
+       var ecc []executor.MeasureExecutionContext
+       for i := range queryCriteria.Groups {
+               meta := &commonv1.Metadata{
+                       Name:  queryCriteria.Name,
+                       Group: queryCriteria.Groups[i],
+               }
+               ec, ecErr := measureService.Measure(meta)
+               if ecErr != nil {
+                       return nil, fmt.Errorf("fail to get execution context 
for measure %s: %w", meta.GetName(), ecErr)
+               }
+               s, schemaErr := logical_measure.BuildSchema(ec.GetSchema(), 
ec.GetIndexRules())
+               if schemaErr != nil {
+                       return nil, fmt.Errorf("fail to build schema for 
measure %s: %w", meta.GetName(), schemaErr)
+               }
+               ecc = append(ecc, ec)
+               schemas = append(schemas, s)
+               metadata = append(metadata, meta)
+       }
+       ml := log.Named(logPrefix, queryCriteria.Groups[0], queryCriteria.Name)
+       return &measureExecutionContext{
+               metadata: metadata,
+               schemas:  schemas,
+               ecc:      ecc,
+               ml:       ml,
+       }, nil
+}
+
+// executeMeasurePlan executes the measure query plan and returns the iterator.
+func executeMeasurePlan(ctx context.Context, queryCriteria 
*measurev1.QueryRequest, mctx *measureExecutionContext) (executor.MIterator, 
logical.Plan, error) {
+       plan, planErr := logical_measure.Analyze(queryCriteria, mctx.metadata, 
mctx.schemas, mctx.ecc)
+       if planErr != nil {
+               return nil, nil, fmt.Errorf("fail to analyze the query request 
for measure %s: %w", queryCriteria.GetName(), planErr)
+       }
+       if e := mctx.ml.Debug(); e.Enabled() {
+               e.Str("plan", plan.String()).Msg("query plan")
+       }
+       mIterator, execErr := plan.(executor.MeasureExecutable).Execute(ctx)
+       if execErr != nil {
+               mctx.ml.Error().Err(execErr).RawJSON("req", 
logger.Proto(queryCriteria)).Msg("fail to query")
+               return nil, nil, fmt.Errorf("fail to execute the query plan for 
measure %s: %w", queryCriteria.GetName(), execErr)
+       }
+       return mIterator, plan, nil
+}
+
+// extractTagValuesFromInternalDataPoints extracts tag values from 
InternalDataPoints for RewriteAggTopNResult.
+func extractTagValuesFromInternalDataPoints(dataPoints 
[]*measurev1.InternalDataPoint, groupByTags []string) 
map[string][]*modelv1.TagValue {
+       tagValueMap := make(map[string][]*modelv1.TagValue)
+       for _, idp := range dataPoints {
+               dp := idp.DataPoint
+               if dp == nil {
+                       continue
+               }
+               for _, tagFamily := range dp.GetTagFamilies() {
+                       for _, tag := range tagFamily.GetTags() {
+                               tagName := tag.GetKey()
+                               if len(groupByTags) == 0 || 
slices.Contains(groupByTags, tagName) {
+                                       tagValueMap[tagName] = 
append(tagValueMap[tagName], tag.GetValue())
+                               }
+                       }
+               }
+       }
+       return tagValueMap
+}
+
+// collectInternalDataPoints collects InternalDataPoints (with ShardId) from 
the iterator.
+func collectInternalDataPoints(mIterator executor.MIterator) 
[]*measurev1.InternalDataPoint {
+       result := make([]*measurev1.InternalDataPoint, 0)
+       for mIterator.Next() {
+               current := mIterator.Current()
+               if len(current) > 0 {
+                       dp := current[0]
+                       internalDp := &measurev1.InternalDataPoint{
+                               DataPoint: dp,
+                               ShardId:   uint32(mIterator.CurrentShardID()),
+                       }
+                       result = append(result, internalDp)
+               }
+       }
+       return result
+}
+
+// extractTagValuesFromDataPoints extracts tag values from DataPoints for 
RewriteAggTopNResult.
+func extractTagValuesFromDataPoints(dataPoints []*measurev1.DataPoint, 
groupByTags []string) map[string][]*modelv1.TagValue {
+       tagValueMap := make(map[string][]*modelv1.TagValue)
+       for _, dp := range dataPoints {
+               for _, tagFamily := range dp.GetTagFamilies() {
+                       for _, tag := range tagFamily.GetTags() {
+                               tagName := tag.GetKey()
+                               if len(groupByTags) == 0 || 
slices.Contains(groupByTags, tagName) {
+                                       tagValueMap[tagName] = 
append(tagValueMap[tagName], tag.GetValue())
+                               }
+                       }
+               }
+       }
+       return tagValueMap
+}

Review Comment:
   The functions extractTagValuesFromDataPoints and 
extractTagValuesFromInternalDataPoints have almost identical implementations 
with the only difference being the input type. This creates code duplication. 
Consider refactoring by having extractTagValuesFromInternalDataPoints call 
extractDataPoints first and then reuse extractTagValuesFromDataPoints, or 
extract a common helper function that operates on DataPoint objects.



##########
banyand/measure/query.go:
##########
@@ -848,6 +855,7 @@ func (qr *queryResult) merge(storedIndexValue 
map[common.SeriesID]map[string]*mo
                        return result
                }
                lastSid = topBC.bm.seriesID
+               result.ShardID = topBC.shardID

Review Comment:
   The ShardID assignment inside the merge loop will overwrite the shard ID on 
every iteration. If a single MeasureResult contains data from multiple blocks 
with different shard IDs (which shouldn't happen but isn't explicitly 
prevented), this could lead to an incorrect final shard ID. While this may work 
correctly in practice if all blocks in a single result come from the same 
shard, it would be clearer to either assert this condition or set the shard ID 
only once before the loop.
   ```suggestion
                if result.ShardID == 0 {
                        result.ShardID = topBC.shardID
                }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] refactor deduplicateAggregatedDataPoints [skywalking-banyandb]

Reply via email to