Copilot commented on code in PR #943:
URL:
https://github.com/apache/skywalking-banyandb/pull/943#discussion_r2694672268
##########
pkg/query/logical/measure/measure_plan_distributed.go:
##########
@@ -531,26 +542,54 @@ func (s *pushedDownAggregatedIterator) Current()
[]*measurev1.DataPoint {
return []*measurev1.DataPoint{s.dataPoints[s.index-1]}
}
+func (s *pushedDownAggregatedIterator) CurrentShardID() common.ShardID {
+ // In distributed query with aggregation, data comes from multiple
shards, so shard ID is not applicable
+ return 0
+}
+
func (s *pushedDownAggregatedIterator) Close() error {
return nil
}
-// deduplicateAggregatedDataPoints removes duplicate aggregated results from
multiple replicas.
-func deduplicateAggregatedDataPoints(dataPoints []*measurev1.DataPoint,
groupByTagsRefs [][]*logical.TagRef) ([]*measurev1.DataPoint, error) {
+// deduplicateAggregatedDataPointsWithShard removes duplicate aggregated
results from multiple replicas
+// of the same shard, while preserving results from different shards.
+func deduplicateAggregatedDataPointsWithShard(dataPoints
[]*measurev1.InternalDataPoint, groupByTagsRefs [][]*logical.TagRef)
([]*measurev1.DataPoint, error) {
if len(groupByTagsRefs) == 0 {
- return dataPoints, nil
+ return extractDataPoints(dataPoints), nil
}
+ // key = hash(shard_id, group_key)
+ // Same shard with same group key will be deduplicated
+ // Different shards with same group key will be preserved
groupMap := make(map[uint64]struct{})
result := make([]*measurev1.DataPoint, 0, len(dataPoints))
- for _, dp := range dataPoints {
- key, err := formatGroupByKey(dp, groupByTagsRefs)
- if err != nil {
- return nil, err
+ for _, idp := range dataPoints {
+ groupKey, keyErr := formatGroupByKey(idp.DataPoint,
groupByTagsRefs)
+ if keyErr != nil {
+ return nil, keyErr
}
+ // Include shard_id in key calculation
+ key := hashWithShard(uint64(idp.ShardId), groupKey)
if _, exists := groupMap[key]; !exists {
groupMap[key] = struct{}{}
- result = append(result, dp)
+ result = append(result, idp.DataPoint)
}
}
return result, nil
}
Review Comment:
The new deduplicateAggregatedDataPointsWithShard function lacks test
coverage. Given that this function implements critical deduplication logic that
distinguishes between replicas and shards, it should have comprehensive unit
tests to verify:
1. Deduplication of data from replicas of the same shard
2. Preservation of data from different shards with the same group key
3. Handling of empty groupByTagsRefs
4. Handling of error cases from formatGroupByKey
##########
banyand/query/processor.go:
##########
@@ -161,6 +162,138 @@ type measureQueryProcessor struct {
*bus.UnImplementedHealthyListener
}
+// measureExecutionContext holds the common execution context for measure
queries.
+type measureExecutionContext struct {
+ ml *logger.Logger
+ ecc []executor.MeasureExecutionContext
+ metadata []*commonv1.Metadata
+ schemas []logical.Schema
+}
+
+// buildMeasureContext builds the execution context for a measure query.
+func buildMeasureContext(measureService measure.Service, log *logger.Logger,
queryCriteria *measurev1.QueryRequest, logPrefix string)
(*measureExecutionContext, error) {
+ var metadata []*commonv1.Metadata
+ var schemas []logical.Schema
+ var ecc []executor.MeasureExecutionContext
+ for i := range queryCriteria.Groups {
+ meta := &commonv1.Metadata{
+ Name: queryCriteria.Name,
+ Group: queryCriteria.Groups[i],
+ }
+ ec, ecErr := measureService.Measure(meta)
+ if ecErr != nil {
+ return nil, fmt.Errorf("fail to get execution context
for measure %s: %w", meta.GetName(), ecErr)
+ }
+ s, schemaErr := logical_measure.BuildSchema(ec.GetSchema(),
ec.GetIndexRules())
+ if schemaErr != nil {
+ return nil, fmt.Errorf("fail to build schema for
measure %s: %w", meta.GetName(), schemaErr)
+ }
+ ecc = append(ecc, ec)
+ schemas = append(schemas, s)
+ metadata = append(metadata, meta)
+ }
+ ml := log.Named(logPrefix, queryCriteria.Groups[0], queryCriteria.Name)
+ return &measureExecutionContext{
+ metadata: metadata,
+ schemas: schemas,
+ ecc: ecc,
+ ml: ml,
+ }, nil
+}
+
+// executeMeasurePlan executes the measure query plan and returns the iterator.
+func executeMeasurePlan(ctx context.Context, queryCriteria
*measurev1.QueryRequest, mctx *measureExecutionContext) (executor.MIterator,
logical.Plan, error) {
+ plan, planErr := logical_measure.Analyze(queryCriteria, mctx.metadata,
mctx.schemas, mctx.ecc)
+ if planErr != nil {
+ return nil, nil, fmt.Errorf("fail to analyze the query request
for measure %s: %w", queryCriteria.GetName(), planErr)
+ }
+ if e := mctx.ml.Debug(); e.Enabled() {
+ e.Str("plan", plan.String()).Msg("query plan")
+ }
+ mIterator, execErr := plan.(executor.MeasureExecutable).Execute(ctx)
+ if execErr != nil {
+ mctx.ml.Error().Err(execErr).RawJSON("req",
logger.Proto(queryCriteria)).Msg("fail to query")
+ return nil, nil, fmt.Errorf("fail to execute the query plan for
measure %s: %w", queryCriteria.GetName(), execErr)
+ }
+ return mIterator, plan, nil
+}
+
+// extractTagValuesFromInternalDataPoints extracts tag values from
InternalDataPoints for RewriteAggTopNResult.
+func extractTagValuesFromInternalDataPoints(dataPoints
[]*measurev1.InternalDataPoint, groupByTags []string)
map[string][]*modelv1.TagValue {
+ tagValueMap := make(map[string][]*modelv1.TagValue)
+ for _, idp := range dataPoints {
+ dp := idp.DataPoint
+ if dp == nil {
+ continue
+ }
+ for _, tagFamily := range dp.GetTagFamilies() {
+ for _, tag := range tagFamily.GetTags() {
+ tagName := tag.GetKey()
+ if len(groupByTags) == 0 ||
slices.Contains(groupByTags, tagName) {
+ tagValueMap[tagName] =
append(tagValueMap[tagName], tag.GetValue())
+ }
+ }
+ }
+ }
+ return tagValueMap
+}
+
+// collectInternalDataPoints collects InternalDataPoints (with ShardId) from
the iterator.
+func collectInternalDataPoints(mIterator executor.MIterator)
[]*measurev1.InternalDataPoint {
+ result := make([]*measurev1.InternalDataPoint, 0)
+ for mIterator.Next() {
+ current := mIterator.Current()
+ if len(current) > 0 {
+ dp := current[0]
+ internalDp := &measurev1.InternalDataPoint{
+ DataPoint: dp,
+ ShardId: uint32(mIterator.CurrentShardID()),
+ }
+ result = append(result, internalDp)
+ }
+ }
+ return result
+}
+
+// extractTagValuesFromDataPoints extracts tag values from DataPoints for
RewriteAggTopNResult.
+func extractTagValuesFromDataPoints(dataPoints []*measurev1.DataPoint,
groupByTags []string) map[string][]*modelv1.TagValue {
+ tagValueMap := make(map[string][]*modelv1.TagValue)
+ for _, dp := range dataPoints {
+ for _, tagFamily := range dp.GetTagFamilies() {
+ for _, tag := range tagFamily.GetTags() {
+ tagName := tag.GetKey()
+ if len(groupByTags) == 0 ||
slices.Contains(groupByTags, tagName) {
+ tagValueMap[tagName] =
append(tagValueMap[tagName], tag.GetValue())
+ }
+ }
+ }
+ }
+ return tagValueMap
+}
Review Comment:
The functions extractTagValuesFromDataPoints and
extractTagValuesFromInternalDataPoints have almost identical implementations
with the only difference being the input type. This creates code duplication.
Consider refactoring by having extractTagValuesFromInternalDataPoints call
extractDataPoints first and then reuse extractTagValuesFromDataPoints, or
extract a common helper function that operates on DataPoint objects.
##########
banyand/measure/query.go:
##########
@@ -848,6 +855,7 @@ func (qr *queryResult) merge(storedIndexValue
map[common.SeriesID]map[string]*mo
return result
}
lastSid = topBC.bm.seriesID
+ result.ShardID = topBC.shardID
Review Comment:
The ShardID assignment inside the merge loop will overwrite the shard ID on
every iteration. If a single MeasureResult contains data from multiple blocks
with different shard IDs (which shouldn't happen but isn't explicitly
prevented), this could lead to an incorrect final shard ID. While this may work
correctly in practice if all blocks in a single result come from the same
shard, it would be clearer to either assert this condition or set the shard ID
only once before the loop.
```suggestion
if result.ShardID == 0 {
result.ShardID = topBC.shardID
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]