This is an automated email from the ASF dual-hosted git repository. wusheng pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/master by this push: new 062b31f4ba Fix AlarmRule expression validation: add labeled metrics mock data for check. (#11437) 062b31f4ba is described below commit 062b31f4ba5586c33f301dd4052f602894b784c4 Author: Wan Kai <wankai...@foxmail.com> AuthorDate: Fri Oct 20 10:45:15 2023 +0800 Fix AlarmRule expression validation: add labeled metrics mock data for check. (#11437) --- docs/en/changes/changes.md | 1 + .../oap/server/core/alarm/provider/AlarmRule.java | 18 +----- .../provider/expr/rt/AlarmMQEVerifyVisitor.java | 66 ++++++++++++++++++++-- .../server/core/alarm/provider/AlarmRuleTest.java | 18 ++++-- test/e2e-v2/cases/alarm/alarm-settings.yml | 11 ++-- .../alarm/expected/silence-after-graphql-warn.yml | 48 ++++++++++++++++ .../alarm/expected/silence-before-graphql-warn.yml | 24 ++++++++ 7 files changed, 156 insertions(+), 30 deletions(-) diff --git a/docs/en/changes/changes.md b/docs/en/changes/changes.md index a027a04a80..cebc12bf8c 100644 --- a/docs/en/changes/changes.md +++ b/docs/en/changes/changes.md @@ -25,6 +25,7 @@ * Fix `AlarmCore` doAlarm: catch exception for each callback to avoid interruption. * Optimize queryBasicTraces in TraceQueryEsDAO. * Fix `WebhookCallback` send incorrect messages, add catch exception for each callback HTTP Post. +* Fix AlarmRule expression validation: add labeled metrics mock data for check. #### UI diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index 5094957265..76c6e6cc1a 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -21,7 +21,6 @@ package org.apache.skywalking.oap.server.core.alarm.provider; import java.util.ArrayList; import java.util.HashSet; import java.util.Map; -import java.util.Optional; import java.util.Set; import lombok.AccessLevel; import lombok.Data; @@ -39,7 +38,6 @@ import org.apache.skywalking.mqe.rt.grammar.MQEParser; import org.apache.skywalking.mqe.rt.type.ExpressionResult; import org.apache.skywalking.mqe.rt.type.ExpressionResultType; import org.apache.skywalking.oap.server.core.alarm.provider.expr.rt.AlarmMQEVerifyVisitor; -import org.apache.skywalking.oap.server.core.storage.annotation.Column; import org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata; import org.apache.skywalking.oap.server.library.util.StringUtil; @@ -95,25 +93,11 @@ public class AlarmRule { private void verifyIncludeMetrics(Set<String> includeMetrics, String expression) throws IllegalExpressionException { Set<String> scopeSet = new HashSet<>(); for (String metricName : includeMetrics) { - Optional<ValueColumnMetadata.ValueColumn> valueColumn = ValueColumnMetadata.INSTANCE.readValueColumnDefinition( - metricName); - if (valueColumn.isEmpty()) { - throw new IllegalExpressionException("Metric: [" + metricName + "] dose not exist."); - } scopeSet.add(ValueColumnMetadata.INSTANCE.getScope(metricName).name()); - Column.ValueDataType dataType = valueColumn.get().getDataType(); - switch (dataType) { - case COMMON_VALUE: - case LABELED_VALUE: - break; - default: - throw new IllegalExpressionException( - "Metric dose not supported in alarm, metric: [" + metricName + "] is not a common or labeled metric."); - } } if (scopeSet.size() != 1) { throw new IllegalExpressionException( - "The metrics in expression: " + expression + " must have the same scope level, but got: " + scopeSet); + "The metrics in expression: " + expression + " must have the same scope level, but got: " + scopeSet + "."); } } } diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java index 78c93afdd7..c6c556f9c8 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/expr/rt/AlarmMQEVerifyVisitor.java @@ -18,7 +18,12 @@ package org.apache.skywalking.oap.server.core.alarm.provider.expr.rt; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; +import java.util.List; +import java.util.Optional; import java.util.Set; import lombok.Getter; import lombok.extern.slf4j.Slf4j; @@ -28,6 +33,13 @@ import org.apache.skywalking.mqe.rt.type.ExpressionResult; import org.apache.skywalking.mqe.rt.type.ExpressionResultType; import org.apache.skywalking.mqe.rt.type.MQEValue; import org.apache.skywalking.mqe.rt.type.MQEValues; +import org.apache.skywalking.mqe.rt.type.Metadata; +import org.apache.skywalking.oap.server.core.Const; +import org.apache.skywalking.oap.server.core.query.type.KeyValue; +import org.apache.skywalking.oap.server.core.storage.annotation.Column; +import org.apache.skywalking.oap.server.core.storage.annotation.ValueColumnMetadata; +import org.apache.skywalking.oap.server.library.util.CollectionUtils; +import org.apache.skywalking.oap.server.library.util.StringUtil; /** * Used for verify the alarm expression and get the metrics name when read the alarm rules. @@ -41,6 +53,14 @@ public class AlarmMQEVerifyVisitor extends MQEVisitorBase { public ExpressionResult visitMetric(MQEParser.MetricContext ctx) { ExpressionResult result = new ExpressionResult(); String metricName = ctx.metricName().getText(); + Optional<ValueColumnMetadata.ValueColumn> valueColumn = ValueColumnMetadata.INSTANCE.readValueColumnDefinition( + metricName); + if (valueColumn.isEmpty()) { + result.setType(ExpressionResultType.UNKNOWN); + result.setError("Metric: [" + metricName + "] dose not exist."); + return result; + } + this.includeMetrics.add(metricName); if (ctx.parent instanceof MQEParser.TopNOPContext) { @@ -48,13 +68,49 @@ public class AlarmMQEVerifyVisitor extends MQEVisitorBase { result.setError("Unsupported operation: [top_n] in alarm expression."); return result; } + Column.ValueDataType dataType = valueColumn.get().getDataType(); - MQEValues mqeValues = new MQEValues(); + MQEValues mockMqeValues = new MQEValues(); MQEValue mqeValue = new MQEValue(); mqeValue.setEmptyValue(true); - mqeValues.getValues().add(mqeValue); - result.getResults().add(mqeValues); - result.setType(ExpressionResultType.TIME_SERIES_VALUES); - return result; + mockMqeValues.getValues().add(mqeValue); + if (dataType == Column.ValueDataType.COMMON_VALUE) { + result.getResults().add(mockMqeValues); + result.setType(ExpressionResultType.TIME_SERIES_VALUES); + return result; + } else if (dataType == Column.ValueDataType.LABELED_VALUE) { + List<String> labelValues = Collections.emptyList(); + if (ctx.label() != null) { + String labelValue = ctx.label().labelValue().getText(); + String labelValueTrim = labelValue.substring(1, labelValue.length() - 1); + if (StringUtil.isNotBlank(labelValueTrim)) { + labelValues = Arrays.asList(labelValueTrim.split(Const.COMMA)); + } + } + ArrayList<MQEValues> mqeValuesList = new ArrayList<>(); + if (CollectionUtils.isEmpty(labelValues)) { + KeyValue label = new KeyValue(GENERAL_LABEL_NAME, GENERAL_LABEL_NAME); + Metadata metadata = new Metadata(); + metadata.getLabels().add(label); + mockMqeValues.setMetric(metadata); + mqeValuesList.add(mockMqeValues); + } else { + for (String value : labelValues) { + Metadata metadata = new Metadata(); + KeyValue label = new KeyValue(GENERAL_LABEL_NAME, value); + metadata.getLabels().add(label); + mockMqeValues.setMetric(metadata); + mqeValuesList.add(mockMqeValues); + } + } + result.setType(ExpressionResultType.TIME_SERIES_VALUES); + result.setResults(mqeValuesList); + result.setLabeledResult(true); + return result; + } else { + result.setType(ExpressionResultType.UNKNOWN); + result.setError("Metric dose not supported in alarm, metric: [" + metricName + "] is not a common or labeled metric."); + return result; + } } } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java index 098d1d3592..64e353587a 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRuleTest.java @@ -41,6 +41,10 @@ public class AlarmRuleTest { "endpoint_percent", "testColumn", Column.ValueDataType.COMMON_VALUE, Function.Avg, 0, Scope.Endpoint.getScopeId() ); + ValueColumnMetadata.INSTANCE.putIfAbsent( + "meter_status_code", "testColumn", Column.ValueDataType.LABELED_VALUE, Function.Avg, 0, + Scope.Service.getScopeId() + ); ValueColumnMetadata.INSTANCE.putIfAbsent( "record", "testColumn", Column.ValueDataType.SAMPLED_RECORD, Function.Avg, 0, Scope.Endpoint.getScopeId() @@ -58,9 +62,15 @@ public class AlarmRuleTest { @Test public void testExpressionVerify() throws IllegalExpressionException { AlarmRule rule = new AlarmRule(); - //normal + //normal common metric rule.setExpression("sum(service_percent < 85) >= 3"); - + //normal labeled metric + //4xx + 5xx > 10 + rule.setExpression("sum(aggregate_labels(meter_status_code{_='4xx,5xx'},sum) > 10) > 3"); + rule.setExpression("sum(aggregate_labels(meter_status_code,sum) > 10) > 3"); + //4xx or 5xx > 10 + rule.setExpression("sum(meter_status_code{_='4xx,5xx'} > 10) >= 3"); + rule.setExpression("sum(meter_status_code > 10) >= 3"); //illegal expression Assertions.assertThrows(IllegalExpressionException.class, () -> { rule.setExpression("what? sum(service_percent < 85) >= 3"); @@ -68,7 +78,7 @@ public class AlarmRuleTest { //not exist metric Assertions.assertEquals( - "Metric: [service_percent111] dose not exist.", + "Expression: sum(service_percent111 < 85) >= 3 error: Metric: [service_percent111] dose not exist.", Assertions.assertThrows(IllegalExpressionException.class, () -> { rule.setExpression("sum(service_percent111 < 85) >= 3"); }).getMessage() @@ -92,7 +102,7 @@ public class AlarmRuleTest { //not a common or labeled metric Assertions.assertEquals( - "Metric dose not supported in alarm, metric: [record] is not a common or labeled metric.", + "Expression: sum(record < 85) > 1 error: Metric dose not supported in alarm, metric: [record] is not a common or labeled metric.", Assertions.assertThrows(IllegalExpressionException.class, () -> { rule.setExpression("sum(record < 85) > 1"); }).getMessage() diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml b/test/e2e-v2/cases/alarm/alarm-settings.yml index 4046d44740..40df6f0a75 100755 --- a/test/e2e-v2/cases/alarm/alarm-settings.yml +++ b/test/e2e-v2/cases/alarm/alarm-settings.yml @@ -25,12 +25,15 @@ rules: receivers: lisi hooks: - webhook.custom - # service sla > 1% - service_sla_rule: - expression: sum(service_sla > 100) >= 1 + # service_percentile > 10ms + service_percentile_rule: + expression: sum(service_percentile{_='0,1,2,3,4'} > 10) >= 3 period: 10 silence-period: 1 - message: Successful rate of service {name} is more than 1% in 1 minutes of last 10 minutes + message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + tags: + level: WARNING + receivers: lisi hooks: - webhook.none comp_rule: diff --git a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml index 6fd21b956a..641150f9da 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-after-graphql-warn.yml @@ -15,6 +15,30 @@ msgs: {{- contains .msgs }} + - starttime: {{ gt .starttime 0 }} + scope: Service + id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + events: + {{- contains .events }} + - uuid: {{ notEmpty .uuid }} + source: + service: e2e-service-provider + serviceinstance: "" + endpoint: "" + name: Alarm + type: "" + message: {{ notEmpty .message }} + parameters: [] + starttime: {{ gt .starttime 0 }} + endtime: {{ gt .endtime 0 }} + layer: GENERAL + {{- end }} - starttime: {{ gt .starttime 0 }} scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 @@ -39,6 +63,30 @@ msgs: endtime: {{ gt .endtime 0 }} layer: GENERAL {{- end }} + - starttime: {{ gt .starttime 0 }} + scope: Service + id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + events: + {{- contains .events }} + - uuid: {{ notEmpty .uuid }} + source: + service: e2e-service-provider + serviceinstance: "" + endpoint: "" + name: Alarm + type: "" + message: {{ notEmpty .message }} + parameters: [] + starttime: {{ gt .starttime 0 }} + endtime: {{ gt .endtime 0 }} + layer: GENERAL + {{- end }} - starttime: {{ gt .starttime 0 }} scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 diff --git a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml index 4a6e55462e..e8d5c1ca88 100644 --- a/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml +++ b/test/e2e-v2/cases/alarm/expected/silence-before-graphql-warn.yml @@ -15,6 +15,30 @@ msgs: {{- contains .msgs }} + - starttime: {{ gt .starttime 0 }} + scope: Service + id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1 + message: Percentile response time of service e2e-service-provider alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 10, p75 > 10, p90 > 10, p95 > 10, p99 > 10. + tags: + - key: level + value: WARNING + - key: receivers + value: lisi + events: + {{- contains .events }} + - uuid: {{ notEmpty .uuid }} + source: + service: e2e-service-provider + serviceinstance: "" + endpoint: "" + name: Alarm + type: "" + message: {{ notEmpty .message }} + parameters: [] + starttime: {{ gt .starttime 0 }} + endtime: {{ gt .endtime 0 }} + layer: GENERAL + {{- end }} - starttime: {{ gt .starttime 0 }} scope: Service id: ZTJlLXNlcnZpY2UtcHJvdmlkZXI=.1