This is an automated email from the ASF dual-hosted git repository.
wankai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/master by this push:
new 75670e4e6e Simplify the alarm recovery record. (#13585)
75670e4e6e is described below
commit 75670e4e6ef36547caea44f5dee4ccfc90a14895
Author: Wan Kai <[email protected]>
AuthorDate: Wed Nov 19 21:12:42 2025 +0800
Simplify the alarm recovery record. (#13585)
---
docs/en/status/query_alarm_runtime_status.md | 40 +++++++-----
.../core/alarm/provider/AlarmStatusWatcher.java | 7 +++
.../server/core/alarm/provider/RunningRule.java | 1 +
.../alarm/provider/status/AlarmRunningContext.java | 3 +
.../oap/server/core/alarm/AlarmRecoveryRecord.java | 71 +---------------------
.../core/alarm/AlarmStandardPersistence.java | 15 -----
.../banyandb/stream/BanyanDBAlarmQueryDAO.java | 6 +-
7 files changed, 40 insertions(+), 103 deletions(-)
diff --git a/docs/en/status/query_alarm_runtime_status.md
b/docs/en/status/query_alarm_runtime_status.md
index 21cb44e5ab..803bce1d21 100644
--- a/docs/en/status/query_alarm_runtime_status.md
+++ b/docs/en/status/query_alarm_runtime_status.md
@@ -73,7 +73,7 @@ Return the detailed information of the alarm running rule.
{
"scope": "SERVICE",
"name": "mock_b_service",
- "formattedMessage": "Response time of mock_b_service is more than
upper baseline in 1 minutes of last 10 minutes."
+ "formattedMessage": "Service mock_b_service response time is more
than 1000ms of last 10 minutes"
}
],
"tags": [
@@ -108,12 +108,12 @@ Return the detailed information of the alarm running rule.
{
"scope": "SERVICE",
"name": "mock_a_service",
- "formattedMessage": "Response time of mock_a_service is more than
upper baseline in 1 minutes of last 10 minutes."
+ "formattedMessage": "Service mock_a_service response time is more
than 1000ms of last 10 minutes."
},
{
"scope": "SERVICE",
"name": "mock_c_service",
- "formattedMessage": "Response time of service mock_c_service is
more than upper baseline in 1 minutes of last 10 minutes."
+ "formattedMessage": "Service mock_c_service response time is more
than 1000ms of last 10 minutes."
}
],
"tags": [
@@ -155,13 +155,13 @@ Return the running context of the alarm rule.
"status": {
"ruleId": "service_resp_time_rule",
"expression": "sum(service_resp_time > 1000) >= 1",
- "endTime": "2025-11-10T09:39:00.000",
+ "endTime": "2025-11-19T15:20:00.000",
"additionalPeriod": 0,
"size": 10,
- "silencePeriod": 3,
- "recoveryObservationPeriod": 2,
+ "silencePeriod": 10,
+ "recoveryObservationPeriod": 0,
"silenceCountdown": 10,
- "recoveryObservationCountdown": 2,
+ "recoveryObservationCountdown": 0,
"currentState": "FIRING",
"entityName": "mock_b_service",
"windowValues": [
@@ -195,26 +195,30 @@ Return the running context of the alarm rule.
},
{
"index": 7,
+ "metrics": []
+ },
+ {
+ "index": 8,
"metrics": [
{
"name": "service_resp_time",
- "timeBucket": 202502121437,
+ "timeBucket": 202511191519,
"value": "6000"
}
]
},
- {
- "index": 8,
- "metrics": []
- },
{
"index": 9,
"metrics": []
}
],
"mqeMetricsSnapshot": {
- "service_resp_time":
"[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121431\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121432\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121433\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121434\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202502121435\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"2
[...]
- "baseline(service_resp_time,upper)":
"[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202502121430\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121431\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121432\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121433\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121434\",\"doubleValue\":10.0,\"isEmptyValue\":false},{\"id\":\"202502121435\",\"doubleValue\":10.0,\"isEmp
[...]
+ "service_resp_time":
"[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202511191511\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191512\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191513\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191514\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191515\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191516\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"2
[...]
+ },
+ "lastAlarmTime": "1763536823628",
+ "lastAlarmMessage": "Service mock_b_service response time is more than
1000ms of last 10 minutes.",
+ "lastAlarmMqeMetricsSnapshot": {
+ "service_resp_time":
"[{\"metric\":{\"labels\":[]},\"values\":[{\"id\":\"202511191511\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191512\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191513\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191514\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191515\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"202511191516\",\"doubleValue\":0.0,\"isEmptyValue\":true},{\"id\":\"2
[...]
}
}
},
@@ -227,7 +231,8 @@ Return the running context of the alarm rule.
"size": 0,
"silenceCountdown": 0,
"recoveryObservationCountdown": 0,
- "windowValues": []
+ "windowValues": [],
+ "lastAlarmTime": 0
}
}
]
@@ -237,8 +242,11 @@ Return the running context of the alarm rule.
`silenceCountdown` is the countdown of the silence period. -1 means silence
countdown is not running.
`recoveryObservationCountdown` is the countdown of the recovery observation
period.
`windowValues` is the original metrics data when the metrics come in. The
`index` is the index of the window, starting from 0.
-`mqeMetricsSnapshot` is the metrics data in the MQE format which is generated
when executing the checking.
+`mqeMetricsSnapshot` is the current metrics data in the MQE format which is
generated when executing the checking.
These data will be calculated according to the expression.
+`lastAlarmTime` is the last time when the alarm is triggered. It will be reset
to 0 when the alarm recovers.
+`lastAlarmMessage` is the last alarm message when the alarm is triggered.
+`lastAlarmMqeMetricsSnapshot` is the metrics data snapshot in the MQE format
when the last alarm is triggered.
## Get Errors When Querying Status from OAP Instances
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
index 3342894ecc..859bda0e1a 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
@@ -22,6 +22,7 @@ import com.google.gson.Gson;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
+import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.AlarmModule;
import org.apache.skywalking.oap.server.core.alarm.AlarmRulesWatcherService;
import org.apache.skywalking.oap.server.core.alarm.AlarmStatusWatcherService;
@@ -169,6 +170,12 @@ public class AlarmStatusWatcher implements
AlarmStatusWatcherService {
}
});
runningContext.setMqeMetricsSnapshot(window.getMqeMetricsSnapshot());
+ AlarmMessage lastAlarmMessage = window.getLastAlarmMessage();
+ if (lastAlarmMessage != null) {
+
runningContext.setLastAlarmTime(window.getLastAlarmMessage().getStartTime());
+
runningContext.setLastAlarmMessage(window.getLastAlarmMessage().getAlarmMessage());
+
runningContext.setLastAlarmMqeMetricsSnapshot(window.getLastAlarmMessage().getMqeMetricsSnapshot());
+ }
return GSON.toJson(runningContext);
}
}
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
index 4bcf133cf1..a0d3563362 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -260,6 +260,7 @@ public class RunningRule {
private final AlarmStateMachine stateMachine;
private LinkedList<Map<String, Metrics>> values;
private ReentrantLock lock = new ReentrantLock();
+ @Getter
private AlarmMessage lastAlarmMessage;
@Getter
private JsonObject mqeMetricsSnapshot;
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
index 96a1447bcb..16e58828f1 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
@@ -38,6 +38,9 @@ public class AlarmRunningContext {
private String entityName;
private List<WindowValue> windowValues = new ArrayList<>();
private JsonObject mqeMetricsSnapshot;
+ private long lastAlarmTime;
+ private String lastAlarmMessage;
+ private JsonObject lastAlarmMqeMetricsSnapshot;
@Data
public static class Metric {
diff --git
a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java
b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java
index d569a900ab..882f68a31d 100644
---
a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java
+++
b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmRecoveryRecord.java
@@ -21,7 +21,6 @@ package org.apache.skywalking.oap.server.core.alarm;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.analysis.Stream;
-import org.apache.skywalking.oap.server.core.analysis.manual.searchtag.Tag;
import org.apache.skywalking.oap.server.core.analysis.record.Record;
import
org.apache.skywalking.oap.server.core.analysis.worker.RecordStreamProcessor;
import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine;
@@ -30,114 +29,50 @@ import
org.apache.skywalking.oap.server.core.storage.StorageID;
import org.apache.skywalking.oap.server.core.storage.annotation.BanyanDB;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import org.apache.skywalking.oap.server.core.storage.annotation.ElasticSearch;
-import org.apache.skywalking.oap.server.core.storage.annotation.SQLDatabase;
import org.apache.skywalking.oap.server.core.storage.type.Convert2Entity;
import org.apache.skywalking.oap.server.core.storage.type.Convert2Storage;
import org.apache.skywalking.oap.server.core.storage.type.StorageBuilder;
-import java.util.List;
-
import static
org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.ALARM_RECOVERY;
-import static
org.apache.skywalking.oap.server.core.storage.StorageData.TIME_BUCKET;
@Getter
@Setter
@ScopeDeclaration(id = ALARM_RECOVERY, name = "AlarmRecovery")
@Stream(name = AlarmRecoveryRecord.INDEX_NAME, scopeId =
DefaultScopeDefine.ALARM_RECOVERY, builder = AlarmRecoveryRecord.Builder.class,
processor = RecordStreamProcessor.class)
[email protected](additionalTable =
AlarmRecoveryRecord.ADDITIONAL_TAG_TABLE, parentColumn = TIME_BUCKET)
[email protected](AlarmRecoveryRecord.START_TIME)
[email protected](AlarmRecoveryRecord.RECOVERY_TIME)
@BanyanDB.Group(streamGroup = BanyanDB.StreamGroup.RECORDS)
public class AlarmRecoveryRecord extends Record {
public static final String INDEX_NAME = "alarm_recovery_record";
- public static final String ADDITIONAL_TAG_TABLE = "alarm_record_tag";
public static final String UUID = "uuid";
- public static final String SCOPE = "scope";
- public static final String NAME = "name";
- public static final String ID0 = "id0";
- public static final String ID1 = "id1";
- public static final String START_TIME = "start_time";
public static final String RECOVERY_TIME = "recovery_time";
- public static final String ALARM_MESSAGE = "alarm_message";
- public static final String RULE_NAME = "rule_name";
- public static final String TAGS = "tags";
- public static final String TAGS_RAW_DATA = "tags_raw_data";
- public static final String SNAPSHOT = "snapshot";
@Override
public StorageID id() {
- return new StorageID()
- .append(TIME_BUCKET, getTimeBucket())
- .append(RULE_NAME, ruleName)
- .append(ID0, id0)
- .append(ID1, id1);
+ return new StorageID().append(UUID, uuid);
}
- @Column(name = SCOPE)
- private int scope;
- @Column(name = NAME, storageOnly = true, length = 512)
- private String name;
- @Column(name = ID0, storageOnly = true, length = 512)
- @BanyanDB.SeriesID(index = 0)
- private String id0;
- @Column(name = ID1, storageOnly = true)
- private String id1;
- @ElasticSearch.EnableDocValues
- @Column(name = START_TIME)
- private long startTime;
@ElasticSearch.EnableDocValues
@Column(name = RECOVERY_TIME)
private long recoveryTime;
- @Column(name = ALARM_MESSAGE, length = 512)
- @ElasticSearch.MatchQuery
- @BanyanDB.MatchQuery(analyzer = BanyanDB.MatchQuery.AnalyzerType.SIMPLE)
- private String alarmMessage;
- @Column(name = RULE_NAME)
- private String ruleName;
+ @BanyanDB.SeriesID(index = 0)
@Column(name = UUID)
private String uuid;
- @Column(name = TAGS, indexOnly = true)
- @SQLDatabase.AdditionalEntity(additionalTables = {ADDITIONAL_TAG_TABLE})
- private List<String> tagsInString;
- @Column(name = TAGS_RAW_DATA, storageOnly = true, length = Tag.TAG_LENGTH)
- private byte[] tagsRawData;
- @Column(name = SNAPSHOT, storageOnly = true, length = 50000)
- private String snapshot;
public static class Builder implements StorageBuilder<AlarmRecoveryRecord>
{
@Override
public AlarmRecoveryRecord storage2Entity(final Convert2Entity
converter) {
AlarmRecoveryRecord record = new AlarmRecoveryRecord();
- record.setScope(((Number) converter.get(SCOPE)).intValue());
- record.setName((String) converter.get(NAME));
record.setUuid((String) converter.get(UUID));
- record.setId0((String) converter.get(ID0));
- record.setId1((String) converter.get(ID1));
- record.setAlarmMessage((String) converter.get(ALARM_MESSAGE));
- record.setStartTime(((Number)
converter.get(START_TIME)).longValue());
record.setRecoveryTime(((Number)
converter.get(RECOVERY_TIME)).longValue());
record.setTimeBucket(((Number)
converter.get(TIME_BUCKET)).longValue());
- record.setRuleName((String) converter.get(RULE_NAME));
- record.setTagsRawData(converter.getBytes(TAGS_RAW_DATA));
- record.setSnapshot((String) converter.get(SNAPSHOT));
- // Don't read the TAGS as they are only for query.
return record;
}
@Override
public void entity2Storage(final AlarmRecoveryRecord storageData,
final Convert2Storage converter) {
- converter.accept(SCOPE, storageData.getScope());
- converter.accept(NAME, storageData.getName());
converter.accept(UUID, storageData.getUuid());
- converter.accept(ID0, storageData.getId0());
- converter.accept(ID1, storageData.getId1());
- converter.accept(ALARM_MESSAGE, storageData.getAlarmMessage());
- converter.accept(START_TIME, storageData.getStartTime());
converter.accept(RECOVERY_TIME, storageData.getRecoveryTime());
converter.accept(TIME_BUCKET, storageData.getTimeBucket());
- converter.accept(RULE_NAME, storageData.getRuleName());
- converter.accept(TAGS_RAW_DATA, storageData.getTagsRawData());
- converter.accept(TAGS, storageData.getTagsInString());
- converter.accept(SNAPSHOT, storageData.getSnapshot());
}
}
}
diff --git
a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java
b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java
index c4acfa8a00..b8b3d47bf9 100644
---
a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java
+++
b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/alarm/AlarmStandardPersistence.java
@@ -89,23 +89,8 @@ public class AlarmStandardPersistence implements
AlarmCallback {
AlarmRecoveryMessage alarmRecoveryMessage = (AlarmRecoveryMessage)
message;
AlarmRecoveryRecord record = new AlarmRecoveryRecord();
record.setUuid(message.getUuid());
- record.setScope(message.getScopeId());
- record.setId0(message.getId0());
- record.setId1(message.getId1());
- record.setName(message.getName());
- record.setAlarmMessage(message.getAlarmMessage());
- record.setStartTime(message.getStartTime());
record.setRecoveryTime(alarmRecoveryMessage.getRecoveryTime());
record.setTimeBucket(TimeBucket.getRecordTimeBucket(message.getStartTime()));
- record.setRuleName(message.getRuleName());
- Collection<Tag> tags = appendSearchableTags(message.getTags());
- addAutocompleteTags(tags,
TimeBucket.getMinuteTimeBucket(message.getStartTime()));
-
record.setTagsRawData(gson.toJson(message.getTags()).getBytes(Charsets.UTF_8));
- record.setTagsInString(Tag.Util.toStringList(new
ArrayList<>(tags)));
- AlarmSnapshotRecord snapshot = new AlarmSnapshotRecord();
- snapshot.setExpression(message.getExpression());
- snapshot.setMetrics(message.getMqeMetricsSnapshot());
- record.setSnapshot(gson.toJson(snapshot));
RecordStreamProcessor.getInstance().in(record);
});
}
diff --git
a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java
b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java
index a2908bdcd8..799f497d23 100644
---
a/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java
+++
b/oap-server/server-storage-plugin/storage-banyandb-plugin/src/main/java/org/apache/skywalking/oap/server/storage/plugin/banyandb/stream/BanyanDBAlarmQueryDAO.java
@@ -52,10 +52,8 @@ public class BanyanDBAlarmQueryDAO extends
AbstractBanyanDBDAO implements IAlarm
private static final Set<String> TAGS = ImmutableSet.of(AlarmRecord.SCOPE,
AlarmRecord.NAME, AlarmRecord.ID0, AlarmRecord.ID1,
AlarmRecord.UUID, AlarmRecord.ALARM_MESSAGE,
AlarmRecord.START_TIME, AlarmRecord.RULE_NAME, AlarmRecord.TAGS,
AlarmRecord.TAGS_RAW_DATA, AlarmRecord.SNAPSHOT);
- private static final Set<String> RECOVERY_TAGS =
ImmutableSet.of(AlarmRecoveryRecord.SCOPE,
- AlarmRecoveryRecord.NAME, AlarmRecord.ID0,
AlarmRecoveryRecord.ID1, AlarmRecoveryRecord.UUID,
- AlarmRecoveryRecord.ALARM_MESSAGE, AlarmRecoveryRecord.START_TIME,
AlarmRecoveryRecord.RECOVERY_TIME,
- AlarmRecoveryRecord.RULE_NAME, AlarmRecoveryRecord.TAGS,
AlarmRecoveryRecord.TAGS_RAW_DATA, AlarmRecoveryRecord.SNAPSHOT);
+ private static final Set<String> RECOVERY_TAGS = ImmutableSet.of(
+ AlarmRecoveryRecord.UUID, AlarmRecoveryRecord.RECOVERY_TIME);
public BanyanDBAlarmQueryDAO(BanyanDBStorageClient client) {
super(client);