This is an automated email from the ASF dual-hosted git repository.
wusheng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking.git
The following commit(s) were added to refs/heads/master by this push:
new e57eeeb7a5 Polish alarm recovery logic. (#13581)
e57eeeb7a5 is described below
commit e57eeeb7a535f8fa6a10a983aec71e171093a498
Author: Wan Kai <[email protected]>
AuthorDate: Tue Nov 18 18:57:50 2025 +0800
Polish alarm recovery logic. (#13581)
---
docs/en/setup/backend/backend-alarm.md | 16 +++++++------
docs/en/status/query_alarm_runtime_status.md | 8 +++++--
.../core/alarm/provider/AlarmStatusWatcher.java | 3 +++
.../server/core/alarm/provider/RunningRule.java | 26 ++++++++++----------
.../alarm/provider/status/AlarmRunningContext.java | 3 +++
.../core/alarm/provider/RunningRuleTest.java | 28 +++++++++++++---------
test/e2e-v2/cases/alarm/alarm-settings.yml | 2 +-
7 files changed, 53 insertions(+), 33 deletions(-)
diff --git a/docs/en/setup/backend/backend-alarm.md
b/docs/en/setup/backend/backend-alarm.md
index 60a60eaae8..f09a3ca1ab 100644
--- a/docs/en/setup/backend/backend-alarm.md
+++ b/docs/en/setup/backend/backend-alarm.md
@@ -40,7 +40,8 @@ The metrics names in the expression could be found in the
[list of all potential
- **Silence period**. After the alarm is triggered at Time-N (TN), there will
be silence during the **TN -> TN + period**.
By default, it works in the same manner as **period**. The same Alarm (having
the same ID in the same metrics name) may only be triggered once within a
period.
- **Recovery observation period**. Defines the number of consecutive periods
that the alarm condition must remain false before the alarm is considered
recovered. When the alarm condition becomes false, the system enters an
observation period. If the condition remains false for the specified number of
periods, a recovery notification is sent. If the condition becomes true again
during the observation period, the alarm returns to the FIRING state.
-The default value is 0, which means immediate recovery notification when the
condition becomes false.
+The default value is 0, which means immediate recovery notification when the
condition becomes false.
+**Notice:** because the alarm will not be triggered again during the silence
period, recovery won't be triggered during the silence period after an alarm is
fired. It will be in the OBSERVING_RECOVERY state, the recovery will be
triggered only after the silence period is over and the condition remains false
for the specified observation periods.
Such as for a metric, there is a shifting window as following at T7.
@@ -523,15 +524,16 @@ stateDiagram-v2
[*] --> NORMAL
NORMAL --> FIRING: Expression true<br/>not in silence period
- FIRING --> SILENCED: Expression true<br/>in silence period
- FIRING --> OBSERVING_RECOVERY: Expression false<br/>in recovery window
- FIRING --> RECOVERED: Expression false<br/>not in recovery window
+ FIRING --> SILENCED_FIRING: Expression true<br/>in silence period
+ FIRING --> OBSERVING_RECOVERY: Expression false<br/>in recovery window or
in silence period
+ FIRING --> RECOVERED: Expression false<br/>not in recovery window and not
in silence period
OBSERVING_RECOVERY --> FIRING: Expression true<br/>not in silence period
- OBSERVING_RECOVERY --> RECOVERED: Expression false<br/>not in recovery
window
+ OBSERVING_RECOVERY --> SILENCED_FIRING: Expression true<br/>in silence
period or in silence period
+ OBSERVING_RECOVERY --> RECOVERED: Expression false<br/>not in recovery
window and not in silence period
- SILENCED --> RECOVERED: Expression false<br/>not in recovery window
- SILENCED --> OBSERVING_RECOVERY: Expression false<br/>in recovery window
+ SILENCED_FIRING --> RECOVERED: Expression false<br/>not in recovery window
and not in silence period
+ SILENCED_FIRING --> OBSERVING_RECOVERY: Expression false<br/>in recovery
window or in silence period
RECOVERED --> FIRING: Expression true<br/>not in silence period
RECOVERED --> NORMAL: Expression false
diff --git a/docs/en/status/query_alarm_runtime_status.md
b/docs/en/status/query_alarm_runtime_status.md
index 3568b2a495..21cb44e5ab 100644
--- a/docs/en/status/query_alarm_runtime_status.md
+++ b/docs/en/status/query_alarm_runtime_status.md
@@ -158,8 +158,11 @@ Return the running context of the alarm rule.
"endTime": "2025-11-10T09:39:00.000",
"additionalPeriod": 0,
"size": 10,
+ "silencePeriod": 3,
+ "recoveryObservationPeriod": 2,
"silenceCountdown": 10,
"recoveryObservationCountdown": 2,
+ "currentState": "FIRING",
"entityName": "mock_b_service",
"windowValues": [
{
@@ -233,8 +236,9 @@ Return the running context of the alarm rule.
`size` is the window size. Equal to the `period + additionalPeriod`.
`silenceCountdown` is the countdown of the silence period. -1 means silence
countdown is not running.
`recoveryObservationCountdown` is the countdown of the recovery observation
period.
-`windowValues` is the original metrics data. The `index` is the index of the
window, starting from 0.
-`mqeMetricsSnapshot` is the metrics data in the MQE format. When checking
conditions, these data will be calculated according to the expression.
+`windowValues` is the original metrics data when the metrics come in. The
`index` is the index of the window, starting from 0.
+`mqeMetricsSnapshot` is the metrics data in the MQE format which is generated
when executing the checking.
+These data will be calculated according to the expression.
## Get Errors When Querying Status from OAP Instances
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
index 1cda9bb005..3342894ecc 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmStatusWatcher.java
@@ -136,8 +136,11 @@ public class AlarmStatusWatcher implements
AlarmStatusWatcherService {
runningContext.setEndTime(window.getEndTime().toString());
runningContext.setAdditionalPeriod(window.getAdditionalPeriod());
runningContext.setSize(window.getSize());
+
runningContext.setSilencePeriod(window.getStateMachine().getSilencePeriod());
+
runningContext.setRecoveryObservationPeriod(window.getStateMachine().getRecoveryObservationPeriod());
runningContext.setSilenceCountdown(window.getStateMachine().getSilenceCountdown());
runningContext.setRecoveryObservationCountdown(window.getStateMachine().getRecoveryObservationCountdown());
+
runningContext.setCurrentState(window.getStateMachine().getCurrentState().name());
window.scanWindowValues(values -> {
for (int i = 0; i < values.size(); i++) {
AlarmRunningContext.WindowValue windowValue = new
AlarmRunningContext.WindowValue();
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
index e226b950dd..4bcf133cf1 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java
@@ -237,7 +237,7 @@ public class RunningRule {
public enum State {
NORMAL,
FIRING,
- SILENCED,
+ SILENCED_FIRING,
OBSERVING_RECOVERY,
RECOVERED
}
@@ -477,14 +477,12 @@ public class RunningRule {
}
}
+ @Getter
public class AlarmStateMachine {
- @Getter
private int silenceCountdown;
- @Getter
private int recoveryObservationCountdown;
private final int silencePeriod;
private final int recoveryObservationPeriod;
- @Getter
private State currentState;
public AlarmStateMachine(int silencePeriod, int
recoveryObservationPeriod) {
@@ -503,16 +501,20 @@ public class RunningRule {
silenceCountdown--;
switch (currentState) {
case NORMAL:
- case SILENCED:
+ transitionTo(State.FIRING);
+ break;
+ case SILENCED_FIRING:
case OBSERVING_RECOVERY:
case RECOVERED:
if (silenceCountdown < 0) {
transitionTo(State.FIRING);
+ } else {
+ transitionTo(State.SILENCED_FIRING);
}
break;
case FIRING:
if (silenceCountdown >= 0) {
- transitionTo(State.SILENCED);
+ transitionTo(State.SILENCED_FIRING);
}
break;
default:
@@ -531,15 +533,15 @@ public class RunningRule {
silenceCountdown--;
switch (currentState) {
case FIRING:
- case SILENCED:
- if (this.recoveryObservationCountdown < 0) {
+ case SILENCED_FIRING:
+ if (this.recoveryObservationCountdown < 0 &&
silenceCountdown < 0) {
transitionTo(State.RECOVERED);
} else {
transitionTo(State.OBSERVING_RECOVERY);
}
break;
case OBSERVING_RECOVERY:
- if (recoveryObservationCountdown < 0) {
+ if (recoveryObservationCountdown < 0 &&
silenceCountdown < 0) {
transitionTo(State.RECOVERED);
}
break;
@@ -564,9 +566,9 @@ public class RunningRule {
break;
case FIRING:
this.silenceCountdown = this.silencePeriod;
- this.recoveryObservationCountdown =
recoveryObservationPeriod;
+ this.recoveryObservationCountdown =
this.recoveryObservationPeriod;
break;
- case SILENCED:
+ case SILENCED_FIRING:
break;
case OBSERVING_RECOVERY:
this.recoveryObservationCountdown =
this.recoveryObservationPeriod - 1;
@@ -578,7 +580,7 @@ public class RunningRule {
}
private void resetCountdowns() {
- recoveryObservationCountdown = this.recoveryObservationPeriod;
+ this.recoveryObservationCountdown =
this.recoveryObservationPeriod;
}
}
diff --git
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
index 8d98d8960f..96a1447bcb 100644
---
a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
+++
b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/status/AlarmRunningContext.java
@@ -30,8 +30,11 @@ public class AlarmRunningContext {
private String endTime;
private int additionalPeriod;
private int size;
+ private int silencePeriod;
+ private int recoveryObservationPeriod;
private int silenceCountdown;
private int recoveryObservationCountdown;
+ private String currentState;
private String entityName;
private List<WindowValue> windowValues = new ArrayList<>();
private JsonObject mqeMetricsSnapshot;
diff --git
a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
index e6030177f0..69d425cc43 100644
---
a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
+++
b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java
@@ -740,13 +740,13 @@ public class RunningRuleTest {
runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(1).getMillis()),
72));
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced");
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(2).getMillis()),
72));
runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced");
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(3).getMillis()),
80));
runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime());
@@ -758,16 +758,22 @@ public class RunningRuleTest {
runningRule.moveTo(startTime.plusMinutes(4).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced");
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(5).getMillis()),
80));
runningRule.moveTo(startTime.plusMinutes(5).toLocalDateTime());
alarmMessages = getAlarmRecoveryMessageList(runningRule.check());
- Assertions.assertEquals(1, alarmMessages.size(), "Should recover
immediately");
- Assertions.assertEquals(RunningRule.State.RECOVERED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(0, alarmMessages.size(), "Should not recover
immediately");
+ Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY,
stateMachine.getCurrentState());
runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(6).getMillis()),
80));
runningRule.moveTo(startTime.plusMinutes(6).toLocalDateTime());
+ alarmMessages = getAlarmRecoveryMessageList(runningRule.check());
+ Assertions.assertEquals(1, alarmMessages.size(), "Should recover after
silence period");
+ Assertions.assertEquals(RunningRule.State.RECOVERED,
stateMachine.getCurrentState());
+
+ runningRule.in(getMetaInAlarm(123),
getMetrics(TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(7).getMillis()),
80));
+ runningRule.moveTo(startTime.plusMinutes(7).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be normal");
Assertions.assertEquals(RunningRule.State.NORMAL,
stateMachine.getCurrentState());
@@ -858,23 +864,23 @@ public class RunningRuleTest {
alarmMessages = getAlarmFiringMessageList(runningRule.check());
if (i < 3) {
Assertions.assertEquals(0, alarmMessages.size(), "Should be
silenced at minute " + i);
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
} else {
Assertions.assertEquals(1, alarmMessages.size(), "Should fire
after silence period");
Assertions.assertEquals(RunningRule.State.FIRING,
stateMachine.getCurrentState());
}
}
- for (int i = 0; i <= 2; i++) {
+ for (int i = 0; i <= 3; i++) {
runningRule.moveTo(startTime.plusMinutes(8 + i).toLocalDateTime());
runningRule.in(getMetaInAlarm(123), getMetrics(
TimeBucket.getMinuteTimeBucket(startTime.plusMinutes(8 +
i).getMillis()), 80));
- if (i < 2) {
+ if (i < 3) {
List<AlarmMessage> recoveryMessages =
getAlarmRecoveryMessageList(runningRule.check());
Assertions.assertEquals(0, recoveryMessages.size(), "Should
not recover immediately");
Assertions.assertEquals(RunningRule.State.OBSERVING_RECOVERY,
stateMachine.getCurrentState());
} else {
List<AlarmMessage> recoveryMessages =
getAlarmRecoveryMessageList(runningRule.check());
- Assertions.assertEquals(1, recoveryMessages.size(), "Should
recover after observation period");
+ Assertions.assertEquals(1, recoveryMessages.size(), "Should
recover after silence period");
Assertions.assertEquals(RunningRule.State.RECOVERED,
stateMachine.getCurrentState());
}
}
@@ -914,12 +920,12 @@ public class RunningRuleTest {
runningRule.moveTo(startTime.plusMinutes(1).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced");
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
runningRule.moveTo(startTime.plusMinutes(2).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
Assertions.assertEquals(0, alarmMessages.size(), "Should be silenced");
- Assertions.assertEquals(RunningRule.State.SILENCED,
stateMachine.getCurrentState());
+ Assertions.assertEquals(RunningRule.State.SILENCED_FIRING,
stateMachine.getCurrentState());
runningRule.moveTo(startTime.plusMinutes(3).toLocalDateTime());
alarmMessages = getAlarmFiringMessageList(runningRule.check());
diff --git a/test/e2e-v2/cases/alarm/alarm-settings.yml
b/test/e2e-v2/cases/alarm/alarm-settings.yml
index 8679261c44..2fc34b05e5 100755
--- a/test/e2e-v2/cases/alarm/alarm-settings.yml
+++ b/test/e2e-v2/cases/alarm/alarm-settings.yml
@@ -38,7 +38,7 @@ rules:
- webhook.custom
comp_rule:
expression: sum((service_resp_time > 100) && (service_sla > 1)) >= 1
- period: 10
+ period: 5
recovery-observation-period: 3
message: Service {name} response time is more than 100ms and sla is more
than 1%.
tags: