This is an automated email from the ASF dual-hosted git repository.
manishswaminathan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new e05ef7b068 Adds metrics for task states (#14785)
e05ef7b068 is described below
commit e05ef7b0683241490cb0f15343e93579e8cfcf44
Author: NOOB <[email protected]>
AuthorDate: Wed Jan 22 21:56:27 2025 +0530
Adds metrics for task states (#14785)
* Adds metrics for task states
* Adds tests
---
.../configs/controller.yml | 2 +-
.../pinot/common/metrics/ControllerGauge.java | 4 +++
.../ControllerPrometheusMetricsTest.java | 3 ++
.../core/minion/PinotHelixTaskResourceManager.java | 30 +++++++++++++++++-
.../helix/core/minion/TaskMetricsEmitter.java | 18 +++++++++++
.../helix/core/minion/TaskMetricsEmitterTest.java | 37 ++++++++++++++++++++--
6 files changed, 89 insertions(+), 5 deletions(-)
diff --git
a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
index 2281a9ea41..a1b945ab4a 100644
--- a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
+++ b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
@@ -17,7 +17,7 @@ rules:
tableType: "$6"
partition: "$7"
# Gauges that accept the controller taskType
-- pattern:
"\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\",
name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)"
+- pattern:
"\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\",
name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|numMinionSubtasksDropped|numMinionSubtasksTimedOut|numMinionSubtasksAborted|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)"
name: "pinot_controller_$1_$3"
cache: true
labels:
diff --git
a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
index a978219343..99bd892066 100644
---
a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
+++
b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
@@ -64,11 +64,15 @@ public enum ControllerGauge implements
AbstractMetrics.Gauge {
TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE("TimeMsSinceLastMinionTaskMetadataUpdate",
false),
TIME_MS_SINCE_LAST_SUCCESSFUL_MINION_TASK_GENERATION("TimeMsSinceLastSuccessfulMinionTaskGeneration",
false),
LAST_MINION_TASK_GENERATION_ENCOUNTERS_ERROR("LastMinionTaskGenerationEncountersError",
false),
+ // TODO: Unify below subtask metrics into a single metric with status label
NUM_MINION_TASKS_IN_PROGRESS("NumMinionTasksInProgress", true),
NUM_MINION_SUBTASKS_WAITING("NumMinionSubtasksWaiting", true),
NUM_MINION_SUBTASKS_RUNNING("NumMinionSubtasksRunning", true),
NUM_MINION_SUBTASKS_ERROR("NumMinionSubtasksError", true),
NUM_MINION_SUBTASKS_UNKNOWN("NumMinionSubtasksUnknown", true),
+ NUM_MINION_SUBTASKS_DROPPED("NumMinionSubtasksDropped", true),
+ NUM_MINION_SUBTASKS_TIMED_OUT("NumMinionSubtasksTimedOut", true),
+ NUM_MINION_SUBTASKS_ABORTED("NumMinionSubtasksAborted", true),
PERCENT_MINION_SUBTASKS_IN_QUEUE("PercentMinionSubtasksInQueue", true),
PERCENT_MINION_SUBTASKS_IN_ERROR("PercentMinionSubtasksInError", true),
TIER_BACKEND_TABLE_COUNT("TierBackendTableCount", true),
diff --git
a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
index 1f458a4448..1061a0af54 100644
---
a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
+++
b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
@@ -41,6 +41,9 @@ public abstract class ControllerPrometheusMetricsTest extends
PinotPrometheusMet
List.of(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS,
ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN,
+ ControllerGauge.NUM_MINION_SUBTASKS_DROPPED,
+ ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT,
+ ControllerGauge.NUM_MINION_SUBTASKS_ABORTED,
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE,
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR);
//local gauges that accept partition
diff --git
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
index f8b258265a..87580c30b0 100644
---
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
+++
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
@@ -1126,7 +1126,7 @@ public class PinotHelixTaskResourceManager {
}
}
- @JsonPropertyOrder({"total", "completed", "running", "waiting", "error",
"unknown"})
+ @JsonPropertyOrder({"total", "completed", "running", "waiting", "error",
"unknown", "dropped", "timedOut", "aborted"})
public static class TaskCount {
private int _waiting; // Number of tasks waiting to be scheduled on
minions
private int _error; // Number of tasks in error
@@ -1134,6 +1134,10 @@ public class PinotHelixTaskResourceManager {
private int _completed; // Number of tasks completed normally
private int _unknown; // Number of tasks with all other states
private int _total; // Total number of tasks in the batch
+ private int _dropped; // Total number of tasks dropped
+ // (Task can be dropped due to no available assigned instance, etc.)
+ private int _timedOut; // Total number of tasks timed out
+ private int _aborted; // Total number of tasks aborted
public TaskCount() {
}
@@ -1157,6 +1161,15 @@ public class PinotHelixTaskResourceManager {
case COMPLETED:
_completed++;
break;
+ case DROPPED:
+ _dropped++;
+ break;
+ case TIMED_OUT:
+ _timedOut++;
+ break;
+ case TASK_ABORTED:
+ _aborted++;
+ break;
default:
_unknown++;
break;
@@ -1188,6 +1201,18 @@ public class PinotHelixTaskResourceManager {
return _unknown;
}
+ public int getDropped() {
+ return _dropped;
+ }
+
+ public int getTimedOut() {
+ return _timedOut;
+ }
+
+ public int getAborted() {
+ return _aborted;
+ }
+
public void accumulate(TaskCount other) {
_waiting += other.getWaiting();
_running += other.getRunning();
@@ -1195,6 +1220,9 @@ public class PinotHelixTaskResourceManager {
_completed += other.getCompleted();
_unknown += other.getUnknown();
_total += other.getTotal();
+ _dropped += other.getDropped();
+ _timedOut += other.getTimedOut();
+ _aborted += other.getAborted();
}
}
}
diff --git
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
index ace3694485..9eb6921053 100644
---
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
+++
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
@@ -116,6 +116,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
taskTypeAccumulatedCount.getError());
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN,
taskType,
taskTypeAccumulatedCount.getUnknown());
+
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_DROPPED,
taskType,
+ taskTypeAccumulatedCount.getDropped());
+
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT,
taskType,
+ taskTypeAccumulatedCount.getTimedOut());
+
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ABORTED,
taskType,
+ taskTypeAccumulatedCount.getAborted());
int total = taskTypeAccumulatedCount.getTotal();
int percent = total != 0
? (taskTypeAccumulatedCount.getWaiting() +
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
@@ -133,6 +139,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
_controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN,
taskCount.getUnknown());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_DROPPED,
taskCount.getDropped());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT,
taskCount.getTimedOut());
+ _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+ ControllerGauge.NUM_MINION_SUBTASKS_ABORTED,
taskCount.getAborted());
int tableTotal = taskCount.getTotal();
int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() +
taskCount.getRunning()) * 100 / tableTotal : 0;
_controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
@@ -168,6 +180,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
_controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_WAITING);
_controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_ERROR);
_controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN);
+ _controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_DROPPED);
+ _controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT);
+ _controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.NUM_MINION_SUBTASKS_ABORTED);
_controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE);
_controllerMetrics.removeGlobalGauge(taskType,
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR);
// remove table task type level gauges
@@ -198,6 +213,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
_controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_WAITING);
_controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_ERROR);
_controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN);
+ _controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_DROPPED);
+ _controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT);
+ _controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.NUM_MINION_SUBTASKS_ABORTED);
_controllerMetrics.removeTableGauge(tableNameWithType, taskType,
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE);
_controllerMetrics.removeTableGauge(tableNameWithType, taskType,
diff --git
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
index bd88f2731c..6ed301a069 100644
---
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
+++
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
@@ -84,7 +84,7 @@ public class TaskMetricsEmitterTest {
Mockito.when(_pinotHelixTaskResourceManager.getTasksInProgress(taskType)).thenReturn(ImmutableSet.of());
_taskMetricsEmitter.runTask(null);
- Assert.assertEquals(metricsRegistry.allMetrics().size(), 8);
+ Assert.assertEquals(metricsRegistry.allMetrics().size(), 11);
Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.onlineMinionInstances")));
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
@@ -99,6 +99,18 @@ public class TaskMetricsEmitterTest {
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksError.taskType1"))
.getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksUnknown.taskType1"))
+ .getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksDropped.taskType1"))
+ .getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksTimedOut.taskType1"))
+ .getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksAborted.taskType1"))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.percentMinionSubtasksInQueue.taskType1"))
.getMetric()).value(), 0L);
@@ -144,7 +156,7 @@ public class TaskMetricsEmitterTest {
private void runAndAssertForTaskType1WithTwoTables() {
PinotMetricsRegistry metricsRegistry =
_controllerMetrics.getMetricsRegistry();
_taskMetricsEmitter.runTask(null);
- Assert.assertEquals(metricsRegistry.allMetrics().size(), 20);
+ Assert.assertEquals(metricsRegistry.allMetrics().size(), 29);
Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.onlineMinionInstances")));
@@ -160,6 +172,9 @@ public class TaskMetricsEmitterTest {
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksError.taskType1"))
.getMetric()).value(), 1L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksDropped.taskType1"))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.percentMinionSubtasksInQueue.taskType1"))
.getMetric()).value(), 50L);
@@ -179,6 +194,10 @@ public class TaskMetricsEmitterTest {
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksError.table1_OFFLINE.taskType1"))
.getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
+
"pinot.controller.numMinionSubtasksDropped.table1_OFFLINE.taskType1"))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.percentMinionSubtasksInQueue.table1_OFFLINE.taskType1"))
@@ -200,6 +219,10 @@ public class TaskMetricsEmitterTest {
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.numMinionSubtasksError.table2_OFFLINE.taskType1"))
.getMetric()).value(), 1L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
+
"pinot.controller.numMinionSubtasksDropped.table2_OFFLINE.taskType1"))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.percentMinionSubtasksInQueue.table2_OFFLINE.taskType1"))
@@ -231,7 +254,7 @@ public class TaskMetricsEmitterTest {
PinotMetricsRegistry metricsRegistry =
_controllerMetrics.getMetricsRegistry();
_taskMetricsEmitter.runTask(null);
- Assert.assertEquals(metricsRegistry.allMetrics().size(), 14);
+ Assert.assertEquals(metricsRegistry.allMetrics().size(), 20);
Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
new YammerMetricName(ControllerMetrics.class,
"pinot.controller.onlineMinionInstances")));
@@ -251,6 +274,10 @@ public class TaskMetricsEmitterTest {
new YammerMetricName(ControllerMetrics.class,
String.format("pinot.controller.numMinionSubtasksError.%s",
taskType)))
.getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
+ String.format("pinot.controller.numMinionSubtasksDropped.%s",
taskType)))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
String.format("pinot.controller.percentMinionSubtasksInQueue.%s", taskType)))
@@ -272,6 +299,10 @@ public class TaskMetricsEmitterTest {
new YammerMetricName(ControllerMetrics.class,
String.format("pinot.controller.numMinionSubtasksError.%s.%s",
tableName, taskType)))
.getMetric()).value(), 0L);
+ Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
+ new YammerMetricName(ControllerMetrics.class,
+
String.format("pinot.controller.numMinionSubtasksDropped.%s.%s", tableName,
taskType)))
+ .getMetric()).value(), 0L);
Assert.assertEquals(((YammerSettableGauge<?>)
metricsRegistry.allMetrics().get(
new YammerMetricName(ControllerMetrics.class,
String.format("pinot.controller.percentMinionSubtasksInQueue.%s.%s", tableName,
taskType)))
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]