This is an automated email from the ASF dual-hosted git repository.
mlbiscoc pushed a commit to branch feature/SOLR-17458
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/feature/SOLR-17458 by this
push:
new 90b92cccc1c SOLR-17806: Migrate UpdateLog metrics to OTEL (#3438)
90b92cccc1c is described below
commit 90b92cccc1c8dd31b38418677fc8e5c24cfb32dc
Author: Matthew Biscocho <[email protected]>
AuthorDate: Wed Aug 6 10:33:08 2025 -0400
SOLR-17806: Migrate UpdateLog metrics to OTEL (#3438)
* Init migration
* Better test cleanup
* Change to OPERATION_ATTR
* Clean up switch
---
.../src/java/org/apache/solr/update/UpdateLog.java | 118 ++++++++++++++-------
.../org/apache/solr/cloud/TestTlogReplica.java | 13 ++-
.../test/org/apache/solr/search/TestRecovery.java | 82 +++++++++-----
3 files changed, 140 insertions(+), 73 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
index e3f5423292f..4eee008c151 100644
--- a/solr/core/src/java/org/apache/solr/update/UpdateLog.java
+++ b/solr/core/src/java/org/apache/solr/update/UpdateLog.java
@@ -21,8 +21,7 @@ import static
org.apache.solr.update.processor.DistributingUpdateProcessorFactor
import com.carrotsearch.hppc.LongHashSet;
import com.carrotsearch.hppc.LongSet;
-import com.codahale.metrics.Gauge;
-import com.codahale.metrics.Meter;
+import io.opentelemetry.api.common.AttributeKey;
import io.opentelemetry.api.common.Attributes;
import java.io.Closeable;
import java.io.IOException;
@@ -80,6 +79,7 @@ import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.core.SolrPaths;
import org.apache.solr.metrics.SolrMetricProducer;
import org.apache.solr.metrics.SolrMetricsContext;
+import org.apache.solr.metrics.otel.instruments.AttributedLongCounter;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrRequestInfo;
@@ -264,10 +264,9 @@ public class UpdateLog implements PluginInfoInitialized,
SolrMetricProducer {
protected List<Long> startingVersions;
// metrics
- protected Gauge<Integer> bufferedOpsGauge;
- protected Meter applyingBufferedOpsMeter;
- protected Meter replayOpsMeter;
- protected Meter copyOverOldUpdatesMeter;
+ protected AttributedLongCounter applyingBufferedOpsCounter;
+ protected AttributedLongCounter replayOpsCounter;
+ protected AttributedLongCounter copyOverOldUpdatesCounter;
protected SolrMetricsContext solrMetricsContext;
public static class LogPtr {
@@ -627,40 +626,65 @@ public class UpdateLog implements PluginInfoInitialized,
SolrMetricProducer {
}
}
- // TODO SOLR-17458: Migrate to Otel
@Override
public void initializeMetrics(
SolrMetricsContext parentContext, Attributes attributes, String scope) {
solrMetricsContext = parentContext.getChildContext(this);
- bufferedOpsGauge =
- () -> {
- if (state == State.BUFFERING) {
- if (bufferTlog == null) return 0;
- // numRecords counts header as a record
- return bufferTlog.numRecords() - 1;
- }
- if (tlog == null) {
- return 0;
- } else if (state == State.APPLYING_BUFFERED) {
- // numRecords counts header as a record
- return tlog.numRecords()
- - 1
- - recoveryInfo.adds
- - recoveryInfo.deleteByQuery
- - recoveryInfo.deletes
- - recoveryInfo.errors.get();
- } else {
- return 0;
- }
- };
- solrMetricsContext.gauge(bufferedOpsGauge, true, "ops", scope, "buffered");
- solrMetricsContext.gauge(() -> logs.size(), true, "logs", scope, "replay",
"remaining");
- solrMetricsContext.gauge(() -> getTotalLogsSize(), true, "bytes", scope,
"replay", "remaining");
- applyingBufferedOpsMeter = solrMetricsContext.meter("ops", scope,
"applyingBuffered");
- replayOpsMeter = solrMetricsContext.meter("ops", scope, "replay");
- copyOverOldUpdatesMeter = solrMetricsContext.meter("ops", scope,
"copyOverOldUpdates");
- solrMetricsContext.gauge(() -> state.getValue(), true, "state", scope);
+ // NOCOMMIT: We do not need a scope attribute
+ // Will need to fix up SolrCoreMetricManager instead of directly removing
it from builder.
+ var baseAttributes =
+ attributes.toBuilder()
+ .remove(AttributeKey.stringKey("scope"))
+ .put(CATEGORY_ATTR, SolrInfoBean.Category.TLOG.toString())
+ .build();
+
+ solrMetricsContext.observableLongGauge(
+ "solr_core_update_log_buffered_ops",
+ "The current number of buffered operations",
+ (observableLongMeasurement ->
+ observableLongMeasurement.record(computeBufferedOps(),
baseAttributes)));
+
+ solrMetricsContext.observableLongGauge(
+ "solr_core_update_log_replay_logs_remaining",
+ "The current number of tlogs remaining to be replayed",
+ (observableLongMeasurement -> {
+ observableLongMeasurement.record(logs.size(), baseAttributes);
+ }));
+
+ solrMetricsContext.observableLongGauge(
+ "solr_core_update_log_size_remaining",
+ "The total size in bytes of all tlogs remaining to be replayed",
+ (observableLongMeasurement -> {
+ observableLongMeasurement.record(getTotalLogsSize(), baseAttributes);
+ }),
+ "By");
+
+ // NOCOMMIT: Do we want to keep this? Metric was just state with the
numeric enum value.
+ // Without context this doesn't mean anything and can be very confusing.
Maybe keep the numeric
+ // value and put the value meaning in the
+ // description?
+ // solrMetricsContext.gauge(() -> state.getValue(), true, "state",
scope);
+
+ applyingBufferedOpsCounter =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_update_log_applied_buffered_ops",
+ "Total number of buffered operations applied"),
+ baseAttributes);
+
+ replayOpsCounter =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_update_log_replay_ops", "Total number of log replay
operations"),
+ baseAttributes);
+
+ copyOverOldUpdatesCounter =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_update_log_old_updates_copied",
+ "Total number of updates copied from previous tlog or last
tlog to a new tlog"),
+ baseAttributes);
}
@Override
@@ -668,6 +692,24 @@ public class UpdateLog implements PluginInfoInitialized,
SolrMetricProducer {
return solrMetricsContext;
}
+ private long computeBufferedOps() {
+ return switch (state) {
+ // numRecords counts header as a record
+ case BUFFERING -> (bufferTlog == null ? 0 : bufferTlog.numRecords() - 1);
+ case APPLYING_BUFFERED -> {
+ if (tlog == null) yield 0;
+ // numRecords counts header as a record
+ yield tlog.numRecords()
+ - 1
+ - recoveryInfo.adds
+ - recoveryInfo.deleteByQuery
+ - recoveryInfo.deletes
+ - recoveryInfo.errors.get();
+ }
+ default -> 0;
+ };
+ }
+
/**
* Returns a new {@link org.apache.solr.update.TransactionLog}. Sub-classes
can override this
* method to change the implementation of the transaction log.
@@ -1506,7 +1548,7 @@ public class UpdateLog implements PluginInfoInitialized,
SolrMetricProducer {
* over
*/
public void copyOverOldUpdates(long commitVersion, TransactionLog oldTlog) {
- copyOverOldUpdatesMeter.mark();
+ copyOverOldUpdatesCounter.inc();
SolrQueryRequest req = new LocalSolrQueryRequest(uhandler.core, new
ModifiableSolrParams());
TransactionLog.LogReader logReader = null;
@@ -2252,9 +2294,9 @@ public class UpdateLog implements PluginInfoInitialized,
SolrMetricProducer {
throw rsp.getException();
}
if (state == State.REPLAYING) {
- replayOpsMeter.mark();
+ replayOpsCounter.inc();
} else if (state == State.APPLYING_BUFFERED) {
- applyingBufferedOpsMeter.mark();
+ applyingBufferedOpsCounter.inc();
} else {
// XXX should not happen?
}
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
index 89dc89aa646..b7b8de30958 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestTlogReplica.java
@@ -19,7 +19,6 @@ package org.apache.solr.cloud;
import static org.apache.solr.cloud.TestPullReplica.getHypotheticalTlogDir;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
-import com.codahale.metrics.Meter;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Files;
@@ -1239,11 +1238,11 @@ public class TestTlogReplica extends SolrCloudTestCase {
}
private long getTimesCopyOverOldUpdates(SolrCore core) {
- return ((Meter)
- core.getSolrMetricsContext()
- .getMetricRegistry()
- .getMetrics()
- .get("TLOG.copyOverOldUpdates.ops"))
- .getCount();
+ var metric =
+ SolrMetricTestUtils.getCounterDatapoint(
+ core,
+ "solr_core_update_log_old_updates_copied",
+ SolrMetricTestUtils.newCloudLabelsBuilder(core).label("category",
"TLOG").build());
+ return (metric != null) ? (long) metric.getValue() : 0L;
}
}
diff --git a/solr/core/src/test/org/apache/solr/search/TestRecovery.java
b/solr/core/src/test/org/apache/solr/search/TestRecovery.java
index ec2dead4da4..b4ad7161251 100644
--- a/solr/core/src/test/org/apache/solr/search/TestRecovery.java
+++ b/solr/core/src/test/org/apache/solr/search/TestRecovery.java
@@ -20,8 +20,6 @@ import static
org.apache.solr.search.TestRecovery.VersionProvider.getNextVersion
import static
org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
import static org.apache.solr.util.SolrMatchers.subListMatches;
-import com.codahale.metrics.Gauge;
-import com.codahale.metrics.Meter;
import com.codahale.metrics.Metric;
import com.codahale.metrics.MetricRegistry;
import java.io.RandomAccessFile;
@@ -46,6 +44,7 @@ import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.metrics.SolrMetricTestUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.UpdateHandler;
@@ -230,8 +229,6 @@ public class TestRecovery extends SolrTestCaseJ4 {
h.close();
createCore();
- Map<String, Metric> metrics = getMetrics(); // live map view
-
// Solr should kick this off now
// h.getCore().getUpdateHandler().getUpdateLog().recoverFromLog();
@@ -244,18 +241,27 @@ public class TestRecovery extends SolrTestCaseJ4 {
assertEquals(
UpdateLog.State.REPLAYING,
h.getCore().getUpdateHandler().getUpdateLog().getState());
+
+ var attributes =
+ SolrMetricTestUtils.newStandaloneLabelsBuilder(h.getCore())
+ .label("category", "TLOG")
+ .build();
+
// check metrics
- @SuppressWarnings({"unchecked"})
- Gauge<Integer> state = (Gauge<Integer>) metrics.get("TLOG.state");
- assertEquals(UpdateLog.State.REPLAYING.ordinal(),
state.getValue().intValue());
- @SuppressWarnings({"unchecked"})
- Gauge<Integer> replayingLogs = (Gauge<Integer>)
metrics.get("TLOG.replay.remaining.logs");
- assertTrue(replayingLogs.getValue() > 0);
- @SuppressWarnings({"unchecked"})
- Gauge<Long> replayingDocs = (Gauge<Long>)
metrics.get("TLOG.replay.remaining.bytes");
- assertTrue(replayingDocs.getValue() > 0);
- Meter replayDocs = (Meter) metrics.get("TLOG.replay.ops");
- long initialOps = replayDocs.getCount();
+ // NOCOMMIT: Need to decide if we are keeping this state metric or not
+ // @SuppressWarnings({"unchecked"})
+ // Gauge<Integer> state = (Gauge<Integer>)
metrics.get("TLOG.state");
+ // assertEquals(UpdateLog.State.REPLAYING.ordinal(),
state.getValue().intValue());
+
+ var actualReplayingLogs =
+ SolrMetricTestUtils.getGaugeDatapoint(
+ h.getCore(), "solr_core_update_log_replay_logs_remaining",
attributes);
+ assertTrue(actualReplayingLogs.getValue() > 0);
+
+ var actualReplayingDocs =
+ SolrMetricTestUtils.getGaugeDatapoint(
+ h.getCore(), "solr_core_update_log_size_remaining_bytes",
attributes);
+ assertTrue(actualReplayingDocs.getValue() > 0);
// unblock recovery
logReplay.release(1000);
@@ -271,8 +277,14 @@ public class TestRecovery extends SolrTestCaseJ4 {
assertJQ(req("q", "*:*"), "/response/numFound==3");
- assertEquals(7L, replayDocs.getCount() - initialOps);
- assertEquals(UpdateLog.State.ACTIVE.ordinal(),
state.getValue().intValue());
+ var actualReplayOps =
+ SolrMetricTestUtils.getCounterDatapoint(
+ h.getCore(), "solr_core_update_log_replay_ops", attributes)
+ .getValue();
+ assertEquals(7.0, actualReplayOps, 0.0);
+
+ // NOCOMMIT: Need to decide if we are keeping this state metric or not
+ // assertEquals(UpdateLog.State.ACTIVE.ordinal(),
state.getValue().intValue());
// make sure we can still access versions after recovery
assertJQ(req("qt", "/get", "getVersions", "" + versions.size()),
"/versions==" + versions);
@@ -662,14 +674,16 @@ public class TestRecovery extends SolrTestCaseJ4 {
ulog.bufferUpdates();
assertEquals(UpdateLog.State.BUFFERING, ulog.getState());
- @SuppressWarnings({"unchecked"})
- Gauge<Integer> state = (Gauge<Integer>) metrics.get("TLOG.state");
- assertEquals(UpdateLog.State.BUFFERING.ordinal(),
state.getValue().intValue());
- @SuppressWarnings({"unchecked"})
- Gauge<Integer> bufferedOps = (Gauge<Integer>)
metrics.get("TLOG.buffered.ops");
- int initialOps = bufferedOps.getValue();
- Meter applyingBuffered = (Meter)
metrics.get("TLOG.applyingBuffered.ops");
- long initialApplyingOps = applyingBuffered.getCount();
+
+ var attributes =
+ SolrMetricTestUtils.newStandaloneLabelsBuilder(h.getCore())
+ .label("category", "TLOG")
+ .build();
+
+ // NOCOMMIT: Need to decide if we are keeping this state metric or not
+ // @SuppressWarnings({"unchecked"})
+ // Gauge<Integer> state = (Gauge<Integer>)
metrics.get("TLOG.state");
+ // assertEquals(UpdateLog.State.BUFFERING.ordinal(),
state.getValue().intValue());
String v3 = getNextVersion();
String v940_del = "-" + getNextVersion();
@@ -722,7 +736,11 @@ public class TestRecovery extends SolrTestCaseJ4 {
// be bad for updates to be visible if we're just buffering.
assertJQ(req("qt", "/get", "id", "B3"), "=={'doc':null}");
- assertEquals(6, bufferedOps.getValue() - initialOps);
+ var actualBufferedOpsValue =
+ SolrMetricTestUtils.getGaugeDatapoint(
+ h.getCore(), "solr_core_update_log_buffered_ops", attributes)
+ .getValue();
+ assertEquals(6, actualBufferedOpsValue, 0.0);
rinfoFuture = ulog.applyBufferedUpdates();
assertNotNull(rinfoFuture);
@@ -734,7 +752,11 @@ public class TestRecovery extends SolrTestCaseJ4 {
UpdateLog.RecoveryInfo rinfo = rinfoFuture.get();
assertEquals(UpdateLog.State.ACTIVE, ulog.getState());
- assertEquals(6L, applyingBuffered.getCount() - initialApplyingOps);
+ var actualAppliedBufferedOpsValue =
+ SolrMetricTestUtils.getCounterDatapoint(
+ h.getCore(), "solr_core_update_log_applied_buffered_ops",
attributes)
+ .getValue();
+ assertEquals(6, actualAppliedBufferedOpsValue, 0.0);
assertThatJQ(
req("qt", "/get", "getVersions", "6"),
@@ -868,7 +890,11 @@ public class TestRecovery extends SolrTestCaseJ4 {
assertEquals(
UpdateLog.State.ACTIVE, ulog.getState()); // leave each test method
in a good state
- assertEquals(0, bufferedOps.getValue().intValue());
+ actualBufferedOpsValue =
+ SolrMetricTestUtils.getGaugeDatapoint(
+ h.getCore(), "solr_core_update_log_buffered_ops", attributes)
+ .getValue();
+ assertEquals(0, actualBufferedOpsValue, 0.0);
} finally {
UpdateLog.testing_logReplayHook = null;
UpdateLog.testing_logReplayFinishHook = null;