This is an automated email from the ASF dual-hosted git repository.
mlbiscoc pushed a commit to branch feature/SOLR-17458
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/feature/SOLR-17458 by this
push:
new 41ed55349fe SOLR-17806: Migrate PeerSync and PeerSyncWithLeader
metrics to OTEL (#3440)
41ed55349fe is described below
commit 41ed55349fe2e6efd356be40cc19664616215692
Author: Matthew Biscocho <[email protected]>
AuthorDate: Tue Jul 29 16:05:05 2025 -0400
SOLR-17806: Migrate PeerSync and PeerSyncWithLeader metrics to OTEL (#3440)
* Migrate peersync metrics to OTEL
* metric utils rename
---
.../src/java/org/apache/solr/update/PeerSync.java | 38 +++++++++++++-------
.../org/apache/solr/update/PeerSyncWithLeader.java | 38 ++++++++++++++------
.../apache/solr/cloud/PeerSyncReplicationTest.java | 40 +++++++++++++++-------
.../org/apache/solr/cloud/TestCloudRecovery.java | 35 ++++++++-----------
.../apache/solr/metrics/SolrMetricTestUtils.java | 9 +++--
5 files changed, 102 insertions(+), 58 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java
b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index a045e88958a..3e201746f53 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -21,8 +21,6 @@ import static org.apache.solr.common.params.CommonParams.ID;
import static
org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase.FROMLEADER;
import static
org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Timer;
import com.google.common.annotations.VisibleForTesting;
import io.opentelemetry.api.common.Attributes;
import java.io.IOException;
@@ -51,6 +49,8 @@ import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.metrics.SolrMetricProducer;
import org.apache.solr.metrics.SolrMetricsContext;
+import org.apache.solr.metrics.otel.instruments.AttributedLongCounter;
+import org.apache.solr.metrics.otel.instruments.AttributedLongTimer;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
@@ -90,9 +90,9 @@ public class PeerSync implements SolrMetricProducer {
private MissedUpdatesFinder missedUpdatesFinder;
// metrics
- private Timer syncTime;
- private Counter syncErrors;
- private Counter syncSkipped;
+ private AttributedLongTimer syncTime;
+ private AttributedLongCounter syncErrors;
+ private AttributedLongCounter syncSkipped;
private SolrMetricsContext solrMetricsContext;
// comparator that sorts by absolute value, putting highest first
@@ -143,14 +143,28 @@ public class PeerSync implements SolrMetricProducer {
return solrMetricsContext;
}
- // TODO SOLR-17458: Migrate to Otel
@Override
public void initializeMetrics(
SolrMetricsContext parentContext, Attributes attributes, String scope) {
this.solrMetricsContext = parentContext.getChildContext(this);
- syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE);
- syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE);
- syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE);
+ var baseAttributes =
+ attributes.toBuilder()
+ .put("category", SolrInfoBean.Category.REPLICATION.toString())
+ .build();
+ syncErrors =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_peer_sync_errors", "Total number of sync errors
with peer"),
+ baseAttributes);
+ syncSkipped =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_peer_sync_skipped", "Total number of skipped syncs
with peer"),
+ baseAttributes);
+ syncTime =
+ new AttributedLongTimer(
+ solrMetricsContext.longHistogram("solr_core_peer_sync_time", "Peer
sync times", "ms"),
+ baseAttributes);
}
public static long percentile(List<Long> arr, float frac) {
@@ -181,7 +195,7 @@ public class PeerSync implements SolrMetricProducer {
syncErrors.inc();
return PeerSyncResult.failure();
}
- Timer.Context timerContext = null;
+ AttributedLongTimer.MetricTimer timerContext = null;
try {
if (log.isInfoEnabled()) {
log.info("{} START replicas={} nUpdates={}", msg(), replicas,
nUpdates);
@@ -194,7 +208,7 @@ public class PeerSync implements SolrMetricProducer {
}
// measure only when actual sync is performed
- timerContext = syncTime.time();
+ timerContext = syncTime.start();
// Fire off the requests before getting our own recent updates (for
better concurrency)
// This also allows us to avoid getting updates we don't need... if we
got our updates and
@@ -273,7 +287,7 @@ public class PeerSync implements SolrMetricProducer {
return success ? PeerSyncResult.success() : PeerSyncResult.failure();
} finally {
if (timerContext != null) {
- timerContext.close();
+ timerContext.stop();
}
}
}
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
index f9acc0f0746..79713dd1d94 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
@@ -22,8 +22,6 @@ import static
org.apache.solr.update.PeerSync.MissedUpdatesRequest;
import static org.apache.solr.update.PeerSync.absComparator;
import static org.apache.solr.update.PeerSync.percentile;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Timer;
import io.opentelemetry.api.common.Attributes;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@@ -43,6 +41,8 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.metrics.SolrMetricProducer;
import org.apache.solr.metrics.SolrMetricsContext;
+import org.apache.solr.metrics.otel.instruments.AttributedLongCounter;
+import org.apache.solr.metrics.otel.instruments.AttributedLongTimer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -67,9 +67,9 @@ public class PeerSyncWithLeader implements SolrMetricProducer
{
private Set<Long> bufferedUpdates;
// metrics
- private Timer syncTime;
- private Counter syncErrors;
- private Counter syncSkipped;
+ private AttributedLongTimer syncTime;
+ private AttributedLongCounter syncErrors;
+ private AttributedLongCounter syncSkipped;
private SolrMetricsContext solrMetricsContext;
public PeerSyncWithLeader(SolrCore core, String leaderUrl, int nUpdates) {
@@ -103,9 +103,25 @@ public class PeerSyncWithLeader implements
SolrMetricProducer {
public void initializeMetrics(
SolrMetricsContext parentContext, Attributes attributes, String scope) {
this.solrMetricsContext = parentContext.getChildContext(this);
- syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE);
- syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE);
- syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE);
+ var baseAttributes =
+ attributes.toBuilder()
+ .put("category", SolrInfoBean.Category.REPLICATION.toString())
+ .build();
+ syncErrors =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_sync_with_leader_errors", "Total number of sync
errors with leader"),
+ baseAttributes);
+ syncSkipped =
+ new AttributedLongCounter(
+ solrMetricsContext.longCounter(
+ "solr_core_sync_with_leader_skipped", "Total number of skipped
syncs with leader"),
+ baseAttributes);
+ syncTime =
+ new AttributedLongTimer(
+ solrMetricsContext.longHistogram(
+ "solr_core_sync_with_leader_time", "leader sync times", "ms"),
+ baseAttributes);
}
// start of peersync related debug messages. includes the core name for
correlation.
@@ -137,7 +153,7 @@ public class PeerSyncWithLeader implements
SolrMetricProducer {
return PeerSync.PeerSyncResult.failure();
}
- Timer.Context timerContext = null;
+ AttributedLongTimer.MetricTimer timerContext = null;
try {
if (log.isInfoEnabled()) {
log.info("{} START leader={} nUpdates={}", msg(), leaderUrl, nUpdates);
@@ -153,7 +169,7 @@ public class PeerSyncWithLeader implements
SolrMetricProducer {
}
// measure only when actual sync is performed
- timerContext = syncTime.time();
+ timerContext = syncTime.start();
List<Long> ourUpdates;
try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
@@ -197,7 +213,7 @@ public class PeerSyncWithLeader implements
SolrMetricProducer {
return success ? PeerSync.PeerSyncResult.success() :
PeerSync.PeerSyncResult.failure();
} finally {
if (timerContext != null) {
- timerContext.close();
+ timerContext.stop();
}
}
}
diff --git
a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
index 6695fcef951..cb0d85fabc6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
@@ -20,8 +20,6 @@ package org.apache.solr.cloud;
import static java.util.Collections.singletonList;
import com.carrotsearch.randomizedtesting.generators.RandomStrings;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Metric;
import com.codahale.metrics.MetricRegistry;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@@ -32,7 +30,6 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
-import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
@@ -44,7 +41,9 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrCore;
import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.metrics.SolrMetricTestUtils;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -186,15 +185,32 @@ public class PeerSyncReplicationTest extends
AbstractFullDistribZkTestBase {
}
}
assertNotNull(registry);
- Map<String, Metric> metrics = registry.getMetrics();
- assertTrue(
- "REPLICATION.peerSync.time present",
metrics.containsKey("REPLICATION.peerSync.time"));
- assertTrue(
- "REPLICATION.peerSync.errors present",
- metrics.containsKey("REPLICATION.peerSync.errors"));
-
- Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors");
- assertEquals(0L, counter.getCount());
+ CoreContainer cc = nodePeerSynced.jetty.getCoreContainer();
+ String coreName =
+ cc.getAllCoreNames().stream()
+ .filter(n -> n.contains(DEFAULT_TEST_COLLECTION_NAME))
+ .findFirst()
+ .orElseThrow(
+ () ->
+ new IllegalStateException(
+ "Couldn't find core for " +
nodePeerSynced.coreNodeName));
+ try (SolrCore core = cc.getCore(coreName)) {
+ assertTrue(
+ SolrMetricTestUtils.getHistogramDatapoint(
+ core,
+ "solr_core_leader_sync_time_milliseconds",
+ SolrMetricTestUtils.newCloudLabelsBuilder(core)
+ .label("category", "REPLICATION")
+ .build())
+ .hasCount());
+ assertNull(
+ SolrMetricTestUtils.getCounterDatapoint(
+ core,
+ "solr_core_leader_sync_errors",
+ SolrMetricTestUtils.newCloudLabelsBuilder(core)
+ .label("category", "REPLICATION")
+ .build()));
+ }
success = true;
} finally {
System.clearProperty("solr.disableFingerprint");
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
index d64c62b5ab5..3562b4bec46 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
@@ -17,17 +17,12 @@
package org.apache.solr.cloud;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Metric;
-import com.codahale.metrics.Timer;
import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
-import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -35,9 +30,10 @@ import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.cloud.ClusterStateUtil;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.embedded.JettySolrRunner;
-import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.metrics.SolrMetricTestUtils;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.util.TestInjection;
@@ -140,22 +136,19 @@ public class TestCloudRecovery extends SolrCloudTestCase {
// check metrics
long replicationCount = 0;
- long errorsCount = 0;
- long skippedCount = 0;
for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
- SolrMetricManager manager = jetty.getCoreContainer().getMetricManager();
- List<String> registryNames =
- manager.registryNames().stream()
- .filter(s -> s.startsWith("solr.core."))
- .collect(Collectors.toList());
- for (String registry : registryNames) {
- Map<String, Metric> metrics = manager.registry(registry).getMetrics();
- Timer timer = (Timer) metrics.get("REPLICATION.peerSync.time");
- Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors");
- Counter skipped = (Counter)
metrics.get("REPLICATION.peerSync.skipped");
- replicationCount += timer.getCount();
- errorsCount += counter.getCount();
- skippedCount += skipped.getCount();
+ CoreContainer cc = jetty.getCoreContainer();
+ for (String coreName : cc.getAllCoreNames()) {
+ try (SolrCore core = cc.getCore(coreName)) {
+ var replicationDatapoint =
+ SolrMetricTestUtils.getHistogramDatapoint(
+ core,
+ "solr_core_peer_sync_time_milliseconds",
+ SolrMetricTestUtils.newCloudLabelsBuilder(core)
+ .label("category", "REPLICATION")
+ .build());
+ replicationCount += (replicationDatapoint != null) ?
replicationDatapoint.getCount() : 0;
+ }
}
}
if (onlyLeaderIndexes) {
diff --git
a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
index 479615ddf93..f7878f19c8e 100644
--- a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
+++ b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
@@ -22,6 +22,7 @@ import
io.opentelemetry.exporter.prometheus.PrometheusMetricReader;
import io.prometheus.metrics.model.snapshots.CounterSnapshot;
import io.prometheus.metrics.model.snapshots.DataPointSnapshot;
import io.prometheus.metrics.model.snapshots.GaugeSnapshot;
+import io.prometheus.metrics.model.snapshots.HistogramSnapshot;
import io.prometheus.metrics.model.snapshots.Labels;
import java.util.ArrayList;
import java.util.HashMap;
@@ -173,9 +174,7 @@ public final class SolrMetricTestUtils {
private static <T> T getDatapoint(
SolrCore core, String metricName, Labels labels, Class<T> snapshotType) {
-
var reader = getPrometheusMetricReader(core);
-
return snapshotType.cast(SolrMetricTestUtils.getDataPointSnapshot(reader,
metricName, labels));
}
@@ -188,4 +187,10 @@ public final class SolrMetricTestUtils {
SolrCore core, String metricName, Labels labels) {
return getDatapoint(core, metricName, labels,
CounterSnapshot.CounterDataPointSnapshot.class);
}
+
+ public static HistogramSnapshot.HistogramDataPointSnapshot
getHistogramDatapoint(
+ SolrCore core, String metricName, Labels labels) {
+ return getDatapoint(
+ core, metricName, labels,
HistogramSnapshot.HistogramDataPointSnapshot.class);
+ }
}