This is an automated email from the ASF dual-hosted git repository. mlbiscoc pushed a commit to branch feature/SOLR-17458-rebased in repository https://gitbox.apache.org/repos/asf/solr.git
commit ed7f2d116ade9af13ec79ba2bea4de8cb610f225 Author: Matthew Biscocho <[email protected]> AuthorDate: Tue Jul 29 16:05:05 2025 -0400 SOLR-17806: Migrate PeerSync and PeerSyncWithLeader metrics to OTEL (#3440) * Migrate peersync metrics to OTEL * metric utils rename --- .../src/java/org/apache/solr/update/PeerSync.java | 38 +++++++++++++------- .../org/apache/solr/update/PeerSyncWithLeader.java | 38 ++++++++++++++------ .../apache/solr/cloud/PeerSyncReplicationTest.java | 40 +++++++++++++++------- .../org/apache/solr/cloud/TestCloudRecovery.java | 35 ++++++++----------- .../apache/solr/metrics/SolrMetricTestUtils.java | 9 +++-- 5 files changed, 102 insertions(+), 58 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java b/solr/core/src/java/org/apache/solr/update/PeerSync.java index 0036c856a6b..e4fa8e8bc39 100644 --- a/solr/core/src/java/org/apache/solr/update/PeerSync.java +++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java @@ -21,8 +21,6 @@ import static org.apache.solr.common.params.CommonParams.ID; import static org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase.FROMLEADER; import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Timer; import com.google.common.annotations.VisibleForTesting; import io.opentelemetry.api.common.Attributes; import java.io.IOException; @@ -51,6 +49,8 @@ import org.apache.solr.handler.component.ShardRequest; import org.apache.solr.handler.component.ShardResponse; import org.apache.solr.metrics.SolrMetricProducer; import org.apache.solr.metrics.SolrMetricsContext; +import org.apache.solr.metrics.otel.instruments.AttributedLongCounter; +import org.apache.solr.metrics.otel.instruments.AttributedLongTimer; import org.apache.solr.request.LocalSolrQueryRequest; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; @@ -90,9 +90,9 @@ public class PeerSync implements SolrMetricProducer { private MissedUpdatesFinder missedUpdatesFinder; // metrics - private Timer syncTime; - private Counter syncErrors; - private Counter syncSkipped; + private AttributedLongTimer syncTime; + private AttributedLongCounter syncErrors; + private AttributedLongCounter syncSkipped; private SolrMetricsContext solrMetricsContext; // comparator that sorts by absolute value, putting highest first @@ -144,14 +144,28 @@ public class PeerSync implements SolrMetricProducer { return solrMetricsContext; } - // TODO SOLR-17458: Migrate to Otel @Override public void initializeMetrics( SolrMetricsContext parentContext, Attributes attributes, String scope) { this.solrMetricsContext = parentContext.getChildContext(this); - syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE); - syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE); - syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE); + var baseAttributes = + attributes.toBuilder() + .put("category", SolrInfoBean.Category.REPLICATION.toString()) + .build(); + syncErrors = + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_core_peer_sync_errors", "Total number of sync errors with peer"), + baseAttributes); + syncSkipped = + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_core_peer_sync_skipped", "Total number of skipped syncs with peer"), + baseAttributes); + syncTime = + new AttributedLongTimer( + solrMetricsContext.longHistogram("solr_core_peer_sync_time", "Peer sync times", "ms"), + baseAttributes); } public static long percentile(List<Long> arr, float frac) { @@ -182,7 +196,7 @@ public class PeerSync implements SolrMetricProducer { syncErrors.inc(); return PeerSyncResult.failure(); } - Timer.Context timerContext = null; + AttributedLongTimer.MetricTimer timerContext = null; try { if (log.isInfoEnabled()) { log.info("{} START replicas={} nUpdates={}", msg(), replicas, nUpdates); @@ -195,7 +209,7 @@ public class PeerSync implements SolrMetricProducer { } // measure only when actual sync is performed - timerContext = syncTime.time(); + timerContext = syncTime.start(); // Fire off the requests before getting our own recent updates (for better concurrency) // This also allows us to avoid getting updates we don't need... if we got our updates and @@ -274,7 +288,7 @@ public class PeerSync implements SolrMetricProducer { return success ? PeerSyncResult.success() : PeerSyncResult.failure(); } finally { if (timerContext != null) { - timerContext.close(); + timerContext.stop(); } } } diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java index d6630dc46b5..94c6252dbe1 100644 --- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java +++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java @@ -22,8 +22,6 @@ import static org.apache.solr.update.PeerSync.MissedUpdatesRequest; import static org.apache.solr.update.PeerSync.absComparator; import static org.apache.solr.update.PeerSync.percentile; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Timer; import io.opentelemetry.api.common.Attributes; import java.io.IOException; import java.lang.invoke.MethodHandles; @@ -43,6 +41,8 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrInfoBean; import org.apache.solr.metrics.SolrMetricProducer; import org.apache.solr.metrics.SolrMetricsContext; +import org.apache.solr.metrics.otel.instruments.AttributedLongCounter; +import org.apache.solr.metrics.otel.instruments.AttributedLongTimer; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,9 +67,9 @@ public class PeerSyncWithLeader implements SolrMetricProducer { private Set<Long> bufferedUpdates; // metrics - private Timer syncTime; - private Counter syncErrors; - private Counter syncSkipped; + private AttributedLongTimer syncTime; + private AttributedLongCounter syncErrors; + private AttributedLongCounter syncSkipped; private SolrMetricsContext solrMetricsContext; public PeerSyncWithLeader(SolrCore core, String leaderUrl, int nUpdates) { @@ -104,9 +104,25 @@ public class PeerSyncWithLeader implements SolrMetricProducer { public void initializeMetrics( SolrMetricsContext parentContext, Attributes attributes, String scope) { this.solrMetricsContext = parentContext.getChildContext(this); - syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE); - syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE); - syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE); + var baseAttributes = + attributes.toBuilder() + .put("category", SolrInfoBean.Category.REPLICATION.toString()) + .build(); + syncErrors = + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_core_sync_with_leader_errors", "Total number of sync errors with leader"), + baseAttributes); + syncSkipped = + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_core_sync_with_leader_skipped", "Total number of skipped syncs with leader"), + baseAttributes); + syncTime = + new AttributedLongTimer( + solrMetricsContext.longHistogram( + "solr_core_sync_with_leader_time", "leader sync times", "ms"), + baseAttributes); } // start of peersync related debug messages. includes the core name for correlation. @@ -138,7 +154,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer { return PeerSync.PeerSyncResult.failure(); } - Timer.Context timerContext = null; + AttributedLongTimer.MetricTimer timerContext = null; try { if (log.isInfoEnabled()) { log.info("{} START leader={} nUpdates={}", msg(), leaderUrl, nUpdates); @@ -154,7 +170,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer { } // measure only when actual sync is performed - timerContext = syncTime.time(); + timerContext = syncTime.start(); List<Long> ourUpdates; try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) { @@ -198,7 +214,7 @@ public class PeerSyncWithLeader implements SolrMetricProducer { return success ? PeerSync.PeerSyncResult.success() : PeerSync.PeerSyncResult.failure(); } finally { if (timerContext != null) { - timerContext.close(); + timerContext.stop(); } } } diff --git a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java index 128423b736a..469f3f2cd31 100644 --- a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java @@ -20,8 +20,6 @@ package org.apache.solr.cloud; import static java.util.Collections.singletonList; import com.carrotsearch.randomizedtesting.generators.RandomStrings; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Metric; import com.codahale.metrics.MetricRegistry; import java.io.IOException; import java.lang.invoke.MethodHandles; @@ -32,7 +30,6 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -44,7 +41,9 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.cloud.Replica; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.core.CoreContainer; +import org.apache.solr.core.SolrCore; import org.apache.solr.metrics.SolrMetricManager; +import org.apache.solr.metrics.SolrMetricTestUtils; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -186,15 +185,32 @@ public class PeerSyncReplicationTest extends AbstractFullDistribZkTestBase { } } assertNotNull(registry); - Map<String, Metric> metrics = registry.getMetrics(); - assertTrue( - "REPLICATION.peerSync.time present", metrics.containsKey("REPLICATION.peerSync.time")); - assertTrue( - "REPLICATION.peerSync.errors present", - metrics.containsKey("REPLICATION.peerSync.errors")); - - Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors"); - assertEquals(0L, counter.getCount()); + CoreContainer cc = nodePeerSynced.jetty.getCoreContainer(); + String coreName = + cc.getAllCoreNames().stream() + .filter(n -> n.contains(DEFAULT_TEST_COLLECTION_NAME)) + .findFirst() + .orElseThrow( + () -> + new IllegalStateException( + "Couldn't find core for " + nodePeerSynced.coreNodeName)); + try (SolrCore core = cc.getCore(coreName)) { + assertTrue( + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_core_leader_sync_time_milliseconds", + SolrMetricTestUtils.newCloudLabelsBuilder(core) + .label("category", "REPLICATION") + .build()) + .hasCount()); + assertNull( + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_core_leader_sync_errors", + SolrMetricTestUtils.newCloudLabelsBuilder(core) + .label("category", "REPLICATION") + .build())); + } success = true; } finally { System.clearProperty("solr.index.replication.fingerprint.enabled"); diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java index d64c62b5ab5..3562b4bec46 100644 --- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java +++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java @@ -17,17 +17,12 @@ package org.apache.solr.cloud; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Metric; -import com.codahale.metrics.Timer; import java.io.FileOutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; -import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.request.CollectionAdminRequest; @@ -35,9 +30,10 @@ import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.cloud.ClusterStateUtil; import org.apache.solr.common.cloud.ZkStateReader; import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.core.CoreContainer; import org.apache.solr.core.SolrCore; import org.apache.solr.embedded.JettySolrRunner; -import org.apache.solr.metrics.SolrMetricManager; +import org.apache.solr.metrics.SolrMetricTestUtils; import org.apache.solr.update.UpdateLog; import org.apache.solr.update.UpdateShardHandler; import org.apache.solr.util.TestInjection; @@ -140,22 +136,19 @@ public class TestCloudRecovery extends SolrCloudTestCase { // check metrics long replicationCount = 0; - long errorsCount = 0; - long skippedCount = 0; for (JettySolrRunner jetty : cluster.getJettySolrRunners()) { - SolrMetricManager manager = jetty.getCoreContainer().getMetricManager(); - List<String> registryNames = - manager.registryNames().stream() - .filter(s -> s.startsWith("solr.core.")) - .collect(Collectors.toList()); - for (String registry : registryNames) { - Map<String, Metric> metrics = manager.registry(registry).getMetrics(); - Timer timer = (Timer) metrics.get("REPLICATION.peerSync.time"); - Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors"); - Counter skipped = (Counter) metrics.get("REPLICATION.peerSync.skipped"); - replicationCount += timer.getCount(); - errorsCount += counter.getCount(); - skippedCount += skipped.getCount(); + CoreContainer cc = jetty.getCoreContainer(); + for (String coreName : cc.getAllCoreNames()) { + try (SolrCore core = cc.getCore(coreName)) { + var replicationDatapoint = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_core_peer_sync_time_milliseconds", + SolrMetricTestUtils.newCloudLabelsBuilder(core) + .label("category", "REPLICATION") + .build()); + replicationCount += (replicationDatapoint != null) ? replicationDatapoint.getCount() : 0; + } } } if (onlyLeaderIndexes) { diff --git a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java index 479615ddf93..f7878f19c8e 100644 --- a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java +++ b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java @@ -22,6 +22,7 @@ import io.opentelemetry.exporter.prometheus.PrometheusMetricReader; import io.prometheus.metrics.model.snapshots.CounterSnapshot; import io.prometheus.metrics.model.snapshots.DataPointSnapshot; import io.prometheus.metrics.model.snapshots.GaugeSnapshot; +import io.prometheus.metrics.model.snapshots.HistogramSnapshot; import io.prometheus.metrics.model.snapshots.Labels; import java.util.ArrayList; import java.util.HashMap; @@ -173,9 +174,7 @@ public final class SolrMetricTestUtils { private static <T> T getDatapoint( SolrCore core, String metricName, Labels labels, Class<T> snapshotType) { - var reader = getPrometheusMetricReader(core); - return snapshotType.cast(SolrMetricTestUtils.getDataPointSnapshot(reader, metricName, labels)); } @@ -188,4 +187,10 @@ public final class SolrMetricTestUtils { SolrCore core, String metricName, Labels labels) { return getDatapoint(core, metricName, labels, CounterSnapshot.CounterDataPointSnapshot.class); } + + public static HistogramSnapshot.HistogramDataPointSnapshot getHistogramDatapoint( + SolrCore core, String metricName, Labels labels) { + return getDatapoint( + core, metricName, labels, HistogramSnapshot.HistogramDataPointSnapshot.class); + } }
