This is an automated email from the ASF dual-hosted git repository.

mlbiscoc pushed a commit to branch feature/SOLR-17458
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/feature/SOLR-17458 by this 
push:
     new 41ed55349fe SOLR-17806: Migrate PeerSync and PeerSyncWithLeader 
metrics to OTEL (#3440)
41ed55349fe is described below

commit 41ed55349fe2e6efd356be40cc19664616215692
Author: Matthew Biscocho <[email protected]>
AuthorDate: Tue Jul 29 16:05:05 2025 -0400

    SOLR-17806: Migrate PeerSync and PeerSyncWithLeader metrics to OTEL (#3440)
    
    * Migrate peersync metrics to OTEL
    
    * metric utils rename
---
 .../src/java/org/apache/solr/update/PeerSync.java  | 38 +++++++++++++-------
 .../org/apache/solr/update/PeerSyncWithLeader.java | 38 ++++++++++++++------
 .../apache/solr/cloud/PeerSyncReplicationTest.java | 40 +++++++++++++++-------
 .../org/apache/solr/cloud/TestCloudRecovery.java   | 35 ++++++++-----------
 .../apache/solr/metrics/SolrMetricTestUtils.java   |  9 +++--
 5 files changed, 102 insertions(+), 58 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/update/PeerSync.java 
b/solr/core/src/java/org/apache/solr/update/PeerSync.java
index a045e88958a..3e201746f53 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSync.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSync.java
@@ -21,8 +21,6 @@ import static org.apache.solr.common.params.CommonParams.ID;
 import static 
org.apache.solr.update.processor.DistributedUpdateProcessor.DistribPhase.FROMLEADER;
 import static 
org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
 
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Timer;
 import com.google.common.annotations.VisibleForTesting;
 import io.opentelemetry.api.common.Attributes;
 import java.io.IOException;
@@ -51,6 +49,8 @@ import org.apache.solr.handler.component.ShardRequest;
 import org.apache.solr.handler.component.ShardResponse;
 import org.apache.solr.metrics.SolrMetricProducer;
 import org.apache.solr.metrics.SolrMetricsContext;
+import org.apache.solr.metrics.otel.instruments.AttributedLongCounter;
+import org.apache.solr.metrics.otel.instruments.AttributedLongTimer;
 import org.apache.solr.request.LocalSolrQueryRequest;
 import org.apache.solr.request.SolrQueryRequest;
 import org.apache.solr.response.SolrQueryResponse;
@@ -90,9 +90,9 @@ public class PeerSync implements SolrMetricProducer {
   private MissedUpdatesFinder missedUpdatesFinder;
 
   // metrics
-  private Timer syncTime;
-  private Counter syncErrors;
-  private Counter syncSkipped;
+  private AttributedLongTimer syncTime;
+  private AttributedLongCounter syncErrors;
+  private AttributedLongCounter syncSkipped;
   private SolrMetricsContext solrMetricsContext;
 
   // comparator that sorts by absolute value, putting highest first
@@ -143,14 +143,28 @@ public class PeerSync implements SolrMetricProducer {
     return solrMetricsContext;
   }
 
-  // TODO SOLR-17458: Migrate to Otel
   @Override
   public void initializeMetrics(
       SolrMetricsContext parentContext, Attributes attributes, String scope) {
     this.solrMetricsContext = parentContext.getChildContext(this);
-    syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE);
-    syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE);
-    syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE);
+    var baseAttributes =
+        attributes.toBuilder()
+            .put("category", SolrInfoBean.Category.REPLICATION.toString())
+            .build();
+    syncErrors =
+        new AttributedLongCounter(
+            solrMetricsContext.longCounter(
+                "solr_core_peer_sync_errors", "Total number of sync errors 
with peer"),
+            baseAttributes);
+    syncSkipped =
+        new AttributedLongCounter(
+            solrMetricsContext.longCounter(
+                "solr_core_peer_sync_skipped", "Total number of skipped syncs 
with peer"),
+            baseAttributes);
+    syncTime =
+        new AttributedLongTimer(
+            solrMetricsContext.longHistogram("solr_core_peer_sync_time", "Peer 
sync times", "ms"),
+            baseAttributes);
   }
 
   public static long percentile(List<Long> arr, float frac) {
@@ -181,7 +195,7 @@ public class PeerSync implements SolrMetricProducer {
       syncErrors.inc();
       return PeerSyncResult.failure();
     }
-    Timer.Context timerContext = null;
+    AttributedLongTimer.MetricTimer timerContext = null;
     try {
       if (log.isInfoEnabled()) {
         log.info("{} START replicas={} nUpdates={}", msg(), replicas, 
nUpdates);
@@ -194,7 +208,7 @@ public class PeerSync implements SolrMetricProducer {
       }
 
       // measure only when actual sync is performed
-      timerContext = syncTime.time();
+      timerContext = syncTime.start();
 
       // Fire off the requests before getting our own recent updates (for 
better concurrency)
       // This also allows us to avoid getting updates we don't need... if we 
got our updates and
@@ -273,7 +287,7 @@ public class PeerSync implements SolrMetricProducer {
       return success ? PeerSyncResult.success() : PeerSyncResult.failure();
     } finally {
       if (timerContext != null) {
-        timerContext.close();
+        timerContext.stop();
       }
     }
   }
diff --git a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java 
b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
index f9acc0f0746..79713dd1d94 100644
--- a/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
+++ b/solr/core/src/java/org/apache/solr/update/PeerSyncWithLeader.java
@@ -22,8 +22,6 @@ import static 
org.apache.solr.update.PeerSync.MissedUpdatesRequest;
 import static org.apache.solr.update.PeerSync.absComparator;
 import static org.apache.solr.update.PeerSync.percentile;
 
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Timer;
 import io.opentelemetry.api.common.Attributes;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
@@ -43,6 +41,8 @@ import org.apache.solr.core.SolrCore;
 import org.apache.solr.core.SolrInfoBean;
 import org.apache.solr.metrics.SolrMetricProducer;
 import org.apache.solr.metrics.SolrMetricsContext;
+import org.apache.solr.metrics.otel.instruments.AttributedLongCounter;
+import org.apache.solr.metrics.otel.instruments.AttributedLongTimer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -67,9 +67,9 @@ public class PeerSyncWithLeader implements SolrMetricProducer 
{
   private Set<Long> bufferedUpdates;
 
   // metrics
-  private Timer syncTime;
-  private Counter syncErrors;
-  private Counter syncSkipped;
+  private AttributedLongTimer syncTime;
+  private AttributedLongCounter syncErrors;
+  private AttributedLongCounter syncSkipped;
   private SolrMetricsContext solrMetricsContext;
 
   public PeerSyncWithLeader(SolrCore core, String leaderUrl, int nUpdates) {
@@ -103,9 +103,25 @@ public class PeerSyncWithLeader implements 
SolrMetricProducer {
   public void initializeMetrics(
       SolrMetricsContext parentContext, Attributes attributes, String scope) {
     this.solrMetricsContext = parentContext.getChildContext(this);
-    syncTime = solrMetricsContext.timer("time", scope, METRIC_SCOPE);
-    syncErrors = solrMetricsContext.counter("errors", scope, METRIC_SCOPE);
-    syncSkipped = solrMetricsContext.counter("skipped", scope, METRIC_SCOPE);
+    var baseAttributes =
+        attributes.toBuilder()
+            .put("category", SolrInfoBean.Category.REPLICATION.toString())
+            .build();
+    syncErrors =
+        new AttributedLongCounter(
+            solrMetricsContext.longCounter(
+                "solr_core_sync_with_leader_errors", "Total number of sync 
errors with leader"),
+            baseAttributes);
+    syncSkipped =
+        new AttributedLongCounter(
+            solrMetricsContext.longCounter(
+                "solr_core_sync_with_leader_skipped", "Total number of skipped 
syncs with leader"),
+            baseAttributes);
+    syncTime =
+        new AttributedLongTimer(
+            solrMetricsContext.longHistogram(
+                "solr_core_sync_with_leader_time", "leader sync times", "ms"),
+            baseAttributes);
   }
 
   // start of peersync related debug messages.  includes the core name for 
correlation.
@@ -137,7 +153,7 @@ public class PeerSyncWithLeader implements 
SolrMetricProducer {
       return PeerSync.PeerSyncResult.failure();
     }
 
-    Timer.Context timerContext = null;
+    AttributedLongTimer.MetricTimer timerContext = null;
     try {
       if (log.isInfoEnabled()) {
         log.info("{} START leader={} nUpdates={}", msg(), leaderUrl, nUpdates);
@@ -153,7 +169,7 @@ public class PeerSyncWithLeader implements 
SolrMetricProducer {
       }
 
       // measure only when actual sync is performed
-      timerContext = syncTime.time();
+      timerContext = syncTime.start();
 
       List<Long> ourUpdates;
       try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
@@ -197,7 +213,7 @@ public class PeerSyncWithLeader implements 
SolrMetricProducer {
       return success ? PeerSync.PeerSyncResult.success() : 
PeerSync.PeerSyncResult.failure();
     } finally {
       if (timerContext != null) {
-        timerContext.close();
+        timerContext.stop();
       }
     }
   }
diff --git 
a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java 
b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
index 6695fcef951..cb0d85fabc6 100644
--- a/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/PeerSyncReplicationTest.java
@@ -20,8 +20,6 @@ package org.apache.solr.cloud;
 import static java.util.Collections.singletonList;
 
 import com.carrotsearch.randomizedtesting.generators.RandomStrings;
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Metric;
 import com.codahale.metrics.MetricRegistry;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
@@ -32,7 +30,6 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.stream.Collectors;
@@ -44,7 +41,9 @@ import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrCore;
 import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.metrics.SolrMetricTestUtils;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -186,15 +185,32 @@ public class PeerSyncReplicationTest extends 
AbstractFullDistribZkTestBase {
         }
       }
       assertNotNull(registry);
-      Map<String, Metric> metrics = registry.getMetrics();
-      assertTrue(
-          "REPLICATION.peerSync.time present", 
metrics.containsKey("REPLICATION.peerSync.time"));
-      assertTrue(
-          "REPLICATION.peerSync.errors present",
-          metrics.containsKey("REPLICATION.peerSync.errors"));
-
-      Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors");
-      assertEquals(0L, counter.getCount());
+      CoreContainer cc = nodePeerSynced.jetty.getCoreContainer();
+      String coreName =
+          cc.getAllCoreNames().stream()
+              .filter(n -> n.contains(DEFAULT_TEST_COLLECTION_NAME))
+              .findFirst()
+              .orElseThrow(
+                  () ->
+                      new IllegalStateException(
+                          "Couldn't find core for " + 
nodePeerSynced.coreNodeName));
+      try (SolrCore core = cc.getCore(coreName)) {
+        assertTrue(
+            SolrMetricTestUtils.getHistogramDatapoint(
+                    core,
+                    "solr_core_leader_sync_time_milliseconds",
+                    SolrMetricTestUtils.newCloudLabelsBuilder(core)
+                        .label("category", "REPLICATION")
+                        .build())
+                .hasCount());
+        assertNull(
+            SolrMetricTestUtils.getCounterDatapoint(
+                core,
+                "solr_core_leader_sync_errors",
+                SolrMetricTestUtils.newCloudLabelsBuilder(core)
+                    .label("category", "REPLICATION")
+                    .build()));
+      }
       success = true;
     } finally {
       System.clearProperty("solr.disableFingerprint");
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java 
b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
index d64c62b5ab5..3562b4bec46 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestCloudRecovery.java
@@ -17,17 +17,12 @@
 
 package org.apache.solr.cloud;
 
-import com.codahale.metrics.Counter;
-import com.codahale.metrics.Metric;
-import com.codahale.metrics.Timer;
 import java.io.FileOutputStream;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import org.apache.solr.client.solrj.impl.CloudSolrClient;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
@@ -35,9 +30,10 @@ import org.apache.solr.client.solrj.response.QueryResponse;
 import org.apache.solr.common.cloud.ClusterStateUtil;
 import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.ModifiableSolrParams;
+import org.apache.solr.core.CoreContainer;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.embedded.JettySolrRunner;
-import org.apache.solr.metrics.SolrMetricManager;
+import org.apache.solr.metrics.SolrMetricTestUtils;
 import org.apache.solr.update.UpdateLog;
 import org.apache.solr.update.UpdateShardHandler;
 import org.apache.solr.util.TestInjection;
@@ -140,22 +136,19 @@ public class TestCloudRecovery extends SolrCloudTestCase {
 
     // check metrics
     long replicationCount = 0;
-    long errorsCount = 0;
-    long skippedCount = 0;
     for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
-      SolrMetricManager manager = jetty.getCoreContainer().getMetricManager();
-      List<String> registryNames =
-          manager.registryNames().stream()
-              .filter(s -> s.startsWith("solr.core."))
-              .collect(Collectors.toList());
-      for (String registry : registryNames) {
-        Map<String, Metric> metrics = manager.registry(registry).getMetrics();
-        Timer timer = (Timer) metrics.get("REPLICATION.peerSync.time");
-        Counter counter = (Counter) metrics.get("REPLICATION.peerSync.errors");
-        Counter skipped = (Counter) 
metrics.get("REPLICATION.peerSync.skipped");
-        replicationCount += timer.getCount();
-        errorsCount += counter.getCount();
-        skippedCount += skipped.getCount();
+      CoreContainer cc = jetty.getCoreContainer();
+      for (String coreName : cc.getAllCoreNames()) {
+        try (SolrCore core = cc.getCore(coreName)) {
+          var replicationDatapoint =
+              SolrMetricTestUtils.getHistogramDatapoint(
+                  core,
+                  "solr_core_peer_sync_time_milliseconds",
+                  SolrMetricTestUtils.newCloudLabelsBuilder(core)
+                      .label("category", "REPLICATION")
+                      .build());
+          replicationCount += (replicationDatapoint != null) ? 
replicationDatapoint.getCount() : 0;
+        }
       }
     }
     if (onlyLeaderIndexes) {
diff --git 
a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java 
b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
index 479615ddf93..f7878f19c8e 100644
--- a/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
+++ b/solr/core/src/test/org/apache/solr/metrics/SolrMetricTestUtils.java
@@ -22,6 +22,7 @@ import 
io.opentelemetry.exporter.prometheus.PrometheusMetricReader;
 import io.prometheus.metrics.model.snapshots.CounterSnapshot;
 import io.prometheus.metrics.model.snapshots.DataPointSnapshot;
 import io.prometheus.metrics.model.snapshots.GaugeSnapshot;
+import io.prometheus.metrics.model.snapshots.HistogramSnapshot;
 import io.prometheus.metrics.model.snapshots.Labels;
 import java.util.ArrayList;
 import java.util.HashMap;
@@ -173,9 +174,7 @@ public final class SolrMetricTestUtils {
 
   private static <T> T getDatapoint(
       SolrCore core, String metricName, Labels labels, Class<T> snapshotType) {
-
     var reader = getPrometheusMetricReader(core);
-
     return snapshotType.cast(SolrMetricTestUtils.getDataPointSnapshot(reader, 
metricName, labels));
   }
 
@@ -188,4 +187,10 @@ public final class SolrMetricTestUtils {
       SolrCore core, String metricName, Labels labels) {
     return getDatapoint(core, metricName, labels, 
CounterSnapshot.CounterDataPointSnapshot.class);
   }
+
+  public static HistogramSnapshot.HistogramDataPointSnapshot 
getHistogramDatapoint(
+      SolrCore core, String metricName, Labels labels) {
+    return getDatapoint(
+        core, metricName, labels, 
HistogramSnapshot.HistogramDataPointSnapshot.class);
+  }
 }

Reply via email to