This is an automated email from the ASF dual-hosted git repository.

mlbiscoc pushed a commit to branch feature/SOLR-17458
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/feature/SOLR-17458 by this 
push:
     new 78c9c2f6305 SOLR-17806: Migrate Overseer and Blockcache metrics to 
OTEL (#3607)
78c9c2f6305 is described below

commit 78c9c2f6305da5164b2458e1291f59bdcd9a328d
Author: Matthew Biscocho <[email protected]>
AuthorDate: Mon Sep 8 11:37:55 2025 -0400

    SOLR-17806: Migrate Overseer and Blockcache metrics to OTEL (#3607)
    
    * Migrate Overseer and Blockcache metrics
    
    * Single close
    
    ---------
    
    Co-authored-by: Matthew Biscocho <[email protected]>
---
 .../java/org/apache/solr/blockcache/Metrics.java   | 109 +++++++++++++--------
 .../src/java/org/apache/solr/cloud/Overseer.java   |  48 +++++++--
 .../apache/solr/cloud/OverseerTaskProcessor.java   |  44 ++++++++-
 .../java/org/apache/solr/core/SolrInfoBean.java    |   1 +
 .../org/apache/solr/metrics/SolrMetricManager.java |  27 ++++-
 .../apache/solr/metrics/SolrMetricsContext.java    |   9 ++
 .../apache/solr/blockcache/BufferStoreTest.java    |  59 ++++++-----
 7 files changed, 215 insertions(+), 82 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java 
b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
index dcdb4550d8c..98c9b2dbef6 100644
--- a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
+++ b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
@@ -17,11 +17,9 @@
 package org.apache.solr.blockcache;
 
 import io.opentelemetry.api.common.Attributes;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.atomic.AtomicLong;
+import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.core.SolrInfoBean;
-import org.apache.solr.metrics.MetricsMap;
 import org.apache.solr.metrics.SolrMetricsContext;
 import org.apache.solr.search.SolrCacheBase;
 
@@ -50,19 +48,31 @@ public class Metrics extends SolrCacheBase implements 
SolrInfoBean {
   public AtomicLong shardBuffercacheAllocate = new AtomicLong(0);
   public AtomicLong shardBuffercacheLost = new AtomicLong(0);
 
-  private MetricsMap metricsMap;
-  private Set<String> metricNames = ConcurrentHashMap.newKeySet();
   private SolrMetricsContext solrMetricsContext;
   private long previous = System.nanoTime();
 
-  // TODO SOLR-17458: Migrate to Otel
+  private AutoCloseable toClose;
+
   @Override
   public void initializeMetrics(
       SolrMetricsContext parentContext, Attributes attributes, String scope) {
     solrMetricsContext = parentContext.getChildContext(this);
-    metricsMap =
-        new MetricsMap(
-            map -> {
+    var baseAttributes =
+        attributes.toBuilder().put(CATEGORY_ATTR, 
getCategory().toString()).build();
+    var blockcacheStats =
+        solrMetricsContext.longMeasurement("solr_block_cache_stats", "Block 
cache stats");
+    var hitRatio =
+        solrMetricsContext.doubleMeasurement("solr_block_cache_hit_ratio", 
"Block cache hit ratio");
+    var perSecStats =
+        solrMetricsContext.doubleMeasurement(
+            "solr_block_cache_stats_per_second", "Block cache per second 
stats");
+    var bufferCacheStats =
+        solrMetricsContext.doubleMeasurement(
+            "solr_buffer_cache_stats", "Buffer cache per second stats");
+
+    this.toClose =
+        solrMetricsContext.batchCallback(
+            () -> {
               long now = System.nanoTime();
               long delta = Math.max(now - previous, 1);
               double seconds = delta / 1000000000.0;
@@ -86,44 +96,63 @@ public class Metrics extends SolrCacheBase implements 
SolrInfoBean {
               long lookups_delta = hits_delta + miss_delta;
               long lookups_total = hits_total + miss_total;
 
-              map.put("size", blockCacheSize.get());
-              map.put("lookups", lookups_total);
-              map.put("hits", hits_total);
-              map.put("evictions", evict_total);
-              map.put("storeFails", storeFail_total);
-              map.put(
-                  "hitratio_current",
-                  calcHitRatio(lookups_delta, hits_delta)); // hit ratio since 
the last call
-              map.put(
-                  "lookups_persec",
-                  getPerSecond(lookups_delta, seconds)); // lookups per second 
since the last call
-              map.put(
-                  "hits_persec",
-                  getPerSecond(hits_delta, seconds)); // hits per second since 
the last call
-              map.put(
-                  "evictions_persec",
-                  getPerSecond(evict_delta, seconds)); // evictions per second 
since the last call
-              map.put(
-                  "storeFails_persec",
-                  getPerSecond(
-                      storeFail_delta, seconds)); // evictions per second 
since the last call
-              map.put("time_delta", seconds); // seconds since last call
-
-              // TODO: these aren't really related to the BlockCache
-              map.put(
-                  "buffercache.allocations",
-                  getPerSecond(shardBuffercacheAllocate.getAndSet(0), 
seconds));
-              map.put("buffercache.lost", 
getPerSecond(shardBuffercacheLost.getAndSet(0), seconds));
-
+              blockcacheStats.record(
+                  blockCacheSize.get(), 
baseAttributes.toBuilder().put(TYPE_ATTR, "size").build());
+              blockcacheStats.record(
+                  lookups_total, baseAttributes.toBuilder().put(TYPE_ATTR, 
"lookups").build());
+              blockcacheStats.record(
+                  hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, 
"hits").build());
+              blockcacheStats.record(
+                  hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, 
"evictions").build());
+              blockcacheStats.record(
+                  storeFail_total,
+                  baseAttributes.toBuilder().put(TYPE_ATTR, 
"store_fails").build());
+              perSecStats.record(
+                  getPerSecond(lookups_delta, seconds),
+                  baseAttributes.toBuilder()
+                      .put(TYPE_ATTR, "lookups")
+                      .build()); // lookups per second since the last call
+              perSecStats.record(
+                  getPerSecond(hits_delta, seconds),
+                  baseAttributes.toBuilder()
+                      .put(TYPE_ATTR, "hits")
+                      .build()); // hits per second since the last call
+              perSecStats.record(
+                  getPerSecond(evict_delta, seconds),
+                  baseAttributes.toBuilder()
+                      .put(TYPE_ATTR, "evictions")
+                      .build()); // evictions per second since the last call
+              perSecStats.record(
+                  getPerSecond(storeFail_delta, seconds),
+                  baseAttributes.toBuilder()
+                      .put(TYPE_ATTR, "store_fails")
+                      .build()); // evictions per second since the last call
+              hitRatio.record(
+                  calcHitRatio(lookups_delta, hits_delta),
+                  baseAttributes); // hit ratio since the last call
+              bufferCacheStats.record(
+                  getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds),
+                  baseAttributes.toBuilder().put(TYPE_ATTR, 
"allocations").build());
+              bufferCacheStats.record(
+                  getPerSecond(shardBuffercacheLost.getAndSet(0), seconds),
+                  baseAttributes.toBuilder().put(TYPE_ATTR, "lost").build());
               previous = now;
-            });
-    solrMetricsContext.gauge(metricsMap, true, getName(), 
getCategory().toString(), scope);
+            },
+            blockcacheStats,
+            perSecStats,
+            hitRatio,
+            bufferCacheStats);
   }
 
   private float getPerSecond(long value, double seconds) {
     return (float) (value / seconds);
   }
 
+  @Override
+  public void close() {
+    IOUtils.closeQuietly(toClose);
+  }
+
   // SolrInfoBean methods
 
   @Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java 
b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index b1a9f72c1f7..51007b736ec 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
 import static org.apache.solr.common.params.CommonParams.ID;
 
 import com.codahale.metrics.Timer;
+import io.opentelemetry.api.common.Attributes;
 import java.io.Closeable;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
@@ -167,7 +168,7 @@ public class Overseer implements SolrCloseable {
    *
    * <p>The cluster state updater is a single thread dequeueing and executing 
requests.
    */
-  private class ClusterStateUpdater implements Runnable, Closeable {
+  private class ClusterStateUpdater implements SolrInfoBean, Runnable, 
Closeable {
 
     private final ZkStateReader reader;
     private final SolrZkClient zkClient;
@@ -195,6 +196,8 @@ public class Overseer implements SolrCloseable {
 
     private boolean isClosed = false;
 
+    private AutoCloseable toClose;
+
     public ClusterStateUpdater(
         final ZkStateReader reader,
         final String myId,
@@ -213,12 +216,27 @@ public class Overseer implements SolrCloseable {
       this.minStateByteLenForCompression = minStateByteLenForCompression;
       this.compressor = compressor;
 
-      clusterStateUpdaterMetricContext = 
solrMetricsContext.getChildContext(this);
-      clusterStateUpdaterMetricContext.gauge(
-          () -> stateUpdateQueue.getZkStats().getQueueLength(),
-          true,
-          "stateUpdateQueueSize",
-          "queue");
+      this.clusterStateUpdaterMetricContext = 
solrMetricsContext.getChildContext(this);
+      initializeMetrics(
+          solrMetricsContext, Attributes.of(CATEGORY_ATTR, 
getCategory().toString()), "");
+    }
+
+    @Override
+    public void initializeMetrics(
+        SolrMetricsContext parentContext, Attributes attributes, String scope) 
{
+      this.toClose =
+          parentContext.observableLongGauge(
+              "solr_overseer_state_update_queue_size",
+              "Size of overseer's update queue",
+              (observableLongMeasurement) -> {
+                observableLongMeasurement.record(
+                    stateUpdateQueue.getZkStats().getQueueLength(), 
attributes);
+              });
+    }
+
+    @Override
+    public SolrMetricsContext getSolrMetricsContext() {
+      return clusterStateUpdaterMetricContext;
     }
 
     public Stats getStateUpdateQueueStats() {
@@ -641,8 +659,24 @@ public class Overseer implements SolrCloseable {
     @Override
     public void close() {
       this.isClosed = true;
+      IOUtils.closeQuietly(toClose);
       clusterStateUpdaterMetricContext.unregister();
     }
+
+    @Override
+    public String getName() {
+      return this.getClass().getName();
+    }
+
+    @Override
+    public String getDescription() {
+      return "Cluster leader responsible for processing state updates";
+    }
+
+    @Override
+    public Category getCategory() {
+      return Category.OVERSEER;
+    }
   }
 
   public static class OverseerThread extends Thread implements Closeable {
diff --git 
a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java 
b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
index 3751d850f09..e29df029de7 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
@@ -20,6 +20,7 @@ import static 
org.apache.solr.common.params.CommonAdminParams.ASYNC;
 import static org.apache.solr.common.params.CommonParams.ID;
 
 import com.codahale.metrics.Timer;
+import io.opentelemetry.api.common.Attributes;
 import java.io.Closeable;
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
@@ -44,6 +45,7 @@ import org.apache.solr.common.util.IOUtils;
 import org.apache.solr.common.util.SolrNamedThreadFactory;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.SolrInfoBean;
 import org.apache.solr.logging.MDCLoggingContext;
 import org.apache.solr.metrics.SolrMetricsContext;
 import org.apache.zookeeper.KeeperException;
@@ -59,7 +61,7 @@ import org.slf4j.LoggerFactory;
  * <p>An {@link OverseerMessageHandlerSelector} determines which {@link 
OverseerMessageHandler}
  * handles specific messages in the queue.
  */
-public class OverseerTaskProcessor implements Runnable, Closeable {
+public class OverseerTaskProcessor implements SolrInfoBean, Runnable, 
Closeable {
 
   /** Maximum number of overseer collection operations which can be executed 
concurrently */
   public static final int MAX_PARALLEL_TASKS = 100;
@@ -91,6 +93,7 @@ public class OverseerTaskProcessor implements Runnable, 
Closeable {
 
   private final Stats stats;
   private final SolrMetricsContext overseerTaskProcessorMetricsContext;
+  private AutoCloseable toClose;
 
   /**
    * This map may contain tasks which are read from work queue but could not 
be executed because
@@ -148,9 +151,9 @@ public class OverseerTaskProcessor implements Runnable, 
Closeable {
     this.runningTasks = new HashMap<>();
     thisNode = MDCLoggingContext.getNodeName();
 
-    overseerTaskProcessorMetricsContext = 
solrMetricsContext.getChildContext(this);
-    overseerTaskProcessorMetricsContext.gauge(
-        () -> workQueue.getZkStats().getQueueLength(), true, 
"collectionWorkQueueSize", "queue");
+    this.overseerTaskProcessorMetricsContext = 
solrMetricsContext.getChildContext(this);
+    initializeMetrics(
+        solrMetricsContext, Attributes.of(CATEGORY_ATTR, 
getCategory().toString()), "");
   }
 
   @Override
@@ -403,6 +406,23 @@ public class OverseerTaskProcessor implements Runnable, 
Closeable {
     runningTasks.entrySet().removeIf(e -> e.getValue().isDone());
   }
 
+  @Override
+  public void initializeMetrics(
+      SolrMetricsContext parentContext, Attributes attributes, String scope) {
+    this.toClose =
+        parentContext.observableLongGauge(
+            "solr_overseer_collection_work_queue_size",
+            "Size of overseer's collection work queue",
+            (observableLongMeasurement) -> {
+              
observableLongMeasurement.record(workQueue.getZkStats().getQueueLength(), 
attributes);
+            });
+  }
+
+  @Override
+  public SolrMetricsContext getSolrMetricsContext() {
+    return overseerTaskProcessorMetricsContext;
+  }
+
   @Override
   public void close() {
     isClosed = true;
@@ -413,6 +433,7 @@ public class OverseerTaskProcessor implements Runnable, 
Closeable {
       }
     }
     IOUtils.closeQuietly(selector);
+    IOUtils.closeQuietly(toClose);
   }
 
   public static List<String> getSortedOverseerNodeNames(SolrZkClient zk)
@@ -673,6 +694,21 @@ public class OverseerTaskProcessor implements Runnable, 
Closeable {
     return myId;
   }
 
+  @Override
+  public String getName() {
+    return this.getClass().getName();
+  }
+
+  @Override
+  public String getDescription() {
+    return "Processor in Overseer handling items added to a distributed work 
queue";
+  }
+
+  @Override
+  public Category getCategory() {
+    return Category.OVERSEER;
+  }
+
   /**
    * An interface to determine which {@link OverseerMessageHandler} handles a 
given message. This
    * could be a single OverseerMessageHandler for the case where a single type 
of message is handled
diff --git a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java 
b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
index d62f51e439e..0689edb0d84 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
@@ -39,6 +39,7 @@ public interface SolrInfoBean extends SolrMetricProducer {
     DIRECTORY,
     HTTP,
     SECURITY,
+    OVERSEER,
     OTHER
   }
 
diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java 
b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
index 4ddc6443f80..e179306eaae 100644
--- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
+++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
@@ -358,11 +358,6 @@ public class SolrMetricManager {
     return builder.buildWithCallback(callback);
   }
 
-  ObservableLongMeasurement longMeasurement(
-      String registry, String gaugeName, String description, OtelUnit unit) {
-    return longGaugeBuilder(registry, gaugeName, description, 
unit).buildObserver();
-  }
-
   BatchCallback batchCallback(
       String registry,
       Runnable callback,
@@ -373,6 +368,16 @@ public class SolrMetricManager {
         .batchCallback(callback, measurement, additionalMeasurements);
   }
 
+  ObservableLongMeasurement longMeasurement(
+      String registry, String gaugeName, String description, OtelUnit unit) {
+    return longGaugeBuilder(registry, gaugeName, description, 
unit).buildObserver();
+  }
+
+  ObservableDoubleMeasurement doubleMeasurement(
+      String registry, String gaugeName, String description, OtelUnit unit) {
+    return doubleGaugeBuilder(registry, gaugeName, description, 
unit).buildObserver();
+  }
+
   private LongGaugeBuilder longGaugeBuilder(
       String registry, String gaugeName, String description, OtelUnit unit) {
     LongGaugeBuilder builder =
@@ -386,6 +391,18 @@ public class SolrMetricManager {
     return builder;
   }
 
+  private DoubleGaugeBuilder doubleGaugeBuilder(
+      String registry, String gaugeName, String description, OtelUnit unit) {
+    DoubleGaugeBuilder builder =
+        meterProvider(registry)
+            .get(OTEL_SCOPE_NAME)
+            .gaugeBuilder(gaugeName)
+            .setDescription(description);
+    if (unit != null) builder.setUnit(unit.getSymbol());
+
+    return builder;
+  }
+
   // for unit tests
   public MetricRegistry.MetricSupplier<Counter> getCounterSupplier() {
     return counterSupplier;
diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java 
b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
index 14b6affa475..696ace6aa0d 100644
--- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
+++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
@@ -285,6 +285,15 @@ public class SolrMetricsContext {
     return metricManager.longMeasurement(registryName, metricName, 
description, unit);
   }
 
+  public ObservableDoubleMeasurement doubleMeasurement(String metricName, 
String description) {
+    return doubleMeasurement(metricName, description, null);
+  }
+
+  public ObservableDoubleMeasurement doubleMeasurement(
+      String metricName, String description, OtelUnit unit) {
+    return metricManager.doubleMeasurement(registryName, metricName, 
description, unit);
+  }
+
   public BatchCallback batchCallback(
       Runnable callback,
       ObservableMeasurement measurement,
diff --git a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java 
b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
index 46bc29685ca..a628b9263a7 100644
--- a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
+++ b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
@@ -17,11 +17,10 @@
 package org.apache.solr.blockcache;
 
 import io.opentelemetry.api.common.Attributes;
-import java.math.BigDecimal;
-import java.util.Map;
+import io.prometheus.metrics.model.snapshots.GaugeSnapshot;
+import io.prometheus.metrics.model.snapshots.Labels;
 import org.apache.lucene.tests.util.TestUtil;
 import org.apache.solr.SolrTestCase;
-import org.apache.solr.metrics.MetricsMap;
 import org.apache.solr.metrics.SolrMetricManager;
 import org.apache.solr.metrics.SolrMetricsContext;
 import org.junit.After;
@@ -32,27 +31,19 @@ public class BufferStoreTest extends SolrTestCase {
   private static final int blockSize = 1024;
 
   private Metrics metrics;
-  private MetricsMap metricsMap;
+  private SolrMetricManager metricManager;
+  private String registry;
 
   private Store store;
 
   @Before
   public void setup() {
     metrics = new Metrics();
-    SolrMetricManager metricManager = new SolrMetricManager();
-    String registry = TestUtil.randomSimpleString(random(), 2, 10);
+    metricManager = new SolrMetricManager();
+    registry = TestUtil.randomSimpleString(random(), 2, 10);
     String scope = TestUtil.randomSimpleString(random(), 2, 10);
     SolrMetricsContext solrMetricsContext = new 
SolrMetricsContext(metricManager, registry, "foo");
-    // TODO SOLR-17458: Fix test later
     metrics.initializeMetrics(solrMetricsContext, Attributes.empty(), scope);
-    metricsMap =
-        (MetricsMap)
-            ((SolrMetricManager.GaugeWrapper)
-                    metricManager
-                        .registry(registry)
-                        .getMetrics()
-                        .get("CACHE." + scope + ".hdfsBlockCache"))
-                .getGauge();
     BufferStore.initNewBuffer(blockSize, blockSize, metrics);
     store = BufferStore.instance(blockSize);
   }
@@ -99,19 +90,35 @@ public class BufferStoreTest extends SolrTestCase {
    * @param lost whether buffers should have been lost since the last call
    */
   private void assertGaugeMetricsChanged(boolean allocated, boolean lost) {
-    Map<String, Object> stats = metricsMap.getValue();
-
-    assertEquals(
-        "Buffer allocation metric not updating correctly.",
-        allocated,
-        isMetricPositive(stats, "buffercache.allocations"));
+    var gauge =
+        metricManager.getPrometheusMetricReader(registry).collect().stream()
+            .filter(ms -> 
ms.getMetadata().getPrometheusName().equals("solr_buffer_cache_stats"))
+            .map(GaugeSnapshot.class::cast)
+            .findFirst()
+            .orElseThrow(() -> new AssertionError("Missing gauge metric 
solr_buffer_cache_stats"));
+
+    var actualAllocatedValue = getBufferCacheStatsValue(gauge, "allocations");
+    var actualLostValue = getBufferCacheStatsValue(gauge, "lost");
     assertEquals(
-        "Buffer lost metric not updating correctly.",
-        lost,
-        isMetricPositive(stats, "buffercache.lost"));
+        "Buffer allocation metric not updating correctly.", allocated, 
actualAllocatedValue > 0);
+    assertEquals("Buffer allocation metric not updating correctly.", lost, 
actualLostValue > 0);
   }
 
-  private boolean isMetricPositive(Map<String, Object> stats, String metric) {
-    return new 
BigDecimal(stats.get(metric).toString()).compareTo(BigDecimal.ZERO) > 0;
+  private Double getBufferCacheStatsValue(GaugeSnapshot gaugeSnapshot, String 
type) {
+    return gaugeSnapshot.getDataPoints().stream()
+        .filter(
+            (dp) ->
+                dp.getLabels()
+                    .equals(
+                        Labels.of(
+                            "category",
+                            "CACHE",
+                            "otel_scope_name",
+                            "org.apache.solr",
+                            "type",
+                            type)))
+        .findFirst()
+        .orElseThrow(() -> new AssertionError("Missing type=" + type + " label 
on metric"))
+        .getValue();
   }
 }

Reply via email to