This is an automated email from the ASF dual-hosted git repository.
mlbiscoc pushed a commit to branch feature/SOLR-17458
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/feature/SOLR-17458 by this
push:
new 78c9c2f6305 SOLR-17806: Migrate Overseer and Blockcache metrics to
OTEL (#3607)
78c9c2f6305 is described below
commit 78c9c2f6305da5164b2458e1291f59bdcd9a328d
Author: Matthew Biscocho <[email protected]>
AuthorDate: Mon Sep 8 11:37:55 2025 -0400
SOLR-17806: Migrate Overseer and Blockcache metrics to OTEL (#3607)
* Migrate Overseer and Blockcache metrics
* Single close
---------
Co-authored-by: Matthew Biscocho <[email protected]>
---
.../java/org/apache/solr/blockcache/Metrics.java | 109 +++++++++++++--------
.../src/java/org/apache/solr/cloud/Overseer.java | 48 +++++++--
.../apache/solr/cloud/OverseerTaskProcessor.java | 44 ++++++++-
.../java/org/apache/solr/core/SolrInfoBean.java | 1 +
.../org/apache/solr/metrics/SolrMetricManager.java | 27 ++++-
.../apache/solr/metrics/SolrMetricsContext.java | 9 ++
.../apache/solr/blockcache/BufferStoreTest.java | 59 ++++++-----
7 files changed, 215 insertions(+), 82 deletions(-)
diff --git a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
index dcdb4550d8c..98c9b2dbef6 100644
--- a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
+++ b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java
@@ -17,11 +17,9 @@
package org.apache.solr.blockcache;
import io.opentelemetry.api.common.Attributes;
-import java.util.Set;
-import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicLong;
+import org.apache.solr.common.util.IOUtils;
import org.apache.solr.core.SolrInfoBean;
-import org.apache.solr.metrics.MetricsMap;
import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.solr.search.SolrCacheBase;
@@ -50,19 +48,31 @@ public class Metrics extends SolrCacheBase implements
SolrInfoBean {
public AtomicLong shardBuffercacheAllocate = new AtomicLong(0);
public AtomicLong shardBuffercacheLost = new AtomicLong(0);
- private MetricsMap metricsMap;
- private Set<String> metricNames = ConcurrentHashMap.newKeySet();
private SolrMetricsContext solrMetricsContext;
private long previous = System.nanoTime();
- // TODO SOLR-17458: Migrate to Otel
+ private AutoCloseable toClose;
+
@Override
public void initializeMetrics(
SolrMetricsContext parentContext, Attributes attributes, String scope) {
solrMetricsContext = parentContext.getChildContext(this);
- metricsMap =
- new MetricsMap(
- map -> {
+ var baseAttributes =
+ attributes.toBuilder().put(CATEGORY_ATTR,
getCategory().toString()).build();
+ var blockcacheStats =
+ solrMetricsContext.longMeasurement("solr_block_cache_stats", "Block
cache stats");
+ var hitRatio =
+ solrMetricsContext.doubleMeasurement("solr_block_cache_hit_ratio",
"Block cache hit ratio");
+ var perSecStats =
+ solrMetricsContext.doubleMeasurement(
+ "solr_block_cache_stats_per_second", "Block cache per second
stats");
+ var bufferCacheStats =
+ solrMetricsContext.doubleMeasurement(
+ "solr_buffer_cache_stats", "Buffer cache per second stats");
+
+ this.toClose =
+ solrMetricsContext.batchCallback(
+ () -> {
long now = System.nanoTime();
long delta = Math.max(now - previous, 1);
double seconds = delta / 1000000000.0;
@@ -86,44 +96,63 @@ public class Metrics extends SolrCacheBase implements
SolrInfoBean {
long lookups_delta = hits_delta + miss_delta;
long lookups_total = hits_total + miss_total;
- map.put("size", blockCacheSize.get());
- map.put("lookups", lookups_total);
- map.put("hits", hits_total);
- map.put("evictions", evict_total);
- map.put("storeFails", storeFail_total);
- map.put(
- "hitratio_current",
- calcHitRatio(lookups_delta, hits_delta)); // hit ratio since
the last call
- map.put(
- "lookups_persec",
- getPerSecond(lookups_delta, seconds)); // lookups per second
since the last call
- map.put(
- "hits_persec",
- getPerSecond(hits_delta, seconds)); // hits per second since
the last call
- map.put(
- "evictions_persec",
- getPerSecond(evict_delta, seconds)); // evictions per second
since the last call
- map.put(
- "storeFails_persec",
- getPerSecond(
- storeFail_delta, seconds)); // evictions per second
since the last call
- map.put("time_delta", seconds); // seconds since last call
-
- // TODO: these aren't really related to the BlockCache
- map.put(
- "buffercache.allocations",
- getPerSecond(shardBuffercacheAllocate.getAndSet(0),
seconds));
- map.put("buffercache.lost",
getPerSecond(shardBuffercacheLost.getAndSet(0), seconds));
-
+ blockcacheStats.record(
+ blockCacheSize.get(),
baseAttributes.toBuilder().put(TYPE_ATTR, "size").build());
+ blockcacheStats.record(
+ lookups_total, baseAttributes.toBuilder().put(TYPE_ATTR,
"lookups").build());
+ blockcacheStats.record(
+ hits_total, baseAttributes.toBuilder().put(TYPE_ATTR,
"hits").build());
+ blockcacheStats.record(
+ hits_total, baseAttributes.toBuilder().put(TYPE_ATTR,
"evictions").build());
+ blockcacheStats.record(
+ storeFail_total,
+ baseAttributes.toBuilder().put(TYPE_ATTR,
"store_fails").build());
+ perSecStats.record(
+ getPerSecond(lookups_delta, seconds),
+ baseAttributes.toBuilder()
+ .put(TYPE_ATTR, "lookups")
+ .build()); // lookups per second since the last call
+ perSecStats.record(
+ getPerSecond(hits_delta, seconds),
+ baseAttributes.toBuilder()
+ .put(TYPE_ATTR, "hits")
+ .build()); // hits per second since the last call
+ perSecStats.record(
+ getPerSecond(evict_delta, seconds),
+ baseAttributes.toBuilder()
+ .put(TYPE_ATTR, "evictions")
+ .build()); // evictions per second since the last call
+ perSecStats.record(
+ getPerSecond(storeFail_delta, seconds),
+ baseAttributes.toBuilder()
+ .put(TYPE_ATTR, "store_fails")
+ .build()); // evictions per second since the last call
+ hitRatio.record(
+ calcHitRatio(lookups_delta, hits_delta),
+ baseAttributes); // hit ratio since the last call
+ bufferCacheStats.record(
+ getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds),
+ baseAttributes.toBuilder().put(TYPE_ATTR,
"allocations").build());
+ bufferCacheStats.record(
+ getPerSecond(shardBuffercacheLost.getAndSet(0), seconds),
+ baseAttributes.toBuilder().put(TYPE_ATTR, "lost").build());
previous = now;
- });
- solrMetricsContext.gauge(metricsMap, true, getName(),
getCategory().toString(), scope);
+ },
+ blockcacheStats,
+ perSecStats,
+ hitRatio,
+ bufferCacheStats);
}
private float getPerSecond(long value, double seconds) {
return (float) (value / seconds);
}
+ @Override
+ public void close() {
+ IOUtils.closeQuietly(toClose);
+ }
+
// SolrInfoBean methods
@Override
diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
index b1a9f72c1f7..51007b736ec 100644
--- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java
+++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java
@@ -19,6 +19,7 @@ package org.apache.solr.cloud;
import static org.apache.solr.common.params.CommonParams.ID;
import com.codahale.metrics.Timer;
+import io.opentelemetry.api.common.Attributes;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@@ -167,7 +168,7 @@ public class Overseer implements SolrCloseable {
*
* <p>The cluster state updater is a single thread dequeueing and executing
requests.
*/
- private class ClusterStateUpdater implements Runnable, Closeable {
+ private class ClusterStateUpdater implements SolrInfoBean, Runnable,
Closeable {
private final ZkStateReader reader;
private final SolrZkClient zkClient;
@@ -195,6 +196,8 @@ public class Overseer implements SolrCloseable {
private boolean isClosed = false;
+ private AutoCloseable toClose;
+
public ClusterStateUpdater(
final ZkStateReader reader,
final String myId,
@@ -213,12 +216,27 @@ public class Overseer implements SolrCloseable {
this.minStateByteLenForCompression = minStateByteLenForCompression;
this.compressor = compressor;
- clusterStateUpdaterMetricContext =
solrMetricsContext.getChildContext(this);
- clusterStateUpdaterMetricContext.gauge(
- () -> stateUpdateQueue.getZkStats().getQueueLength(),
- true,
- "stateUpdateQueueSize",
- "queue");
+ this.clusterStateUpdaterMetricContext =
solrMetricsContext.getChildContext(this);
+ initializeMetrics(
+ solrMetricsContext, Attributes.of(CATEGORY_ATTR,
getCategory().toString()), "");
+ }
+
+ @Override
+ public void initializeMetrics(
+ SolrMetricsContext parentContext, Attributes attributes, String scope)
{
+ this.toClose =
+ parentContext.observableLongGauge(
+ "solr_overseer_state_update_queue_size",
+ "Size of overseer's update queue",
+ (observableLongMeasurement) -> {
+ observableLongMeasurement.record(
+ stateUpdateQueue.getZkStats().getQueueLength(),
attributes);
+ });
+ }
+
+ @Override
+ public SolrMetricsContext getSolrMetricsContext() {
+ return clusterStateUpdaterMetricContext;
}
public Stats getStateUpdateQueueStats() {
@@ -641,8 +659,24 @@ public class Overseer implements SolrCloseable {
@Override
public void close() {
this.isClosed = true;
+ IOUtils.closeQuietly(toClose);
clusterStateUpdaterMetricContext.unregister();
}
+
+ @Override
+ public String getName() {
+ return this.getClass().getName();
+ }
+
+ @Override
+ public String getDescription() {
+ return "Cluster leader responsible for processing state updates";
+ }
+
+ @Override
+ public Category getCategory() {
+ return Category.OVERSEER;
+ }
}
public static class OverseerThread extends Thread implements Closeable {
diff --git
a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
index 3751d850f09..e29df029de7 100644
--- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
+++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java
@@ -20,6 +20,7 @@ import static
org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.ID;
import com.codahale.metrics.Timer;
+import io.opentelemetry.api.common.Attributes;
import java.io.Closeable;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
@@ -44,6 +45,7 @@ import org.apache.solr.common.util.IOUtils;
import org.apache.solr.common.util.SolrNamedThreadFactory;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.Utils;
+import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.metrics.SolrMetricsContext;
import org.apache.zookeeper.KeeperException;
@@ -59,7 +61,7 @@ import org.slf4j.LoggerFactory;
* <p>An {@link OverseerMessageHandlerSelector} determines which {@link
OverseerMessageHandler}
* handles specific messages in the queue.
*/
-public class OverseerTaskProcessor implements Runnable, Closeable {
+public class OverseerTaskProcessor implements SolrInfoBean, Runnable,
Closeable {
/** Maximum number of overseer collection operations which can be executed
concurrently */
public static final int MAX_PARALLEL_TASKS = 100;
@@ -91,6 +93,7 @@ public class OverseerTaskProcessor implements Runnable,
Closeable {
private final Stats stats;
private final SolrMetricsContext overseerTaskProcessorMetricsContext;
+ private AutoCloseable toClose;
/**
* This map may contain tasks which are read from work queue but could not
be executed because
@@ -148,9 +151,9 @@ public class OverseerTaskProcessor implements Runnable,
Closeable {
this.runningTasks = new HashMap<>();
thisNode = MDCLoggingContext.getNodeName();
- overseerTaskProcessorMetricsContext =
solrMetricsContext.getChildContext(this);
- overseerTaskProcessorMetricsContext.gauge(
- () -> workQueue.getZkStats().getQueueLength(), true,
"collectionWorkQueueSize", "queue");
+ this.overseerTaskProcessorMetricsContext =
solrMetricsContext.getChildContext(this);
+ initializeMetrics(
+ solrMetricsContext, Attributes.of(CATEGORY_ATTR,
getCategory().toString()), "");
}
@Override
@@ -403,6 +406,23 @@ public class OverseerTaskProcessor implements Runnable,
Closeable {
runningTasks.entrySet().removeIf(e -> e.getValue().isDone());
}
+ @Override
+ public void initializeMetrics(
+ SolrMetricsContext parentContext, Attributes attributes, String scope) {
+ this.toClose =
+ parentContext.observableLongGauge(
+ "solr_overseer_collection_work_queue_size",
+ "Size of overseer's collection work queue",
+ (observableLongMeasurement) -> {
+
observableLongMeasurement.record(workQueue.getZkStats().getQueueLength(),
attributes);
+ });
+ }
+
+ @Override
+ public SolrMetricsContext getSolrMetricsContext() {
+ return overseerTaskProcessorMetricsContext;
+ }
+
@Override
public void close() {
isClosed = true;
@@ -413,6 +433,7 @@ public class OverseerTaskProcessor implements Runnable,
Closeable {
}
}
IOUtils.closeQuietly(selector);
+ IOUtils.closeQuietly(toClose);
}
public static List<String> getSortedOverseerNodeNames(SolrZkClient zk)
@@ -673,6 +694,21 @@ public class OverseerTaskProcessor implements Runnable,
Closeable {
return myId;
}
+ @Override
+ public String getName() {
+ return this.getClass().getName();
+ }
+
+ @Override
+ public String getDescription() {
+ return "Processor in Overseer handling items added to a distributed work
queue";
+ }
+
+ @Override
+ public Category getCategory() {
+ return Category.OVERSEER;
+ }
+
/**
* An interface to determine which {@link OverseerMessageHandler} handles a
given message. This
* could be a single OverseerMessageHandler for the case where a single type
of message is handled
diff --git a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
index d62f51e439e..0689edb0d84 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java
@@ -39,6 +39,7 @@ public interface SolrInfoBean extends SolrMetricProducer {
DIRECTORY,
HTTP,
SECURITY,
+ OVERSEER,
OTHER
}
diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
index 4ddc6443f80..e179306eaae 100644
--- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
+++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java
@@ -358,11 +358,6 @@ public class SolrMetricManager {
return builder.buildWithCallback(callback);
}
- ObservableLongMeasurement longMeasurement(
- String registry, String gaugeName, String description, OtelUnit unit) {
- return longGaugeBuilder(registry, gaugeName, description,
unit).buildObserver();
- }
-
BatchCallback batchCallback(
String registry,
Runnable callback,
@@ -373,6 +368,16 @@ public class SolrMetricManager {
.batchCallback(callback, measurement, additionalMeasurements);
}
+ ObservableLongMeasurement longMeasurement(
+ String registry, String gaugeName, String description, OtelUnit unit) {
+ return longGaugeBuilder(registry, gaugeName, description,
unit).buildObserver();
+ }
+
+ ObservableDoubleMeasurement doubleMeasurement(
+ String registry, String gaugeName, String description, OtelUnit unit) {
+ return doubleGaugeBuilder(registry, gaugeName, description,
unit).buildObserver();
+ }
+
private LongGaugeBuilder longGaugeBuilder(
String registry, String gaugeName, String description, OtelUnit unit) {
LongGaugeBuilder builder =
@@ -386,6 +391,18 @@ public class SolrMetricManager {
return builder;
}
+ private DoubleGaugeBuilder doubleGaugeBuilder(
+ String registry, String gaugeName, String description, OtelUnit unit) {
+ DoubleGaugeBuilder builder =
+ meterProvider(registry)
+ .get(OTEL_SCOPE_NAME)
+ .gaugeBuilder(gaugeName)
+ .setDescription(description);
+ if (unit != null) builder.setUnit(unit.getSymbol());
+
+ return builder;
+ }
+
// for unit tests
public MetricRegistry.MetricSupplier<Counter> getCounterSupplier() {
return counterSupplier;
diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
index 14b6affa475..696ace6aa0d 100644
--- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
+++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java
@@ -285,6 +285,15 @@ public class SolrMetricsContext {
return metricManager.longMeasurement(registryName, metricName,
description, unit);
}
+ public ObservableDoubleMeasurement doubleMeasurement(String metricName,
String description) {
+ return doubleMeasurement(metricName, description, null);
+ }
+
+ public ObservableDoubleMeasurement doubleMeasurement(
+ String metricName, String description, OtelUnit unit) {
+ return metricManager.doubleMeasurement(registryName, metricName,
description, unit);
+ }
+
public BatchCallback batchCallback(
Runnable callback,
ObservableMeasurement measurement,
diff --git a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
index 46bc29685ca..a628b9263a7 100644
--- a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
+++ b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java
@@ -17,11 +17,10 @@
package org.apache.solr.blockcache;
import io.opentelemetry.api.common.Attributes;
-import java.math.BigDecimal;
-import java.util.Map;
+import io.prometheus.metrics.model.snapshots.GaugeSnapshot;
+import io.prometheus.metrics.model.snapshots.Labels;
import org.apache.lucene.tests.util.TestUtil;
import org.apache.solr.SolrTestCase;
-import org.apache.solr.metrics.MetricsMap;
import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.metrics.SolrMetricsContext;
import org.junit.After;
@@ -32,27 +31,19 @@ public class BufferStoreTest extends SolrTestCase {
private static final int blockSize = 1024;
private Metrics metrics;
- private MetricsMap metricsMap;
+ private SolrMetricManager metricManager;
+ private String registry;
private Store store;
@Before
public void setup() {
metrics = new Metrics();
- SolrMetricManager metricManager = new SolrMetricManager();
- String registry = TestUtil.randomSimpleString(random(), 2, 10);
+ metricManager = new SolrMetricManager();
+ registry = TestUtil.randomSimpleString(random(), 2, 10);
String scope = TestUtil.randomSimpleString(random(), 2, 10);
SolrMetricsContext solrMetricsContext = new
SolrMetricsContext(metricManager, registry, "foo");
- // TODO SOLR-17458: Fix test later
metrics.initializeMetrics(solrMetricsContext, Attributes.empty(), scope);
- metricsMap =
- (MetricsMap)
- ((SolrMetricManager.GaugeWrapper)
- metricManager
- .registry(registry)
- .getMetrics()
- .get("CACHE." + scope + ".hdfsBlockCache"))
- .getGauge();
BufferStore.initNewBuffer(blockSize, blockSize, metrics);
store = BufferStore.instance(blockSize);
}
@@ -99,19 +90,35 @@ public class BufferStoreTest extends SolrTestCase {
* @param lost whether buffers should have been lost since the last call
*/
private void assertGaugeMetricsChanged(boolean allocated, boolean lost) {
- Map<String, Object> stats = metricsMap.getValue();
-
- assertEquals(
- "Buffer allocation metric not updating correctly.",
- allocated,
- isMetricPositive(stats, "buffercache.allocations"));
+ var gauge =
+ metricManager.getPrometheusMetricReader(registry).collect().stream()
+ .filter(ms ->
ms.getMetadata().getPrometheusName().equals("solr_buffer_cache_stats"))
+ .map(GaugeSnapshot.class::cast)
+ .findFirst()
+ .orElseThrow(() -> new AssertionError("Missing gauge metric
solr_buffer_cache_stats"));
+
+ var actualAllocatedValue = getBufferCacheStatsValue(gauge, "allocations");
+ var actualLostValue = getBufferCacheStatsValue(gauge, "lost");
assertEquals(
- "Buffer lost metric not updating correctly.",
- lost,
- isMetricPositive(stats, "buffercache.lost"));
+ "Buffer allocation metric not updating correctly.", allocated,
actualAllocatedValue > 0);
+ assertEquals("Buffer allocation metric not updating correctly.", lost,
actualLostValue > 0);
}
- private boolean isMetricPositive(Map<String, Object> stats, String metric) {
- return new
BigDecimal(stats.get(metric).toString()).compareTo(BigDecimal.ZERO) > 0;
+ private Double getBufferCacheStatsValue(GaugeSnapshot gaugeSnapshot, String
type) {
+ return gaugeSnapshot.getDataPoints().stream()
+ .filter(
+ (dp) ->
+ dp.getLabels()
+ .equals(
+ Labels.of(
+ "category",
+ "CACHE",
+ "otel_scope_name",
+ "org.apache.solr",
+ "type",
+ type)))
+ .findFirst()
+ .orElseThrow(() -> new AssertionError("Missing type=" + type + " label
on metric"))
+ .getValue();
}
}