This is an automated email from the ASF dual-hosted git repository. mlbiscoc pushed a commit to branch feature/SOLR-17458-rebased in repository https://gitbox.apache.org/repos/asf/solr.git
commit e6d4d446d757ae3c45675ab9c143a6af8845b3fb Author: Matthew Biscocho <[email protected]> AuthorDate: Mon Sep 8 11:37:55 2025 -0400 SOLR-17806: Migrate Overseer and Blockcache metrics to OTEL (#3607) * Migrate Overseer and Blockcache metrics * Single close --------- Co-authored-by: Matthew Biscocho <[email protected]> --- .../java/org/apache/solr/blockcache/Metrics.java | 109 +++++++++++++-------- .../src/java/org/apache/solr/cloud/Overseer.java | 48 +++++++-- .../apache/solr/cloud/OverseerTaskProcessor.java | 44 ++++++++- .../java/org/apache/solr/core/SolrInfoBean.java | 1 + .../org/apache/solr/metrics/SolrMetricManager.java | 27 ++++- .../apache/solr/metrics/SolrMetricsContext.java | 9 ++ .../apache/solr/blockcache/BufferStoreTest.java | 59 ++++++----- 7 files changed, 215 insertions(+), 82 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java index dcdb4550d8c..98c9b2dbef6 100644 --- a/solr/core/src/java/org/apache/solr/blockcache/Metrics.java +++ b/solr/core/src/java/org/apache/solr/blockcache/Metrics.java @@ -17,11 +17,9 @@ package org.apache.solr.blockcache; import io.opentelemetry.api.common.Attributes; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicLong; +import org.apache.solr.common.util.IOUtils; import org.apache.solr.core.SolrInfoBean; -import org.apache.solr.metrics.MetricsMap; import org.apache.solr.metrics.SolrMetricsContext; import org.apache.solr.search.SolrCacheBase; @@ -50,19 +48,31 @@ public class Metrics extends SolrCacheBase implements SolrInfoBean { public AtomicLong shardBuffercacheAllocate = new AtomicLong(0); public AtomicLong shardBuffercacheLost = new AtomicLong(0); - private MetricsMap metricsMap; - private Set<String> metricNames = ConcurrentHashMap.newKeySet(); private SolrMetricsContext solrMetricsContext; private long previous = System.nanoTime(); - // TODO SOLR-17458: Migrate to Otel + private AutoCloseable toClose; + @Override public void initializeMetrics( SolrMetricsContext parentContext, Attributes attributes, String scope) { solrMetricsContext = parentContext.getChildContext(this); - metricsMap = - new MetricsMap( - map -> { + var baseAttributes = + attributes.toBuilder().put(CATEGORY_ATTR, getCategory().toString()).build(); + var blockcacheStats = + solrMetricsContext.longMeasurement("solr_block_cache_stats", "Block cache stats"); + var hitRatio = + solrMetricsContext.doubleMeasurement("solr_block_cache_hit_ratio", "Block cache hit ratio"); + var perSecStats = + solrMetricsContext.doubleMeasurement( + "solr_block_cache_stats_per_second", "Block cache per second stats"); + var bufferCacheStats = + solrMetricsContext.doubleMeasurement( + "solr_buffer_cache_stats", "Buffer cache per second stats"); + + this.toClose = + solrMetricsContext.batchCallback( + () -> { long now = System.nanoTime(); long delta = Math.max(now - previous, 1); double seconds = delta / 1000000000.0; @@ -86,44 +96,63 @@ public class Metrics extends SolrCacheBase implements SolrInfoBean { long lookups_delta = hits_delta + miss_delta; long lookups_total = hits_total + miss_total; - map.put("size", blockCacheSize.get()); - map.put("lookups", lookups_total); - map.put("hits", hits_total); - map.put("evictions", evict_total); - map.put("storeFails", storeFail_total); - map.put( - "hitratio_current", - calcHitRatio(lookups_delta, hits_delta)); // hit ratio since the last call - map.put( - "lookups_persec", - getPerSecond(lookups_delta, seconds)); // lookups per second since the last call - map.put( - "hits_persec", - getPerSecond(hits_delta, seconds)); // hits per second since the last call - map.put( - "evictions_persec", - getPerSecond(evict_delta, seconds)); // evictions per second since the last call - map.put( - "storeFails_persec", - getPerSecond( - storeFail_delta, seconds)); // evictions per second since the last call - map.put("time_delta", seconds); // seconds since last call - - // TODO: these aren't really related to the BlockCache - map.put( - "buffercache.allocations", - getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds)); - map.put("buffercache.lost", getPerSecond(shardBuffercacheLost.getAndSet(0), seconds)); - + blockcacheStats.record( + blockCacheSize.get(), baseAttributes.toBuilder().put(TYPE_ATTR, "size").build()); + blockcacheStats.record( + lookups_total, baseAttributes.toBuilder().put(TYPE_ATTR, "lookups").build()); + blockcacheStats.record( + hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "hits").build()); + blockcacheStats.record( + hits_total, baseAttributes.toBuilder().put(TYPE_ATTR, "evictions").build()); + blockcacheStats.record( + storeFail_total, + baseAttributes.toBuilder().put(TYPE_ATTR, "store_fails").build()); + perSecStats.record( + getPerSecond(lookups_delta, seconds), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "lookups") + .build()); // lookups per second since the last call + perSecStats.record( + getPerSecond(hits_delta, seconds), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "hits") + .build()); // hits per second since the last call + perSecStats.record( + getPerSecond(evict_delta, seconds), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "evictions") + .build()); // evictions per second since the last call + perSecStats.record( + getPerSecond(storeFail_delta, seconds), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "store_fails") + .build()); // evictions per second since the last call + hitRatio.record( + calcHitRatio(lookups_delta, hits_delta), + baseAttributes); // hit ratio since the last call + bufferCacheStats.record( + getPerSecond(shardBuffercacheAllocate.getAndSet(0), seconds), + baseAttributes.toBuilder().put(TYPE_ATTR, "allocations").build()); + bufferCacheStats.record( + getPerSecond(shardBuffercacheLost.getAndSet(0), seconds), + baseAttributes.toBuilder().put(TYPE_ATTR, "lost").build()); previous = now; - }); - solrMetricsContext.gauge(metricsMap, true, getName(), getCategory().toString(), scope); + }, + blockcacheStats, + perSecStats, + hitRatio, + bufferCacheStats); } private float getPerSecond(long value, double seconds) { return (float) (value / seconds); } + @Override + public void close() { + IOUtils.closeQuietly(toClose); + } + // SolrInfoBean methods @Override diff --git a/solr/core/src/java/org/apache/solr/cloud/Overseer.java b/solr/core/src/java/org/apache/solr/cloud/Overseer.java index c6ca1426cdb..6f7cba71767 100644 --- a/solr/core/src/java/org/apache/solr/cloud/Overseer.java +++ b/solr/core/src/java/org/apache/solr/cloud/Overseer.java @@ -19,6 +19,7 @@ package org.apache.solr.cloud; import static org.apache.solr.common.params.CommonParams.ID; import com.codahale.metrics.Timer; +import io.opentelemetry.api.common.Attributes; import java.io.Closeable; import java.io.IOException; import java.lang.invoke.MethodHandles; @@ -167,7 +168,7 @@ public class Overseer implements SolrCloseable { * * <p>The cluster state updater is a single thread dequeueing and executing requests. */ - private class ClusterStateUpdater implements Runnable, Closeable { + private class ClusterStateUpdater implements SolrInfoBean, Runnable, Closeable { private final ZkStateReader reader; private final SolrZkClient zkClient; @@ -195,6 +196,8 @@ public class Overseer implements SolrCloseable { private boolean isClosed = false; + private AutoCloseable toClose; + public ClusterStateUpdater( final ZkStateReader reader, final String myId, @@ -213,12 +216,27 @@ public class Overseer implements SolrCloseable { this.minStateByteLenForCompression = minStateByteLenForCompression; this.compressor = compressor; - clusterStateUpdaterMetricContext = solrMetricsContext.getChildContext(this); - clusterStateUpdaterMetricContext.gauge( - () -> stateUpdateQueue.getZkStats().getQueueLength(), - true, - "stateUpdateQueueSize", - "queue"); + this.clusterStateUpdaterMetricContext = solrMetricsContext.getChildContext(this); + initializeMetrics( + solrMetricsContext, Attributes.of(CATEGORY_ATTR, getCategory().toString()), ""); + } + + @Override + public void initializeMetrics( + SolrMetricsContext parentContext, Attributes attributes, String scope) { + this.toClose = + parentContext.observableLongGauge( + "solr_overseer_state_update_queue_size", + "Size of overseer's update queue", + (observableLongMeasurement) -> { + observableLongMeasurement.record( + stateUpdateQueue.getZkStats().getQueueLength(), attributes); + }); + } + + @Override + public SolrMetricsContext getSolrMetricsContext() { + return clusterStateUpdaterMetricContext; } public Stats getStateUpdateQueueStats() { @@ -641,8 +659,24 @@ public class Overseer implements SolrCloseable { @Override public void close() { this.isClosed = true; + IOUtils.closeQuietly(toClose); clusterStateUpdaterMetricContext.unregister(); } + + @Override + public String getName() { + return this.getClass().getName(); + } + + @Override + public String getDescription() { + return "Cluster leader responsible for processing state updates"; + } + + @Override + public Category getCategory() { + return Category.OVERSEER; + } } public static class OverseerThread extends Thread implements Closeable { diff --git a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java index 3751d850f09..e29df029de7 100644 --- a/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java +++ b/solr/core/src/java/org/apache/solr/cloud/OverseerTaskProcessor.java @@ -20,6 +20,7 @@ import static org.apache.solr.common.params.CommonAdminParams.ASYNC; import static org.apache.solr.common.params.CommonParams.ID; import com.codahale.metrics.Timer; +import io.opentelemetry.api.common.Attributes; import java.io.Closeable; import java.lang.invoke.MethodHandles; import java.util.ArrayList; @@ -44,6 +45,7 @@ import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.SolrNamedThreadFactory; import org.apache.solr.common.util.StrUtils; import org.apache.solr.common.util.Utils; +import org.apache.solr.core.SolrInfoBean; import org.apache.solr.logging.MDCLoggingContext; import org.apache.solr.metrics.SolrMetricsContext; import org.apache.zookeeper.KeeperException; @@ -59,7 +61,7 @@ import org.slf4j.LoggerFactory; * <p>An {@link OverseerMessageHandlerSelector} determines which {@link OverseerMessageHandler} * handles specific messages in the queue. */ -public class OverseerTaskProcessor implements Runnable, Closeable { +public class OverseerTaskProcessor implements SolrInfoBean, Runnable, Closeable { /** Maximum number of overseer collection operations which can be executed concurrently */ public static final int MAX_PARALLEL_TASKS = 100; @@ -91,6 +93,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable { private final Stats stats; private final SolrMetricsContext overseerTaskProcessorMetricsContext; + private AutoCloseable toClose; /** * This map may contain tasks which are read from work queue but could not be executed because @@ -148,9 +151,9 @@ public class OverseerTaskProcessor implements Runnable, Closeable { this.runningTasks = new HashMap<>(); thisNode = MDCLoggingContext.getNodeName(); - overseerTaskProcessorMetricsContext = solrMetricsContext.getChildContext(this); - overseerTaskProcessorMetricsContext.gauge( - () -> workQueue.getZkStats().getQueueLength(), true, "collectionWorkQueueSize", "queue"); + this.overseerTaskProcessorMetricsContext = solrMetricsContext.getChildContext(this); + initializeMetrics( + solrMetricsContext, Attributes.of(CATEGORY_ATTR, getCategory().toString()), ""); } @Override @@ -403,6 +406,23 @@ public class OverseerTaskProcessor implements Runnable, Closeable { runningTasks.entrySet().removeIf(e -> e.getValue().isDone()); } + @Override + public void initializeMetrics( + SolrMetricsContext parentContext, Attributes attributes, String scope) { + this.toClose = + parentContext.observableLongGauge( + "solr_overseer_collection_work_queue_size", + "Size of overseer's collection work queue", + (observableLongMeasurement) -> { + observableLongMeasurement.record(workQueue.getZkStats().getQueueLength(), attributes); + }); + } + + @Override + public SolrMetricsContext getSolrMetricsContext() { + return overseerTaskProcessorMetricsContext; + } + @Override public void close() { isClosed = true; @@ -413,6 +433,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable { } } IOUtils.closeQuietly(selector); + IOUtils.closeQuietly(toClose); } public static List<String> getSortedOverseerNodeNames(SolrZkClient zk) @@ -673,6 +694,21 @@ public class OverseerTaskProcessor implements Runnable, Closeable { return myId; } + @Override + public String getName() { + return this.getClass().getName(); + } + + @Override + public String getDescription() { + return "Processor in Overseer handling items added to a distributed work queue"; + } + + @Override + public Category getCategory() { + return Category.OVERSEER; + } + /** * An interface to determine which {@link OverseerMessageHandler} handles a given message. This * could be a single OverseerMessageHandler for the case where a single type of message is handled diff --git a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java index d62f51e439e..0689edb0d84 100644 --- a/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java +++ b/solr/core/src/java/org/apache/solr/core/SolrInfoBean.java @@ -39,6 +39,7 @@ public interface SolrInfoBean extends SolrMetricProducer { DIRECTORY, HTTP, SECURITY, + OVERSEER, OTHER } diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java index 4ddc6443f80..e179306eaae 100644 --- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java +++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricManager.java @@ -358,11 +358,6 @@ public class SolrMetricManager { return builder.buildWithCallback(callback); } - ObservableLongMeasurement longMeasurement( - String registry, String gaugeName, String description, OtelUnit unit) { - return longGaugeBuilder(registry, gaugeName, description, unit).buildObserver(); - } - BatchCallback batchCallback( String registry, Runnable callback, @@ -373,6 +368,16 @@ public class SolrMetricManager { .batchCallback(callback, measurement, additionalMeasurements); } + ObservableLongMeasurement longMeasurement( + String registry, String gaugeName, String description, OtelUnit unit) { + return longGaugeBuilder(registry, gaugeName, description, unit).buildObserver(); + } + + ObservableDoubleMeasurement doubleMeasurement( + String registry, String gaugeName, String description, OtelUnit unit) { + return doubleGaugeBuilder(registry, gaugeName, description, unit).buildObserver(); + } + private LongGaugeBuilder longGaugeBuilder( String registry, String gaugeName, String description, OtelUnit unit) { LongGaugeBuilder builder = @@ -386,6 +391,18 @@ public class SolrMetricManager { return builder; } + private DoubleGaugeBuilder doubleGaugeBuilder( + String registry, String gaugeName, String description, OtelUnit unit) { + DoubleGaugeBuilder builder = + meterProvider(registry) + .get(OTEL_SCOPE_NAME) + .gaugeBuilder(gaugeName) + .setDescription(description); + if (unit != null) builder.setUnit(unit.getSymbol()); + + return builder; + } + // for unit tests public MetricRegistry.MetricSupplier<Counter> getCounterSupplier() { return counterSupplier; diff --git a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java index 14b6affa475..696ace6aa0d 100644 --- a/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java +++ b/solr/core/src/java/org/apache/solr/metrics/SolrMetricsContext.java @@ -285,6 +285,15 @@ public class SolrMetricsContext { return metricManager.longMeasurement(registryName, metricName, description, unit); } + public ObservableDoubleMeasurement doubleMeasurement(String metricName, String description) { + return doubleMeasurement(metricName, description, null); + } + + public ObservableDoubleMeasurement doubleMeasurement( + String metricName, String description, OtelUnit unit) { + return metricManager.doubleMeasurement(registryName, metricName, description, unit); + } + public BatchCallback batchCallback( Runnable callback, ObservableMeasurement measurement, diff --git a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java index 46bc29685ca..a628b9263a7 100644 --- a/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java +++ b/solr/core/src/test/org/apache/solr/blockcache/BufferStoreTest.java @@ -17,11 +17,10 @@ package org.apache.solr.blockcache; import io.opentelemetry.api.common.Attributes; -import java.math.BigDecimal; -import java.util.Map; +import io.prometheus.metrics.model.snapshots.GaugeSnapshot; +import io.prometheus.metrics.model.snapshots.Labels; import org.apache.lucene.tests.util.TestUtil; import org.apache.solr.SolrTestCase; -import org.apache.solr.metrics.MetricsMap; import org.apache.solr.metrics.SolrMetricManager; import org.apache.solr.metrics.SolrMetricsContext; import org.junit.After; @@ -32,27 +31,19 @@ public class BufferStoreTest extends SolrTestCase { private static final int blockSize = 1024; private Metrics metrics; - private MetricsMap metricsMap; + private SolrMetricManager metricManager; + private String registry; private Store store; @Before public void setup() { metrics = new Metrics(); - SolrMetricManager metricManager = new SolrMetricManager(); - String registry = TestUtil.randomSimpleString(random(), 2, 10); + metricManager = new SolrMetricManager(); + registry = TestUtil.randomSimpleString(random(), 2, 10); String scope = TestUtil.randomSimpleString(random(), 2, 10); SolrMetricsContext solrMetricsContext = new SolrMetricsContext(metricManager, registry, "foo"); - // TODO SOLR-17458: Fix test later metrics.initializeMetrics(solrMetricsContext, Attributes.empty(), scope); - metricsMap = - (MetricsMap) - ((SolrMetricManager.GaugeWrapper) - metricManager - .registry(registry) - .getMetrics() - .get("CACHE." + scope + ".hdfsBlockCache")) - .getGauge(); BufferStore.initNewBuffer(blockSize, blockSize, metrics); store = BufferStore.instance(blockSize); } @@ -99,19 +90,35 @@ public class BufferStoreTest extends SolrTestCase { * @param lost whether buffers should have been lost since the last call */ private void assertGaugeMetricsChanged(boolean allocated, boolean lost) { - Map<String, Object> stats = metricsMap.getValue(); - - assertEquals( - "Buffer allocation metric not updating correctly.", - allocated, - isMetricPositive(stats, "buffercache.allocations")); + var gauge = + metricManager.getPrometheusMetricReader(registry).collect().stream() + .filter(ms -> ms.getMetadata().getPrometheusName().equals("solr_buffer_cache_stats")) + .map(GaugeSnapshot.class::cast) + .findFirst() + .orElseThrow(() -> new AssertionError("Missing gauge metric solr_buffer_cache_stats")); + + var actualAllocatedValue = getBufferCacheStatsValue(gauge, "allocations"); + var actualLostValue = getBufferCacheStatsValue(gauge, "lost"); assertEquals( - "Buffer lost metric not updating correctly.", - lost, - isMetricPositive(stats, "buffercache.lost")); + "Buffer allocation metric not updating correctly.", allocated, actualAllocatedValue > 0); + assertEquals("Buffer allocation metric not updating correctly.", lost, actualLostValue > 0); } - private boolean isMetricPositive(Map<String, Object> stats, String metric) { - return new BigDecimal(stats.get(metric).toString()).compareTo(BigDecimal.ZERO) > 0; + private Double getBufferCacheStatsValue(GaugeSnapshot gaugeSnapshot, String type) { + return gaugeSnapshot.getDataPoints().stream() + .filter( + (dp) -> + dp.getLabels() + .equals( + Labels.of( + "category", + "CACHE", + "otel_scope_name", + "org.apache.solr", + "type", + type))) + .findFirst() + .orElseThrow(() -> new AssertionError("Missing type=" + type + " label on metric")) + .getValue(); } }
