This is an automated email from the ASF dual-hosted git repository. mlbiscoc pushed a commit to branch feature/SOLR-17458-rebased in repository https://gitbox.apache.org/repos/asf/solr.git
commit e551fa15e48292927b2d06315ed4e959c6f16610 Author: cugarte <[email protected]> AuthorDate: Mon Sep 8 09:40:07 2025 -0400 SOLR-17806: Migrate SolrIndexWriter metrics to OTEL (#3568) * SOLR-17806: Migrate SolrIndexWriter metrics to OTEL * Adding unit test, merged gauges, closed observables. * Reusing metric name by switching merge type to an attribute. * gradlew tidy --- .../org/apache/solr/update/SolrIndexWriter.java | 194 +++++++++++++------- .../collection1/conf/solrconfig-indexmetrics.xml | 2 + .../apache/solr/update/SolrIndexMetricsTest.java | 204 ++++++++++++++++----- 3 files changed, 289 insertions(+), 111 deletions(-) diff --git a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java index 7b215e74fab..7fd5e434d6f 100644 --- a/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java +++ b/solr/core/src/java/org/apache/solr/update/SolrIndexWriter.java @@ -16,9 +16,16 @@ */ package org.apache.solr.update; -import com.codahale.metrics.Counter; -import com.codahale.metrics.Meter; -import com.codahale.metrics.Timer; +import static org.apache.solr.metrics.SolrCoreMetricManager.COLLECTION_ATTR; +import static org.apache.solr.metrics.SolrCoreMetricManager.CORE_ATTR; +import static org.apache.solr.metrics.SolrCoreMetricManager.REPLICA_ATTR; +import static org.apache.solr.metrics.SolrCoreMetricManager.SHARD_ATTR; +import static org.apache.solr.metrics.SolrMetricProducer.CATEGORY_ATTR; +import static org.apache.solr.metrics.SolrMetricProducer.TYPE_ATTR; + +import io.opentelemetry.api.common.AttributeKey; +import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.ObservableLongGauge; import java.io.IOException; import java.lang.invoke.MethodHandles; import java.util.Collections; @@ -37,11 +44,15 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.util.InfoStream; import org.apache.solr.common.util.IOUtils; import org.apache.solr.common.util.SuppressForbidden; +import org.apache.solr.common.util.Utils; import org.apache.solr.core.DirectoryFactory; import org.apache.solr.core.DirectoryFactory.DirContext; import org.apache.solr.core.SolrCore; import org.apache.solr.core.SolrInfoBean; import org.apache.solr.metrics.SolrMetricsContext; +import org.apache.solr.metrics.otel.OtelUnit; +import org.apache.solr.metrics.otel.instruments.AttributedLongCounter; +import org.apache.solr.metrics.otel.instruments.AttributedLongTimer; import org.apache.solr.schema.IndexSchema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,6 +75,8 @@ public class SolrIndexWriter extends IndexWriter { public static final String COMMIT_COMMAND_VERSION = "commitCommandVer"; + public static final AttributeKey<String> MERGE_TYPE_ATTR = AttributeKey.stringKey("merge_type"); + private final Object CLOSE_LOCK = new Object(); String name; @@ -73,12 +86,12 @@ public class SolrIndexWriter extends IndexWriter { // metrics private long majorMergeDocs = 512 * 1024; - private Timer majorMerge; - private Timer minorMerge; - private Meter majorMergedDocs; - private Meter majorDeletedDocs; - private Counter mergeErrors; - private Meter flushMeter; // original counter is package-private in IndexWriter + private AttributedLongTimer majorMerge; + private AttributedLongTimer minorMerge; + private AttributedLongCounter majorMergedDocs; + private AttributedLongCounter majorDeletedDocs; + private AttributedLongCounter mergeErrors; + private AttributedLongCounter flushes; // original counter is package-private in IndexWriter private boolean mergeTotals = false; private boolean mergeDetails = false; private final AtomicInteger runningMajorMerges = new AtomicInteger(); @@ -87,6 +100,7 @@ public class SolrIndexWriter extends IndexWriter { private final AtomicInteger runningMinorMergesSegments = new AtomicInteger(); private final AtomicLong runningMajorMergesDocs = new AtomicLong(); private final AtomicLong runningMinorMergesDocs = new AtomicLong(); + private ObservableLongGauge mergeStats; private final SolrMetricsContext solrMetricsContext; // merge diagnostics. @@ -177,66 +191,113 @@ public class SolrIndexWriter extends IndexWriter { } else { mergeTotals = false; } + String coreName = core.getCoreDescriptor().getName(); + var baseAttributesBuilder = + Attributes.builder() + .put(CATEGORY_ATTR, SolrInfoBean.Category.INDEX.toString()) + .put(CORE_ATTR, coreName); + if (core.getCoreContainer().isZooKeeperAware()) { + String collectionName = core.getCoreDescriptor().getCollectionName(); + baseAttributesBuilder + .put(COLLECTION_ATTR, collectionName) + .put(SHARD_ATTR, core.getCoreDescriptor().getCloudDescriptor().getShardId()) + .put(REPLICA_ATTR, Utils.parseMetricsReplicaName(collectionName, coreName)); + } + var baseAttributes = baseAttributesBuilder.build(); if (mergeDetails) { mergeTotals = true; // override majorMergedDocs = - solrMetricsContext.meter( - "docs", SolrInfoBean.Category.INDEX.toString(), "merge", "major"); + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_indexwriter_major_merged_docs", + "Number of documents merged while merging segments above the majorMergeDocs threshold (" + + majorMergeDocs + + ")"), + baseAttributes); majorDeletedDocs = - solrMetricsContext.meter( - "deletedDocs", SolrInfoBean.Category.INDEX.toString(), "merge", "major"); + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_indexwriter_major_deleted_docs", + "Number of deleted documents that were expunged while merging segments above the majorMergeDocs threshold (" + + majorMergeDocs + + ")"), + baseAttributes); } if (mergeTotals) { minorMerge = - solrMetricsContext.timer("minor", SolrInfoBean.Category.INDEX.toString(), "merge"); + new AttributedLongTimer( + solrMetricsContext.longHistogram( + "solr_indexwriter_merge", + "Time spent merging segments below or equal to the majorMergeDocs threshold (" + + majorMergeDocs + + ")", + OtelUnit.MILLISECONDS), + baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "minor").build()); majorMerge = - solrMetricsContext.timer("major", SolrInfoBean.Category.INDEX.toString(), "merge"); + new AttributedLongTimer( + solrMetricsContext.longHistogram( + "solr_indexwriter_merge", + "Time spent merging segments above the majorMergeDocs threshold (" + + majorMergeDocs + + ")", + OtelUnit.MILLISECONDS), + baseAttributes.toBuilder().put(MERGE_TYPE_ATTR, "major").build()); mergeErrors = - solrMetricsContext.counter("errors", SolrInfoBean.Category.INDEX.toString(), "merge"); + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_indexwriter_merge_errors", "Number of merge errors"), + baseAttributes); String tag = core.getMetricTag(); - solrMetricsContext.gauge( - () -> runningMajorMerges.get(), - true, - "running", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "major"); - solrMetricsContext.gauge( - () -> runningMinorMerges.get(), - true, - "running", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "minor"); - solrMetricsContext.gauge( - () -> runningMajorMergesDocs.get(), - true, - "running.docs", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "major"); - solrMetricsContext.gauge( - () -> runningMinorMergesDocs.get(), - true, - "running.docs", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "minor"); - solrMetricsContext.gauge( - () -> runningMajorMergesSegments.get(), - true, - "running.segments", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "major"); - solrMetricsContext.gauge( - () -> runningMinorMergesSegments.get(), - true, - "running.segments", - SolrInfoBean.Category.INDEX.toString(), - "merge", - "minor"); - flushMeter = solrMetricsContext.meter("flush", SolrInfoBean.Category.INDEX.toString()); + mergeStats = + solrMetricsContext.observableLongGauge( + "solr_indexwriter_merge_stats", + "Metrics around currently running segment merges; major := above the majorMergeDocs threshold (" + + majorMergeDocs + + "), minor := below or equal to the threshold", + (observableLongMeasurement -> { + observableLongMeasurement.record( + runningMajorMerges.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running") + .put(MERGE_TYPE_ATTR, "major") + .build()); + observableLongMeasurement.record( + runningMajorMergesDocs.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running_docs") + .put(MERGE_TYPE_ATTR, "major") + .build()); + observableLongMeasurement.record( + runningMajorMergesSegments.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running_segments") + .put(MERGE_TYPE_ATTR, "major") + .build()); + observableLongMeasurement.record( + runningMinorMerges.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running") + .put(MERGE_TYPE_ATTR, "minor") + .build()); + observableLongMeasurement.record( + runningMinorMergesDocs.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running_docs") + .put(MERGE_TYPE_ATTR, "minor") + .build()); + observableLongMeasurement.record( + runningMinorMergesSegments.get(), + baseAttributes.toBuilder() + .put(TYPE_ATTR, "running_segments") + .put(MERGE_TYPE_ATTR, "minor") + .build()); + })); + flushes = + new AttributedLongCounter( + solrMetricsContext.longCounter( + "solr_indexwriter_flush", + "Number of times added/deleted documents have been flushed to the Directory"), + baseAttributes); } } } @@ -284,21 +345,21 @@ public class SolrIndexWriter extends IndexWriter { } boolean major = totalNumDocs > majorMergeDocs; int segmentsCount = merge.segments.size(); - Timer.Context context; + AttributedLongTimer.MetricTimer context; if (major) { runningMajorMerges.incrementAndGet(); runningMajorMergesDocs.addAndGet(totalNumDocs); runningMajorMergesSegments.addAndGet(segmentsCount); if (mergeDetails) { - majorMergedDocs.mark(totalNumDocs); - majorDeletedDocs.mark(deletedDocs); + majorMergedDocs.add(totalNumDocs); + majorDeletedDocs.add(deletedDocs); } - context = majorMerge.time(); + context = majorMerge.start(); } else { runningMinorMerges.incrementAndGet(); runningMinorMergesDocs.addAndGet(totalNumDocs); runningMinorMergesSegments.addAndGet(segmentsCount); - context = minorMerge.time(); + context = minorMerge.start(); } try { super.merge(merge); @@ -326,8 +387,8 @@ public class SolrIndexWriter extends IndexWriter { @Override protected void doAfterFlush() throws IOException { - if (flushMeter != null) { // this is null when writer is used only for snapshot cleanup - flushMeter.mark(); // or if mergeTotals == false + if (flushes != null) { // this is null when writer is used only for snapshot cleanup + flushes.inc(); // or if mergeTotals == false } super.doAfterFlush(); } @@ -416,6 +477,7 @@ public class SolrIndexWriter extends IndexWriter { if (directoryFactory != null) { directoryFactory.release(directory); } + IOUtils.closeQuietly(mergeStats); if (solrMetricsContext != null) { solrMetricsContext.unregister(); } diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig-indexmetrics.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig-indexmetrics.xml index 6238e7db021..ec5c9406fee 100644 --- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-indexmetrics.xml +++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-indexmetrics.xml @@ -31,6 +31,8 @@ <metrics> <bool name="merge">${solr.tests.metrics.merge:false}</bool> <bool name="mergeDetails">${solr.tests.metrics.mergeDetails:false}</bool> + <!-- majorMergeDocs default comes from the default of SolrIndexWriter.majorMergeDocs --> + <long name="majorMergeDocs">${solr.tests.metrics.majorMergeDocs:524288}</long> </metrics> <!-- intentionally set very low values here to trigger multiple flushes and merges. DO NOT USE THESE ABSURD VALUES IN PRODUCTION. --> diff --git a/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java b/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java index ecfd83c02d4..6b0e96e77af 100644 --- a/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java +++ b/solr/core/src/test/org/apache/solr/update/SolrIndexMetricsTest.java @@ -16,14 +16,14 @@ */ package org.apache.solr.update; -import com.codahale.metrics.Meter; -import com.codahale.metrics.Metric; -import com.codahale.metrics.MetricRegistry; -import com.codahale.metrics.Timer; -import java.util.Map; +import static org.apache.solr.metrics.SolrMetricProducer.CATEGORY_ATTR; +import static org.apache.solr.update.SolrIndexWriter.MERGE_TYPE_ATTR; + +import io.prometheus.metrics.model.snapshots.MetricSnapshots; import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.core.SolrCore; +import org.apache.solr.core.SolrInfoBean; import org.apache.solr.metrics.SolrMetricTestUtils; import org.apache.solr.request.SolrQueryRequest; import org.junit.After; @@ -61,28 +61,49 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { addDocs(); - MetricRegistry registry = - h.getCoreContainer() - .getMetricManager() - .registry(h.getCore().getCoreMetricManager().getRegistryName()); - assertNotNull(registry); - - Map<String, Metric> metrics = registry.getMetrics(); - - // NOCOMMIT: As we migrate more metrics to OTEL, this will need to migrate to check prometheus - // reader instead - assertEquals( - 10, metrics.entrySet().stream().filter(e -> e.getKey().startsWith("INDEX")).count()); - - // check basic index meters - Timer timer = (Timer) metrics.get("INDEX.merge.minor"); - assertTrue("minorMerge: " + timer.getCount(), timer.getCount() >= 3); - timer = (Timer) metrics.get("INDEX.merge.major"); - assertEquals("majorMerge: " + timer.getCount(), 0, timer.getCount()); - // check detailed meters - assertNull((Meter) metrics.get("INDEX.merge.major.docs")); - Meter meter = (Meter) metrics.get("INDEX.flush"); - assertTrue("flush: " + meter.getCount(), meter.getCount() > 10); + try (SolrCore core = h.getCoreContainer().getCore("collection1")) { + // check basic index meters + var minorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "minor") + .build()); + assertTrue("minorMerge: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() >= 3); + var majorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "major") + .build()); + // major merge timer should have a value of 0, and because 0 values are not reported, no + // datapoint is available + assertNull("majorMergeTimer", majorMergeTimer); + + // check detailed meters + var majorMergeDocs = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_major_merged_docs", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + // major merge docs should be null because mergeDetails is false + assertNull("majorMergeDocs", majorMergeDocs); + + var flushCounter = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_flush", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() > 10); + } } @Test @@ -119,27 +140,120 @@ public class SolrIndexMetricsTest extends SolrTestCaseJ4 { addDocs(); - MetricRegistry registry = - h.getCoreContainer() - .getMetricManager() - .registry(h.getCore().getCoreMetricManager().getRegistryName()); - assertNotNull(registry); + try (SolrCore core = h.getCoreContainer().getCore("collection1")) { + var prometheusMetricReader = SolrMetricTestUtils.getPrometheusMetricReader(core); + assertNotNull(prometheusMetricReader); + MetricSnapshots otelMetrics = prometheusMetricReader.collect(); + assertTrue("Metrics count: " + otelMetrics.size(), otelMetrics.size() >= 19); + + // check basic index meters + var minorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "minor") + .build()); + assertTrue("minorMergeTimer: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() >= 3); + var majorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "major") + .build()); + // major merge timer should have a value of 0, and because 0 values are not reported, no + // datapoint is available + assertNull("majorMergeTimer", majorMergeTimer); + + // check detailed meters + var majorMergeDocs = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_major_merged_docs", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + // major merge docs should have a value of 0, and because 0 values are not reported, no + // datapoint is available + assertNull("majorMergeDocs", majorMergeDocs); + + var flushCounter = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_flush", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() > 10); + } + } + + public void testIndexMetricsMajorAndMinorMergesWithDetails() throws Exception { + System.setProperty("solr.tests.metrics.merge", "false"); // test mergeDetails override too + System.setProperty("solr.tests.metrics.mergeDetails", "true"); + System.setProperty("solr.tests.metrics.majorMergeDocs", "450"); + initCore("solrconfig-indexmetrics.xml", "schema.xml"); + + addDocs(); + + try (SolrCore core = h.getCoreContainer().getCore("collection1")) { + var prometheusMetricReader = SolrMetricTestUtils.getPrometheusMetricReader(core); + assertNotNull(prometheusMetricReader); + MetricSnapshots otelMetrics = prometheusMetricReader.collect(); + assertTrue("Metrics count: " + otelMetrics.size(), otelMetrics.size() >= 18); - Map<String, Metric> metrics = registry.getMetrics(); + // addDocs() adds 1000 documents and then sends a commit. maxBufferedDocs==100, + // segmentsPerTier==3, + // maxMergeAtOnce==3 and majorMergeDocs==450. Thus, new documents form segments with 100 + // docs, merges are + // called for when there are 3 segments at the lowest tier, and the merges are as follows: + // 1. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge + // 2. 100 + 100 + 100 ==> new 300 doc segment, below the 450 threshold ==> minor merge + // 3. 300 + 100 + 100 ==> new 500 doc segment, above the 450 threshold ==> major merge + // 4. 300 + 100 + 100 ==> new 500 doc segment, above the 450 threshold ==> major merge - assertTrue( - metrics.entrySet().stream().filter(e -> e.getKey().startsWith("INDEX")).count() >= 12); + // check basic index meters + var minorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "minor") + .build()); + assertTrue("minorMergeTimer: " + minorMergeTimer.getCount(), minorMergeTimer.getCount() == 2); + var majorMergeTimer = + SolrMetricTestUtils.getHistogramDatapoint( + core, + "solr_indexwriter_merge_milliseconds", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .label(MERGE_TYPE_ATTR.toString(), "major") + .build()); + assertTrue("majorMergeTimer: " + majorMergeTimer.getCount(), majorMergeTimer.getCount() == 2); - // check basic index meters - Timer timer = (Timer) metrics.get("INDEX.merge.minor"); - assertTrue("minorMerge: " + timer.getCount(), timer.getCount() >= 3); - timer = (Timer) metrics.get("INDEX.merge.major"); - assertEquals("majorMerge: " + timer.getCount(), 0, timer.getCount()); - // check detailed meters - Meter meter = (Meter) metrics.get("INDEX.merge.major.docs"); - assertEquals("majorMergeDocs: " + meter.getCount(), 0, meter.getCount()); + // check detailed meters + var majorMergeDocs = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_major_merged_docs", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + // majorMergeDocs is the total number of docs merged during major merge operations + assertTrue("majorMergeDocs: " + majorMergeDocs.getValue(), majorMergeDocs.getValue() == 1000); - meter = (Meter) metrics.get("INDEX.flush"); - assertTrue("flush: " + meter.getCount(), meter.getCount() > 10); + var flushCounter = + SolrMetricTestUtils.getCounterDatapoint( + core, + "solr_indexwriter_flush", + SolrMetricTestUtils.newStandaloneLabelsBuilder(core) + .label(CATEGORY_ATTR.toString(), SolrInfoBean.Category.INDEX.toString()) + .build()); + assertTrue("flush: " + flushCounter.getValue(), flushCounter.getValue() >= 10); + } } }
