This is an automated email from the ASF dual-hosted git repository.
rpuch pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/ignite-3.git
The following commit(s) were added to refs/heads/main by this push:
new be5df8031e6 IGNITE-27356 Add checkpoint metrics (#7250)
be5df8031e6 is described below
commit be5df8031e6b60ea976e724d4d6062df6e763dae
Author: Viacheslav Blinov <[email protected]>
AuthorDate: Tue Dec 30 17:26:38 2025 +0300
IGNITE-27356 Add checkpoint metrics (#7250)
Co-authored-by: Roman Puchkovskiy <[email protected]>
---
.../persistence/checkpoint/CheckpointManager.java | 5 +-
.../checkpoint/CheckpointReadWriteLock.java | 63 ++++++++---
.../checkpoint/CheckpointReadWriteLockMetrics.java | 120 +++++++++++++++++++++
.../checkpoint/CheckpointReadWriteLockTest.java | 6 +-
.../checkpoint/CheckpointTimeoutLockTest.java | 72 ++++++++++++-
.../checkpoint/CheckpointTestUtils.java | 8 +-
6 files changed, 254 insertions(+), 20 deletions(-)
diff --git
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
index 526490a4cd9..b6e23dbb363 100644
---
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
+++
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointManager.java
@@ -124,9 +124,12 @@ public class CheckpointManager {
? new
ReentrantReadWriteLockWithTracking(Loggers.forClass(CheckpointReadWriteLock.class),
logReadLockThresholdTimeout)
: new ReentrantReadWriteLockWithTracking();
+ var readWriteLockMetrics = new
CheckpointReadWriteLockMetrics(checkpointMetricSource);
+
CheckpointReadWriteLock checkpointReadWriteLock = new
CheckpointReadWriteLock(
reentrantReadWriteLockWithTracking,
- commonExecutorService
+ commonExecutorService,
+ readWriteLockMetrics
);
checkpointWorkflow = new CheckpointWorkflow(
diff --git
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
index eab912f45f2..290374878a2 100644
---
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
+++
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLock.java
@@ -17,8 +17,6 @@
package org.apache.ignite.internal.pagememory.persistence.checkpoint;
-import static
org.apache.ignite.internal.util.FastTimestamps.coarseCurrentTimeMillis;
-
import java.util.concurrent.Executor;
import java.util.concurrent.TimeUnit;
import org.apache.ignite.internal.lang.IgniteInternalException;
@@ -43,11 +41,15 @@ public class CheckpointReadWriteLock {
private final IgniteThrottledLogger log;
+ private final ThreadLocal<Long> checkpointReadLockAcquiredTime = new
ThreadLocal<>();
+
private final ThreadLocal<Integer> checkpointReadLockHoldCount =
ThreadLocal.withInitial(() -> 0);
/** Checkpoint lock. */
private final ReentrantReadWriteLockWithTracking checkpointLock;
+ private final CheckpointReadWriteLockMetrics metrics;
+
/** Current write lock holder thread. */
private volatile @Nullable Thread currentWriteLockHolder;
@@ -56,10 +58,16 @@ public class CheckpointReadWriteLock {
*
* @param checkpointLock Checkpoint lock.
* @param throttledLogExecutor Executor for the throttled logger.
+ * @param metrics Read/write lock metrics.
*/
- public CheckpointReadWriteLock(ReentrantReadWriteLockWithTracking
checkpointLock, Executor throttledLogExecutor) {
+ public CheckpointReadWriteLock(
+ ReentrantReadWriteLockWithTracking checkpointLock,
+ Executor throttledLogExecutor,
+ CheckpointReadWriteLockMetrics metrics
+ ) {
this.checkpointLock = checkpointLock;
this.log =
Loggers.toThrottledLogger(Loggers.forClass(CheckpointReadWriteLock.class),
throttledLogExecutor);
+ this.metrics = metrics;
}
/**
@@ -72,11 +80,12 @@ public class CheckpointReadWriteLock {
return;
}
- long start = coarseCurrentTimeMillis();
+ long startNanos = System.nanoTime();
+ metrics.incrementReadLockWaitingThreads();
checkpointLock.readLock().lock();
- onReadLock(start, true);
+ onReadLock(startNanos, true);
}
/**
@@ -91,11 +100,12 @@ public class CheckpointReadWriteLock {
return true;
}
- long start = coarseCurrentTimeMillis();
+ long startNanos = System.nanoTime();
+ metrics.incrementReadLockWaitingThreads();
boolean res = checkpointLock.readLock().tryLock(timeout, unit);
- onReadLock(start, res);
+ onReadLock(startNanos, res);
return res;
}
@@ -110,11 +120,12 @@ public class CheckpointReadWriteLock {
return true;
}
- long start = coarseCurrentTimeMillis();
+ long startNanos = System.nanoTime();
+ metrics.incrementReadLockWaitingThreads();
boolean res = checkpointLock.readLock().tryLock();
- onReadLock(start, res);
+ onReadLock(startNanos, res);
return res;
}
@@ -138,7 +149,7 @@ public class CheckpointReadWriteLock {
checkpointLock.readLock().unlock();
- checkpointReadLockHoldCount.set(checkpointReadLockHoldCount.get() - 1);
+ onReadUnlock();
}
/**
@@ -181,15 +192,37 @@ public class CheckpointReadWriteLock {
return checkpointLock.hasQueuedWriters();
}
- private void onReadLock(long start, boolean taken) {
- long elapsed = coarseCurrentTimeMillis() - start;
+ private void onReadLock(long startNanos, boolean taken) {
+ metrics.decrementReadLockWaitingThreads();
+
+ long currentNanos = System.nanoTime();
+ long elapsedNanos = currentNanos - startNanos;
if (taken) {
- checkpointReadLockHoldCount.set(checkpointReadLockHoldCount.get()
+ 1);
+ int newLockCount = checkpointReadLockHoldCount.get() + 1;
+ checkpointReadLockHoldCount.set(newLockCount);
+
+ // We only record acquisition time on first lock acquisition (not
on reentry).
+ if (newLockCount == 1) {
+ checkpointReadLockAcquiredTime.set(currentNanos);
+ }
+ metrics.recordReadLockAcquisitionTime(elapsedNanos);
}
- if (elapsed > LONG_LOCK_THRESHOLD_MILLIS) {
- log.warn(LONG_LOCK_THROTTLE_KEY, "Checkpoint read lock took {} ms
to acquire.", elapsed);
+ long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(elapsedNanos);
+ if (elapsedMillis > LONG_LOCK_THRESHOLD_MILLIS) {
+ log.warn(LONG_LOCK_THROTTLE_KEY, "Checkpoint read lock took {} ms
to acquire.", elapsedMillis);
+ }
+ }
+
+ private void onReadUnlock() {
+ int newLockCount = checkpointReadLockHoldCount.get() - 1;
+ checkpointReadLockHoldCount.set(newLockCount);
+ if (newLockCount == 0) {
+ // Fully unlocked - record hold duration.
+ Long acquiredTimeNanos = checkpointReadLockAcquiredTime.get();
+ long holdDurationNanos = System.nanoTime() - acquiredTimeNanos;
+ metrics.recordReadLockHoldDuration(holdDurationNanos);
}
}
}
diff --git
a/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
new file mode 100644
index 00000000000..e1d63df4017
--- /dev/null
+++
b/modules/page-memory/src/main/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockMetrics.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.ignite.internal.pagememory.persistence.checkpoint;
+
+import org.apache.ignite.internal.metrics.DistributionMetric;
+import org.apache.ignite.internal.metrics.LongAdderMetric;
+
+/**
+ * Metrics for checkpoint read/write lock operations.
+ *
+ * <p>This metric source tracks performance and contention characteristics of
checkpoint read lock
+ * acquired by normal operations during database operation.
+ */
+public class CheckpointReadWriteLockMetrics {
+ private static final long[] LOCK_ACQUISITION_BOUNDS_NANOS = {
+ 1_000, // 1µs - uncontended, fast path
+ 10_000, // 10µs - minor contention
+ 100_000, // 100µs - moderate contention
+ 1_000_000, // 1ms - high contention
+ 10_000_000, // 10ms - checkpoint in progress?
+ 100_000_000, // 100ms - severe contention, reported as warning
in logs
+ 1_000_000_000 // 1s - pathological case, shall be treated as
an emergency error
+ };
+
+ private static final long[] LOCK_HOLD_BOUNDS_NANOS = {
+ 1_000, // 1µs - very fast operation (single field
update)
+ 10_000, // 10µs - fast single-page operation
+ 100_000, // 100µs - multi-page operation
+ 1_000_000, // 1ms - complex operation
+ 10_000_000, // 10ms - batch operation
+ 100_000_000, // 100ms - large batch or slow I/O
+ 1_000_000_000 // 1s - pathologically long operation
+ };
+
+ private final DistributionMetric readLockAcquisitionTime = new
DistributionMetric(
+ "ReadLockAcquisitionTime",
+ "Time from requesting checkpoint read lock until acquisition in
nanoseconds.",
+ LOCK_ACQUISITION_BOUNDS_NANOS
+ );
+
+ private final DistributionMetric readLockHoldTime = new DistributionMetric(
+ "ReadLockHoldTime",
+ "Duration between checkpoint read lock acquisition and release in
nanoseconds.",
+ LOCK_HOLD_BOUNDS_NANOS
+ );
+
+ private final LongAdderMetric readLockWaitingThreads = new LongAdderMetric(
+ "ReadLockWaitingThreads",
+ "Current number of threads waiting for checkpoint read lock."
+ );
+
+ /**
+ * Constructor.
+ *
+ * @param metricSource Metric source to register metrics with.
+ */
+ public CheckpointReadWriteLockMetrics(CheckpointMetricSource metricSource)
{
+ metricSource.addMetric(readLockAcquisitionTime);
+ metricSource.addMetric(readLockHoldTime);
+ metricSource.addMetric(readLockWaitingThreads);
+ }
+
+ /**
+ * Records the duration of a lock acquisition in nanoseconds.
+ */
+ public void recordReadLockAcquisitionTime(long acquisitionDurationNanos) {
+ readLockAcquisitionTime.add(acquisitionDurationNanos);
+ }
+
+ /**
+ * Records the duration of a lock hold in nanoseconds.
+ */
+ public void recordReadLockHoldDuration(long lockHoldDurationNanos) {
+ readLockHoldTime.add(lockHoldDurationNanos);
+ }
+
+ /**
+ * Increments the count of threads waiting for the read lock.
+ */
+ public void incrementReadLockWaitingThreads() {
+ readLockWaitingThreads.increment();
+ }
+
+ /**
+ * Decrements the count of threads waiting for the read lock.
+ */
+ public void decrementReadLockWaitingThreads() {
+ readLockWaitingThreads.decrement();
+ }
+
+ /** Returns the read lock acquisition time metric. */
+ DistributionMetric readLockAcquisitionTime() {
+ return readLockAcquisitionTime;
+ }
+
+ /** Returns the read lock hold time metric. */
+ DistributionMetric readLockHoldTime() {
+ return readLockHoldTime;
+ }
+
+ /** Returns the read lock waiting threads metric. */
+ LongAdderMetric readLockWaitingThreads() {
+ return readLockWaitingThreads;
+ }
+}
diff --git
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
index 43b264897e2..a7846208469 100644
---
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
+++
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointReadWriteLockTest.java
@@ -41,6 +41,10 @@ public class CheckpointReadWriteLockTest {
@InjectExecutorService
private ExecutorService executorService;
+ private final CheckpointReadWriteLockMetrics metrics = new
CheckpointReadWriteLockMetrics(
+ new CheckpointMetricSource("test")
+ );
+
@Test
void testReadLock() throws Exception {
CheckpointReadWriteLock lock0 = newReadWriteLock();
@@ -167,7 +171,7 @@ public class CheckpointReadWriteLockTest {
}
private CheckpointReadWriteLock newReadWriteLock() {
- return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(), executorService);
+ return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(), executorService, metrics);
}
@Test
diff --git
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
index 7aa7021e44e..bca2ab77fa8 100644
---
a/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
+++
b/modules/page-memory/src/test/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTimeoutLockTest.java
@@ -28,6 +28,7 @@ import static
org.apache.ignite.internal.testframework.IgniteTestUtils.runAsync;
import static
org.apache.ignite.internal.testframework.matchers.CompletableFutureMatcher.willSucceedIn;
import static org.apache.ignite.internal.util.IgniteUtils.closeAll;
import static
org.apache.ignite.lang.ErrorGroups.CriticalWorkers.SYSTEM_CRITICAL_OPERATION_TIMEOUT_ERR;
+import static org.awaitility.Awaitility.await;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;
@@ -41,6 +42,7 @@ import static org.mockito.Mockito.doAnswer;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
+import java.util.Arrays;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CountDownLatch;
@@ -51,6 +53,7 @@ import java.util.concurrent.atomic.AtomicReference;
import org.apache.ignite.internal.failure.FailureManager;
import org.apache.ignite.internal.lang.IgniteInternalException;
import org.apache.ignite.internal.lang.NodeStoppingException;
+import org.apache.ignite.internal.metrics.DistributionMetric;
import org.apache.ignite.internal.pagememory.persistence.CheckpointUrgency;
import org.apache.ignite.internal.testframework.BaseIgniteAbstractTest;
import org.apache.ignite.internal.testframework.ExecutorServiceExtension;
@@ -71,6 +74,10 @@ public class CheckpointTimeoutLockTest extends
BaseIgniteAbstractTest {
@InjectExecutorService
private ExecutorService executorService;
+ private final CheckpointReadWriteLockMetrics dummyMetrics = new
CheckpointReadWriteLockMetrics(
+ new CheckpointMetricSource("test")
+ );
+
@AfterEach
void tearDown() {
if (timeoutLock != null) {
@@ -385,7 +392,11 @@ public class CheckpointTimeoutLockTest extends
BaseIgniteAbstractTest {
}
private CheckpointReadWriteLock newReadWriteLock() {
- return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(log, 5_000), executorService);
+ return newReadWriteLock(dummyMetrics);
+ }
+
+ private CheckpointReadWriteLock
newReadWriteLock(CheckpointReadWriteLockMetrics metrics) {
+ return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(log, 5_000), executorService, metrics);
}
private CheckpointProgress newCheckpointProgress(CompletableFuture<?>
future) {
@@ -407,4 +418,63 @@ public class CheckpointTimeoutLockTest extends
BaseIgniteAbstractTest {
return checkpointer;
}
+
+ @Test
+ void testCheckpointReadLockMetrics() {
+ CheckpointMetricSource metricSource = new
CheckpointMetricSource("test");
+ CheckpointReadWriteLockMetrics metrics = new
CheckpointReadWriteLockMetrics(metricSource);
+ CheckpointReadWriteLock readWriteLock = newReadWriteLock(metrics);
+
+ timeoutLock = new CheckpointTimeoutLock(
+ readWriteLock,
+ 10_000,
+ () -> NOT_REQUIRED,
+ mock(Checkpointer.class),
+ mock(FailureManager.class)
+ );
+
+ timeoutLock.start();
+
+ try {
+ // Verify metrics start at zero
+
assertDistributionMetricRecordsCount(metrics.readLockAcquisitionTime(), 0L);
+
+ // Acquire and immediately release the lock
+ timeoutLock.checkpointReadLock();
+ timeoutLock.checkpointReadUnlock();
+
+ // Verify acquisition was recorded
+
assertDistributionMetricRecordsCount(metrics.readLockAcquisitionTime(), 1L);
+
+ // Verify hold time distribution was recorded
+ assertDistributionMetricRecordsCount(metrics.readLockHoldTime(),
1L);
+
+ readWriteLock.writeLock();
+ runAsync(() -> {
+ timeoutLock.checkpointReadLock();
+ timeoutLock.checkpointReadUnlock();
+ });
+ await().untilAsserted(() ->
assertThat(metrics.readLockWaitingThreads().value(), is(1L)));
+ readWriteLock.writeUnlock();
+ await().untilAsserted(() ->
assertThat(metrics.readLockWaitingThreads().value(), is(0L)));
+ } finally {
+ timeoutLock.stop();
+ }
+ }
+
+ /**
+ * Verifies that the specified distribution metric has recorded the
expected total number of measurements.
+ *
+ * <p>
+ * Rather than checking individual histogram buckets, this method
aggregates all recorded measurements across every bucket
+ * and confirms that the expected interaction was captured in at least one
of them.
+ */
+ private static void
assertDistributionMetricRecordsCount(DistributionMetric metric, long
expectedMeasuresCount) {
+ long totalMeasuresCount = Arrays.stream(metric.value()).sum();
+ assertThat(
+ "Unexpected total measures count in distribution metric " +
metric.name(),
+ totalMeasuresCount,
+ is(expectedMeasuresCount)
+ );
+ }
}
diff --git
a/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
b/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
index 8d2267ebd13..ee95c6a6b1c 100644
---
a/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
+++
b/modules/page-memory/src/testFixtures/java/org/apache/ignite/internal/pagememory/persistence/checkpoint/CheckpointTestUtils.java
@@ -42,14 +42,18 @@ import
org.apache.ignite.internal.pagememory.persistence.store.FilePageStoreMana
* Useful class for testing a checkpoint.
*/
public class CheckpointTestUtils {
+ private static final CheckpointReadWriteLockMetrics metrics = new
CheckpointReadWriteLockMetrics(
+ new CheckpointMetricSource("test")
+ );
+
/**
* Returns new instance of {@link CheckpointReadWriteLock}.
*
- * @param log Logger.
+ * @param log Logger.
* @param executorService Executor service.
*/
static CheckpointReadWriteLock newReadWriteLock(IgniteLogger log,
ExecutorService executorService) {
- return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(log, 5_000), executorService);
+ return new CheckpointReadWriteLock(new
ReentrantReadWriteLockWithTracking(log, 5_000), executorService, metrics);
}
/**