This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 243be86725c HDDS-14768. Fix lock leak during snapshot cache cleanup
and handle eviction race appropriately. (#9869)
243be86725c is described below
commit 243be86725c53e8a9904d48d98c187731fde8917
Author: SaketaChalamchala <[email protected]>
AuthorDate: Wed Mar 18 21:47:28 2026 -0700
HDDS-14768. Fix lock leak during snapshot cache cleanup and handle eviction
race appropriately. (#9869)
Co-authored-by: Copilot Autofix powered by AI
<[email protected]>
---
.../hadoop/ozone/om/snapshot/SnapshotCache.java | 37 ++++--
.../ozone/om/snapshot/TestSnapshotCache.java | 135 +++++++++++++++++++++
2 files changed, 161 insertions(+), 11 deletions(-)
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotCache.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotCache.java
index 6ab588463c5..790150e218a 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotCache.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotCache.java
@@ -25,6 +25,7 @@
import com.google.common.annotations.VisibleForTesting;
import com.google.common.cache.CacheLoader;
import java.io.IOException;
+import java.util.Collections;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
@@ -135,6 +136,11 @@ ConcurrentHashMap<UUID, ReferenceCounted<OmSnapshot>>
getDbMap() {
return dbMap;
}
+ @VisibleForTesting
+ Set<UUID> getPendingEvictionQueue() {
+ return Collections.unmodifiableSet(pendingEvictionQueue);
+ }
+
/**
* @return number of DB instances currently held in cache.
*/
@@ -158,6 +164,7 @@ public void invalidate(UUID key) {
}
omMetrics.decNumSnapshotCacheSize();
}
+ pendingEvictionQueue.remove(k);
return null;
});
}
@@ -323,8 +330,17 @@ private UncheckedAutoCloseableSupplier<OMLockDetails>
lock(Supplier<OMLockDetail
AtomicReference<OMLockDetails> lockDetails = new
AtomicReference<>(emptyLockFunction.get());
if (lockDetails.get().isLockAcquired()) {
- if (!cleanupFunction.get()) {
- lockDetails.set(emptyUnlockFunction.get());
+ try {
+ if (!cleanupFunction.get()) {
+ throw new IllegalStateException("Failed to acquire lock as cleanup
did not drain the cache.");
+ }
+ } catch (Throwable t) {
+ try {
+ lockDetails.set(emptyUnlockFunction.get());
+ } catch (Throwable unlockThrowable) {
+ t.addSuppressed(unlockThrowable);
+ }
+ throw t;
}
}
@@ -377,26 +393,25 @@ private synchronized Void cleanup(UUID evictionKey,
boolean expectKeyToBePresent
}
dbMap.compute(evictionKey, (k, v) -> {
- pendingEvictionQueue.remove(k);
+ ReferenceCounted<OmSnapshot> result = null;
if (v == null) {
- throw new IllegalStateException("SnapshotId '" + k + "' does not exist
in cache. The RocksDB " +
- "instance of the Snapshot may not be closed properly.");
- }
-
- if (v.getTotalRefCount() > 0) {
+ LOG.info("SnapshotId '{}' does not exist in cache during cleanup; "
+ + "it may have already been invalidated, closed, and removed.", k);
+ } else if (v.getTotalRefCount() > 0) {
LOG.debug("SnapshotId {} is still being referenced ({}), skipping its
clean up.", k, v.getTotalRefCount());
- return v;
+ result = v;
} else {
LOG.debug("Closing SnapshotId {}. It is not being referenced
anymore.", k);
// Close the instance, which also closes its DB handle.
try {
v.get().close();
} catch (IOException ex) {
- throw new IllegalStateException("Error while closing snapshot DB.",
ex);
+ throw new IllegalStateException("Error while closing snapshot DB for
snapshotId " + k, ex);
}
omMetrics.decNumSnapshotCacheSize();
- return null;
}
+ pendingEvictionQueue.remove(k);
+ return result;
});
return null;
}
diff --git
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotCache.java
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotCache.java
index 4adf0011342..ebc78b26c1e 100644
---
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotCache.java
+++
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/snapshot/TestSnapshotCache.java
@@ -19,6 +19,7 @@
import static
org.apache.hadoop.ozone.om.lock.DAGLeveledResource.SNAPSHOT_DB_LOCK;
import static
org.apache.hadoop.ozone.om.lock.OzoneManagerLock.LeveledResource.VOLUME_LOCK;
+import static org.junit.jupiter.api.Assertions.assertDoesNotThrow;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
@@ -46,6 +47,7 @@
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import org.apache.hadoop.hdds.utils.db.DBStore;
import org.apache.hadoop.hdds.utils.db.Table;
import org.apache.hadoop.ozone.om.OMMetadataManager;
import org.apache.hadoop.ozone.om.OMMetrics;
@@ -511,4 +513,137 @@ void testSnapshotOperationsNotBlockedDuringCompaction()
throws IOException, Inte
verify(store1, times(1)).compactTable("table2");
verify(store1, times(0)).compactTable("keyTable");
}
+
+ private static IOzoneManagerLock newAcquiringLock() {
+ IOzoneManagerLock acquiringLock = mock(IOzoneManagerLock.class);
+ when(acquiringLock.acquireReadLock(eq(SNAPSHOT_DB_LOCK),
any(String[].class)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_ACQUIRED);
+ when(acquiringLock.releaseReadLock(eq(SNAPSHOT_DB_LOCK),
any(String[].class)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_NOT_ACQUIRED);
+ when(acquiringLock.acquireResourceWriteLock(eq(SNAPSHOT_DB_LOCK)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_ACQUIRED);
+ when(acquiringLock.releaseResourceWriteLock(eq(SNAPSHOT_DB_LOCK)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_NOT_ACQUIRED);
+ when(acquiringLock.acquireWriteLock(eq(SNAPSHOT_DB_LOCK),
any(String[].class)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_ACQUIRED);
+ when(acquiringLock.releaseWriteLock(eq(SNAPSHOT_DB_LOCK),
any(String[].class)))
+ .thenReturn(OMLockDetails.EMPTY_DETAILS_LOCK_NOT_ACQUIRED);
+ return acquiringLock;
+ }
+
+ private OmSnapshot mockSnapshot(UUID snapshotId) {
+ final OmSnapshot omSnapshot = mock(OmSnapshot.class);
+ when(omSnapshot.getSnapshotTableKey()).thenReturn(snapshotId.toString());
+ when(omSnapshot.getSnapshotID()).thenReturn(snapshotId);
+
+ return omSnapshot;
+ }
+
+ @Test
+ @DisplayName("Stale eviction key (invalidate + late close) is cleaned up
without throwing")
+ void testStaleEvictionKeyDuringCleanup() throws IOException {
+ snapshotCache = new SnapshotCache(cacheLoader, CACHE_SIZE_LIMIT,
omMetrics, 0, true, newAcquiringLock());
+ final UUID snapshotId = UUID.randomUUID();
+
+ // Acquire a snapshot handle so it is ref-counted in the cache.
+ UncheckedAutoCloseableSupplier<OmSnapshot> handle =
snapshotCache.get(snapshotId);
+ assertEquals(1, snapshotCache.size());
+
+ // Invalidate removes the dbMap entry. The handle still exists and will
later hit refcount=0.
+ snapshotCache.invalidate(snapshotId);
+ assertEquals(0, snapshotCache.size());
+
+ // Late close triggers ReferenceCounted callback which can re-add
snapshotId to pendingEvictionQueue.
+ handle.close();
+ assertTrue(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+
+ // cleanup(true) is invoked by lock(); it should remove the stale key and
not throw.
+ assertDoesNotThrow(() -> {
+ try (UncheckedAutoCloseableSupplier<OMLockDetails> lockDetails =
snapshotCache.lock()) {
+ assertTrue(lockDetails.get().isLockAcquired());
+ }
+ });
+ assertFalse(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+ }
+
+ @Test
+ @DisplayName("Close failure keeps snapshot in eviction queue for retry")
+ void testCloseFailureRetriesSnapshot() throws Exception {
+
+ IOzoneManagerLock acquiringLock = newAcquiringLock();
+ snapshotCache = new SnapshotCache(cacheLoader, CACHE_SIZE_LIMIT,
omMetrics, 0, true, acquiringLock);
+ final UUID snapshotId = UUID.randomUUID();
+
+ final AtomicBoolean failCloseOnce = new AtomicBoolean(true);
+ final OmSnapshot failingSnapshot = mockSnapshot(snapshotId);
+
+ OMMetadataManager metadataManager = mock(OMMetadataManager.class);
+ DBStore store = mock(DBStore.class);
+ when(failingSnapshot.getMetadataManager()).thenReturn(metadataManager);
+ when(metadataManager.getStore()).thenReturn(store);
+ when(store.listTables()).thenReturn(new ArrayList<>());
+
+ doAnswer(invocation -> {
+ if (failCloseOnce.getAndSet(false)) {
+ throw new IOException("close failed");
+ }
+ return null;
+ }).when(failingSnapshot).close();
+
+ when(cacheLoader.load(eq(snapshotId))).thenReturn(failingSnapshot);
+
+ // Load + close handle so refcount transitions to 0 and snapshotId is
queued for eviction.
+ try (UncheckedAutoCloseableSupplier<OmSnapshot> ignored =
snapshotCache.get(snapshotId)) {
+ assertEquals(1, snapshotCache.size());
+ assertEquals(1, omMetrics.getNumSnapshotCacheSize());
+ }
+ assertEquals(0L,
snapshotCache.getDbMap().get(snapshotId).getTotalRefCount());
+ assertTrue(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+
+ // First cleanup attempt fails to close; entry should remain in dbMap and
key should stay queued for retry.
+ assertThrows(IllegalStateException.class, () -> snapshotCache.lock());
+ verify(acquiringLock,
times(1)).acquireResourceWriteLock(eq(SNAPSHOT_DB_LOCK));
+ verify(acquiringLock,
times(1)).releaseResourceWriteLock(eq(SNAPSHOT_DB_LOCK));
+ assertTrue(snapshotCache.getDbMap().containsKey(snapshotId));
+ assertTrue(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+ assertEquals(1, omMetrics.getNumSnapshotCacheSize());
+
+ // Second cleanup attempt should succeed (close no longer throws),
removing entry and eviction key.
+ try (UncheckedAutoCloseableSupplier<OMLockDetails> lockDetails =
snapshotCache.lock()) {
+ assertTrue(lockDetails.get().isLockAcquired());
+ }
+ assertFalse(snapshotCache.getDbMap().containsKey(snapshotId));
+ assertFalse(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+ assertEquals(0, omMetrics.getNumSnapshotCacheSize());
+ }
+
+ @Test
+ @DisplayName("lock supplier releases write lock if cleanup throws an
exception")
+ void testLockSupplierReleasesWriteLockOnCleanupException() throws Exception {
+ IOzoneManagerLock acquiringLock = newAcquiringLock();
+ snapshotCache = new SnapshotCache(cacheLoader, CACHE_SIZE_LIMIT,
omMetrics, 0, true, acquiringLock);
+
+ final UUID snapshotId = UUID.randomUUID();
+ final OmSnapshot failingSnapshot = mockSnapshot(snapshotId);
+
+ OMMetadataManager metadataManager = mock(OMMetadataManager.class);
+ DBStore store = mock(DBStore.class);
+ when(failingSnapshot.getMetadataManager()).thenReturn(metadataManager);
+ when(metadataManager.getStore()).thenReturn(store);
+ // Trigger an unchecked exception during compaction, which is not caught
by cleanup().
+ when(store.listTables()).thenThrow(new RuntimeException("listTables
failed"));
+
+ when(cacheLoader.load(eq(snapshotId))).thenReturn(failingSnapshot);
+
+ // Load the snapshot and close so it is enqueued for eviction (refcount
reaches 0).
+ try (UncheckedAutoCloseableSupplier<OmSnapshot> ignored =
snapshotCache.get(snapshotId)) {
+ assertEquals(1, snapshotCache.size());
+ }
+ assertTrue(snapshotCache.getPendingEvictionQueue().contains(snapshotId));
+
+ // cleanup(true) will throw -> lock() should release the resource write
lock before rethrowing.
+ assertThrows(RuntimeException.class, () -> snapshotCache.lock());
+ verify(acquiringLock,
times(1)).acquireResourceWriteLock(eq(SNAPSHOT_DB_LOCK));
+ verify(acquiringLock,
times(1)).releaseResourceWriteLock(eq(SNAPSHOT_DB_LOCK));
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]