This is an automated email from the ASF dual-hosted git repository.
hemant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new f01e0a12ed HDDS-9199. Snapshot chain corruption should not fail OM
restart (#5262)
f01e0a12ed is described below
commit f01e0a12edfc5b28c0c65b3f7ee46fe908870bbf
Author: Hemant Kumar <[email protected]>
AuthorDate: Wed Oct 4 10:31:50 2023 -0700
HDDS-9199. Snapshot chain corruption should not fail OM restart (#5262)
---
.../ozone/om/TestOzoneManagerHASnapshot.java | 5 ++
.../hadoop/ozone/om/SnapshotChainManager.java | 66 +++++++++++++-----
.../request/snapshot/OMSnapshotCreateRequest.java | 2 +-
.../apache/hadoop/ozone/om/TestSnapshotChain.java | 78 ++++++++++++++++------
4 files changed, 115 insertions(+), 36 deletions(-)
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
index 7f071f1137..4dc0ab111b 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
@@ -50,6 +50,7 @@ import static
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.DO
import static
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.IN_PROGRESS;
import static org.awaitility.Awaitility.await;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -256,6 +257,10 @@ public class TestOzoneManagerHASnapshot {
await().atMost(Duration.ofSeconds(180))
.until(() -> cluster.getOMLeader() != null);
assertNotNull(cluster.getOMLeader());
+ OmMetadataManagerImpl metadataManager = (OmMetadataManagerImpl) cluster
+ .getOMLeader().getMetadataManager();
+ assertFalse(metadataManager.getSnapshotChainManager()
+ .isSnapshotChainCorrupted());
}
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index f87383756e..9c0c1b602c 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -43,7 +43,7 @@ import java.util.concurrent.ConcurrentMap;
* ii.) Global snapshot chain, sequence of all snapshots created in order
* <p>
* On start, the snapshot chains are initialized from the on disk
- * SnapshotInfoTable from the om RocksDB.
+ * SnapshotInfoTable from the OM RocksDB.
*/
public class SnapshotChainManager {
private static final Logger LOG =
@@ -55,6 +55,7 @@ public class SnapshotChainManager {
private final ConcurrentMap<String, UUID> latestSnapshotIdByPath;
private final ConcurrentMap<UUID, String> snapshotIdToTableKey;
private UUID latestGlobalSnapshotId;
+ private final boolean snapshotChainCorrupted;
public SnapshotChainManager(OMMetadataManager metadataManager) {
globalSnapshotChain = Collections.synchronizedMap(new LinkedHashMap<>());
@@ -62,7 +63,7 @@ public class SnapshotChainManager {
latestSnapshotIdByPath = new ConcurrentHashMap<>();
snapshotIdToTableKey = new ConcurrentHashMap<>();
latestGlobalSnapshotId = null;
- loadFromSnapshotInfoTable(metadataManager);
+ snapshotChainCorrupted = !loadFromSnapshotInfoTable(metadataManager);
}
/**
@@ -163,7 +164,7 @@ public class SnapshotChainManager {
latestSnapshotIdByPath.put(snapshotPath, snapshotID);
}
- private boolean deleteSnapshotGlobal(UUID snapshotID) throws IOException {
+ private boolean deleteSnapshotGlobal(UUID snapshotID) {
if (globalSnapshotChain.containsKey(snapshotID)) {
// reset prev and next snapshot entries in chain ordered list
// for node removal
@@ -171,13 +172,15 @@ public class SnapshotChainManager {
UUID prev = globalSnapshotChain.get(snapshotID).getPreviousSnapshotId();
if (prev != null && !globalSnapshotChain.containsKey(prev)) {
- throw new IOException(String.format("Snapshot chain corruption. " +
+ throw new IllegalStateException(String.format(
+ "Global snapshot chain corruption. " +
"SnapshotId: %s to be deleted has previous snapshotId: %s " +
"but associated snapshot is not found in snapshot chain.",
snapshotID, prev));
}
if (next != null && !globalSnapshotChain.containsKey(next)) {
- throw new IOException(String.format("Snapshot chain corruption. " +
+ throw new IllegalStateException(String.format(
+ "Global snapshot chain corruption. " +
"SnapshotId: {%s} to be deleted has next snapshotId: %s " +
"but associated snapshot is not found in snapshot chain.",
snapshotID, next));
@@ -203,7 +206,7 @@ public class SnapshotChainManager {
}
private boolean deleteSnapshotPath(String snapshotPath,
- UUID snapshotId) throws IOException {
+ UUID snapshotId) {
if (snapshotChainByPath.containsKey(snapshotPath) &&
snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
// reset prev and next snapshot entries in chain ordered list
@@ -220,7 +223,8 @@ public class SnapshotChainManager {
if (previousSnapshotId != null &&
!snapshotChainByPath.get(snapshotPath)
.containsKey(previousSnapshotId)) {
- throw new IOException(String.format("Snapshot chain corruption. " +
+ throw new IllegalStateException(String.format(
+ "Path snapshot chain corruption. " +
"SnapshotId: %s at snapshotPath: %s to be deleted has " +
"previous snapshotId: %s but associated snapshot is not " +
"found in snapshot chain.", snapshotId, snapshotPath,
@@ -228,7 +232,8 @@ public class SnapshotChainManager {
}
if (nextSnapshotId != null && !snapshotChainByPath.get(snapshotPath)
.containsKey(nextSnapshotId)) {
- throw new IOException(String.format("Snapshot chain corruption. " +
+ throw new IllegalStateException(String.format(
+ "Path snapshot chain corruption. " +
"SnapshotId: %s at snapshotPath: %s to be deleted has next " +
"snapshotId: %s but associated snapshot is not found in " +
"snapshot chain.", snapshotId, snapshotPath,
@@ -267,9 +272,10 @@ public class SnapshotChainManager {
}
/**
- * Loads the snapshot chain from SnapshotInfo table.
+ * Loads the snapshot chain from SnapshotInfo table and return true if chain
+ * gets loaded successfully.
*/
- private void loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
+ private boolean loadFromSnapshotInfoTable(OMMetadataManager metadataManager)
{
// read from snapshotInfo table to populate
// snapshot chains - both global and local path
try (TableIterator<String, ? extends Table.KeyValue<String, SnapshotInfo>>
@@ -318,16 +324,18 @@ public class SnapshotChainManager {
"corruption. All snapshots have not been added to the " +
"snapshot chain. Last snapshot added to chain : %s", prev));
}
- } catch (IOException ioException) {
- // TODO: [SNAPSHOT] Fail gracefully.
- throw new RuntimeException(ioException);
+ } catch (IOException | IllegalStateException exception) {
+ LOG.error("Failure while loading snapshot chain.", exception);
+ return false;
}
+ return true;
}
/**
* Add snapshot to snapshot chain.
*/
public synchronized void addSnapshot(SnapshotInfo snapshotInfo) {
+ validateSnapshotChain();
addSnapshotGlobal(snapshotInfo.getSnapshotId(),
snapshotInfo.getGlobalPreviousSnapshotId());
addSnapshotPath(snapshotInfo.getSnapshotPath(),
@@ -341,8 +349,8 @@ public class SnapshotChainManager {
/**
* Delete snapshot from snapshot chain.
*/
- public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo)
- throws IOException {
+ public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo) {
+ validateSnapshotChain();
boolean status = deleteSnapshotGlobal(snapshotInfo.getSnapshotId()) &&
deleteSnapshotPath(snapshotInfo.getSnapshotPath(),
snapshotInfo.getSnapshotId());
@@ -356,6 +364,7 @@ public class SnapshotChainManager {
* Get latest global snapshot in snapshot chain.
*/
public UUID getLatestGlobalSnapshotId() {
+ validateSnapshotChain();
return latestGlobalSnapshotId;
}
@@ -363,6 +372,7 @@ public class SnapshotChainManager {
* Get latest path snapshot in snapshot chain.
*/
public UUID getLatestPathSnapshotId(String snapshotPath) {
+ validateSnapshotChain();
return latestSnapshotIdByPath.get(snapshotPath);
}
@@ -371,6 +381,7 @@ public class SnapshotChainManager {
* in the global snapshot chain.
*/
public boolean hasNextGlobalSnapshot(UUID snapshotId) {
+ validateSnapshotChain();
if (!globalSnapshotChain.containsKey(snapshotId)) {
LOG.error("No snapshot for provided snapshotId: {}", snapshotId);
throw new NoSuchElementException(String.format("SnapshotId: %s is not " +
@@ -383,6 +394,7 @@ public class SnapshotChainManager {
* Get next global snapshot in snapshot chain from given snapshot.
*/
public UUID nextGlobalSnapshot(UUID snapshotId) {
+ validateSnapshotChain();
if (!hasNextGlobalSnapshot(snapshotId)) {
LOG.error("No snapshot for provided snapshotId: {}", snapshotId);
throw new NoSuchElementException(String.format("SnapshotId: %s is not " +
@@ -396,6 +408,7 @@ public class SnapshotChainManager {
* entry in the global snapshot chain.
*/
public boolean hasPreviousGlobalSnapshot(UUID snapshotId) {
+ validateSnapshotChain();
if (!globalSnapshotChain.containsKey(snapshotId)) {
LOG.error("No snapshot found in snapshot chain for provided " +
"snapshotId: {}.", snapshotId);
@@ -410,6 +423,7 @@ public class SnapshotChainManager {
* Get previous global snapshot in snapshot chain from given snapshot.
*/
public UUID previousGlobalSnapshot(UUID snapshotId) {
+ validateSnapshotChain();
if (!hasPreviousGlobalSnapshot(snapshotId)) {
LOG.error("No preceding snapshot found in snapshot chain for provided " +
"snapshotId: {}.", snapshotId);
@@ -424,6 +438,7 @@ public class SnapshotChainManager {
* entry in the path snapshot chain.
*/
public boolean hasNextPathSnapshot(String snapshotPath, UUID snapshotId) {
+ validateSnapshotChain();
if (!snapshotChainByPath.containsKey(snapshotPath) ||
!snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
LOG.error("No snapshot found for provided snapshotId: {} and " +
@@ -443,6 +458,7 @@ public class SnapshotChainManager {
* Get next path snapshot in snapshot chain from given snapshot.
*/
public UUID nextPathSnapshot(String snapshotPath, UUID snapshotId) {
+ validateSnapshotChain();
if (!hasNextPathSnapshot(snapshotPath, snapshotId)) {
LOG.error("No following snapshot for provided snapshotId {} and " +
"snapshotPath {}.", snapshotId, snapshotPath);
@@ -461,6 +477,7 @@ public class SnapshotChainManager {
*/
public boolean hasPreviousPathSnapshot(String snapshotPath,
UUID snapshotId) {
+ validateSnapshotChain();
if (!snapshotChainByPath.containsKey(snapshotPath) ||
!snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
LOG.error("No snapshot found for provided snapshotId: {} and " +
@@ -479,6 +496,7 @@ public class SnapshotChainManager {
*/
public UUID previousPathSnapshot(String snapshotPath,
UUID snapshotId) {
+ validateSnapshotChain();
if (!hasPreviousPathSnapshot(snapshotPath, snapshotId)) {
LOG.error("No preceding snapshot for provided snapshotId: {} and " +
"snapshotPath: {}", snapshotId, snapshotPath);
@@ -498,17 +516,35 @@ public class SnapshotChainManager {
public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(
String path) {
+ validateSnapshotChain();
return snapshotChainByPath.get(path);
}
@VisibleForTesting
public Map<UUID, SnapshotChainInfo> getGlobalSnapshotChain() {
+ validateSnapshotChain();
return globalSnapshotChain;
}
@VisibleForTesting
public Map<String,
LinkedHashMap<UUID, SnapshotChainInfo>> getSnapshotChainByPath() {
+ validateSnapshotChain();
return snapshotChainByPath;
}
+
+ /**
+ * Validate if snapshot chain is loaded without any error and throw
+ * IllegalStateException in case there was an issue while loading snapshot
+ * chain on OM start up.
+ */
+ private void validateSnapshotChain() {
+ if (snapshotChainCorrupted) {
+ throw new IllegalStateException("Snapshot chain is corrupted.");
+ }
+ }
+
+ public boolean isSnapshotChainCorrupted() {
+ return snapshotChainCorrupted;
+ }
}
diff --git
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
index f61ddcc419..59426a2bf2 100644
---
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
+++
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
@@ -298,7 +298,7 @@ public class OMSnapshotCreateRequest extends
OMClientRequest {
) {
try {
snapshotChainManager.deleteSnapshot(info);
- } catch (IOException exception) {
+ } catch (IllegalStateException exception) {
LOG.error("Failed to remove snapshot: {} from SnapshotChainManager.",
info, exception);
}
diff --git
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
index fb473fb700..22fb03903d 100644
---
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
+++
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
@@ -21,7 +21,7 @@ import com.google.common.collect.ImmutableMap;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.utils.db.Table;
import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
-import org.junit.jupiter.api.Assertions;
+import org.apache.hadoop.util.Time;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Named;
import org.junit.jupiter.api.Test;
@@ -32,7 +32,6 @@ import org.junit.jupiter.params.provider.MethodSource;
import org.junit.jupiter.params.provider.ValueSource;
import java.io.File;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
@@ -93,7 +92,7 @@ public class TestSnapshotChain {
.build();
}
- private void deleteSnapshot(UUID snapshotID) throws IOException {
+ private void deleteSnapshot(UUID snapshotID) {
SnapshotInfo sinfo = null;
final String snapshotPath = "vol1/bucket1";
// reset the next snapshotInfo.globalPreviousSnapshotID
@@ -263,22 +262,23 @@ public class TestSnapshotChain {
// add two snapshots to the snapshotInfo
UUID snapshotID1 = UUID.randomUUID();
UUID snapshotID2 = UUID.randomUUID();
-
- ArrayList<UUID> snapshotIDs = new ArrayList<>();
+ List<UUID> snapshotIDs = new ArrayList<>();
snapshotIDs.add(snapshotID1);
snapshotIDs.add(snapshotID2);
+ List<SnapshotInfo> snapshotInfoList = new ArrayList<>();
UUID prevSnapshotID = null;
long time = System.currentTimeMillis();
- // add 3 snapshots
for (UUID snapshotID : snapshotIDs) {
- snapshotInfo.put(snapshotID.toString(),
- createSnapshotInfo(snapshotID, prevSnapshotID, prevSnapshotID,
- increasingTIme ? time++ : time--));
+ SnapshotInfo snapInfo = createSnapshotInfo(snapshotID, prevSnapshotID,
+ prevSnapshotID, increasingTIme ? time++ : time--);
+ snapshotInfo.put(snapshotID.toString(), snapInfo);
prevSnapshotID = snapshotID;
+ snapshotInfoList.add(snapInfo);
}
chainManager = new SnapshotChainManager(omMetadataManager);
+ assertFalse(chainManager.isSnapshotChainCorrupted());
// check if snapshots loaded correctly from snapshotInfoTable
assertEquals(snapshotID2, chainManager.getLatestGlobalSnapshotId());
assertEquals(snapshotID2, chainManager.nextGlobalSnapshot(snapshotID1));
@@ -289,6 +289,16 @@ public class TestSnapshotChain {
assertThrows(NoSuchElementException.class,
() -> chainManager.previousPathSnapshot(String
.join("/", "vol1", "bucket1"), snapshotID1));
+
+ UUID snapshotID3 = UUID.randomUUID();
+ SnapshotInfo snapshotInfo3 = createSnapshotInfo(snapshotID3,
prevSnapshotID,
+ prevSnapshotID, Time.now());
+ // Add and delete snapshot to make sure snapshot chain is correct.
+ chainManager.addSnapshot(snapshotInfo3);
+ chainManager.deleteSnapshot(snapshotInfoList.get(0));
+ assertEquals(snapshotID3, chainManager.getLatestGlobalSnapshotId());
+ assertThrows(NoSuchElementException.class,
+ () -> chainManager.nextGlobalSnapshot(snapshotID1));
}
private static Stream<? extends Arguments> invalidSnapshotChain() {
@@ -334,12 +344,26 @@ public class TestSnapshotChain {
createSnapshotInfo(snapshotID, snapshotChain.get(snapshotID),
snapshotChain.get(snapshotID), System.currentTimeMillis()));
}
- IllegalStateException exception = Assertions.assertThrows(
- IllegalStateException.class,
- () -> new SnapshotChainManager(omMetadataManager));
- Assertions.assertTrue(exception.getMessage()
- .startsWith("Snapshot chain corruption. All snapshots have not been " +
- "added to the snapshot chain."));
+ chainManager = new SnapshotChainManager(omMetadataManager);
+
+ assertTrue(chainManager.isSnapshotChainCorrupted());
+
+ SnapshotInfo snapInfo = createSnapshotInfo(UUID.randomUUID(),
+ UUID.randomUUID(),
+ UUID.randomUUID(),
+ System.currentTimeMillis());
+ IllegalStateException createException =
+ assertThrows(IllegalStateException.class,
+ () -> chainManager.addSnapshot(snapInfo));
+ assertEquals("Snapshot chain is corrupted.", createException.getMessage());
+ if (!snapshotIDs.isEmpty()) {
+ IllegalStateException deleteException =
+ assertThrows(IllegalStateException.class,
+ () -> chainManager.deleteSnapshot(
+ snapshotInfo.get(snapshotIDs.get(0).toString())));
+ assertEquals("Snapshot chain is corrupted.",
+ deleteException.getMessage());
+ }
}
@ParameterizedTest
@@ -355,10 +379,24 @@ public class TestSnapshotChain {
prevSnapshotId, System.currentTimeMillis()));
prevSnapshotId = snapshotID;
}
- IllegalStateException exception = Assertions.assertThrows(
- IllegalStateException.class,
- () -> new SnapshotChainManager(omMetadataManager));
- Assertions.assertTrue(exception.getMessage()
- .startsWith("Path Snapshot chain corruption."));
+ chainManager = new SnapshotChainManager(omMetadataManager);
+ assertTrue(chainManager.isSnapshotChainCorrupted());
+
+ SnapshotInfo snapInfo = createSnapshotInfo(UUID.randomUUID(),
+ UUID.randomUUID(),
+ UUID.randomUUID(),
+ System.currentTimeMillis());
+ IllegalStateException createException =
+ assertThrows(IllegalStateException.class,
+ () -> chainManager.addSnapshot(snapInfo));
+ assertEquals("Snapshot chain is corrupted.", createException.getMessage());
+ if (!snapshotIDs.isEmpty()) {
+ IllegalStateException deleteException =
+ assertThrows(IllegalStateException.class,
+ () -> chainManager.deleteSnapshot(
+ snapshotInfo.get(snapshotIDs.get(0).toString())));
+ assertEquals("Snapshot chain is corrupted.",
+ deleteException.getMessage());
+ }
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]