This is an automated email from the ASF dual-hosted git repository.

hemant pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new f01e0a12ed  HDDS-9199. Snapshot chain corruption should not fail OM 
restart (#5262)
f01e0a12ed is described below

commit f01e0a12edfc5b28c0c65b3f7ee46fe908870bbf
Author: Hemant Kumar <[email protected]>
AuthorDate: Wed Oct 4 10:31:50 2023 -0700

     HDDS-9199. Snapshot chain corruption should not fail OM restart (#5262)
---
 .../ozone/om/TestOzoneManagerHASnapshot.java       |  5 ++
 .../hadoop/ozone/om/SnapshotChainManager.java      | 66 +++++++++++++-----
 .../request/snapshot/OMSnapshotCreateRequest.java  |  2 +-
 .../apache/hadoop/ozone/om/TestSnapshotChain.java  | 78 ++++++++++++++++------
 4 files changed, 115 insertions(+), 36 deletions(-)

diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
index 7f071f1137..4dc0ab111b 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHASnapshot.java
@@ -50,6 +50,7 @@ import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.DO
 import static 
org.apache.hadoop.ozone.snapshot.SnapshotDiffResponse.JobStatus.IN_PROGRESS;
 import static org.awaitility.Awaitility.await;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
@@ -256,6 +257,10 @@ public class TestOzoneManagerHASnapshot {
     await().atMost(Duration.ofSeconds(180))
         .until(() -> cluster.getOMLeader() != null);
     assertNotNull(cluster.getOMLeader());
+    OmMetadataManagerImpl metadataManager = (OmMetadataManagerImpl) cluster
+        .getOMLeader().getMetadataManager();
+    assertFalse(metadataManager.getSnapshotChainManager()
+        .isSnapshotChainCorrupted());
   }
 
 
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
index f87383756e..9c0c1b602c 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/SnapshotChainManager.java
@@ -43,7 +43,7 @@ import java.util.concurrent.ConcurrentMap;
  * ii.) Global snapshot chain, sequence of all snapshots created in order
  * <p>
  * On start, the snapshot chains are initialized from the on disk
- * SnapshotInfoTable from the om RocksDB.
+ * SnapshotInfoTable from the OM RocksDB.
  */
 public class SnapshotChainManager {
   private static final Logger LOG =
@@ -55,6 +55,7 @@ public class SnapshotChainManager {
   private final ConcurrentMap<String, UUID> latestSnapshotIdByPath;
   private final ConcurrentMap<UUID, String> snapshotIdToTableKey;
   private UUID latestGlobalSnapshotId;
+  private final boolean snapshotChainCorrupted;
 
   public SnapshotChainManager(OMMetadataManager metadataManager) {
     globalSnapshotChain = Collections.synchronizedMap(new LinkedHashMap<>());
@@ -62,7 +63,7 @@ public class SnapshotChainManager {
     latestSnapshotIdByPath = new ConcurrentHashMap<>();
     snapshotIdToTableKey = new ConcurrentHashMap<>();
     latestGlobalSnapshotId = null;
-    loadFromSnapshotInfoTable(metadataManager);
+    snapshotChainCorrupted = !loadFromSnapshotInfoTable(metadataManager);
   }
 
   /**
@@ -163,7 +164,7 @@ public class SnapshotChainManager {
     latestSnapshotIdByPath.put(snapshotPath, snapshotID);
   }
 
-  private boolean deleteSnapshotGlobal(UUID snapshotID) throws IOException {
+  private boolean deleteSnapshotGlobal(UUID snapshotID) {
     if (globalSnapshotChain.containsKey(snapshotID)) {
       // reset prev and next snapshot entries in chain ordered list
       // for node removal
@@ -171,13 +172,15 @@ public class SnapshotChainManager {
       UUID prev = globalSnapshotChain.get(snapshotID).getPreviousSnapshotId();
 
       if (prev != null && !globalSnapshotChain.containsKey(prev)) {
-        throw new IOException(String.format("Snapshot chain corruption. " +
+        throw new IllegalStateException(String.format(
+            "Global snapshot chain corruption. " +
                 "SnapshotId: %s to be deleted has previous snapshotId: %s " +
                 "but associated snapshot is not found in snapshot chain.",
             snapshotID, prev));
       }
       if (next != null && !globalSnapshotChain.containsKey(next)) {
-        throw new IOException(String.format("Snapshot chain corruption. " +
+        throw new IllegalStateException(String.format(
+            "Global snapshot chain corruption. " +
                 "SnapshotId: {%s} to be deleted has next snapshotId: %s " +
                 "but associated snapshot is not found in snapshot chain.",
             snapshotID, next));
@@ -203,7 +206,7 @@ public class SnapshotChainManager {
   }
 
   private boolean deleteSnapshotPath(String snapshotPath,
-                                     UUID snapshotId) throws IOException {
+                                     UUID snapshotId) {
     if (snapshotChainByPath.containsKey(snapshotPath) &&
         snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
       // reset prev and next snapshot entries in chain ordered list
@@ -220,7 +223,8 @@ public class SnapshotChainManager {
       if (previousSnapshotId != null &&
           !snapshotChainByPath.get(snapshotPath)
               .containsKey(previousSnapshotId)) {
-        throw new IOException(String.format("Snapshot chain corruption. " +
+        throw new IllegalStateException(String.format(
+            "Path snapshot chain corruption. " +
                 "SnapshotId: %s at snapshotPath: %s to be deleted has " +
                 "previous snapshotId: %s but associated snapshot is not " +
                 "found in snapshot chain.", snapshotId, snapshotPath,
@@ -228,7 +232,8 @@ public class SnapshotChainManager {
       }
       if (nextSnapshotId != null && !snapshotChainByPath.get(snapshotPath)
           .containsKey(nextSnapshotId)) {
-        throw new IOException(String.format("Snapshot chain corruption. " +
+        throw new IllegalStateException(String.format(
+            "Path snapshot chain corruption. " +
                 "SnapshotId: %s at snapshotPath: %s to be deleted has next " +
                 "snapshotId: %s but associated snapshot is not found in " +
                 "snapshot chain.", snapshotId, snapshotPath,
@@ -267,9 +272,10 @@ public class SnapshotChainManager {
   }
 
   /**
-   * Loads the snapshot chain from SnapshotInfo table.
+   * Loads the snapshot chain from SnapshotInfo table and return true if chain
+   * gets loaded successfully.
    */
-  private void loadFromSnapshotInfoTable(OMMetadataManager metadataManager) {
+  private boolean loadFromSnapshotInfoTable(OMMetadataManager metadataManager) 
{
     // read from snapshotInfo table to populate
     // snapshot chains - both global and local path
     try (TableIterator<String, ? extends Table.KeyValue<String, SnapshotInfo>>
@@ -318,16 +324,18 @@ public class SnapshotChainManager {
             "corruption. All snapshots have not been added to the " +
             "snapshot chain. Last snapshot added to chain : %s", prev));
       }
-    } catch (IOException ioException) {
-      // TODO: [SNAPSHOT] Fail gracefully.
-      throw new RuntimeException(ioException);
+    } catch (IOException | IllegalStateException exception) {
+      LOG.error("Failure while loading snapshot chain.", exception);
+      return false;
     }
+    return true;
   }
 
   /**
    * Add snapshot to snapshot chain.
    */
   public synchronized void addSnapshot(SnapshotInfo snapshotInfo) {
+    validateSnapshotChain();
     addSnapshotGlobal(snapshotInfo.getSnapshotId(),
         snapshotInfo.getGlobalPreviousSnapshotId());
     addSnapshotPath(snapshotInfo.getSnapshotPath(),
@@ -341,8 +349,8 @@ public class SnapshotChainManager {
   /**
    * Delete snapshot from snapshot chain.
    */
-  public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo)
-      throws IOException {
+  public synchronized boolean deleteSnapshot(SnapshotInfo snapshotInfo) {
+    validateSnapshotChain();
     boolean status = deleteSnapshotGlobal(snapshotInfo.getSnapshotId()) &&
         deleteSnapshotPath(snapshotInfo.getSnapshotPath(),
             snapshotInfo.getSnapshotId());
@@ -356,6 +364,7 @@ public class SnapshotChainManager {
    * Get latest global snapshot in snapshot chain.
    */
   public UUID getLatestGlobalSnapshotId() {
+    validateSnapshotChain();
     return latestGlobalSnapshotId;
   }
 
@@ -363,6 +372,7 @@ public class SnapshotChainManager {
    * Get latest path snapshot in snapshot chain.
    */
   public UUID getLatestPathSnapshotId(String snapshotPath) {
+    validateSnapshotChain();
     return latestSnapshotIdByPath.get(snapshotPath);
   }
 
@@ -371,6 +381,7 @@ public class SnapshotChainManager {
    * in the global snapshot chain.
    */
   public boolean hasNextGlobalSnapshot(UUID snapshotId) {
+    validateSnapshotChain();
     if (!globalSnapshotChain.containsKey(snapshotId)) {
       LOG.error("No snapshot for provided snapshotId: {}", snapshotId);
       throw new NoSuchElementException(String.format("SnapshotId: %s is not " +
@@ -383,6 +394,7 @@ public class SnapshotChainManager {
    * Get next global snapshot in snapshot chain from given snapshot.
    */
   public UUID nextGlobalSnapshot(UUID snapshotId) {
+    validateSnapshotChain();
     if (!hasNextGlobalSnapshot(snapshotId)) {
       LOG.error("No snapshot for provided snapshotId: {}", snapshotId);
       throw new NoSuchElementException(String.format("SnapshotId: %s is not " +
@@ -396,6 +408,7 @@ public class SnapshotChainManager {
    * entry in the global snapshot chain.
    */
   public boolean hasPreviousGlobalSnapshot(UUID snapshotId) {
+    validateSnapshotChain();
     if (!globalSnapshotChain.containsKey(snapshotId)) {
       LOG.error("No snapshot found in snapshot chain for provided " +
           "snapshotId: {}.", snapshotId);
@@ -410,6 +423,7 @@ public class SnapshotChainManager {
    * Get previous global snapshot in snapshot chain from given snapshot.
    */
   public UUID previousGlobalSnapshot(UUID snapshotId) {
+    validateSnapshotChain();
     if (!hasPreviousGlobalSnapshot(snapshotId)) {
       LOG.error("No preceding snapshot found in snapshot chain for provided " +
           "snapshotId: {}.", snapshotId);
@@ -424,6 +438,7 @@ public class SnapshotChainManager {
    * entry in the path snapshot chain.
    */
   public boolean hasNextPathSnapshot(String snapshotPath, UUID snapshotId) {
+    validateSnapshotChain();
     if (!snapshotChainByPath.containsKey(snapshotPath) ||
         !snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
       LOG.error("No snapshot found for provided snapshotId: {} and " +
@@ -443,6 +458,7 @@ public class SnapshotChainManager {
    * Get next path snapshot in snapshot chain from given snapshot.
    */
   public UUID nextPathSnapshot(String snapshotPath, UUID snapshotId) {
+    validateSnapshotChain();
     if (!hasNextPathSnapshot(snapshotPath, snapshotId)) {
       LOG.error("No following snapshot for provided snapshotId {} and " +
           "snapshotPath {}.", snapshotId, snapshotPath);
@@ -461,6 +477,7 @@ public class SnapshotChainManager {
    */
   public boolean hasPreviousPathSnapshot(String snapshotPath,
                                          UUID snapshotId) {
+    validateSnapshotChain();
     if (!snapshotChainByPath.containsKey(snapshotPath) ||
         !snapshotChainByPath.get(snapshotPath).containsKey(snapshotId)) {
       LOG.error("No snapshot found for provided snapshotId: {} and " +
@@ -479,6 +496,7 @@ public class SnapshotChainManager {
    */
   public UUID previousPathSnapshot(String snapshotPath,
                                    UUID snapshotId) {
+    validateSnapshotChain();
     if (!hasPreviousPathSnapshot(snapshotPath, snapshotId)) {
       LOG.error("No preceding snapshot for provided snapshotId: {} and " +
           "snapshotPath: {}", snapshotId, snapshotPath);
@@ -498,17 +516,35 @@ public class SnapshotChainManager {
 
   public LinkedHashMap<UUID, SnapshotChainInfo> getSnapshotChainPath(
       String path) {
+    validateSnapshotChain();
     return snapshotChainByPath.get(path);
   }
 
   @VisibleForTesting
   public Map<UUID, SnapshotChainInfo> getGlobalSnapshotChain() {
+    validateSnapshotChain();
     return globalSnapshotChain;
   }
 
   @VisibleForTesting
   public Map<String,
       LinkedHashMap<UUID, SnapshotChainInfo>> getSnapshotChainByPath() {
+    validateSnapshotChain();
     return snapshotChainByPath;
   }
+
+  /**
+   * Validate if snapshot chain is loaded without any error and throw
+   * IllegalStateException in case there was an issue while loading snapshot
+   * chain on OM start up.
+   */
+  private void validateSnapshotChain() {
+    if (snapshotChainCorrupted) {
+      throw new IllegalStateException("Snapshot chain is corrupted.");
+    }
+  }
+
+  public boolean isSnapshotChainCorrupted() {
+    return snapshotChainCorrupted;
+  }
 }
diff --git 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
index f61ddcc419..59426a2bf2 100644
--- 
a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
+++ 
b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/request/snapshot/OMSnapshotCreateRequest.java
@@ -298,7 +298,7 @@ public class OMSnapshotCreateRequest extends 
OMClientRequest {
   ) {
     try {
       snapshotChainManager.deleteSnapshot(info);
-    } catch (IOException exception) {
+    } catch (IllegalStateException exception) {
       LOG.error("Failed to remove snapshot: {} from SnapshotChainManager.",
           info, exception);
     }
diff --git 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
index fb473fb700..22fb03903d 100644
--- 
a/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
+++ 
b/hadoop-ozone/ozone-manager/src/test/java/org/apache/hadoop/ozone/om/TestSnapshotChain.java
@@ -21,7 +21,7 @@ import com.google.common.collect.ImmutableMap;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.utils.db.Table;
 import org.apache.hadoop.ozone.om.helpers.SnapshotInfo;
-import org.junit.jupiter.api.Assertions;
+import org.apache.hadoop.util.Time;
 import org.junit.jupiter.api.BeforeEach;
 import org.junit.jupiter.api.Named;
 import org.junit.jupiter.api.Test;
@@ -32,7 +32,6 @@ import org.junit.jupiter.params.provider.MethodSource;
 import org.junit.jupiter.params.provider.ValueSource;
 
 import java.io.File;
-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -93,7 +92,7 @@ public class TestSnapshotChain {
         .build();
   }
 
-  private void deleteSnapshot(UUID snapshotID) throws IOException {
+  private void deleteSnapshot(UUID snapshotID) {
     SnapshotInfo sinfo = null;
     final String snapshotPath = "vol1/bucket1";
     // reset the next snapshotInfo.globalPreviousSnapshotID
@@ -263,22 +262,23 @@ public class TestSnapshotChain {
     // add two snapshots to the snapshotInfo
     UUID snapshotID1 = UUID.randomUUID();
     UUID snapshotID2 = UUID.randomUUID();
-
-    ArrayList<UUID> snapshotIDs = new ArrayList<>();
+    List<UUID> snapshotIDs = new ArrayList<>();
     snapshotIDs.add(snapshotID1);
     snapshotIDs.add(snapshotID2);
+    List<SnapshotInfo> snapshotInfoList = new ArrayList<>();
 
     UUID prevSnapshotID = null;
     long time = System.currentTimeMillis();
-    // add 3 snapshots
     for (UUID snapshotID : snapshotIDs) {
-      snapshotInfo.put(snapshotID.toString(),
-          createSnapshotInfo(snapshotID, prevSnapshotID, prevSnapshotID,
-          increasingTIme ? time++ : time--));
+      SnapshotInfo snapInfo = createSnapshotInfo(snapshotID, prevSnapshotID,
+          prevSnapshotID, increasingTIme ? time++ : time--);
+      snapshotInfo.put(snapshotID.toString(), snapInfo);
       prevSnapshotID = snapshotID;
+      snapshotInfoList.add(snapInfo);
     }
 
     chainManager = new SnapshotChainManager(omMetadataManager);
+    assertFalse(chainManager.isSnapshotChainCorrupted());
     // check if snapshots loaded correctly from snapshotInfoTable
     assertEquals(snapshotID2, chainManager.getLatestGlobalSnapshotId());
     assertEquals(snapshotID2, chainManager.nextGlobalSnapshot(snapshotID1));
@@ -289,6 +289,16 @@ public class TestSnapshotChain {
     assertThrows(NoSuchElementException.class,
         () -> chainManager.previousPathSnapshot(String
             .join("/", "vol1", "bucket1"), snapshotID1));
+
+    UUID snapshotID3 = UUID.randomUUID();
+    SnapshotInfo snapshotInfo3 = createSnapshotInfo(snapshotID3, 
prevSnapshotID,
+        prevSnapshotID, Time.now());
+    // Add and delete snapshot to make sure snapshot chain is correct.
+    chainManager.addSnapshot(snapshotInfo3);
+    chainManager.deleteSnapshot(snapshotInfoList.get(0));
+    assertEquals(snapshotID3, chainManager.getLatestGlobalSnapshotId());
+    assertThrows(NoSuchElementException.class,
+        () -> chainManager.nextGlobalSnapshot(snapshotID1));
   }
 
   private static Stream<? extends Arguments> invalidSnapshotChain() {
@@ -334,12 +344,26 @@ public class TestSnapshotChain {
           createSnapshotInfo(snapshotID, snapshotChain.get(snapshotID),
               snapshotChain.get(snapshotID), System.currentTimeMillis()));
     }
-    IllegalStateException exception = Assertions.assertThrows(
-        IllegalStateException.class,
-        () -> new SnapshotChainManager(omMetadataManager));
-    Assertions.assertTrue(exception.getMessage()
-        .startsWith("Snapshot chain corruption. All snapshots have not been " +
-            "added to the snapshot chain."));
+    chainManager = new SnapshotChainManager(omMetadataManager);
+
+    assertTrue(chainManager.isSnapshotChainCorrupted());
+
+    SnapshotInfo snapInfo = createSnapshotInfo(UUID.randomUUID(),
+        UUID.randomUUID(),
+        UUID.randomUUID(),
+        System.currentTimeMillis());
+    IllegalStateException createException =
+        assertThrows(IllegalStateException.class,
+            () -> chainManager.addSnapshot(snapInfo));
+    assertEquals("Snapshot chain is corrupted.", createException.getMessage());
+    if (!snapshotIDs.isEmpty()) {
+      IllegalStateException deleteException =
+          assertThrows(IllegalStateException.class,
+              () -> chainManager.deleteSnapshot(
+                  snapshotInfo.get(snapshotIDs.get(0).toString())));
+      assertEquals("Snapshot chain is corrupted.",
+          deleteException.getMessage());
+    }
   }
 
   @ParameterizedTest
@@ -355,10 +379,24 @@ public class TestSnapshotChain {
               prevSnapshotId, System.currentTimeMillis()));
       prevSnapshotId = snapshotID;
     }
-    IllegalStateException exception = Assertions.assertThrows(
-        IllegalStateException.class,
-        () -> new SnapshotChainManager(omMetadataManager));
-    Assertions.assertTrue(exception.getMessage()
-        .startsWith("Path Snapshot chain corruption."));
+    chainManager = new SnapshotChainManager(omMetadataManager);
+    assertTrue(chainManager.isSnapshotChainCorrupted());
+
+    SnapshotInfo snapInfo = createSnapshotInfo(UUID.randomUUID(),
+        UUID.randomUUID(),
+        UUID.randomUUID(),
+        System.currentTimeMillis());
+    IllegalStateException createException =
+        assertThrows(IllegalStateException.class,
+            () -> chainManager.addSnapshot(snapInfo));
+    assertEquals("Snapshot chain is corrupted.", createException.getMessage());
+    if (!snapshotIDs.isEmpty()) {
+      IllegalStateException deleteException =
+          assertThrows(IllegalStateException.class,
+              () -> chainManager.deleteSnapshot(
+                  snapshotInfo.get(snapshotIDs.get(0).toString())));
+      assertEquals("Snapshot chain is corrupted.",
+          deleteException.getMessage());
+    }
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to