Repository: hadoop
Updated Branches:
  refs/heads/branch-2 ab23acdfd -> ece01478c


YARN-4924. NM recovery race can lead to container not cleaned up. Contributed 
by sandflee
(cherry picked from commit 3150ae8108a1fc40a67926be6254824c1e37cb38)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/ece01478
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/ece01478
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/ece01478

Branch: refs/heads/branch-2
Commit: ece01478c5e7762a9037880bdc3c18d549d38b32
Parents: ab23acd
Author: Jason Lowe <jl...@apache.org>
Authored: Thu Apr 14 19:17:14 2016 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Thu Apr 14 19:19:46 2016 +0000

----------------------------------------------------------------------
 .../containermanager/ContainerManagerImpl.java  | 17 -----
 .../recovery/NMLeveldbStateStoreService.java    | 80 ++++++++++++--------
 .../recovery/NMNullStateStoreService.java       |  4 -
 .../recovery/NMStateStoreService.java           | 12 ---
 .../TestContainerManagerRecovery.java           |  4 +
 .../recovery/NMMemoryStateStoreService.java     | 10 ---
 .../TestNMLeveldbStateStoreService.java         | 10 +--
 7 files changed, 54 insertions(+), 83 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
index 8d09aa7..b8cca28 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
@@ -296,20 +296,8 @@ public class ContainerManagerImpl extends CompositeService 
implements
         if (LOG.isDebugEnabled()) {
           LOG.debug("Recovering container with state: " + rcs);
         }
-
         recoverContainer(rcs);
       }
-
-      String diagnostic = "Application marked finished during recovery";
-      for (ApplicationId appId : appsState.getFinishedApplications()) {
-
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("Application marked finished during recovery: " + appId);
-        }
-
-        dispatcher.getEventHandler().handle(
-            new ApplicationFinishEvent(appId, diagnostic));
-      }
     } else {
       LOG.info("Not a recoverable state store. Nothing to recover.");
     }
@@ -1332,11 +1320,6 @@ public class ContainerManagerImpl extends 
CompositeService implements
         } else if (appsFinishedEvent.getReason() == 
CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER) {
           diagnostic = "Application killed by ResourceManager";
         }
-        try {
-          this.context.getNMStateStore().storeFinishedApplication(appID);
-        } catch (IOException e) {
-          LOG.error("Unable to update application state in store", e);
-        }
         this.dispatcher.getEventHandler().handle(
             new ApplicationFinishEvent(appID,
                 diagnostic));

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
index 81d6c57..26dea2d 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
@@ -84,6 +84,7 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
 
   private static final String APPLICATIONS_KEY_PREFIX =
       "ContainerManager/applications/";
+  @Deprecated
   private static final String FINISHED_APPS_KEY_PREFIX =
       "ContainerManager/finishedApps/";
 
@@ -392,20 +393,6 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
         state.applications.add(
             ContainerManagerApplicationProto.parseFrom(entry.getValue()));
       }
-
-      state.finishedApplications = new ArrayList<ApplicationId>();
-      keyPrefix = FINISHED_APPS_KEY_PREFIX;
-      iter.seek(bytes(keyPrefix));
-      while (iter.hasNext()) {
-        Entry<byte[], byte[]> entry = iter.next();
-        String key = asString(entry.getKey());
-        if (!key.startsWith(keyPrefix)) {
-          break;
-        }
-        ApplicationId appId =
-            ConverterUtils.toApplicationId(key.substring(keyPrefix.length()));
-        state.finishedApplications.add(appId);
-      }
     } catch (DBException e) {
       throw new IOException(e);
     } finally {
@@ -414,6 +401,8 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
       }
     }
 
+    cleanupDeprecatedFinishedApps();
+
     return state;
   }
 
@@ -434,21 +423,6 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
   }
 
   @Override
-  public void storeFinishedApplication(ApplicationId appId)
-      throws IOException {
-    if (LOG.isDebugEnabled()) {
-      LOG.debug("storeFinishedApplication.appId: " + appId);
-    }
-
-    String key = FINISHED_APPS_KEY_PREFIX + appId;
-    try {
-      db.put(bytes(key), new byte[0]);
-    } catch (DBException e) {
-      throw new IOException(e);
-    }
-  }
-
-  @Override
   public void removeApplication(ApplicationId appId)
       throws IOException {
     if (LOG.isDebugEnabled()) {
@@ -460,8 +434,6 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
       try {
         String key = APPLICATIONS_KEY_PREFIX + appId;
         batch.delete(bytes(key));
-        key = FINISHED_APPS_KEY_PREFIX + appId;
-        batch.delete(bytes(key));
         db.write(batch);
       } finally {
         batch.close();
@@ -979,6 +951,52 @@ public class NMLeveldbStateStoreService extends 
NMStateStoreService {
     }
   }
 
+  @SuppressWarnings("deprecation")
+  private void cleanupDeprecatedFinishedApps() {
+    try {
+      cleanupKeysWithPrefix(FINISHED_APPS_KEY_PREFIX);
+    } catch (Exception e) {
+      LOG.warn("cleanup keys with prefix " + FINISHED_APPS_KEY_PREFIX +
+              " from leveldb failed", e);
+    }
+  }
+
+  private void cleanupKeysWithPrefix(String prefix) throws IOException {
+    WriteBatch batch = null;
+    LeveldbIterator iter = null;
+    try {
+      iter = new LeveldbIterator(db);
+      try {
+        batch = db.createWriteBatch();
+        iter.seek(bytes(prefix));
+        while (iter.hasNext()) {
+          byte[] key = iter.next().getKey();
+          String keyStr = asString(key);
+          if (!keyStr.startsWith(prefix)) {
+            break;
+          }
+          batch.delete(key);
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("cleanup " + keyStr + " from leveldb");
+          }
+        }
+        db.write(batch);
+      } catch (DBException e) {
+        throw new IOException(e);
+      } finally {
+        if (batch != null) {
+          batch.close();
+        }
+      }
+    } catch (DBException e) {
+      throw new IOException(e);
+    } finally {
+      if (iter != null) {
+        iter.close();
+      }
+    }
+  }
+
   private String getLogDeleterKey(ApplicationId appId) {
     return LOG_DELETER_KEY_PREFIX + appId;
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
index d5dce9b..a887e71 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
@@ -59,10 +59,6 @@ public class NMNullStateStoreService extends 
NMStateStoreService {
   }
 
   @Override
-  public void storeFinishedApplication(ApplicationId appId) {
-  }
-
-  @Override
   public void removeApplication(ApplicationId appId) throws IOException {
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
index 84c5aa9..463815e 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
@@ -52,15 +52,11 @@ public abstract class NMStateStoreService extends 
AbstractService {
 
   public static class RecoveredApplicationsState {
     List<ContainerManagerApplicationProto> applications;
-    List<ApplicationId> finishedApplications;
 
     public List<ContainerManagerApplicationProto> getApplications() {
       return applications;
     }
 
-    public List<ApplicationId> getFinishedApplications() {
-      return finishedApplications;
-    }
   }
 
   public enum RecoveredContainerStatus {
@@ -259,14 +255,6 @@ public abstract class NMStateStoreService extends 
AbstractService {
       ContainerManagerApplicationProto p) throws IOException;
 
   /**
-   * Record that an application has finished
-   * @param appId the application ID
-   * @throws IOException
-   */
-  public abstract void storeFinishedApplication(ApplicationId appId)
-      throws IOException;
-
-  /**
    * Remove records corresponding to an application
    * @param appId the application ID
    * @throws IOException

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
index 2e014de..9fa3fcc 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
@@ -259,6 +259,10 @@ public class TestContainerManagerRecovery extends 
BaseContainerManagerTest {
     assertEquals(1, context.getApplications().size());
     app = context.getApplications().get(appId);
     assertNotNull(app);
+    // no longer saving FINISH_APP event in NM stateStore,
+    // simulate by resending FINISH_APP event
+    cm.handle(new CMgrCompletedAppsEvent(finishedApps,
+        CMgrCompletedAppsEvent.Reason.BY_RESOURCEMANAGER));
     waitForAppState(app, ApplicationState.APPLICATION_RESOURCES_CLEANINGUP);
     assertTrue(context.getApplicationACLsManager().checkAccess(
         UserGroupInformation.createRemoteUser(modUser),

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
index a1c95ab..1279896 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
@@ -44,7 +44,6 @@ import 
org.apache.hadoop.yarn.server.api.records.impl.pb.MasterKeyPBImpl;
 
 public class NMMemoryStateStoreService extends NMStateStoreService {
   private Map<ApplicationId, ContainerManagerApplicationProto> apps;
-  private Set<ApplicationId> finishedApps;
   private Map<ContainerId, RecoveredContainerState> containerStates;
   private Map<TrackerKey, TrackerState> trackerStates;
   private Map<Integer, DeletionServiceDeleteTaskProto> deleteTasks;
@@ -59,7 +58,6 @@ public class NMMemoryStateStoreService extends 
NMStateStoreService {
   @Override
   protected void initStorage(Configuration conf) {
     apps = new HashMap<ApplicationId, ContainerManagerApplicationProto>();
-    finishedApps = new HashSet<ApplicationId>();
     containerStates = new HashMap<ContainerId, RecoveredContainerState>();
     nmTokenState = new RecoveredNMTokensState();
     nmTokenState.applicationMasterKeys =
@@ -86,7 +84,6 @@ public class NMMemoryStateStoreService extends 
NMStateStoreService {
     RecoveredApplicationsState state = new RecoveredApplicationsState();
     state.applications = new ArrayList<ContainerManagerApplicationProto>(
         apps.values());
-    state.finishedApplications = new ArrayList<ApplicationId>(finishedApps);
     return state;
   }
 
@@ -99,15 +96,9 @@ public class NMMemoryStateStoreService extends 
NMStateStoreService {
   }
 
   @Override
-  public synchronized void storeFinishedApplication(ApplicationId appId) {
-    finishedApps.add(appId);
-  }
-
-  @Override
   public synchronized void removeApplication(ApplicationId appId)
       throws IOException {
     apps.remove(appId);
-    finishedApps.remove(appId);
   }
 
   @Override
@@ -393,7 +384,6 @@ public class NMMemoryStateStoreService extends 
NMStateStoreService {
     logDeleterState.remove(appId);
   }
 
-
   private static class TrackerState {
     Map<Path, LocalResourceProto> inProgressMap =
         new HashMap<Path, LocalResourceProto>();

http://git-wip-us.apache.org/repos/asf/hadoop/blob/ece01478/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
----------------------------------------------------------------------
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
index 08b49e7..47468d6 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java
@@ -174,7 +174,6 @@ public class TestNMLeveldbStateStoreService {
     // test empty when no state
     RecoveredApplicationsState state = stateStore.loadApplicationsState();
     assertTrue(state.getApplications().isEmpty());
-    assertTrue(state.getFinishedApplications().isEmpty());
 
     // store an application and verify recovered
     final ApplicationId appId1 = ApplicationId.newInstance(1234, 1);
@@ -188,10 +187,8 @@ public class TestNMLeveldbStateStoreService {
     state = stateStore.loadApplicationsState();
     assertEquals(1, state.getApplications().size());
     assertEquals(appProto1, state.getApplications().get(0));
-    assertTrue(state.getFinishedApplications().isEmpty());
 
-    // finish an application and add a new one
-    stateStore.storeFinishedApplication(appId1);
+    // add a new app
     final ApplicationId appId2 = ApplicationId.newInstance(1234, 2);
     builder = ContainerManagerApplicationProto.newBuilder();
     builder.setId(((ApplicationIdPBImpl) appId2).getProto());
@@ -203,18 +200,13 @@ public class TestNMLeveldbStateStoreService {
     assertEquals(2, state.getApplications().size());
     assertTrue(state.getApplications().contains(appProto1));
     assertTrue(state.getApplications().contains(appProto2));
-    assertEquals(1, state.getFinishedApplications().size());
-    assertEquals(appId1, state.getFinishedApplications().get(0));
 
     // test removing an application
-    stateStore.storeFinishedApplication(appId2);
     stateStore.removeApplication(appId2);
     restartStateStore();
     state = stateStore.loadApplicationsState();
     assertEquals(1, state.getApplications().size());
     assertEquals(appProto1, state.getApplications().get(0));
-    assertEquals(1, state.getFinishedApplications().size());
-    assertEquals(appId1, state.getFinishedApplications().get(0));
   }
 
   @Test

Reply via email to