YARN-3850. NM fails to read files from full disks which can lead to container logs being lost and other issues. Contributed by Varun Saxena (cherry picked from commit 40b256949ad6f6e0dbdd248f2d257b05899f4332)
(cherry picked from commit 0221d19f4e398c386f4ca3990b0893562aa8dacf) (cherry picked from commit 87d2204f28f192a964c04a5fa1e2e31644d74b59) Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/fe5877a4 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/fe5877a4 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/fe5877a4 Branch: refs/heads/branch-2.6.1 Commit: fe5877a49e9a8c387a5be77edd1eb1448184271e Parents: 34739fc Author: Jason Lowe <[email protected]> Authored: Fri Jun 26 15:47:07 2015 +0000 Committer: Vinod Kumar Vavilapalli <[email protected]> Committed: Thu Sep 3 14:35:01 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 ++ .../nodemanager/LocalDirsHandlerService.java | 24 +++++++++ .../launcher/RecoveredContainerLaunch.java | 3 +- .../logaggregation/AppLogAggregatorImpl.java | 4 +- .../nodemanager/webapp/ContainerLogsUtils.java | 2 +- .../TestLogAggregationService.java | 54 +++++++++++++++----- .../webapp/TestContainerLogsPage.java | 22 +++++++- 7 files changed, 93 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index 1f4d851..98dc354 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -159,6 +159,9 @@ Release 2.6.1 - UNRELEASED YARN-3832. Resource Localization fails on a cluster due to existing cache directories (Brahma Reddy Battula via jlowe) + YARN-3850. NM fails to read files from full disks which can lead to + container logs being lost and other issues (Varun Saxena via jlowe) + Release 2.6.0 - 2014-11-18 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 7d1aa53..d6950a3 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -213,6 +213,18 @@ public class LocalDirsHandlerService extends AbstractService { } /** + * Function to get the local dirs which should be considered for reading + * existing files on disk. Contains the good local dirs and the local dirs + * that have reached the disk space limit + * + * @return the local dirs which should be considered for reading + */ + public List<String> getLocalDirsForRead() { + return DirectoryCollection.concat(localDirs.getGoodDirs(), + localDirs.getFullDirs()); + } + + /** * Function to get the local dirs which should be considered when cleaning up * resources. Contains the good local dirs and the local dirs that have reached * the disk space limit @@ -225,6 +237,18 @@ public class LocalDirsHandlerService extends AbstractService { } /** + * Function to get the log dirs which should be considered for reading + * existing files on disk. Contains the good log dirs and the log dirs that + * have reached the disk space limit + * + * @return the log dirs which should be considered for reading + */ + public List<String> getLogDirsForRead() { + return DirectoryCollection.concat(logDirs.getGoodDirs(), + logDirs.getFullDirs()); + } + + /** * Function to get the log dirs which should be considered when cleaning up * resources. Contains the good log dirs and the log dirs that have reached * the disk space limit http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java index 03a39aa..c662ecd 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java @@ -121,7 +121,8 @@ public class RecoveredContainerLaunch extends ContainerLaunch { private File locatePidFile(String appIdStr, String containerIdStr) { String pidSubpath= getPidFileSubpath(appIdStr, containerIdStr); - for (String dir : getContext().getLocalDirsHandler().getLocalDirs()) { + for (String dir : getContext().getLocalDirsHandler(). + getLocalDirsForRead()) { File pidFile = new File(dir, pidSubpath); if (pidFile.exists()) { return pidFile; http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java index 20887b6..65c5501 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java @@ -524,10 +524,10 @@ public class AppLogAggregatorImpl implements AppLogAggregator { public Set<Path> doContainerLogAggregation(LogWriter writer) { LOG.info("Uploading logs for container " + containerId + ". Current good log dirs are " - + StringUtils.join(",", dirsHandler.getLogDirs())); + + StringUtils.join(",", dirsHandler.getLogDirsForRead())); final LogKey logKey = new LogKey(containerId); final LogValue logValue = - new LogValue(dirsHandler.getLogDirs(), containerId, + new LogValue(dirsHandler.getLogDirsForRead(), containerId, userUgi.getShortUserName(), logAggregationContext, this.uploadedFileMeta); try { http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java index c588a89..319f49b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java @@ -74,7 +74,7 @@ public class ContainerLogsUtils { static List<File> getContainerLogDirs(ContainerId containerId, LocalDirsHandlerService dirsHandler) throws YarnException { - List<String> logDirs = dirsHandler.getLogDirs(); + List<String> logDirs = dirsHandler.getLogDirsForRead(); List<File> containerLogDirs = new ArrayList<File>(logDirs.size()); for (String logDir : logDirs) { logDir = new File(logDir).toURI().getPath(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java index 7d911e9..e2c45db 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java @@ -172,22 +172,11 @@ public class TestLogAggregationService extends BaseContainerManagerTest { dispatcher.close(); } - @Test - public void testLocalFileDeletionAfterUpload() throws Exception { - this.delSrvc = new DeletionService(createContainerExecutor()); - delSrvc = spy(delSrvc); - this.delSrvc.init(conf); - this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); - this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, - this.remoteRootLogDir.getAbsolutePath()); - - LogAggregationService logAggregationService = spy( - new LogAggregationService(dispatcher, this.context, this.delSrvc, - super.dirsHandler)); + private void verifyLocalFileDeletion( + LogAggregationService logAggregationService) throws Exception { logAggregationService.init(this.conf); logAggregationService.start(); - ApplicationId application1 = BuilderUtils.newApplicationId(1234, 1); // AppLogDir should be created @@ -247,11 +236,48 @@ public class TestLogAggregationService extends BaseContainerManagerTest { ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED) }; - checkEvents(appEventHandler, expectedEvents, true, "getType", "getApplicationID"); + checkEvents(appEventHandler, expectedEvents, true, "getType", + "getApplicationID"); dispatcher.stop(); } @Test + public void testLocalFileDeletionAfterUpload() throws Exception { + this.delSrvc = new DeletionService(createContainerExecutor()); + delSrvc = spy(delSrvc); + this.delSrvc.init(conf); + this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); + this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + this.remoteRootLogDir.getAbsolutePath()); + + LogAggregationService logAggregationService = spy( + new LogAggregationService(dispatcher, this.context, this.delSrvc, + super.dirsHandler)); + verifyLocalFileDeletion(logAggregationService); + } + + @Test + public void testLocalFileDeletionOnDiskFull() throws Exception { + this.delSrvc = new DeletionService(createContainerExecutor()); + delSrvc = spy(delSrvc); + this.delSrvc.init(conf); + this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); + this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + this.remoteRootLogDir.getAbsolutePath()); + List<String> logDirs = super.dirsHandler.getLogDirs(); + LocalDirsHandlerService dirsHandler = spy(super.dirsHandler); + // Simulate disk being full by returning no good log dirs but having a + // directory in full log dirs. + when(dirsHandler.getLogDirs()).thenReturn(new ArrayList<String>()); + when(dirsHandler.getLogDirsForRead()).thenReturn(logDirs); + LogAggregationService logAggregationService = spy( + new LogAggregationService(dispatcher, this.context, this.delSrvc, + dirsHandler)); + verifyLocalFileDeletion(logAggregationService); + } + + + @Test public void testNoContainerOnNode() throws Exception { this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, http://git-wip-us.apache.org/repos/asf/hadoop/blob/fe5877a4/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java index b1d4397..39c52d9 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.webapp; import static org.junit.Assume.assumeTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.when; import static org.mockito.Mockito.verify; @@ -29,6 +30,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -115,6 +117,24 @@ public class TestContainerLogsPage { Assert.assertNull(nmContext.getContainers().get(container1)); files = ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext); Assert.assertTrue(!(files.get(0).toString().contains("file:"))); + + // Create a new context to check if correct container log dirs are fetched + // on full disk. + LocalDirsHandlerService dirsHandlerForFullDisk = spy(dirsHandler); + // good log dirs are empty and nm log dir is in the full log dir list. + when(dirsHandlerForFullDisk.getLogDirs()). + thenReturn(new ArrayList<String>()); + when(dirsHandlerForFullDisk.getLogDirsForRead()). + thenReturn(Arrays.asList(new String[] {absLogDir.getAbsolutePath()})); + nmContext = new NodeManager.NMContext(null, null, dirsHandlerForFullDisk, + new ApplicationACLsManager(conf), new NMNullStateStoreService()); + nmContext.getApplications().put(appId, app); + container.setState(ContainerState.RUNNING); + nmContext.getContainers().put(container1, container); + List<File> dirs = + ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext); + File containerLogDir = new File(absLogDir, appId + "/" + container1); + Assert.assertTrue(dirs.contains(containerLogDir)); } @Test(timeout = 10000) @@ -224,7 +244,7 @@ public class TestContainerLogsPage { LocalDirsHandlerService localDirs = mock(LocalDirsHandlerService.class); List<String> logDirs = new ArrayList<String>(); logDirs.add("F:/nmlogs"); - when(localDirs.getLogDirs()).thenReturn(logDirs); + when(localDirs.getLogDirsForRead()).thenReturn(logDirs); ApplicationIdPBImpl appId = mock(ApplicationIdPBImpl.class); when(appId.toString()).thenReturn("app_id_1");
