YARN-3850. NM fails to read files from full disks which can lead to container logs being lost and other issues. Contributed by Varun Saxena
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/faf4a118 Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/faf4a118 Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/faf4a118 Branch: refs/heads/YARN-2928 Commit: faf4a11884ce6dee8e276c1c1f4f5f7152e0ecf0 Parents: 04fdf61 Author: Jason Lowe <[email protected]> Authored: Fri Jun 26 15:47:07 2015 +0000 Committer: Zhijie Shen <[email protected]> Committed: Mon Jun 29 10:28:27 2015 -0700 ---------------------------------------------------------------------- hadoop-yarn-project/CHANGES.txt | 3 ++ .../nodemanager/LocalDirsHandlerService.java | 24 +++++++++ .../launcher/RecoveredContainerLaunch.java | 3 +- .../logaggregation/AppLogAggregatorImpl.java | 4 +- .../nodemanager/webapp/ContainerLogsUtils.java | 2 +- .../TestLogAggregationService.java | 54 +++++++++++++++----- .../webapp/TestContainerLogsPage.java | 22 +++++++- 7 files changed, 93 insertions(+), 19 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/CHANGES.txt ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt index f093a2c..6e13999 100644 --- a/hadoop-yarn-project/CHANGES.txt +++ b/hadoop-yarn-project/CHANGES.txt @@ -819,6 +819,9 @@ Release 2.7.1 - UNRELEASED YARN-3832. Resource Localization fails on a cluster due to existing cache directories (Brahma Reddy Battula via jlowe) + YARN-3850. NM fails to read files from full disks which can lead to + container logs being lost and other issues (Varun Saxena via jlowe) + Release 2.7.0 - 2015-04-20 INCOMPATIBLE CHANGES http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java index 57d4395..0a61035 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java @@ -238,6 +238,18 @@ public class LocalDirsHandlerService extends AbstractService { } /** + * Function to get the local dirs which should be considered for reading + * existing files on disk. Contains the good local dirs and the local dirs + * that have reached the disk space limit + * + * @return the local dirs which should be considered for reading + */ + public List<String> getLocalDirsForRead() { + return DirectoryCollection.concat(localDirs.getGoodDirs(), + localDirs.getFullDirs()); + } + + /** * Function to get the local dirs which should be considered when cleaning up * resources. Contains the good local dirs and the local dirs that have reached * the disk space limit @@ -250,6 +262,18 @@ public class LocalDirsHandlerService extends AbstractService { } /** + * Function to get the log dirs which should be considered for reading + * existing files on disk. Contains the good log dirs and the log dirs that + * have reached the disk space limit + * + * @return the log dirs which should be considered for reading + */ + public List<String> getLogDirsForRead() { + return DirectoryCollection.concat(logDirs.getGoodDirs(), + logDirs.getFullDirs()); + } + + /** * Function to get the log dirs which should be considered when cleaning up * resources. Contains the good log dirs and the log dirs that have reached * the disk space limit http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java index fb10f22..d7b9ae2 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java @@ -126,7 +126,8 @@ public class RecoveredContainerLaunch extends ContainerLaunch { private File locatePidFile(String appIdStr, String containerIdStr) { String pidSubpath= getPidFileSubpath(appIdStr, containerIdStr); - for (String dir : getContext().getLocalDirsHandler().getLocalDirs()) { + for (String dir : getContext().getLocalDirsHandler(). + getLocalDirsForRead()) { File pidFile = new File(dir, pidSubpath); if (pidFile.exists()) { return pidFile; http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java index 81be813..4b95a03 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java @@ -595,10 +595,10 @@ public class AppLogAggregatorImpl implements AppLogAggregator { boolean appFinished) { LOG.info("Uploading logs for container " + containerId + ". Current good log dirs are " - + StringUtils.join(",", dirsHandler.getLogDirs())); + + StringUtils.join(",", dirsHandler.getLogDirsForRead())); final LogKey logKey = new LogKey(containerId); final LogValue logValue = - new LogValue(dirsHandler.getLogDirs(), containerId, + new LogValue(dirsHandler.getLogDirsForRead(), containerId, userUgi.getShortUserName(), logAggregationContext, this.uploadedFileMeta, appFinished); try { http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java index c588a89..319f49b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/webapp/ContainerLogsUtils.java @@ -74,7 +74,7 @@ public class ContainerLogsUtils { static List<File> getContainerLogDirs(ContainerId containerId, LocalDirsHandlerService dirsHandler) throws YarnException { - List<String> logDirs = dirsHandler.getLogDirs(); + List<String> logDirs = dirsHandler.getLogDirsForRead(); List<File> containerLogDirs = new ArrayList<File>(logDirs.size()); for (String logDir : logDirs) { logDir = new File(logDir).toURI().getPath(); http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java index eb0d055..fd97cef 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java @@ -177,22 +177,11 @@ public class TestLogAggregationService extends BaseContainerManagerTest { dispatcher.close(); } - @Test - public void testLocalFileDeletionAfterUpload() throws Exception { - this.delSrvc = new DeletionService(createContainerExecutor()); - delSrvc = spy(delSrvc); - this.delSrvc.init(conf); - this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); - this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, - this.remoteRootLogDir.getAbsolutePath()); - - LogAggregationService logAggregationService = spy( - new LogAggregationService(dispatcher, this.context, this.delSrvc, - super.dirsHandler)); + private void verifyLocalFileDeletion( + LogAggregationService logAggregationService) throws Exception { logAggregationService.init(this.conf); logAggregationService.start(); - ApplicationId application1 = BuilderUtils.newApplicationId(1234, 1); // AppLogDir should be created @@ -252,10 +241,47 @@ public class TestLogAggregationService extends BaseContainerManagerTest { ApplicationEventType.APPLICATION_LOG_HANDLING_FINISHED) }; - checkEvents(appEventHandler, expectedEvents, true, "getType", "getApplicationID"); + checkEvents(appEventHandler, expectedEvents, true, "getType", + "getApplicationID"); } @Test + public void testLocalFileDeletionAfterUpload() throws Exception { + this.delSrvc = new DeletionService(createContainerExecutor()); + delSrvc = spy(delSrvc); + this.delSrvc.init(conf); + this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); + this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + this.remoteRootLogDir.getAbsolutePath()); + + LogAggregationService logAggregationService = spy( + new LogAggregationService(dispatcher, this.context, this.delSrvc, + super.dirsHandler)); + verifyLocalFileDeletion(logAggregationService); + } + + @Test + public void testLocalFileDeletionOnDiskFull() throws Exception { + this.delSrvc = new DeletionService(createContainerExecutor()); + delSrvc = spy(delSrvc); + this.delSrvc.init(conf); + this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); + this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, + this.remoteRootLogDir.getAbsolutePath()); + List<String> logDirs = super.dirsHandler.getLogDirs(); + LocalDirsHandlerService dirsHandler = spy(super.dirsHandler); + // Simulate disk being full by returning no good log dirs but having a + // directory in full log dirs. + when(dirsHandler.getLogDirs()).thenReturn(new ArrayList<String>()); + when(dirsHandler.getLogDirsForRead()).thenReturn(logDirs); + LogAggregationService logAggregationService = spy( + new LogAggregationService(dispatcher, this.context, this.delSrvc, + dirsHandler)); + verifyLocalFileDeletion(logAggregationService); + } + + + @Test public void testNoContainerOnNode() throws Exception { this.conf.set(YarnConfiguration.NM_LOG_DIRS, localLogDir.getAbsolutePath()); this.conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, http://git-wip-us.apache.org/repos/asf/hadoop/blob/faf4a118/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java ---------------------------------------------------------------------- diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java index b2788ca..4b1a20b 100644 --- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java +++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/webapp/TestContainerLogsPage.java @@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.webapp; import static org.junit.Assume.assumeTrue; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.when; import static org.mockito.Mockito.verify; @@ -29,6 +30,7 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -122,6 +124,24 @@ public class TestContainerLogsPage { Assert.assertNull(nmContext.getContainers().get(container1)); files = ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext); Assert.assertTrue(!(files.get(0).toString().contains("file:"))); + + // Create a new context to check if correct container log dirs are fetched + // on full disk. + LocalDirsHandlerService dirsHandlerForFullDisk = spy(dirsHandler); + // good log dirs are empty and nm log dir is in the full log dir list. + when(dirsHandlerForFullDisk.getLogDirs()). + thenReturn(new ArrayList<String>()); + when(dirsHandlerForFullDisk.getLogDirsForRead()). + thenReturn(Arrays.asList(new String[] {absLogDir.getAbsolutePath()})); + nmContext = new NodeManager.NMContext(null, null, dirsHandlerForFullDisk, + new ApplicationACLsManager(conf), new NMNullStateStoreService()); + nmContext.getApplications().put(appId, app); + container.setState(ContainerState.RUNNING); + nmContext.getContainers().put(container1, container); + List<File> dirs = + ContainerLogsUtils.getContainerLogDirs(container1, user, nmContext); + File containerLogDir = new File(absLogDir, appId + "/" + container1); + Assert.assertTrue(dirs.contains(containerLogDir)); } @Test(timeout = 10000) @@ -231,7 +251,7 @@ public class TestContainerLogsPage { LocalDirsHandlerService localDirs = mock(LocalDirsHandlerService.class); List<String> logDirs = new ArrayList<String>(); logDirs.add("F:/nmlogs"); - when(localDirs.getLogDirs()).thenReturn(logDirs); + when(localDirs.getLogDirsForRead()).thenReturn(logDirs); ApplicationIdPBImpl appId = mock(ApplicationIdPBImpl.class); when(appId.toString()).thenReturn("app_id_1");
