This is an automated email from the ASF dual-hosted git repository.

bteke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 0780710f25a YARN-11567 - Aggregate container launch debug artifacts on 
error (#6053)
0780710f25a is described below

commit 0780710f25a36f4471942edfe7a7f396cacb226d
Author: K0K0V0K <109747532+k0k0...@users.noreply.github.com>
AuthorDate: Fri Sep 22 15:09:17 2023 +0200

    YARN-11567 - Aggregate container launch debug artifacts on error (#6053)
---
 .../apache/hadoop/yarn/conf/YarnConfiguration.java |  4 ++
 .../src/main/resources/yarn-default.xml            | 15 ++++++
 .../yarn/server/nodemanager/ContainerExecutor.java | 25 +++++++---
 .../launcher/TestContainerLaunch.java              | 57 ++++++++++++++++++++++
 4 files changed, 95 insertions(+), 6 deletions(-)

diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index ef06299fcfd..bbb1ed6f8a7 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -150,7 +150,11 @@ public class YarnConfiguration extends Configuration {
   public static final String NM_LOG_CONTAINER_DEBUG_INFO =
       YarnConfiguration.NM_PREFIX + "log-container-debug-info.enabled";
 
+  public static final String NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR =
+      YarnConfiguration.NM_PREFIX + 
"log-container-debug-info-on-error.enabled";
+
   public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO = true;
+  public static final boolean DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR = 
false;
 
   ////////////////////////////////
   // IPC Configs
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index 9697f7aa88c..9fa600db4b0 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -1656,6 +1656,21 @@
     <value>true</value>
   </property>
 
+  <property>
+    <description>Generate additional logs about container launches,
+      if container returned with non-zero exit code.
+      Currently, this creates a copy of the launch script and lists the
+      directory contents of the container work dir. When listing directory
+      contents, we follow symlinks to a max-depth of 5(including symlinks
+      which point to outside the container work dir) which may lead to a
+      slowness in launching containers.
+      If yarn.nodemanager.log-container-debug-info.enabled is true,
+      it does not have effect on the behavior.
+    </description>
+    <name>yarn.nodemanager.log-container-debug-info-on-error.enabled</name>
+    <value>false</value>
+  </property>
+
   <property>
     <description>Amount of physical memory, in MB, that can be allocated 
     for containers. If set to -1 and
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
index 65e8183f699..3d0dca622c1 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/ContainerExecutor.java
@@ -102,6 +102,7 @@ public abstract class ContainerExecutor implements 
Configurable {
   private String[] whitelistVars;
   private int exitCodeFileTimeout =
       YarnConfiguration.DEFAULT_NM_CONTAINER_EXECUTOR_EXIT_FILE_TIMEOUT;
+  private int containerExitCode;
 
   @Override
   public void setConf(Configuration conf) {
@@ -303,7 +304,7 @@ public abstract class ContainerExecutor implements 
Configurable {
 
     if (pidPath == null) {
       LOG.warn("{} is not active, returning terminated error", containerId);
-
+      containerExitCode = ExitCode.TERMINATED.getExitCode();
       return ExitCode.TERMINATED.getExitCode();
     }
 
@@ -335,7 +336,7 @@ public abstract class ContainerExecutor implements 
Configurable {
     while (!file.exists() && msecLeft >= 0) {
       if (!isContainerActive(containerId)) {
         LOG.info("{} was deactivated", containerId);
-
+        containerExitCode = ExitCode.TERMINATED.getExitCode();
         return ExitCode.TERMINATED.getExitCode();
       }
 
@@ -350,7 +351,9 @@ public abstract class ContainerExecutor implements 
Configurable {
     }
 
     try {
-      return Integer.parseInt(FileUtils.readFileToString(file, 
StandardCharsets.UTF_8).trim());
+      containerExitCode = Integer.parseInt(
+          FileUtils.readFileToString(file, StandardCharsets.UTF_8).trim());
+      return containerExitCode;
     } catch (NumberFormatException e) {
       throw new IOException("Error parsing exit code from pid " + pid, e);
     }
@@ -453,9 +456,7 @@ public abstract class ContainerExecutor implements 
Configurable {
     }
 
     // dump debugging information if configured
-    if (getConf() != null &&
-        getConf().getBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
-        YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO)) {
+    if (shouldWriteDebugInformation(getConf())) {
       sb.echo("Copying debugging information");
       sb.copyDebugInformation(new Path(outFilename),
           new Path(logDir, outFilename));
@@ -488,6 +489,18 @@ public abstract class ContainerExecutor implements 
Configurable {
     return new File(dir.toString()).listFiles();
   }
 
+  private boolean shouldWriteDebugInformation(Configuration config) {
+    return config != null && (
+            config.getBoolean(
+                YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO,
+                YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO
+            ) || (
+            config.getBoolean(
+                YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR,
+                YarnConfiguration.DEFAULT_NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR
+            ) && containerExitCode != 0));
+  }
+
   /**
    * The container exit code.
    */
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
index bd135ff5193..6971d34b9d8 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/TestContainerLaunch.java
@@ -1844,6 +1844,63 @@ public class TestContainerLaunch extends 
BaseContainerManagerTest {
     }
   }
 
+  @Test
+  public void testDebuggingInformationOnError() throws IOException {
+    File shellFile = null;
+    File tempFile = null;
+    Configuration conf = new YarnConfiguration();
+    try {
+      shellFile = Shell.appendScriptExtension(tmpDir, "hello");
+      tempFile = Shell.appendScriptExtension(tmpDir, "temp");
+      String testCommand = Shell.WINDOWS ? "@echo \"hello\"" : "echo 
\"hello\"";
+      PrintWriter writer = new PrintWriter(new FileOutputStream(shellFile));
+      FileUtil.setExecutable(shellFile, true);
+      writer.println(testCommand);
+      writer.close();
+      Map<Path, List<String>> resources = new HashMap<>();
+      Map<String, String> env = new HashMap<>();
+      List<String> commands = new ArrayList<>();
+      if (Shell.WINDOWS) {
+        commands.add("cmd");
+        commands.add("/c");
+        commands.add("\"" + shellFile.getAbsolutePath() + "\"");
+      } else {
+        commands.add("/bin/sh \\\"" + shellFile.getAbsolutePath() + "\\\"");
+      }
+      conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO, false);
+      conf.setBoolean(YarnConfiguration.NM_LOG_CONTAINER_DEBUG_INFO_ON_ERROR, 
true);
+      FileOutputStream fos = new FileOutputStream(tempFile);
+      ContainerExecutor exec = new DefaultContainerExecutor();
+      exec.setConf(conf);
+      LinkedHashSet<String> nmVars = new LinkedHashSet<>();
+      exec.writeLaunchEnv(fos, env, resources, commands,
+          new Path(localLogDir.getAbsolutePath()), "user",
+          tempFile.getName(), nmVars);
+      fos.flush();
+      fos.close();
+      FileUtil.setExecutable(tempFile, true);
+      Shell.ShellCommandExecutor shexc = new Shell.ShellCommandExecutor(
+          new String[]{tempFile.getAbsolutePath()}, tmpDir);
+      shexc.execute();
+      assertThat(shexc.getExitCode()).isZero();
+      File directorInfo =
+          new File(localLogDir, ContainerExecutor.DIRECTORY_CONTENTS);
+      File scriptCopy = new File(localLogDir, tempFile.getName());
+      Assert.assertFalse("Directory info file missing",
+          directorInfo.exists());
+      Assert.assertFalse("Copy of launch script missing",
+          scriptCopy.exists());
+    } finally {
+      // cleanup
+      if (shellFile != null && shellFile.exists()) {
+        shellFile.delete();
+      }
+      if (tempFile != null && tempFile.exists()) {
+        tempFile.delete();
+      }
+    }
+  }
+
   /**
    * Test container launch fault.
    * @throws Exception


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to