This is an automated email from the ASF dual-hosted git repository.

bteke pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/trunk by this push:
     new 4d4b099309e YARN-11534. Fixed exception handling when container 
signalling is interrupted (#5864)
4d4b099309e is described below

commit 4d4b099309ee266c5038adaf5a4690e7c2805cf8
Author: Peter Szucs <116345192+p-sz...@users.noreply.github.com>
AuthorDate: Fri Jul 21 12:30:55 2023 +0200

    YARN-11534. Fixed exception handling when container signalling is 
interrupted (#5864)
---
 .../server/nodemanager/LinuxContainerExecutor.java | 16 ++++++--
 .../launcher/RecoverPausedContainerLaunch.java     |  6 +--
 .../launcher/RecoveredContainerLaunch.java         |  6 +--
 .../runtime/ContainerExecutionException.java       |  4 ++
 .../nodemanager/TestLinuxContainerExecutor.java    | 44 ++++++++++++++++++++++
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
index ea4595dffc4..a28a6fc4110 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.classification.VisibleForTesting;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -787,9 +788,18 @@ public class LinuxContainerExecutor extends 
ContainerExecutor {
       LOG.warn("Error in signalling container {} with {}; exit = {}",
           pid, signal, retCode, e);
       logOutput(e.getOutput());
-      throw new IOException("Problem signalling container " + pid + " with "
-          + signal + "; output: " + e.getOutput() + " and exitCode: "
-          + retCode, e);
+
+      // In ContainerExecutionException -1 is the default value for the exit 
code.
+      // If it remained unset, we can treat the signalling as interrupted.
+      if (retCode == ContainerExecutionException.getDefaultExitCode()) {
+        throw new InterruptedIOException("Signalling container " + pid + " 
with "
+            + signal + " is interrupted; output: " + e.getOutput() + " and 
exitCode: "
+            + retCode);
+      } else {
+        throw new IOException("Problem signalling container " + pid + " with "
+            + signal + "; output: " + e.getOutput() + " and exitCode: "
+            + retCode, e);
+      }
     }
     return true;
   }
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java
index c678c91860d..a56a4531de4 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoverPausedContainerLaunch.java
@@ -68,7 +68,7 @@ public class RecoverPausedContainerLaunch extends 
ContainerLaunch {
 
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
         ContainerEventType.RECOVER_PAUSED_CONTAINER));
-    boolean notInterrupted = true;
+    boolean interrupted = false;
     try {
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       if (pidFile != null) {
@@ -87,11 +87,11 @@ public class RecoverPausedContainerLaunch extends 
ContainerLaunch {
 
     } catch (InterruptedException | InterruptedIOException e) {
       LOG.warn("Interrupted while waiting for exit code from " + containerId);
-      notInterrupted = false;
+      interrupted = true;
     } catch (IOException e) {
       LOG.error("Unable to kill the paused container " + containerIdStr, e);
     } finally {
-      if (notInterrupted) {
+      if (!interrupted) {
         this.completed.set(true);
         exec.deactivateContainer(containerId);
         try {
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
index 17ddd77857f..0dcffe6afe5 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/launcher/RecoveredContainerLaunch.java
@@ -74,7 +74,7 @@ public class RecoveredContainerLaunch extends ContainerLaunch 
{
     dispatcher.getEventHandler().handle(new ContainerEvent(containerId,
         ContainerEventType.CONTAINER_LAUNCHED));
 
-    boolean notInterrupted = true;
+    boolean interrupted = false;
     try {
       File pidFile = locatePidFile(appIdStr, containerIdStr);
       if (pidFile != null) {
@@ -92,11 +92,11 @@ public class RecoveredContainerLaunch extends 
ContainerLaunch {
       }
     } catch (InterruptedException | InterruptedIOException e) {
       LOG.warn("Interrupted while waiting for exit code from " + containerId);
-      notInterrupted = false;
+      interrupted = true;
     } catch (IOException e) {
       LOG.error("Unable to recover container " + containerIdStr, e);
     } finally {
-      if (notInterrupted) {
+      if (!interrupted) {
         this.completed.set(true);
         exec.deactivateContainer(containerId);
         try {
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java
index 735db1f6082..dbb2b5138f4 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/runtime/ContainerExecutionException.java
@@ -88,4 +88,8 @@ public class ContainerExecutionException extends 
YarnException {
     return errorOutput;
   }
 
+  public static int getDefaultExitCode() {
+    return EXIT_CODE_UNSET;
+  }
+
 }
\ No newline at end of file
diff --git 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index 8918588bf37..69e7b3bc008 100644
--- 
a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ 
b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -25,7 +25,10 @@ import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.doNothing;
+import static org.mockito.Mockito.doThrow;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.times;
@@ -33,6 +36,8 @@ import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
 import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntime;
+import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException;
+import 
org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeContext;
 import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerExecContext;
 import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerReapContext;
 import org.slf4j.Logger;
@@ -41,6 +46,7 @@ import org.slf4j.LoggerFactory;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.InterruptedIOException;
 import java.io.PrintWriter;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
@@ -725,6 +731,44 @@ public class TestLinuxContainerExecutor {
     verify(lce, times(1)).getLocalResources(container);
   }
 
+  @Test
+  public void testSignalContainerFailureWhenExitCodeIsPresentInTheException()
+      throws ContainerExecutionException {
+    LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class);
+    LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor(
+        containerRuntime));
+    ContainerSignalContext signalContext = new 
ContainerSignalContext.Builder().build();
+    ContainerExecutionException testException =
+        new ContainerExecutionException("exceptionWithExitCode", 123);
+
+    doNothing().when(containerExecutor).verifyUsernamePattern(any());
+    doThrow(testException)
+        .when(containerRuntime)
+        .signalContainer(any(ContainerRuntimeContext.class));
+
+    assertThrows(IOException.class,
+        () -> containerExecutor.signalContainer(signalContext));
+  }
+
+  @Test
+  public void 
testSignalContainerFailureWhenExitCodeIsNotPresentInTheException()
+      throws ContainerExecutionException {
+    LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class);
+    LinuxContainerExecutor containerExecutor = spy(new LinuxContainerExecutor(
+        containerRuntime));
+    ContainerSignalContext signalContext = new 
ContainerSignalContext.Builder().build();
+    ContainerExecutionException testException =
+        new ContainerExecutionException("exceptionWithoutExitCode");
+
+    doNothing().when(containerExecutor).verifyUsernamePattern(any());
+    doThrow(testException)
+        .when(containerRuntime)
+        .signalContainer(any(ContainerRuntimeContext.class));
+
+    assertThrows(InterruptedIOException.class,
+        () -> containerExecutor.signalContainer(signalContext));
+  }
+
   @Deprecated
   private static class TestResourceHandler implements LCEResourcesHandler {
     static Set<ContainerId> postExecContainers = new HashSet<ContainerId>();


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-commits-h...@hadoop.apache.org

Reply via email to