tillrohrmann commented on a change in pull request #10089: [FLINK-12342][yarn] Remove container requests in order to reduce excess containers URL: https://github.com/apache/flink/pull/10089#discussion_r343226103
########## File path: flink-yarn/src/main/java/org/apache/flink/yarn/YarnResourceManager.java ########## @@ -370,45 +370,24 @@ public void onContainersCompleted(final List<ContainerStatus> statuses) { @Override public void onContainersAllocated(List<Container> containers) { runAsync(() -> { + log.info("Received {} containers with {} pending container requests.", containers.size(), numPendingContainerRequests); final Collection<AMRMClient.ContainerRequest> pendingRequests = getPendingRequests(); final Iterator<AMRMClient.ContainerRequest> pendingRequestsIterator = pendingRequests.iterator(); - for (Container container : containers) { - log.info( - "Received new container: {} - Remaining pending container requests: {}", - container.getId(), - numPendingContainerRequests); + final int numAcceptedContainers = Math.min(containers.size(), numPendingContainerRequests); + final List<Container> requiredContainers = containers.subList(0, numAcceptedContainers); + final List<Container> excessContainers = containers.subList(numAcceptedContainers, containers.size()); - if (numPendingContainerRequests > 0) { - removeContainerRequest(pendingRequestsIterator.next()); - - final String containerIdStr = container.getId().toString(); - final ResourceID resourceId = new ResourceID(containerIdStr); - - workerNodeMap.put(resourceId, new YarnWorkerNode(container)); - - try { - // Context information used to start a TaskExecutor Java process - ContainerLaunchContext taskExecutorLaunchContext = createTaskExecutorLaunchContext( - container.getResource(), - containerIdStr, - container.getNodeId().getHost()); + for (int i = 0; i < requiredContainers.size(); i++) { + removeContainerRequest(pendingRequestsIterator.next()); + } - nodeManagerClient.startContainer(container, taskExecutorLaunchContext); - } catch (Throwable t) { - log.error("Could not start TaskManager in container {}.", container.getId(), t); + for (Container excessContainer : excessContainers) { + returnExcessContainer(excessContainer); + } - // release the failed container - workerNodeMap.remove(resourceId); - resourceManagerClient.releaseAssignedContainer(container.getId()); - // and ask for a new one - requestYarnContainerIfRequired(); - } - } else { - // return the excessive containers - log.info("Returning excess container {}.", container.getId()); - resourceManagerClient.releaseAssignedContainer(container.getId()); - } + for (Container requiredContainer : requiredContainers) { + startTaskExecutorInContainer(requiredContainer); Review comment: I don't think that this happens too often. If this should become a problem, then I would suggest to apply this optimization. At the moment it would only complicate the implementation. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services