[ 
https://issues.apache.org/jira/browse/GOBBLIN-2052?focusedWorklogId=916080&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-916080
 ]

ASF GitHub Bot logged work on GOBBLIN-2052:
-------------------------------------------

                Author: ASF GitHub Bot
            Created on: 23/Apr/24 16:54
            Start Date: 23/Apr/24 16:54
    Worklog Time Spent: 10m 
      Work Description: pradeepppc commented on code in PR #3932:
URL: https://github.com/apache/gobblin/pull/3932#discussion_r1576581493


##########
gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnAutoScalingManager.java:
##########
@@ -299,27 +325,65 @@ void runInternal() {
           // Remove this instance if existed in the tracking map.
           instanceIdleSince.remove(participant);
         }
+
+        if(instancesInInitState.contains(participant)) {
+          instanceInitStateSince.putIfAbsent(participant, 
System.currentTimeMillis());
+          if (!isInstanceStuckInInitState(participant)) {
+            // release the corresponding container as the helix task is stuck 
in INIT state for a long time
+            log.info("Instance {} is stuck in INIT state for a long time, 
releasing the container", participant);
+            // get containerInfo of the helix participant
+            YarnService.ContainerInfo containerInfo = 
yarnService.getContainerInfoGivenHelixParticipant(participant);
+            if(containerInfo != null) {
+              containersToRelease.add(containerInfo.getContainer());
+              instanceInitStateSince.remove(participant);
+              inUseInstances.remove(participant);
+            } else {
+              log.warn("ContainerInfo for participant {} is not found", 
participant);
+            }
+          }
+        } else {
+          instanceInitStateSince.remove(participant);
+        }
+      }
+
+      // release the containers
+      if(!containersToRelease.isEmpty()) {
+        this.yarnService.getEventBus().post(new 
ContainerReleaseRequest(containersToRelease, true));
       }
+
       slidingWindowReservoir.add(yarnContainerRequestBundle);
 
+
       log.debug("There are {} containers being requested in total, tag-count 
map {}, tag-resource map {}",
           yarnContainerRequestBundle.getTotalContainers(), 
yarnContainerRequestBundle.getHelixTagContainerCountMap(),
           yarnContainerRequestBundle.getHelixTagResourceMap());
 
       
this.yarnService.requestTargetNumberOfContainers(slidingWindowReservoir.getMax(),
 inUseInstances);
     }
 
-    @VisibleForTesting
     /**
      * Return true is the condition for tagging an instance as "unused" holds.
      * The condition, by default is that if an instance went back to
      * active (having partition running on it) within {@link 
#maxIdleTimeInMinutesBeforeScalingDown} minutes, we will
      * not tag that instance as "unused" and have that as the candidate for 
scaling down.
      */
+    @VisibleForTesting
     boolean isInstanceUnused(String participant){
       return System.currentTimeMillis() - instanceIdleSince.get(participant) >
           TimeUnit.MINUTES.toMillis(maxIdleTimeInMinutesBeforeScalingDown);
     }
+
+    /**
+     * Return true is the condition for tagging an instance as stuck in INIT 
state holds.
+     * The condition, by default is that if an instance went back to
+     * active (having partition running on it) within {@link 
#maxIdleTimeInMinutesBeforeScalingDown} minutes, we will
+     * not tag that instance as stuck and the container will not be scaled 
down.
+     */
+    @VisibleForTesting
+    boolean isInstanceStuckInInitState(String participant) {
+      return System.currentTimeMillis() - 
instanceInitStateSince.get(participant) >
+          TimeUnit.MINUTES.toMillis(maxIdleTimeInMinutesBeforeScalingDown);

Review Comment:
   added new variable 





Issue Time Tracking
-------------------

    Worklog Id:     (was: 916080)
    Time Spent: 1h 10m  (was: 1h)

> Release container which is running yarn task that is stuck in INIT state
> ------------------------------------------------------------------------
>
>                 Key: GOBBLIN-2052
>                 URL: https://issues.apache.org/jira/browse/GOBBLIN-2052
>             Project: Apache Gobblin
>          Issue Type: Improvement
>          Components: gobblin-yarn
>            Reporter: pradeep pallikila
>            Assignee: Abhishek Tiwari
>            Priority: Major
>          Time Spent: 1h 10m
>  Remaining Estimate: 0h
>




--
This message was sent by Atlassian Jira
(v8.20.10#820010)

Reply via email to