[
https://issues.apache.org/jira/browse/GOBBLIN-2052?focusedWorklogId=916080&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-916080
]
ASF GitHub Bot logged work on GOBBLIN-2052:
-------------------------------------------
Author: ASF GitHub Bot
Created on: 23/Apr/24 16:54
Start Date: 23/Apr/24 16:54
Worklog Time Spent: 10m
Work Description: pradeepppc commented on code in PR #3932:
URL: https://github.com/apache/gobblin/pull/3932#discussion_r1576581493
##########
gobblin-yarn/src/main/java/org/apache/gobblin/yarn/YarnAutoScalingManager.java:
##########
@@ -299,27 +325,65 @@ void runInternal() {
// Remove this instance if existed in the tracking map.
instanceIdleSince.remove(participant);
}
+
+ if(instancesInInitState.contains(participant)) {
+ instanceInitStateSince.putIfAbsent(participant,
System.currentTimeMillis());
+ if (!isInstanceStuckInInitState(participant)) {
+ // release the corresponding container as the helix task is stuck
in INIT state for a long time
+ log.info("Instance {} is stuck in INIT state for a long time,
releasing the container", participant);
+ // get containerInfo of the helix participant
+ YarnService.ContainerInfo containerInfo =
yarnService.getContainerInfoGivenHelixParticipant(participant);
+ if(containerInfo != null) {
+ containersToRelease.add(containerInfo.getContainer());
+ instanceInitStateSince.remove(participant);
+ inUseInstances.remove(participant);
+ } else {
+ log.warn("ContainerInfo for participant {} is not found",
participant);
+ }
+ }
+ } else {
+ instanceInitStateSince.remove(participant);
+ }
+ }
+
+ // release the containers
+ if(!containersToRelease.isEmpty()) {
+ this.yarnService.getEventBus().post(new
ContainerReleaseRequest(containersToRelease, true));
}
+
slidingWindowReservoir.add(yarnContainerRequestBundle);
+
log.debug("There are {} containers being requested in total, tag-count
map {}, tag-resource map {}",
yarnContainerRequestBundle.getTotalContainers(),
yarnContainerRequestBundle.getHelixTagContainerCountMap(),
yarnContainerRequestBundle.getHelixTagResourceMap());
this.yarnService.requestTargetNumberOfContainers(slidingWindowReservoir.getMax(),
inUseInstances);
}
- @VisibleForTesting
/**
* Return true is the condition for tagging an instance as "unused" holds.
* The condition, by default is that if an instance went back to
* active (having partition running on it) within {@link
#maxIdleTimeInMinutesBeforeScalingDown} minutes, we will
* not tag that instance as "unused" and have that as the candidate for
scaling down.
*/
+ @VisibleForTesting
boolean isInstanceUnused(String participant){
return System.currentTimeMillis() - instanceIdleSince.get(participant) >
TimeUnit.MINUTES.toMillis(maxIdleTimeInMinutesBeforeScalingDown);
}
+
+ /**
+ * Return true is the condition for tagging an instance as stuck in INIT
state holds.
+ * The condition, by default is that if an instance went back to
+ * active (having partition running on it) within {@link
#maxIdleTimeInMinutesBeforeScalingDown} minutes, we will
+ * not tag that instance as stuck and the container will not be scaled
down.
+ */
+ @VisibleForTesting
+ boolean isInstanceStuckInInitState(String participant) {
+ return System.currentTimeMillis() -
instanceInitStateSince.get(participant) >
+ TimeUnit.MINUTES.toMillis(maxIdleTimeInMinutesBeforeScalingDown);
Review Comment:
added new variable
Issue Time Tracking
-------------------
Worklog Id: (was: 916080)
Time Spent: 1h 10m (was: 1h)
> Release container which is running yarn task that is stuck in INIT state
> ------------------------------------------------------------------------
>
> Key: GOBBLIN-2052
> URL: https://issues.apache.org/jira/browse/GOBBLIN-2052
> Project: Apache Gobblin
> Issue Type: Improvement
> Components: gobblin-yarn
> Reporter: pradeep pallikila
> Assignee: Abhishek Tiwari
> Priority: Major
> Time Spent: 1h 10m
> Remaining Estimate: 0h
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)