AMBARI-20593. EU/RU Auto-Retry does not reschedule task when host is not heartbeating before task is scheduled and doesn't have a start time (alejandro)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/4690fe7c Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/4690fe7c Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/4690fe7c Branch: refs/heads/branch-dev-logsearch Commit: 4690fe7c655ade9af91cfea7189e2efd98885149 Parents: 165ec70 Author: Alejandro Fernandez <afernan...@hortonworks.com> Authored: Mon Mar 27 19:14:13 2017 -0700 Committer: Alejandro Fernandez <afernan...@hortonworks.com> Committed: Tue Mar 28 10:15:44 2017 -0700 ---------------------------------------------------------------------- .../server/actionmanager/HostRoleCommand.java | 6 +- .../services/RetryUpgradeActionService.java | 60 ++++++++++++++++---- .../services/RetryUpgradeActionServiceTest.java | 28 +++++++-- 3 files changed, 74 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java index 85c8e9f..651eb24 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/actionmanager/HostRoleCommand.java @@ -168,10 +168,10 @@ public class HostRoleCommand { errorLog = hostRoleCommandEntity.getErrorLog(); structuredOut = hostRoleCommandEntity.getStructuredOut() != null ? new String(hostRoleCommandEntity.getStructuredOut()) : ""; exitCode = hostRoleCommandEntity.getExitcode(); - startTime = hostRoleCommandEntity.getStartTime(); - originalStartTime = hostRoleCommandEntity.getOriginalStartTime(); + startTime = hostRoleCommandEntity.getStartTime() != null ? hostRoleCommandEntity.getStartTime() : -1L; + originalStartTime = hostRoleCommandEntity.getOriginalStartTime() != null ? hostRoleCommandEntity.getOriginalStartTime() : -1L; endTime = hostRoleCommandEntity.getEndTime() != null ? hostRoleCommandEntity.getEndTime() : -1L; - lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime(); + lastAttemptTime = hostRoleCommandEntity.getLastAttemptTime() != null ? hostRoleCommandEntity.getLastAttemptTime() : -1L; attemptCount = hostRoleCommandEntity.getAttemptCount(); retryAllowed = hostRoleCommandEntity.isRetryAllowed(); autoSkipFailure = hostRoleCommandEntity.isFailureAutoSkipped(); http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java index a92aa04..6d960c3 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/state/services/RetryUpgradeActionService.java @@ -199,24 +199,59 @@ public class RetryUpgradeActionService extends AbstractScheduledService { List<HostRoleCommandEntity> holdingCommands = m_hostRoleCommandDAO.findByRequestIdAndStatuses(requestId, HOLDING_STATUSES); if (holdingCommands.size() > 0) { for (HostRoleCommandEntity hrc : holdingCommands) { - LOG.debug("Comparing taskId: {}, original start time: {}, now: {}", - hrc.getTaskId(), hrc.getOriginalStartTime(), now); + LOG.debug("Comparing taskId: {}, attempt count: {}, original start time: {}, now: {}", + hrc.getTaskId(), hrc.getAttemptCount(), hrc.getOriginalStartTime(), now); /* + Use-Case 1: + If the command has been sent to the host before because it was heartbeating, then it does have + an original start time, so we can attempt to retry on this host even if no longer heartbeating. + If the host does heartbeat again within the time interval, the command will actually be scheduled by the host. + + Use-Case 2: + If the host is not heartbeating and the command is scheduled to be ran on it, then it means the following + is true, + - does not have original start time + - does not have start time + - attempt count is 0 + - status will be HOLDING_TIMEDOUT + When the host does start heartbeating, we need to schedule this command by changing its state back to PENDING. + + Notes: While testing, can update the original_start_time of records in host_role_command table to current epoch time. E.g. in postgres, SELECT CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000; UPDATE host_role_command SET attempt_count = 1, status = 'HOLDING_FAILED', original_start_time = (CAST(EXTRACT(EPOCH FROM NOW()) AS BIGINT) * 1000) WHERE task_id IN (x, y, z); - */ - if (canRetryCommand(hrc) && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) { - Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS; - Long deltaMS = retryTimeWindow - now; - - if (deltaMS > 0) { - String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime())); - String deltaString = m_deltaDateFormat.format(new Date(deltaMS)); - LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}", - hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString); + */ + + if (canRetryCommand(hrc)) { + + boolean allowRetry = false; + // Use-Case 1 + if (hrc.getOriginalStartTime() != null && hrc.getOriginalStartTime() > 0 && hrc.getOriginalStartTime() < now) { + Long retryTimeWindow = hrc.getOriginalStartTime() + MAX_TIMEOUT_MS; + Long deltaMS = retryTimeWindow - now; + + if (deltaMS > 0) { + String originalStartTimeString = m_fullDateFormat.format(new Date(hrc.getOriginalStartTime())); + String deltaString = m_deltaDateFormat.format(new Date(deltaMS)); + LOG.info("Retrying task with id: {}, attempts: {}, original start time: {}, time til timeout: {}", + hrc.getTaskId(), hrc.getAttemptCount(), originalStartTimeString, deltaString); + allowRetry = true; + } + } + + // Use-Case 2 + if ((hrc.getOriginalStartTime() == null || hrc.getOriginalStartTime() == -1L) && + (hrc.getStartTime() == null || hrc.getStartTime() == -1L) && + hrc.getAttemptCount() == 0){ + LOG.info("Re-scheduling task with id: {} since it has 0 attempts, and null start_time and " + + "original_start_time, which likely means the host was not heartbeating when the command was supposed to be scheduled.", + hrc.getTaskId()); + allowRetry = true; + } + + if (allowRetry) { retryHostRoleCommand(hrc); } } @@ -262,6 +297,7 @@ public class RetryUpgradeActionService extends AbstractScheduledService { hrc.setStatus(HostRoleStatus.PENDING); hrc.setStartTime(-1L); // Don't change the original start time. + hrc.setEndTime(-1L); hrc.setLastAttemptTime(-1L); // This will invalidate the cache, as expected. http://git-wip-us.apache.org/repos/asf/ambari/blob/4690fe7c/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java index e699e49..e2ce6e7 100644 --- a/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java +++ b/ambari-server/src/test/java/org/apache/ambari/server/state/services/RetryUpgradeActionServiceTest.java @@ -109,10 +109,12 @@ public class RetryUpgradeActionServiceTest { * Case 4: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that * does NOT meet conditions to be retried => no-op * Case 5: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that - * DOES meet conditions to be retried => retries the task - * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that + * DOES meet conditions to be retried and has values for start time and original start time => retries the task + * * Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_TIMEDOUT that + * DOES meet conditions to be retriedand does not have values for start time or original start time => retries the task + * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that * was already retried and has now expired => no-op - * Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task + * Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task * during Finalize Cluster, which should not be retried => no-op * @throws Exception */ @@ -185,7 +187,23 @@ public class RetryUpgradeActionServiceTest { // Ensure that task 2 transitioned from HOLDING_FAILED to PENDING Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); - // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired. + // Case 6: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that DOES meet conditions to be retried. + hrc2.setStatus(HostRoleStatus.HOLDING_TIMEDOUT); + hrc2.setRetryAllowed(true); + hrc2.setOriginalStartTime(-1L); + hrc2.setStartTime(-1L); + hrc2.setLastAttemptTime(-1L); + hrc2.setEndTime(-1L); + hrc2.setAttemptCount((short) 0); + hostRoleCommandDAO.merge(hrc2); + + // Run the service + service.runOneIteration(); + + // Ensure that task 2 transitioned from HOLDING_TIMEDOUT to PENDING + Assert.assertEquals(HostRoleStatus.PENDING, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); + + // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED that was already retried and has now expired. now = System.currentTimeMillis(); hrc2.setOriginalStartTime(now - (timeoutMins * 60000) - 1); hrc2.setStatus(HostRoleStatus.HOLDING_FAILED); @@ -196,7 +214,7 @@ public class RetryUpgradeActionServiceTest { Assert.assertEquals(HostRoleStatus.HOLDING_FAILED, hostRoleCommandDAO.findByPK(hrc2.getTaskId()).getStatus()); - // Case 7: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task + // Case 8: Cluster with an active upgrade that contains a failed task in HOLDING_FAILED, but it is a critical task // during Finalize Cluster, which should not be retried. now = System.currentTimeMillis(); hrc2.setOriginalStartTime(now);