AMBARI-21142. Log more info about heartbeat message/response when server - agent communication gets out of sync. (stoader)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/b7101f78 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/b7101f78 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/b7101f78 Branch: refs/heads/branch-feature-AMBARI-20859 Commit: b7101f782be9a1291de589262f01083c70dfc935 Parents: c3c06ea Author: Toader, Sebastian <stoa...@hortonworks.com> Authored: Fri Jun 2 23:09:56 2017 +0200 Committer: Toader, Sebastian <stoa...@hortonworks.com> Committed: Fri Jun 2 23:12:46 2017 +0200 ---------------------------------------------------------------------- .../src/main/python/ambari_agent/Controller.py | 6 +++++- .../ambari/server/agent/HeartBeatHandler.java | 18 ++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/b7101f78/ambari-agent/src/main/python/ambari_agent/Controller.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/Controller.py b/ambari-agent/src/main/python/ambari_agent/Controller.py index 0297f74..bc923c3 100644 --- a/ambari-agent/src/main/python/ambari_agent/Controller.py +++ b/ambari-agent/src/main/python/ambari_agent/Controller.py @@ -321,6 +321,7 @@ class Controller(threading.Thread): logger.log(logging_level, "Sending Heartbeat (id = %s)", self.responseId) response = self.sendRequest(self.heartbeatUrl, data) + exitStatus = 0 if 'exitstatus' in response.keys(): exitStatus = int(response['exitstatus']) @@ -366,7 +367,9 @@ class Controller(threading.Thread): self.restartAgent() if serverId != self.responseId + 1: - logger.error("Error in responseId sequence - restarting") + logger.error("Error in responseId sequence - received responseId={0} from server while expecting {1} - restarting..." + .format(serverId, self.responseId + 1)) + self.restartAgent() else: self.responseId = serverId @@ -465,6 +468,7 @@ class Controller(threading.Thread): #randomize the heartbeat delay = randint(0, self.max_reconnect_retry_delay) + logger.info("Waiting {0} seconds before reconnecting to {1}".format(delay, self.heartbeatUrl)) time.sleep(delay) # Sleep for some time http://git-wip-us.apache.org/repos/asf/ambari/blob/b7101f78/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java index d800bc5..fc6e7a7 100644 --- a/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java +++ b/ambari-server/src/main/java/org/apache/ambari/server/agent/HeartBeatHandler.java @@ -161,10 +161,20 @@ public class HeartBeatHandler { + ", receivedResponseId=" + heartbeat.getResponseId()); if (heartbeat.getResponseId() == currentResponseId - 1) { - LOG.warn("Old responseId received - response was lost - returning cached response"); - return hostResponses.get(hostname); + HeartBeatResponse heartBeatResponse = hostResponses.get(hostname); + + LOG.warn("Old responseId={} received form host {} - response was lost - returning cached response with responseId={}", + heartbeat.getResponseId(), + hostname, + heartBeatResponse.getResponseId()); + + return heartBeatResponse; } else if (heartbeat.getResponseId() != currentResponseId) { - LOG.error("Error in responseId sequence - sending agent restart command"); + LOG.error("Error in responseId sequence - received responseId={} from host {} - sending agent restart command with responseId={}", + heartbeat.getResponseId(), + hostname, + currentResponseId); + return createRestartCommand(currentResponseId); } @@ -186,7 +196,7 @@ public class HeartBeatHandler { if (hostObject.getState().equals(HostState.HEARTBEAT_LOST)) { // After loosing heartbeat agent should reregister - LOG.warn("Host is in HEARTBEAT_LOST state - sending register command"); + LOG.warn("Host {} is in HEARTBEAT_LOST state - sending register command", hostname); return createRegisterCommand(); }