AMBARI-13913. Express Upgrade: didn't finalize HDFS, improve robustness for HA (alejandro)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/644d8ba4 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/644d8ba4 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/644d8ba4 Branch: refs/heads/branch-dev-patch-upgrade Commit: 644d8ba4bb654bd6c8cdaafc4f906bfae7b0a523 Parents: 55d0b18 Author: Alejandro Fernandez <afernan...@hortonworks.com> Authored: Fri Nov 13 13:23:29 2015 -0800 Committer: Alejandro Fernandez <afernan...@hortonworks.com> Committed: Mon Nov 16 17:26:49 2015 -0800 ---------------------------------------------------------------------- .../HDFS/2.1.0.2.0/package/scripts/namenode.py | 9 ++++- .../package/scripts/namenode_upgrade.py | 38 ++++++++++---------- .../HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml | 2 +- .../HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml | 2 +- .../HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml | 2 +- .../HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml | 2 +- 6 files changed, 31 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py index 1fada76..2d27724 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode.py @@ -178,6 +178,7 @@ class NameNodeDefault(NameNode): """ During NonRolling (aka Express Upgrade), after starting NameNode, which is still in safemode, and then starting all of the DataNodes, we need for NameNode to receive all of the block reports and leave safemode. + If HA is present, then this command will run individually on each NameNode, which checks for its own address. """ import params @@ -190,7 +191,13 @@ class NameNodeDefault(NameNode): try: hdfs_binary = self.get_hdfs_binary() # Note, this fails if namenode_address isn't prefixed with "params." - is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs {params.namenode_address} -safemode get | grep 'Safe mode is OFF'") + + is_namenode_safe_mode_off = "" + if params.dfs_ha_enabled: + is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs hdfs://{params.namenode_rpc} -safemode get | grep 'Safe mode is OFF'") + else: + is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs {params.namenode_address} -safemode get | grep 'Safe mode is OFF'") + # Wait up to 30 mins Execute(is_namenode_safe_mode_off, tries=180, http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode_upgrade.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode_upgrade.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode_upgrade.py index f8a327f..4873b47 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode_upgrade.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/namenode_upgrade.py @@ -76,10 +76,11 @@ def prepare_upgrade_enter_safe_mode(hdfs_binary): # Safe to call if already in Safe Mode desired_state = SafeMode.ON safemode_transition_successful, original_state = reach_safemode_state(params.hdfs_user, desired_state, params.dfs_ha_enabled, hdfs_binary) + Logger.info("Transition successful: {0}, original state: {1}".format(str(safemode_transition_successful), str(original_state))) if not safemode_transition_successful: raise Fail("Could not transition to safemode state %s. Please check logs to make sure namenode is up." % str(desired_state)) except Exception, e: - message = format("Could not enter safemode. As the HDFS user, call this command: {safe_mode_enter_cmd}") + message = "Could not enter safemode. Error: {0}. As the HDFS user, call this command: {1}".format(str(e), safe_mode_enter_cmd) Logger.error(message) raise Fail(message) @@ -95,7 +96,7 @@ def prepare_upgrade_save_namespace(hdfs_binary): Logger.info("Checkpoint the current namespace.") as_user(save_namespace_cmd, params.hdfs_user, env={'PATH': params.hadoop_bin_dir}) except Exception, e: - message = format("Could save the NameSpace. As the HDFS user, call this command: {save_namespace_cmd}") + message = format("Could not save the NameSpace. As the HDFS user, call this command: {save_namespace_cmd}") Logger.error(message) raise Fail(message) @@ -166,16 +167,22 @@ def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary): import params original_state = SafeMode.UNKNOWN - hostname = params.hostname - safemode_check = format("{hdfs_binary} dfsadmin -safemode get") + safemode_base_command = "" + if params.dfs_ha_enabled: + safemode_base_command = format("{hdfs_binary} dfsadmin -fs hdfs://{params.namenode_rpc} -safemode ") + else: + safemode_base_command = format("{hdfs_binary} dfsadmin -fs {params.namenode_address} -safemode ") + safemode_check_cmd = safemode_base_command + " get" + + grep_pattern = format("Safe mode is {safemode_state}") + safemode_check_with_grep = format("{safemode_check_cmd} | grep '{grep_pattern}'") - grep_pattern = format("Safe mode is {safemode_state} in {hostname}") if in_ha else format("Safe mode is {safemode_state}") - safemode_check_with_grep = format("hdfs dfsadmin -safemode get | grep '{grep_pattern}'") - code, out = shell.call(safemode_check, user=user) - Logger.info("Command: %s\nCode: %d." % (safemode_check, code)) + code, out = shell.call(safemode_check_cmd, user=user, logoutput=True) + Logger.info("Command: %s\nCode: %d." % (safemode_check_cmd, code)) if code == 0 and out is not None: Logger.info(out) - re_pattern = r"Safe mode is (\S*) in " + hostname.replace(".", "\\.") if in_ha else r"Safe mode is (\S*)" + re_pattern = r"Safe mode is (\S*)" + Logger.info("Pattern to search: {0}".format(re_pattern)) m = re.search(re_pattern, out, re.IGNORECASE) if m and len(m.groups()) >= 1: original_state = m.group(1).upper() @@ -184,7 +191,7 @@ def reach_safemode_state(user, safemode_state, in_ha, hdfs_binary): return (True, original_state) else: # Make a transition - command = "{0} dfsadmin -safemode {1}".format(hdfs_binary, safemode_to_instruction[safemode_state]) + command = safemode_base_command + safemode_to_instruction[safemode_state] Execute(command, user=user, logoutput=True, @@ -248,15 +255,8 @@ def finalize_upgrade(upgrade_type, hdfs_binary): kinit_command = format("{params.kinit_path_local} -kt {params.hdfs_user_keytab} {params.hdfs_principal_name}") Execute(kinit_command, user=params.hdfs_user, logoutput=True) - finalize_cmd = "" - query_cmd = "" - if upgrade_type == "rolling": - finalize_cmd = format("{hdfs_binary} dfsadmin -rollingUpgrade finalize") - query_cmd = format("{hdfs_binary} dfsadmin -rollingUpgrade query") - - elif upgrade_type == "nonrolling": - finalize_cmd = format("{hdfs_binary} dfsadmin -finalizeUpgrade") - query_cmd = format("{hdfs_binary} dfsadmin -rollingUpgrade query") + finalize_cmd = format("{hdfs_binary} dfsadmin -rollingUpgrade finalize") + query_cmd = format("{hdfs_binary} dfsadmin -rollingUpgrade query") Execute(query_cmd, user=params.hdfs_user, http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/stacks/HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml b/ambari-server/src/main/resources/stacks/HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml index efc3753..c2e9df4 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml +++ b/ambari-server/src/main/resources/stacks/HDP/2.1/upgrades/nonrolling-upgrade-2.3.xml @@ -373,7 +373,7 @@ <direction>UPGRADE</direction> <execute-stage service="HDFS" component="NAMENODE" title="Wait to leave Safemode"> - <task xsi:type="execute" hosts="master" summary="Wait for NameNode to leave Safemode"> + <task xsi:type="execute" hosts="all" summary="Wait for NameNode to leave Safemode"> <script>scripts/namenode.py</script> <function>wait_for_safemode_off</function> </task> http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml index fa69e72..950ece1 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml +++ b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.2.xml @@ -295,7 +295,7 @@ <direction>UPGRADE</direction> <execute-stage service="HDFS" component="NAMENODE" title="Wait to leave Safemode"> - <task xsi:type="execute" hosts="master" summary="Wait for NameNode to leave Safemode"> + <task xsi:type="execute" hosts="all" summary="Wait for NameNode to leave Safemode"> <script>scripts/namenode.py</script> <function>wait_for_safemode_off</function> </task> http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml index 6282fdc..160f0b8 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml +++ b/ambari-server/src/main/resources/stacks/HDP/2.2/upgrades/nonrolling-upgrade-2.3.xml @@ -552,7 +552,7 @@ <direction>UPGRADE</direction> <execute-stage service="HDFS" component="NAMENODE" title="Wait to leave Safemode"> - <task xsi:type="execute" hosts="master" summary="Wait for NameNode to leave Safemode"> + <task xsi:type="execute" hosts="all" summary="Wait for NameNode to leave Safemode"> <script>scripts/namenode.py</script> <function>wait_for_safemode_off</function> </task> http://git-wip-us.apache.org/repos/asf/ambari/blob/644d8ba4/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml b/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml index 798c895..94fe413 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml +++ b/ambari-server/src/main/resources/stacks/HDP/2.3/upgrades/nonrolling-upgrade-2.3.xml @@ -336,7 +336,7 @@ <direction>UPGRADE</direction> <execute-stage service="HDFS" component="NAMENODE" title="Wait to leave Safemode"> - <task xsi:type="execute" hosts="master" summary="Wait for NameNode to leave Safemode"> + <task xsi:type="execute" hosts="all" summary="Wait for NameNode to leave Safemode"> <script>scripts/namenode.py</script> <function>wait_for_safemode_off</function> </task>