AMBARI-13856. Sometimes when HA is enabled NameNode does not wait to leave safe mode on start (aonishuk)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/4a989be6 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/4a989be6 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/4a989be6 Branch: refs/heads/branch-2.1 Commit: 4a989be67ada86e3e99a39c0915005c64cf103d2 Parents: 408f7b7 Author: Andrew Onishuk <aonis...@hortonworks.com> Authored: Thu Nov 12 14:56:13 2015 +0200 Committer: Andrew Onishuk <aonis...@hortonworks.com> Committed: Thu Nov 12 14:56:13 2015 +0200 ---------------------------------------------------------------------- .../2.1.0.2.0/package/scripts/hdfs_namenode.py | 43 ++++++++------------ .../python/stacks/2.0.6/HDFS/test_namenode.py | 17 ++++---- 2 files changed, 25 insertions(+), 35 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/4a989be6/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py index f944b8d..d6a0a41 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/scripts/hdfs_namenode.py @@ -115,10 +115,11 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e Execute(format("{kinit_path_local} -kt {hdfs_user_keytab} {hdfs_principal_name}"), user = params.hdfs_user) - is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'") if params.dfs_ha_enabled: + is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs hdfs://{namenode_rpc} -safemode get | grep 'Safe mode is OFF'") is_active_namenode_cmd = as_user(format("{hdfs_binary} --config {hadoop_conf_dir} haadmin -getServiceState {namenode_id} | grep active"), params.hdfs_user, env={'PATH':params.hadoop_bin_dir}) else: + is_namenode_safe_mode_off = format("{hdfs_binary} dfsadmin -fs {namenode_address} -safemode get | grep 'Safe mode is OFF'") is_active_namenode_cmd = True # During NonRolling Upgrade, both NameNodes are initially down, @@ -129,30 +130,21 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e # ___Scenario___________|_Expected safemode state__|_Wait for safemode OFF____| # no-HA | ON -> OFF | Yes | # HA and active | ON -> OFF | Yes | - # HA and standby | no change | no check | + # HA and standby | ON -> OFF | Yes | # RU with HA on active | ON -> OFF | Yes | # RU with HA on standby | ON -> OFF | Yes | # EU with HA on active | no change | no check | # EU with HA on standby | no change | no check | # EU non-HA | no change | no check | - check_for_safemode_off = False msg = "" if params.dfs_ha_enabled: if upgrade_type is not None: - check_for_safemode_off = True msg = "Must wait to leave safemode since High Availability is enabled during a Stack Upgrade" else: - # During normal operations, the NameNode is expected to be up. - code, out = shell.call(is_active_namenode_cmd, logoutput=True) # If active NN, code will be 0 - if code == 0: # active - check_for_safemode_off = True - msg = "Must wait to leave safemode since High Availability is enabled and this is the Active NameNode." - else: - msg = "Will remain in the current safemode state." + msg = "Must wait to leave safemode since High Availability is enabled." else: msg = "Must wait to leave safemode since High Availability is not enabled." - check_for_safemode_off = True Logger.info(msg) @@ -161,20 +153,19 @@ def namenode(action=None, hdfs_binary=None, do_format=True, upgrade_type=None, e if upgrade_type == "nonrolling": stay_in_safe_mode = True - if check_for_safemode_off: - Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) - if not stay_in_safe_mode: - Logger.info("Wait to leafe safemode since must transition from ON to OFF.") - try: - # Wait up to 30 mins - Execute(is_namenode_safe_mode_off, - tries=180, - try_sleep=10, - user=params.hdfs_user, - logoutput=True - ) - except Fail: - Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.") + Logger.info("Stay in safe mode: {0}".format(stay_in_safe_mode)) + if not stay_in_safe_mode: + Logger.info("Wait to leafe safemode since must transition from ON to OFF.") + try: + # Wait up to 30 mins + Execute(is_namenode_safe_mode_off, + tries=180, + try_sleep=10, + user=params.hdfs_user, + logoutput=True + ) + except Fail: + Logger.error("NameNode is still in safemode, please be careful with commands that need safemode OFF.") # Always run this on non-HA, or active NameNode during HA. create_hdfs_directories(is_active_namenode_cmd) http://git-wip-us.apache.org/repos/asf/ambari/blob/4a989be6/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py index f18d501..74dc577 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py @@ -416,7 +416,7 @@ class TestNamenode(RMFTestCase): environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'}, not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid", ) - self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'", + self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'", tries=180, try_sleep=10, user="hdfs", @@ -507,7 +507,7 @@ class TestNamenode(RMFTestCase): self.assertResourceCalled('Execute', '/usr/bin/kinit -kt /etc/security/keytabs/hdfs.headless.keytab hdfs', user = 'hdfs', ) - self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'", + self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'", tries=180, try_sleep=10, user="hdfs", @@ -607,7 +607,7 @@ class TestNamenode(RMFTestCase): environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'}, not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid", ) - self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'", + self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6401.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'", tries=180, try_sleep=10, user="hdfs", @@ -706,7 +706,7 @@ class TestNamenode(RMFTestCase): environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'}, not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid", ) - self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'", + self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'", tries=180, try_sleep=10, user="hdfs", @@ -759,10 +759,10 @@ class TestNamenode(RMFTestCase): ) self.assertNoMoreResources() self.assertTrue(call_mocks.called) - self.assertEqual(2, call_mocks.call_count) + self.assertEqual(1, call_mocks.call_count) calls = [ call('hdfs namenode -bootstrapStandby -nonInteractive', logoutput=False, user=u'hdfs'), - call("ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'", logoutput=True)] + ] call_mocks.assert_has_calls(calls, any_order=False) # tests namenode start command when NameNode HA is enabled, and @@ -813,7 +813,7 @@ class TestNamenode(RMFTestCase): environment = {'HADOOP_LIBEXEC_DIR': '/usr/lib/hadoop/libexec'}, not_if = "ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E test -f /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid && ambari-sudo.sh [RMF_ENV_PLACEHOLDER] -H -E pgrep -F /var/run/hadoop/hdfs/hadoop-hdfs-namenode.pid", ) - self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://ns1 -safemode get | grep 'Safe mode is OFF'", + self.assertResourceCalled('Execute', "hdfs dfsadmin -fs hdfs://c6402.ambari.apache.org:8020 -safemode get | grep 'Safe mode is OFF'", tries=180, try_sleep=10, user="hdfs", @@ -866,9 +866,8 @@ class TestNamenode(RMFTestCase): ) self.assertNoMoreResources() self.assertTrue(call_mocks.called) - self.assertEqual(3, call_mocks.call_count) + self.assertEqual(2, call_mocks.call_count) calls = [ - call("ambari-sudo.sh su hdfs -l -s /bin/bash -c 'export PATH=/bin:/usr/bin ; hdfs --config /etc/hadoop/conf haadmin -getServiceState nn2 | grep active'", logoutput=True), call('hdfs namenode -bootstrapStandby -nonInteractive -force', logoutput=False, user=u'hdfs'), call('hdfs namenode -bootstrapStandby -nonInteractive -force', logoutput=False, user=u'hdfs')] call_mocks.assert_has_calls(calls, any_order=True)