Repository: ambari Updated Branches: refs/heads/branch-2.1 61bf60bb7 -> 39b8e5fa4
AMBARI-13427: NAMENODE START failed with both NN's being passive (jluniya) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/39b8e5fa Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/39b8e5fa Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/39b8e5fa Branch: refs/heads/branch-2.1 Commit: 39b8e5fa46d29a375981fd225b3c9102d5d6dade Parents: 61bf60b Author: Jayush Luniya <jlun...@hortonworks.com> Authored: Fri Oct 16 11:30:38 2015 -0700 Committer: Jayush Luniya <jlun...@hortonworks.com> Committed: Fri Oct 16 11:31:44 2015 -0700 ---------------------------------------------------------------------- .../libraries/functions/decorator.py | 5 ++-- .../libraries/functions/namenode_ha_utils.py | 27 ++++++++++++++++++-- .../python/stacks/2.0.6/HDFS/test_namenode.py | 17 +++++++++++- 3 files changed, 44 insertions(+), 5 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/39b8e5fa/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py ---------------------------------------------------------------------- diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py b/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py index cd653e5..1b45981 100644 --- a/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py +++ b/ambari-common/src/main/python/resource_management/libraries/functions/decorator.py @@ -26,7 +26,7 @@ __all__ = ['retry', ] from resource_management.core.logger import Logger -def retry(times=3, sleep_time=1, backoff_factor=1, err_class=Exception): +def retry(times=3, sleep_time=1, max_sleep_time=8, backoff_factor=1, err_class=Exception): """ Retry decorator for improved robustness of functions. :param times: Number of times to attempt to call the function. @@ -44,12 +44,13 @@ def retry(times=3, sleep_time=1, backoff_factor=1, err_class=Exception): while _times > 1: _times -= 1 - _sleep_time *= _backoff_factor try: return function(*args, **kwargs) except _err_class, err: Logger.info("Will retry %d time(s), caught exception: %s. Sleeping for %d sec(s)" % (_times, str(err), _sleep_time)) time.sleep(_sleep_time) + if(_sleep_time * _backoff_factor <= max_sleep_time): + _sleep_time *= _backoff_factor return function(*args, **kwargs) return wrapper http://git-wip-us.apache.org/repos/asf/ambari/blob/39b8e5fa/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py ---------------------------------------------------------------------- diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py index 99f90b8..0920e85 100644 --- a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py +++ b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py @@ -23,6 +23,8 @@ from resource_management.libraries.functions.format import format from resource_management.libraries.functions.jmx import get_value_from_jmx from resource_management.core.base import Fail from resource_management.core import shell +from resource_management.core.logger import Logger +from resource_management.libraries.functions.decorator import retry __all__ = ["get_namenode_states", "get_active_namenode", "get_property_for_active_namenode"] @@ -32,8 +34,29 @@ HDFS_NN_STATE_STANDBY = 'standby' NAMENODE_HTTP_FRAGMENT = 'dfs.namenode.http-address.{0}.{1}' NAMENODE_HTTPS_FRAGMENT = 'dfs.namenode.https-address.{0}.{1}' JMX_URI_FRAGMENT = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem" - -def get_namenode_states(hdfs_site, security_enabled, run_user): + +def get_namenode_states(hdfs_site, security_enabled, run_user, times=10, sleep_time=1, backoff_factor=2): + """ + return format [('nn1', 'hdfs://hostname1:port1'), ('nn2', 'hdfs://hostname2:port2')] , [....], [....] + """ + @retry(times=times, sleep_time=sleep_time, backoff_factor=backoff_factor, err_class=Fail) + def doRetries(hdfs_site, security_enabled, run_user): + doRetries.attempt += 1 + active_namenodes, standby_namenodes, unknown_namenodes = get_namenode_states_noretries(hdfs_site, security_enabled, run_user) + Logger.info( + "NameNode HA states: active_namenodes = {0}, standby_namenodes = {1}, unknown_namenodes = {2}".format( + active_namenodes, standby_namenodes, unknown_namenodes)) + if active_namenodes: + return active_namenodes, standby_namenodes, unknown_namenodes + elif doRetries.attempt == times: + Logger.warning("No active NameNode was found after {0} retries. Will return current NameNode HA states".format(times)) + return active_namenodes, standby_namenodes, unknown_namenodes + raise Fail('No active NameNode was found.') + + doRetries.attempt = 0 + return doRetries(hdfs_site, security_enabled, run_user) + +def get_namenode_states_noretries(hdfs_site, security_enabled, run_user): """ return format [('nn1', 'hdfs://hostname1:port1'), ('nn2', 'hdfs://hostname2:port2')] , [....], [....] """ http://git-wip-us.apache.org/repos/asf/ambari/blob/39b8e5fa/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py index b242c87..68d7d62 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_namenode.py @@ -1251,10 +1251,16 @@ class TestNamenode(RMFTestCase): put_structured_out_mock.assert_called_with({"securityState": "UNSECURED"}) - def test_upgrade_restart(self): + @patch("utils.get_namenode_states") + def test_upgrade_restart(self, get_namenode_states_mock): # Execution of nn_ru_lzo invokes a code path that invokes lzo installation, which # was failing in RU case. See hdfs.py and the lzo_enabled check that is in it. # Just executing the script is enough to test the fix + active_namenodes = [('nn1', 'c6401.ambari.apache.org:50070')] + standby_namenodes = [('nn2', 'c6402.ambari.apache.org:50070')] + unknown_namenodes = [] + + get_namenode_states_mock.return_value = active_namenodes, standby_namenodes, unknown_namenodes self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py", classname = "NameNode", command = "restart", @@ -1262,6 +1268,15 @@ class TestNamenode(RMFTestCase): hdp_stack_version = self.STACK_VERSION, target = RMFTestCase.TARGET_COMMON_SERVICES) + unknown_namenodes = active_namenodes + active_namenodes = [] + get_namenode_states_mock.return_value = active_namenodes, standby_namenodes, unknown_namenodes + self.executeScript(self.COMMON_SERVICES_PACKAGE_DIR + "/scripts/namenode.py", + classname = "NameNode", + command = "restart", + config_file = "nn_ru_lzo.json", + hdp_stack_version = self.STACK_VERSION, + target = RMFTestCase.TARGET_COMMON_SERVICES) def test_pre_rolling_restart(self): config_file = self.get_src_folder()+"/test/python/stacks/2.0.6/configs/default.json"