Repository: ambari Updated Branches: refs/heads/trunk 6609364a0 -> 00fe3df41
AMBARI-5108. HBaseRegionServer requires multiple retries to be stopped during reassigning NameNode after EnablingHA (aonishuk) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/00fe3df4 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/00fe3df4 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/00fe3df4 Branch: refs/heads/trunk Commit: 00fe3df41b1f75c063d4a8f84102ec97f4f98559 Parents: 6609364 Author: Andrew Onischuk <[email protected]> Authored: Sun Mar 16 10:59:25 2014 -0700 Committer: Andrew Onischuk <[email protected]> Committed: Sun Mar 16 10:59:25 2014 -0700 ---------------------------------------------------------------------- .../core/providers/system.py | 11 +++++++++- .../core/resources/system.py | 6 ++++++ .../python/resource_management/core/shell.py | 21 +++++++++++++------- .../HBASE/package/scripts/hbase_service.py | 21 ++++++++++++-------- .../stacks/2.0.6/HBASE/test_hbase_master.py | 16 +++++++++++---- .../2.0.6/HBASE/test_hbase_regionserver.py | 16 +++++++++++---- 6 files changed, 67 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/providers/system.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/resource_management/core/providers/system.py b/ambari-agent/src/main/python/resource_management/core/providers/system.py index a37ba85..fee24be 100644 --- a/ambari-agent/src/main/python/resource_management/core/providers/system.py +++ b/ambari-agent/src/main/python/resource_management/core/providers/system.py @@ -27,6 +27,7 @@ import os import pwd import time import shutil +from subprocess import TimeoutExpired from resource_management.core import shell from resource_management.core.base import Fail from resource_management.core.providers import Provider @@ -231,7 +232,7 @@ class ExecuteProvider(Provider): shell.checked_call(self.resource.command, logoutput=self.resource.logoutput, cwd=self.resource.cwd, env=self.resource.environment, preexec_fn=_preexec_fn(self.resource), user=self.resource.user, - wait_for_finish=self.resource.wait_for_finish) + wait_for_finish=self.resource.wait_for_finish, timeout=self.resource.timeout) break except Fail as ex: if i == self.resource.tries-1: # last try @@ -239,6 +240,14 @@ class ExecuteProvider(Provider): else: Logger.info("Retrying after %d seconds. Reason: %s" % (self.resource.try_sleep, str(ex))) time.sleep(self.resource.try_sleep) + except TimeoutExpired: + err_msg = ("Execution of '%s' was killed due timeout after %d seconds") % (self.resource.command, self.resource.timeout) + + if self.resource.on_timeout: + Logger.info("Executing '%s'. Reason: %s" % (self.resource.on_timeout, err_msg)) + shell.checked_call(self.resource.on_timeout) + else: + raise Fail(err_msg) class ExecuteScriptProvider(Provider): http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/resources/system.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/resource_management/core/resources/system.py b/ambari-agent/src/main/python/resource_management/core/resources/system.py index 45d7a60..0952c48 100644 --- a/ambari-agent/src/main/python/resource_management/core/resources/system.py +++ b/ambari-agent/src/main/python/resource_management/core/resources/system.py @@ -85,6 +85,12 @@ class Execute(Resource): actions = Resource.actions + ["run"] logoutput = BooleanArgument(default=False) """ + if on_timeout is not set leads to failing after x seconds, + otherwise calls on_timeout + """ + timeout = ResourceArgument() # seconds + on_timeout = ResourceArgument() + """ Wait for command to finish or not. NOTE: http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-agent/src/main/python/resource_management/core/shell.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/resource_management/core/shell.py b/ambari-agent/src/main/python/resource_management/core/shell.py index 68d3f7b..77f2a9c 100644 --- a/ambari-agent/src/main/python/resource_management/core/shell.py +++ b/ambari-agent/src/main/python/resource_management/core/shell.py @@ -24,20 +24,21 @@ __all__ = ["checked_call", "call"] import subprocess import pipes +from subprocess import TimeoutExpired from exceptions import Fail from resource_management.core.logger import Logger def checked_call(command, logoutput=False, - cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True): - return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish) + cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None): + return _call(command, logoutput, True, cwd, env, preexec_fn, user, wait_for_finish, timeout) def call(command, logoutput=False, - cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True): - return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish) + cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None): + return _call(command, logoutput, False, cwd, env, preexec_fn, user, wait_for_finish, timeout) def _call(command, logoutput=False, throw_on_failure=True, - cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True): + cwd=None, env=None, preexec_fn=None, user=None, wait_for_finish=True, timeout=None): """ Execute shell command @@ -63,8 +64,14 @@ def _call(command, logoutput=False, throw_on_failure=True, if not wait_for_finish: return None, None + - out = proc.communicate()[0].strip('\n') + try: + out = proc.communicate(timeout=timeout)[0].strip('\n') + except TimeoutExpired as ex: + proc.terminate() + raise ex + code = proc.returncode if logoutput and out: @@ -74,4 +81,4 @@ def _call(command, logoutput=False, throw_on_failure=True, err_msg = ("Execution of '%s' returned %d. %s") % (command[-1], code, out) raise Fail(err_msg) - return code, out + return code, out \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py index 17f0056..d0a6b50 100644 --- a/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py +++ b/ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HBASE/package/scripts/hbase_service.py @@ -29,18 +29,23 @@ def hbase_service( role = name cmd = format("{daemon_script} --config {conf_dir}") pid_file = format("{pid_dir}/hbase-{hbase_user}-{role}.pid") - - daemon_cmd = None - no_op_test = None + no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}` >/dev/null 2>&1") if action == 'start': daemon_cmd = format("{cmd} start {role}") - no_op_test = format("ls {pid_file} >/dev/null 2>&1 && ps `cat {pid_file}` >/dev/null 2>&1") - elif action == 'stop': - daemon_cmd = format("{cmd} stop {role} && rm -f {pid_file}") - - if daemon_cmd is not None: + Execute ( daemon_cmd, not_if = no_op_test, user = params.hbase_user ) + elif action == 'stop': + daemon_cmd = format("{cmd} stop {role}") + + Execute ( daemon_cmd, + user = params.hbase_user, + # BUGFIX: hbase regionserver sometimes hangs when nn is in safemode + timeout = 30, + on_timeout = format("{no_op_test} && kill -9 `cat {pid_file}`") + ) + + Execute (format("rm -f {pid_file}")) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py index 1084ed3..2d77e99 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_master.py @@ -53,9 +53,13 @@ class TestHBaseMaster(RMFTestCase): config_file="default.json" ) - self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master && rm -f /var/run/hbase/hbase-hbase-master.pid', - not_if = None, + self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master', user = 'hbase', + on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-master.pid`', + timeout = 30, + ) + + self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid', ) self.assertNoMoreResources() @@ -136,9 +140,13 @@ class TestHBaseMaster(RMFTestCase): config_file="secured.json" ) - self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master && rm -f /var/run/hbase/hbase-hbase-master.pid', - not_if = None, + self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop master', user = 'hbase', + on_timeout = 'ls /var/run/hbase/hbase-hbase-master.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-master.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-master.pid`', + timeout = 30, + ) + + self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-master.pid', ) self.assertNoMoreResources() http://git-wip-us.apache.org/repos/asf/ambari/blob/00fe3df4/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py index 4ced781..920312a 100644 --- a/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py +++ b/ambari-server/src/test/python/stacks/2.0.6/HBASE/test_hbase_regionserver.py @@ -53,9 +53,13 @@ class TestHbaseRegionServer(RMFTestCase): config_file="default.json" ) - self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid', - not_if = None, + self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver', user = 'hbase', + on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', + timeout = 30, + ) + + self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid', ) self.assertNoMoreResources() @@ -90,9 +94,13 @@ class TestHbaseRegionServer(RMFTestCase): config_file="secured.json" ) - self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver && rm -f /var/run/hbase/hbase-hbase-regionserver.pid', - not_if = None, + self.assertResourceCalled('Execute', '/usr/lib/hbase/bin/hbase-daemon.sh --config /etc/hbase/conf stop regionserver', user = 'hbase', + on_timeout = 'ls /var/run/hbase/hbase-hbase-regionserver.pid >/dev/null 2>&1 && ps `cat /var/run/hbase/hbase-hbase-regionserver.pid` >/dev/null 2>&1 && kill -9 `cat /var/run/hbase/hbase-hbase-regionserver.pid`', + timeout = 30, + ) + + self.assertResourceCalled('Execute', 'rm -f /var/run/hbase/hbase-hbase-regionserver.pid', ) self.assertNoMoreResources()
