Repository: ambari Updated Branches: refs/heads/branch-2.0.maint d0f32d236 -> 138b8ec86
AMBARI-10464 - Ambari Agent holding socket open on 50070 prevents NN from starting (jonathanhurley) Project: http://git-wip-us.apache.org/repos/asf/ambari/repo Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/138b8ec8 Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/138b8ec8 Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/138b8ec8 Branch: refs/heads/branch-2.0.maint Commit: 138b8ec86c74d841bd6a7381aea727d0a05ea7c4 Parents: d0f32d2 Author: Jonathan Hurley <jhur...@hortonworks.com> Authored: Tue Apr 14 11:00:59 2015 -0400 Committer: Jonathan Hurley <jhur...@hortonworks.com> Committed: Tue Apr 21 08:30:33 2015 -0400 ---------------------------------------------------------------------- .../src/main/python/ambari_agent/alerts/metric_alert.py | 4 +++- .../HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py | 4 +++- .../HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py | 4 +++- .../HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py | 5 +++-- .../YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py | 4 +++- .../2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py | 6 ++++-- .../0.8/services/HDFS/package/files/alert_checkpoint_time.py | 4 +++- .../services/HDFS/package/files/alert_ha_namenode_health.py | 4 +++- .../0.8/services/WEBHCAT/package/files/alert_webhcat_server.py | 5 +++-- .../services/YARN/package/files/alert_nodemanager_health.py | 4 +++- 10 files changed, 31 insertions(+), 13 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py ---------------------------------------------------------------------- diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py index 8b5f15d..33f7508 100644 --- a/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py +++ b/ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py @@ -31,6 +31,8 @@ from resource_management.libraries.functions.get_port_from_url import get_port_f logger = logging.getLogger() +CONNECTION_TIMEOUT = 5.0 + class MetricAlert(BaseAlert): def __init__(self, alert_meta, alert_source_meta): @@ -157,7 +159,7 @@ class MetricAlert(BaseAlert): response = None try: url_opener = urllib2.build_opener(RefreshHeaderProcessor()) - response = url_opener.open(url) + response = url_opener.open(url, timeout=CONNECTION_TIMEOUT) content = response.read() finally: # explicitely close the connection as we've seen python hold onto these http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py index 032310d..2455d3b 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py @@ -36,6 +36,8 @@ PERCENT_CRITICAL = 200 CHECKPOINT_TX_DEFAULT = 1000000 CHECKPOINT_PERIOD_DEFAULT = 21600 +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -133,7 +135,7 @@ def get_value_from_jmx(query, jmx_property): response = None try: - response = urllib2.urlopen(query) + response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py index 058b7b2..2066d46 100644 --- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py +++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py @@ -35,6 +35,8 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -163,7 +165,7 @@ def get_value_from_jmx(query, jmx_property): response = None try: - response = urllib2.urlopen(query) + response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py index e991f53..dd20be4 100644 --- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py +++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py @@ -53,7 +53,8 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}} WEBHCAT_OK_RESPONSE = 'ok' WEBHCAT_PORT_DEFAULT = 50111 -CURL_CONNECTION_TIMEOUT = '10' +CURL_CONNECTION_TIMEOUT = '5' +CONNECTION_TIMEOUT = 5.0 def get_tokens(): """ @@ -177,7 +178,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes WebHCat status start_time = time.time() - url_response = urllib2.urlopen(query_url) + url_response = urllib2.urlopen(query_url, timeout=CONNECTION_TIMEOUT) total_time = time.time() - start_time json_response = json.loads(url_response.read()) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py index 8c72f4c..516d858 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py @@ -40,6 +40,8 @@ CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager hea NODEMANAGER_DEFAULT_PORT = 8042 +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -106,7 +108,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes templeton status - url_response = urllib2.urlopen(query) + url_response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError)) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py index b297b0c..7c00625 100644 --- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py +++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py @@ -29,6 +29,8 @@ OK_LABEL = 'All NodeManagers are healthy' NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address}}' NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}' YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}' + +CONNECTION_TIMEOUT = 5.0 def get_tokens(): """ @@ -99,7 +101,7 @@ def execute(parameters=None, host_name=None): label = str(e) result_code = 'UNKNOWN' - return ((result_code, [label])) + return (result_code, [label]) def get_value_from_jmx(query, jmx_property): @@ -109,7 +111,7 @@ def get_value_from_jmx(query, jmx_property): # use a customer header process that will look for the non-standard # "Refresh" header and attempt to follow the redirect url_opener = urllib2.build_opener(RefreshHeaderProcessor()) - response = url_opener.open(query) + response = url_opener.open(query, timeout=CONNECTION_TIMEOUT) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py index 032310d..2455d3b 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py @@ -36,6 +36,8 @@ PERCENT_CRITICAL = 200 CHECKPOINT_TX_DEFAULT = 1000000 CHECKPOINT_PERIOD_DEFAULT = 21600 +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -133,7 +135,7 @@ def get_value_from_jmx(query, jmx_property): response = None try: - response = urllib2.urlopen(query) + response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py index 058b7b2..2066d46 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py @@ -35,6 +35,8 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}' NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}' DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}' +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -163,7 +165,7 @@ def get_value_from_jmx(query, jmx_property): response = None try: - response = urllib2.urlopen(query) + response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) data = response.read() data_dict = json.loads(data) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py index e991f53..dd20be4 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py @@ -53,7 +53,8 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}} WEBHCAT_OK_RESPONSE = 'ok' WEBHCAT_PORT_DEFAULT = 50111 -CURL_CONNECTION_TIMEOUT = '10' +CURL_CONNECTION_TIMEOUT = '5' +CONNECTION_TIMEOUT = 5.0 def get_tokens(): """ @@ -177,7 +178,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes WebHCat status start_time = time.time() - url_response = urllib2.urlopen(query_url) + url_response = urllib2.urlopen(query_url, timeout=CONNECTION_TIMEOUT) total_time = time.time() - start_time json_response = json.loads(url_response.read()) http://git-wip-us.apache.org/repos/asf/ambari/blob/138b8ec8/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py ---------------------------------------------------------------------- diff --git a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py index 8c72f4c..516d858 100644 --- a/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py +++ b/ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py @@ -40,6 +40,8 @@ CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager hea NODEMANAGER_DEFAULT_PORT = 8042 +CONNECTION_TIMEOUT = 5.0 + def get_tokens(): """ Returns a tuple of tokens in the format {{site/property}} that will be used @@ -106,7 +108,7 @@ def execute(parameters=None, host_name=None): try: # execute the query for the JSON that includes templeton status - url_response = urllib2.urlopen(query) + url_response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT) except urllib2.HTTPError, httpError: label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query, str(httpError))