Filippo Giunchedi has submitted this change and it was merged.

Change subject: elasticsearch: add percent-based shard check
......................................................................


elasticsearch: add percent-based shard check

One of the problems that have been observed is the cluster going temporarily
red while elasticsearch shuffles shards around and the nagios check firing a
false positive.

This plugin add functionality to check non-active shards (i.e. everything that
ES reports as not in its place), and critically the ability to check
percentages of the total shards.

This should reduce the number of false positives, the plan is to extend the
plugin with more in-depth checks on the index/shard/etc status and so on.

The plugin by default also checks the cluster status as a whole if it is
non-green, however that can be disabled to avoid false positives for the
reasons explained above.

Change-Id: Icf4240e08dd612ab43825db63b0eae3608b73225
---
A modules/elasticsearch/files/nagios/check_elasticsearch.py
A modules/elasticsearch/files/nagios/check_elasticsearch_test.py
2 files changed, 182 insertions(+), 0 deletions(-)

Approvals:
  Chad: Looks good to me, but someone else must approve
  Filippo Giunchedi: Verified; Looks good to me, approved
  Manybubbles: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch.py 
b/modules/elasticsearch/files/nagios/check_elasticsearch.py
new file mode 100644
index 0000000..de59922
--- /dev/null
+++ b/modules/elasticsearch/files/nagios/check_elasticsearch.py
@@ -0,0 +1,158 @@
+#!/usr/bin/python
+
+# Author: Filippo Giunchedi <fili...@wikimedia.org>
+# Copyright 2014 Wikimedia Foundation
+# Copyright 2014 Filippo Giunchedi
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+import argparse
+import re
+import operator
+import sys
+
+import requests
+
+
+EX_OK = 0
+EX_WARNING = 1
+EX_CRITICAL = 2
+EX_UNKNOWN = 3
+
+
+class Threshold(object):
+    '''Implement a simple threshold parser/checker with common predicates and
+    percentages.'''
+
+    PREDICATES = {
+        '<=': operator.le,
+        '>=': operator.ge,
+        '>': operator.gt,
+        '<': operator.lt,
+        '==': operator.eq,
+        }
+
+    def __init__(self, threshold):
+        self.threshold_string = threshold
+        self.predicate = None
+        self.threshold = None
+        self.percent = None
+        self.FORMAT_RE = re.compile(
+            r'^(%s)?\s*([\d.]+)\s*(%%)?' % '|'.join(self.PREDICATES))
+        self._parse(threshold)
+
+    def breach(self, value, total=None):
+        if total is None and self.percent is not None:
+            raise ValueError('threshold %r has percent but no total provided' %
+                             self.threshold_string)
+        if total is not None:
+            value = float(value) / total
+        return self.predicate(value, self.threshold)
+
+    def _parse(self, threshold):
+        m = self.FORMAT_RE.match(threshold)
+        if not m:
+            raise ValueError('unable to parse threshold: %r' % threshold)
+        predicate, value, percent = m.groups()
+        try:
+            value = float(value)
+        except ValueError, e:
+            raise ValueError('unable to parse as float: %r' % value)
+        self.predicate = self.PREDICATES.get(predicate, operator.eq)
+        self.threshold = value
+        self.percent = percent
+
+
+def check_status(health):
+    if health['status'] != 'green':
+        return EX_CRITICAL
+    return EX_OK
+
+
+def log_critical(log):
+    print 'CRITICAL - elasticsearch %s' % log
+
+
+def log_ok(log):
+    print 'OK - elasticsearch %s' % log
+
+
+def check_shards_inactive(health, threshold):
+    total_shards = 0
+    inactive_shards = 0
+    for s in 'relocating', 'initializing', 'unassigned':
+        inactive_shards += health['%s_shards' % s]
+        total_shards += health['%s_shards' % s]
+    total_shards += health['active_shards']
+    t = Threshold(threshold)
+    if not t.breach(inactive_shards, total_shards):
+        return EX_OK
+
+    log_critical('inactive shards %s threshold %s breach: %r' % (
+                 inactive_shards, threshold, health))
+    return EX_CRITICAL
+
+
+def check_elasticsearch(options):
+    try:
+        cluster_health_url = options.url + '/_cluster/health'
+        response = requests.get(cluster_health_url,
+                                timeout=options.timeout)
+        response.raise_for_status()
+    except requests.exceptions.RequestException, e:
+        log_critical('%s error while fetching: %s' % (cluster_health_url, e))
+        return EX_CRITICAL
+
+    try:
+        cluster_health = response.json()
+    except ValueError, e:
+        log_critical('%s error while decoding json: %s' % (
+            cluster_health_url, e))
+        return EX_CRITICAL
+
+    r = check_shards_inactive(cluster_health, options.shards_inactive)
+    if r != EX_OK:
+        return r
+
+    if not options.ignore_status:
+        r = check_status(cluster_health)
+        if r != EX_OK:
+            return r
+
+    log_ok('status %s: %r' % (cluster_health['cluster_name'], cluster_health))
+    return EX_OK
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--url', default='http://localhost:9200',
+                        help='Elasticsearch endpoint')
+    parser.add_argument('--timeout', default=2, type=int, metavar='SECONDS',
+                        help='Timeout for the request to complete')
+    parser.add_argument('--shards-inactive', default='>=0.1%',
+                        dest='shards_inactive', metavar='THRESHOLD',
+                        help='Threshold to check for inactive shards '
+                             '(i.e. initializing/relocating/unassigned)')
+    parser.add_argument('--ignore-status', default=False, action='store_true',
+                        dest='ignore_status',
+                        help='Do not check elasticsearch cluster status')
+    options = parser.parse_args()
+
+    return check_elasticsearch(options)
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/modules/elasticsearch/files/nagios/check_elasticsearch_test.py 
b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py
new file mode 100644
index 0000000..89214ba
--- /dev/null
+++ b/modules/elasticsearch/files/nagios/check_elasticsearch_test.py
@@ -0,0 +1,24 @@
+import unittest
+
+from check_elasticsearch import Threshold
+
+
+class ThresholdTest(unittest.TestCase):
+    def testBasicThreshold(self):
+        self.assertFalse(self._breach('>0', 0))
+        self.assertTrue(self._breach('0', 0))
+        self.assertFalse(self._breach('>=2', 0))
+        self.assertFalse(self._breach('2', 0))
+
+    def testInvalidThreshold(self):
+        self.assertRaises(ValueError, self._breach, '')
+        self.assertRaises(ValueError, self._breach, '>')
+        self.assertRaises(ValueError, self._breach, '%123')
+
+    def testPercentThreshold(self):
+        self.assertRaises(ValueError, self._breach, '>0%', 0)
+        self.assertTrue(self._breach('>=0.34%', 42, 123))
+        self.assertFalse(self._breach('1%', 1, 100))
+
+    def _breach(self, threshold, *args):
+        return Threshold(threshold).breach(*args)

-- 
To view, visit https://gerrit.wikimedia.org/r/154786
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icf4240e08dd612ab43825db63b0eae3608b73225
Gerrit-PatchSet: 2
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: Chad <ch...@wikimedia.org>
Gerrit-Reviewer: Filippo Giunchedi <fgiunch...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to