Jcrespo has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/355246 )
Change subject: raid-check: optionally return critical when not in a write
policy
......................................................................
raid-check: optionally return critical when not in a write policy
Failing to a different write policy happens silently in megacli
checks (for example, if BBU is flat, damaged, too hot, etc.). In
some hosts (databases), a policy change means horrible
performance, so bad that it can cause an outage due to the heavy
IO now being as slow as a spinning disk.
This script now detects the current active policy, and returns a
critical alert (not a warning) if the write policy specicied does
not match the one requested. By default, it conserves the previous
behaviour (not caring about that), but it can be enabled for hosts
that want, for example, a WriteBack or a WriteThrough policy.
It will also catch a manual misconfiguration (BBU is ok, but has
been configured incorrectly).
Bug: T166108
Change-Id: I6f1c74ec2e4f2982a0cf83b52b566c78f3858133
---
M modules/raid/files/check-raid.py
1 file changed, 47 insertions(+), 11 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/46/355246/1
diff --git a/modules/raid/files/check-raid.py b/modules/raid/files/check-raid.py
index 6868fed..d3974b9 100644
--- a/modules/raid/files/check-raid.py
+++ b/modules/raid/files/check-raid.py
@@ -1,5 +1,6 @@
#!/usr/bin/python
+import argparse
import os
import os.path
import re
@@ -9,14 +10,11 @@
def main():
- try:
- argv_driver = sys.argv[1]
- except:
- argv_driver = None
-
+
+ options = parse_args()
osName = os.uname()[0]
- if argv_driver:
- driver = argv_driver
+ if options.driver:
+ driver = options.driver
elif osName == 'SunOS':
driver = 'zpool'
elif osName == 'Linux':
@@ -35,7 +33,7 @@
elif driver == 'twe':
status = check3ware()
elif driver == 'megacli':
- status = checkMegaSas()
+ status = checkMegaSas(options.policy)
elif driver == 'zpool':
status = checkZfs()
elif driver == 'mpt':
@@ -54,6 +52,23 @@
if status == 0:
print 'OK'
sys.exit(status)
+
+
+def parse_args():
+ """Parse command line arguments"""
+
+ parser = argparse.ArgumentParser(
+ description=('Checks the state of the raid, trying to autodetect'
+ 'of all detected MegaRAID controllers'))
+ parser.add_argument(
+ 'driver', nargs='?', default=None,
+ help='Optional argument indicating the driver to use.')
+ parser.add_argument(
+ '-p', '--policy', default=None,
+ help=('Check that the given cache write policy is currently applied '
+ '(for example WriteBack or WriteThrough)'))
+
+ return parser.parse_args()
def autoDetectDriver():
@@ -262,7 +277,7 @@
return 0
-def checkMegaSas():
+def checkMegaSas(policy=None):
try:
proc = subprocess.Popen(['/usr/sbin/megacli',
'-LDInfo', '-LALL', '-aALL', '-NoLog'],
@@ -275,8 +290,11 @@
stateRegex = re.compile('^State\s*:\s*([^\n]*)')
drivesRegex = re.compile('^Number Of Drives( per span)?\s*:\s*([^\n]*)')
configuredRegex = re.compile('^Adapter \d+: No Virtual Drive Configured')
- numPD = numLD = failedLD = 0
+ writePolicyRegex = re.compile('^Current Cache Policy\s*:\s*([^,]*)')
+
+ numPD = numLD = failedLD = wrongPolicyLD = 0
states = []
+ currentWrongPolicies = []
lines = 0
match = False
@@ -305,6 +323,16 @@
match = True
continue
+ if policy is not None:
+ m = writePolicyRegex.match(line)
+ if m is not None:
+ match = True
+ currentPolicy = m.group(1)
+ if currentPolicy != policy:
+ wrongPolicyLD += 1
+ currentWrongPolicies.append(currentPolicy)
+ continue
+
ret = proc.wait()
if ret != 0:
print 'WARNING: megacli returned exit status %d' % (ret)
@@ -326,7 +354,15 @@
print 'CRITICAL: %d failed LD(s) (%s)' % (failedLD, ", ".join(states))
return 2
- print 'OK: optimal, %d logical, %d physical' % (numLD, numPD)
+ if wrongPolicyLD > 0:
+ print 'CRITICAL: %d LD(s) not in %s policy (%s)' % (
+ wrongPolicyLD, policy, ", ".join(currentWrongPolicies))
+ return 2
+
+ if policy is None:
+ print 'OK: optimal, %d logical, %d physical' % (numLD, numPD)
+ else:
+ print 'OK: optimal, %d logical, %d physical, %s policy' % (numLD,
numPD, policy)
return 0
--
To view, visit https://gerrit.wikimedia.org/r/355246
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I6f1c74ec2e4f2982a0cf83b52b566c78f3858133
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Jcrespo <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits