coren has uploaded a new change for review.
https://gerrit.wikimedia.org/r/230556
Change subject: Redo "nrpe: Merge check_systemd_unit_lastrun into _state"
......................................................................
Redo "nrpe: Merge check_systemd_unit_lastrun into _state"
This reverts commit 0d171b748a0e39592267be3f001b8dca7158e2f3.
Change-Id: I91e58d2add83c5eaded426b9dd9d14c9bec356ad
---
M modules/labstore/manifests/fileserver/replicate.pp
D modules/nrpe/files/plugins/check_systemd_unit_lastrun
M modules/nrpe/files/plugins/check_systemd_unit_state
D modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
M modules/nrpe/manifests/monitor_systemd_unit_state.pp
M modules/nrpe/manifests/systemd_scripts.pp
6 files changed, 111 insertions(+), 221 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/56/230556/1
diff --git a/modules/labstore/manifests/fileserver/replicate.pp
b/modules/labstore/manifests/fileserver/replicate.pp
index 903d833..c516f6e 100644
--- a/modules/labstore/manifests/fileserver/replicate.pp
+++ b/modules/labstore/manifests/fileserver/replicate.pp
@@ -13,14 +13,8 @@
declare_service => false,
}
- nrpe::monitor_systemd_unit_lastrun { "replicate-${title}":
- description => "Last backup of the ${title} filesystem",
- warn_secs => 60*60*1,
- crit_secs => 60*60*2,
- }
-
nrpe::monitor_systemd_unit_state { "replicate-${title}":
- description => "Backup of ${title} filesystem",
- expected_state => "success",
+ description => "Last backup of the ${title} filesystem",
+ expected_state => "periodic 3600",
}
}
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_lastrun
b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
deleted file mode 100755
index 201a992..0000000
--- a/modules/nrpe/files/plugins/check_systemd_unit_lastrun
+++ /dev/null
@@ -1,100 +0,0 @@
-#! /usr/bin/python3
-# -*- coding: utf-8 -*-
-#
-# Copyright © 2015 Marc-André Pelletier <[email protected]>
-#
-# Permission to use, copy, modify, and/or distribute this software for any
-# purpose with or without fee is hereby granted, provided that the above
-# copyright notice and this permission notice appear in all copies.
-#
-# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-#
-# THIS FILE IS MANAGED BY PUPPET
-#
-# Source: modules/labstore/storage-replicate
-# From: modules/labstore/manifests/fileserve.rpp
-#
-
-"""
-check_systemd_unit_lastrun
-
-usage: check_systemd_unit_lastrun <unit> <warn> <crit>
-
-Checks that the systemd unit has been run recently
-enough. Warns if the last start/stop activity is older
-than warn seconds, and criticals if it is older than
-crit seconds.
-"""
-
-import argparse
-import time
-import datetime
-import subprocess
-import logging
-import json
-import sys
-
-def main():
-
- parser = argparse.ArgumentParser()
- parser.add_argument('unit', help='Systemd unit to check')
- parser.add_argument('warn', help='Number of seconds past which a warning
should be emitted')
- parser.add_argument('crit', help='Number of seconds past which a critical
should be emitted')
- args = parser.parse_args()
-
- warn = datetime.timedelta(seconds=int(args.warn))
- crit = datetime.timedelta(seconds=int(args.crit))
-
- logging.basicConfig(level=logging.INFO, format='%(message)s')
-
- log = []
-
- try:
- raw = subprocess.check_output(
- ['/bin/journalctl', '--output=json', '--reverse', '--unit',
args.unit],
- stderr=subprocess.STDOUT).decode()
- for entry in raw.splitlines():
- log.append(json.loads(entry))
- except subprocess.CalledProcessError:
- print('LASTRUN UNKNOWN - Unable to get systemd journal for unit "%s"'
% args.unit)
- sys.exit(3)
- except ValueError:
- print('LASTRUN UNKNOWN - Unable to parse systemd journal for unit
"%s"' % args.unit)
- sys.exit(3)
-
- lastrun = None
- for entry in log:
- try:
- if entry['CODE_FUNCTION'] ==
'unit_status_log_starting_stopping_reloading':
- lastrun = int(float(entry['__REALTIME_TIMESTAMP'])/1000000) #
because microseconds
- break
- except (KeyError):
- pass
-
- if not lastrun:
- print('LASTRUN UNKNOWN - No start/stop information for unit "%s"' %
args.unit)
- sys.exit(3)
-
- age = datetime.timedelta(seconds=int(time.time()) - lastrun)
-
- if age > crit:
- print('LASTRUN CRITICAL - Last run more than %s ago' % crit)
- sys.exit(2)
-
- if age > warn:
- print('LASTRUN WARNING - Last run more than %s ago' % warn)
- sys.exit(1)
-
- print('LASTRUN OK - Last run %s ago' % age)
- sys.exit(0)
-
-if __name__ == "__main__":
- main()
-
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_state
b/modules/nrpe/files/plugins/check_systemd_unit_state
index bffe462..cea8908 100755
--- a/modules/nrpe/files/plugins/check_systemd_unit_state
+++ b/modules/nrpe/files/plugins/check_systemd_unit_state
@@ -1,85 +1,126 @@
-#!/usr/bin/perl
-
-# Copyright 2015 Giuseppe Lavagetto
-# Copyright 2015 Wikimedia Foundation, Inc.
+#! /usr/bin/python3
+# -*- coding: utf-8 -*-
#
-# This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY.
-# It may be used, redistributed and/or modified under the terms of the GNU
-# General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
+# Copyright © 2015 Marc-André Pelletier <[email protected]>
#
-# Example usage:
-# check_systemd_unit_state -s apache2 [ -e <active|inactive|success> ]
+# Permission to use, copy, modify, and/or distribute this software for any
+# purpose with or without fee is hereby granted, provided that the above
+# copyright notice and this permission notice appear in all copies.
#
-# Checks the state of a systemd unit and raises an error unless:
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
-# active: the unit is currently running
-# inactive: the unit is not currently running
-# success: the unit is currently running OR its last result is success
+#
+# THIS FILE IS MANAGED BY PUPPET
+#
+# Source: modules/nrpe/files/plugins/check_systemd_unit_state
+# From: modules/nrpe/manifests/systemd_scripts.pp
+#
-use strict;
-use Nagios::Plugin;
+"""
+check_systemd_unit_state
-sub setup{
- my $np = Nagios::Plugin->new(usage => "Usage: %s -s <service> -e
<active|inactive>");
- $np->add_arg(
- spec => 'service|s=s',
- help => '-s SERVICE',
- required => 1,
- );
- $np->add_arg(
- spec => 'expect|e=s',
- help => '-e active|inactive|success',
- default => 'active',
- );
- $np->getopts;
- my @expected = ('active', 'inactive', 'success');
+usage: check_systemd_unit_state <unit> <expect> [<lastrun>]
- $np->nagios_exit(UNKNOWN, "Valid expected states are 'active', 'inactive'
or 'success'")
- unless (grep {$_ eq $np->opts->expect} @expected);
+Checks that the systemd unit <unit> is in the correct state according
+to <expect>:
- return $np;
-}
+ active - Ok if the unit is active and running
+ inactive - Ok if the unit is inactive and dead
+ periodic - Ok if the unit is either:
+ (a) active and running
+ (b) inactive, dead and the last result was success
+ In addition, if <lastrun> is specified, the checks
+ returns Ok iff the unit was started no more than
+ <lastrun> seconds ago (and this information is only
+ valid when a timer exists for the unit)
+"""
-sub get_data {
- my $service = shift;
- open(STATUS, "/bin/systemctl show '$service' |");
+import time
+import datetime
+import subprocess
+import re
+import sys
- my %service_data = {};
- # Parse the systemctl output.
- # Yes this is going to be crude.
- while (<STATUS>) {
- next if /^#/;
- chomp;
- my ($k, $v) = split /=/, $_, 2;
- $service_data{$k} = $v;
- }
- return \%service_data;
-}
+def unknown(msg):
+ print("UNKNOWN - %s" % msg)
+ sys.exit(3)
-my $plugin = setup();
-my $service = $plugin->opts->service;
-my $expect = $plugin->opts->expect;
-my %expected_substates = ( 'active' => 'running', 'inactive' => 'dead' );
+def crit(msg):
+ print("CRITICAL - %s" % msg)
+ sys.exit(2)
-my $service_status = get_data($service);
+def ok(msg):
+ print("OK - %s" % msg)
+ sys.exit(0)
-$plugin->nagios_exit(UNKNOWN, "Service $service is not loaded") unless
$service_status->{LoadState} eq 'loaded';
+def main():
-if ($service_status->{SubState} ne
$expected_substates{$service_status->{ActiveState}}) {
- $plugin->nagios_exit(CRITICAL, "Service $service is
$service_status->{ActiveState}, but its last recorded state is:
$service_status->{SubState}");
-}
+ try:
+ lastrun = None
+ unit = sys.argv[1]
+ expect = sys.argv[2]
+ if expect not in ['active', 'inactive', 'periodic']:
+ unknown("Must expect one of 'active', 'inactive', or 'periodic'")
+ if expect == 'periodic' and len(sys.argv) > 3:
+ lastrun = datetime.timedelta(seconds=int(sys.argv[3]))
+ except (IndexError, ValueError):
+ unknown("Bad arguments to %s (%s)" % (sys.argv[0], ",
".join(sys.argv[1:])))
-if ($service_status->{ActiveState} eq 'active') {
- $plugin->nagios_exit(CRITICAL, "Service $service is active (expected
inactive)") if $expect eq 'inactive';
- $plugin->nagios_exit(OK, "Service $service is active (expected success)")
if $expect eq 'success';
-} else {
- $plugin->nagios_exit(CRITICAL, "Service $service is inactive (expected
active)") if $expect eq 'active';
- if($expect eq 'success') {
- $plugin->nagios_exit(OK, "Last run of service $service was succesful")
if $service_status->{Result} eq 'success';
- $plugin->nagios_exit(CRITICAL, "Service $service failed
($service_status->{Result})");
- }
-}
+ state = {}
+ try:
+ raw = subprocess.check_output(['/bin/systemctl', 'show', unit],
stderr=subprocess.STDOUT).decode()
+ for entry in raw.splitlines():
+ kv = entry.split('=', 1)
+ state[kv[0]] = kv[1]
+ except IndexError:
+ unknown("Unable to parse status of unit %s" % unit)
-$plugin->nagios_exit(OK, "Service $service is in the desired state ($expect)");
+ if expect == 'active':
+
+ if state['ActiveState'] != 'active':
+ crit("Expecting active but unit is %s" % state['ActiveState'])
+ if state['SubState'] != 'running':
+ crit("Unit is active but reported %s'" % state['SubState'])
+ ok("%s is active" % unit)
+
+ elif expect == 'inactive':
+
+ if state['ActiveState'] != 'inactive':
+ crit("Expecting inactive but unit is %s" % state['ActiveState'])
+ if state['SubState'] != 'dead':
+ crit("Unit is inactive but reported %s'" % state['SubState'])
+ ok("%s is inactive" % unit)
+
+ # else periodic
+
+ if state['ActiveState'] == 'active':
+ ok("Unit is currently active")
+ if state['ActiveState'] != 'inactive':
+ crit("Unit is in state %s" % state['ActiveState'])
+ if state['Result'] != 'success':
+ crit("Last run result was %s" % state['Result'])
+
+ if lastrun:
+ try:
+ # Timestamps in systemctl show are in format 'Thu 2015-07-30
16:56:59 UTC'
+ started =
datetime.datetime.strptime(state['ExecMainStartTimestamp'], '%a %Y-%m-%d
%H:%M:%S %Z')
+ age = datetime.datetime.fromtimestamp(int(time.time())) - started
+
+ if age > lastrun:
+ crit("Last run was over %s ago" % lastrun)
+
+ except (KeyError, ValueError):
+ unknown("Unit has no usable last run information (not a timer?)")
+
+ ok("Last run successful")
+
+if __name__ == "__main__":
+ main()
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
deleted file mode 100644
index 7a6fe41..0000000
--- a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
+++ /dev/null
@@ -1,36 +0,0 @@
-# === Define: nrpe::monitor_systemd_unit_lastrun
-#
-# Installs a check for last run time of a systemd unit using journalctl
-define nrpe::monitor_systemd_unit_lastrun(
- $unit = $title,
- $description = "${unit} last run",
- $contact_group = 'admins',
- $retries = 3,
- $timeout = 10,
- $critical = false,
- $ensure = 'present',
- $warn_secs = 60*60*25,
- $crit_secs = 60*60*49,
- ){
-
- if $::initsystem != 'systemd' {
- fail('nrpe::monitor_systemd_unit_lastrun can only work on
systemd-enabled systems')
- }
- require nrpe::systemd_scripts
-
- # Temporary hack until we fix the downstream modules
- if $critical {
- $nagios_critical = 'true'
- } else {
- $nagios_critical = 'false'
- }
-
- nrpe::monitor_service { "${unit}-lastrun":
- ensure => $ensure,
- description => $description,
- nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_lastrun
'${unit}' ${warn_secs} ${crit_secs}",
- retries => $retries,
- timeout => $timeout,
- critical => $nagios_critical,
- }
-}
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
index 5f1d855..66de6a5 100644
--- a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
+++ b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
@@ -27,7 +27,7 @@
nrpe::monitor_service { "${unit}-state":
ensure => $ensure,
description => $description,
- nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state -s
'${unit}' -e ${expected_state}",
+ nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state
'${unit}' ${expected_state}",
retries => $retries,
timeout => $timeout,
critical => $nagios_critical,
diff --git a/modules/nrpe/manifests/systemd_scripts.pp
b/modules/nrpe/manifests/systemd_scripts.pp
index 0f355b5..0f2ee2c 100644
--- a/modules/nrpe/manifests/systemd_scripts.pp
+++ b/modules/nrpe/manifests/systemd_scripts.pp
@@ -4,20 +4,11 @@
#
class nrpe::systemd_scripts {
- require_package 'libnagios-plugin-perl'
# These scripts allows monitoring of systemd services
file { '/usr/local/bin/nrpe_check_systemd_unit_state':
ensure => present,
source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_state',
- owner => 'root',
- group => 'root',
- mode => '0555',
- }
-
- file { '/usr/local/bin/nrpe_check_systemd_unit_lastrun':
- ensure => present,
- source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_lastrun',
owner => 'root',
group => 'root',
mode => '0555',
--
To view, visit https://gerrit.wikimedia.org/r/230556
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I91e58d2add83c5eaded426b9dd9d14c9bec356ad
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: coren <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits