coren has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/230556

Change subject: Redo "nrpe: Merge check_systemd_unit_lastrun into _state"
......................................................................

Redo "nrpe: Merge check_systemd_unit_lastrun into _state"

This reverts commit 0d171b748a0e39592267be3f001b8dca7158e2f3.

Change-Id: I91e58d2add83c5eaded426b9dd9d14c9bec356ad
---
M modules/labstore/manifests/fileserver/replicate.pp
D modules/nrpe/files/plugins/check_systemd_unit_lastrun
M modules/nrpe/files/plugins/check_systemd_unit_state
D modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
M modules/nrpe/manifests/monitor_systemd_unit_state.pp
M modules/nrpe/manifests/systemd_scripts.pp
6 files changed, 111 insertions(+), 221 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/56/230556/1

diff --git a/modules/labstore/manifests/fileserver/replicate.pp 
b/modules/labstore/manifests/fileserver/replicate.pp
index 903d833..c516f6e 100644
--- a/modules/labstore/manifests/fileserver/replicate.pp
+++ b/modules/labstore/manifests/fileserver/replicate.pp
@@ -13,14 +13,8 @@
         declare_service => false,
     }
 
-    nrpe::monitor_systemd_unit_lastrun { "replicate-${title}":
-        description => "Last backup of the ${title} filesystem",
-        warn_secs   => 60*60*1,
-        crit_secs   => 60*60*2,
-    }
-
     nrpe::monitor_systemd_unit_state { "replicate-${title}":
-        description    => "Backup of ${title} filesystem",
-        expected_state => "success",
+        description    => "Last backup of the ${title} filesystem",
+        expected_state => "periodic 3600",
     }
 }
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_lastrun 
b/modules/nrpe/files/plugins/check_systemd_unit_lastrun
deleted file mode 100755
index 201a992..0000000
--- a/modules/nrpe/files/plugins/check_systemd_unit_lastrun
+++ /dev/null
@@ -1,100 +0,0 @@
-#! /usr/bin/python3
-# -*- coding: utf-8 -*-
-#
-#  Copyright © 2015 Marc-André Pelletier <[email protected]>
-#
-#  Permission to use, copy, modify, and/or distribute this software for any
-#  purpose with or without fee is hereby granted, provided that the above
-#  copyright notice and this permission notice appear in all copies.
-#
-#  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
-#  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
-#  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
-#  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
-#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
-#  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
-#  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
-#
-#
-#  THIS FILE IS MANAGED BY PUPPET
-#
-#  Source: modules/labstore/storage-replicate
-#  From:   modules/labstore/manifests/fileserve.rpp
-#
-
-"""
-check_systemd_unit_lastrun
-
-usage: check_systemd_unit_lastrun <unit> <warn> <crit>
-
-Checks that the systemd unit has been run recently
-enough.  Warns if the last start/stop activity is older
-than warn seconds, and criticals if it is older than
-crit seconds.
-"""
-
-import argparse
-import time
-import datetime
-import subprocess
-import logging
-import json
-import sys
-
-def main():
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument('unit', help='Systemd unit to check')
-    parser.add_argument('warn', help='Number of seconds past which a warning 
should be emitted')
-    parser.add_argument('crit', help='Number of seconds past which a critical 
should be emitted')
-    args = parser.parse_args()
-
-    warn = datetime.timedelta(seconds=int(args.warn))
-    crit = datetime.timedelta(seconds=int(args.crit))
-
-    logging.basicConfig(level=logging.INFO, format='%(message)s')
-
-    log = []
-
-    try:
-        raw = subprocess.check_output(
-            ['/bin/journalctl', '--output=json', '--reverse', '--unit', 
args.unit],
-            stderr=subprocess.STDOUT).decode()
-        for entry in raw.splitlines():
-            log.append(json.loads(entry))
-    except subprocess.CalledProcessError:
-        print('LASTRUN UNKNOWN - Unable to get systemd journal for unit "%s"' 
% args.unit)
-        sys.exit(3)
-    except ValueError:
-        print('LASTRUN UNKNOWN - Unable to parse systemd journal for unit 
"%s"' % args.unit)
-        sys.exit(3)
-
-    lastrun = None
-    for entry in log:
-        try:
-            if entry['CODE_FUNCTION'] == 
'unit_status_log_starting_stopping_reloading':
-                lastrun = int(float(entry['__REALTIME_TIMESTAMP'])/1000000) # 
because microseconds
-                break
-        except (KeyError):
-            pass
-
-    if not lastrun:
-        print('LASTRUN UNKNOWN - No start/stop information for unit "%s"' % 
args.unit)
-        sys.exit(3)
-
-    age = datetime.timedelta(seconds=int(time.time()) - lastrun)
-
-    if age > crit:
-        print('LASTRUN CRITICAL - Last run more than %s ago' % crit)
-        sys.exit(2)
-
-    if age > warn:
-        print('LASTRUN WARNING - Last run more than %s ago' % warn)
-        sys.exit(1)
-
-    print('LASTRUN OK - Last run %s ago' % age)
-    sys.exit(0)
-
-if __name__ == "__main__":
-    main()
-
diff --git a/modules/nrpe/files/plugins/check_systemd_unit_state 
b/modules/nrpe/files/plugins/check_systemd_unit_state
index bffe462..cea8908 100755
--- a/modules/nrpe/files/plugins/check_systemd_unit_state
+++ b/modules/nrpe/files/plugins/check_systemd_unit_state
@@ -1,85 +1,126 @@
-#!/usr/bin/perl
-
-# Copyright 2015 Giuseppe Lavagetto
-# Copyright 2015 Wikimedia Foundation, Inc.
+#! /usr/bin/python3
+# -*- coding: utf-8 -*-
 #
-# This nagios plugin is free software, and comes with ABSOLUTELY NO WARRANTY.
-# It may be used, redistributed and/or modified under the terms of the GNU
-# General Public Licence (see http://www.fsf.org/licensing/licenses/gpl.txt).
+#  Copyright © 2015 Marc-André Pelletier <[email protected]>
 #
-# Example usage:
-#   check_systemd_unit_state -s apache2 [ -e <active|inactive|success> ]
+#  Permission to use, copy, modify, and/or distribute this software for any
+#  purpose with or without fee is hereby granted, provided that the above
+#  copyright notice and this permission notice appear in all copies.
 #
-# Checks the state of a systemd unit and raises an error unless:
+#  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+#  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+#  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+#  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+#  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+#  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+#  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 #
-#    active: the unit is currently running
-#  inactive: the unit is not currently running
-#   success: the unit is currently running OR its last result is success
+#
+#  THIS FILE IS MANAGED BY PUPPET
+#
+#  Source: modules/nrpe/files/plugins/check_systemd_unit_state
+#  From:   modules/nrpe/manifests/systemd_scripts.pp
+#
 
-use strict;
-use Nagios::Plugin;
+"""
+check_systemd_unit_state
 
-sub setup{
-    my $np = Nagios::Plugin->new(usage => "Usage: %s -s <service> -e 
<active|inactive>");
-    $np->add_arg(
-                 spec     => 'service|s=s',
-                 help     => '-s SERVICE',
-                 required => 1,
-    );
-    $np->add_arg(
-                 spec    => 'expect|e=s',
-                 help    => '-e active|inactive|success',
-                 default => 'active',
-    );
-    $np->getopts;
-    my @expected = ('active', 'inactive', 'success');
+usage: check_systemd_unit_state <unit> <expect> [<lastrun>]
 
-    $np->nagios_exit(UNKNOWN, "Valid expected states are 'active', 'inactive' 
or 'success'")
-            unless (grep {$_ eq $np->opts->expect} @expected);
+Checks that the systemd unit <unit> is in the correct state according
+to <expect>:
 
-    return $np;
-}
+    active   - Ok if the unit is active and running
+    inactive - Ok if the unit is inactive and dead
+    periodic - Ok if the unit is either:
+                 (a) active and running
+                 (b) inactive, dead and the last result was success
+               In addition, if <lastrun> is specified, the checks
+               returns Ok iff the unit was started no more than
+               <lastrun> seconds ago (and this information is only
+               valid when a timer exists for the unit)
+"""
 
-sub get_data {
-    my $service = shift;
-    open(STATUS, "/bin/systemctl show '$service' |");
+import time
+import datetime
+import subprocess
+import re
+import sys
 
-    my %service_data = {};
 
-    # Parse the systemctl output.
-    # Yes this is going to be crude.
-    while (<STATUS>) {
-        next if /^#/;
-        chomp;
-        my ($k, $v) = split /=/, $_, 2;
-        $service_data{$k} = $v;
-    }
-    return \%service_data;
-}
+def unknown(msg):
+    print("UNKNOWN - %s" % msg)
+    sys.exit(3)
 
-my $plugin = setup();
-my $service = $plugin->opts->service;
-my $expect = $plugin->opts->expect;
-my %expected_substates = ( 'active' => 'running', 'inactive' => 'dead' );
+def crit(msg):
+    print("CRITICAL - %s" % msg)
+    sys.exit(2)
 
-my $service_status = get_data($service);
+def ok(msg):
+    print("OK - %s" % msg)
+    sys.exit(0)
 
-$plugin->nagios_exit(UNKNOWN, "Service $service is not loaded") unless 
$service_status->{LoadState} eq 'loaded';
+def main():
 
-if ($service_status->{SubState} ne 
$expected_substates{$service_status->{ActiveState}}) {
-    $plugin->nagios_exit(CRITICAL, "Service $service is 
$service_status->{ActiveState}, but its last recorded state is: 
$service_status->{SubState}");
-}
+    try:
+        lastrun = None
+        unit = sys.argv[1]
+        expect = sys.argv[2]
+        if expect not in ['active', 'inactive', 'periodic']:
+            unknown("Must expect one of 'active', 'inactive', or 'periodic'")
+        if expect == 'periodic' and len(sys.argv) > 3:
+            lastrun = datetime.timedelta(seconds=int(sys.argv[3]))
+    except (IndexError, ValueError):
+        unknown("Bad arguments to %s (%s)" % (sys.argv[0], ", 
".join(sys.argv[1:])))
 
-if ($service_status->{ActiveState} eq 'active') {
-    $plugin->nagios_exit(CRITICAL, "Service $service is active (expected 
inactive)") if $expect eq 'inactive';
-    $plugin->nagios_exit(OK, "Service $service is active (expected success)") 
if $expect eq 'success';
-} else {
-    $plugin->nagios_exit(CRITICAL, "Service $service is inactive (expected 
active)") if $expect eq 'active';
-    if($expect eq 'success') {
-        $plugin->nagios_exit(OK, "Last run of service $service was succesful") 
if $service_status->{Result} eq 'success';
-        $plugin->nagios_exit(CRITICAL, "Service $service failed 
($service_status->{Result})");
-    }
-}
+    state = {}
+    try:
+        raw = subprocess.check_output(['/bin/systemctl', 'show', unit], 
stderr=subprocess.STDOUT).decode()
+        for entry in raw.splitlines():
+            kv = entry.split('=', 1)
+            state[kv[0]] = kv[1]
+    except IndexError:
+        unknown("Unable to parse status of unit %s" % unit)
 
-$plugin->nagios_exit(OK, "Service $service is in the desired state ($expect)");
+    if expect == 'active':
+
+        if state['ActiveState'] != 'active':
+            crit("Expecting active but unit is %s" % state['ActiveState'])
+        if state['SubState'] != 'running':
+            crit("Unit is active but reported %s'" % state['SubState'])
+        ok("%s is active" % unit)
+
+    elif expect == 'inactive':
+
+        if state['ActiveState'] != 'inactive':
+            crit("Expecting inactive but unit is %s" % state['ActiveState'])
+        if state['SubState'] != 'dead':
+            crit("Unit is inactive but reported %s'" % state['SubState'])
+        ok("%s is inactive" % unit)
+
+    # else periodic
+
+    if state['ActiveState'] == 'active':
+        ok("Unit is currently active")
+    if state['ActiveState'] != 'inactive':
+        crit("Unit is in state %s" % state['ActiveState'])
+    if state['Result'] != 'success':
+        crit("Last run result was %s" % state['Result'])
+
+    if lastrun:
+        try:
+            # Timestamps in systemctl show are in format 'Thu 2015-07-30 
16:56:59 UTC'
+            started = 
datetime.datetime.strptime(state['ExecMainStartTimestamp'], '%a %Y-%m-%d 
%H:%M:%S %Z')
+            age = datetime.datetime.fromtimestamp(int(time.time())) - started
+
+            if age > lastrun:
+                crit("Last run was over %s ago" % lastrun)
+
+        except (KeyError, ValueError):
+            unknown("Unit has no usable last run information (not a timer?)")
+
+    ok("Last run successful")
+
+if __name__ == "__main__":
+    main()
 
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp 
b/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
deleted file mode 100644
index 7a6fe41..0000000
--- a/modules/nrpe/manifests/monitor_systemd_unit_lastrun.pp
+++ /dev/null
@@ -1,36 +0,0 @@
-# === Define: nrpe::monitor_systemd_unit_lastrun
-#
-# Installs a check for last run time of a systemd unit using journalctl
-define nrpe::monitor_systemd_unit_lastrun(
-    $unit = $title,
-    $description = "${unit} last run",
-    $contact_group = 'admins',
-    $retries = 3,
-    $timeout = 10,
-    $critical = false,
-    $ensure = 'present',
-    $warn_secs = 60*60*25,
-    $crit_secs = 60*60*49,
-    ){
-
-    if $::initsystem != 'systemd' {
-        fail('nrpe::monitor_systemd_unit_lastrun can only work on 
systemd-enabled systems')
-    }
-    require nrpe::systemd_scripts
-
-    # Temporary hack until we fix the downstream modules
-    if $critical {
-        $nagios_critical = 'true'
-    } else {
-        $nagios_critical = 'false'
-    }
-
-    nrpe::monitor_service { "${unit}-lastrun":
-        ensure       => $ensure,
-        description  => $description,
-        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_lastrun 
'${unit}' ${warn_secs} ${crit_secs}",
-        retries      => $retries,
-        timeout      => $timeout,
-        critical     => $nagios_critical,
-    }
-}
diff --git a/modules/nrpe/manifests/monitor_systemd_unit_state.pp 
b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
index 5f1d855..66de6a5 100644
--- a/modules/nrpe/manifests/monitor_systemd_unit_state.pp
+++ b/modules/nrpe/manifests/monitor_systemd_unit_state.pp
@@ -27,7 +27,7 @@
     nrpe::monitor_service { "${unit}-state":
         ensure       => $ensure,
         description  => $description,
-        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state -s 
'${unit}' -e ${expected_state}",
+        nrpe_command => "/usr/local/bin/nrpe_check_systemd_unit_state 
'${unit}' ${expected_state}",
         retries      => $retries,
         timeout      => $timeout,
         critical     => $nagios_critical,
diff --git a/modules/nrpe/manifests/systemd_scripts.pp 
b/modules/nrpe/manifests/systemd_scripts.pp
index 0f355b5..0f2ee2c 100644
--- a/modules/nrpe/manifests/systemd_scripts.pp
+++ b/modules/nrpe/manifests/systemd_scripts.pp
@@ -4,20 +4,11 @@
 #
 
 class nrpe::systemd_scripts {
-    require_package 'libnagios-plugin-perl'
 
     # These scripts allows monitoring of systemd services
     file { '/usr/local/bin/nrpe_check_systemd_unit_state':
         ensure => present,
         source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_state',
-        owner  => 'root',
-        group  => 'root',
-        mode   => '0555',
-    }
-
-    file { '/usr/local/bin/nrpe_check_systemd_unit_lastrun':
-        ensure => present,
-        source => 'puppet:///modules/nrpe/plugins/check_systemd_unit_lastrun',
         owner  => 'root',
         group  => 'root',
         mode   => '0555',

-- 
To view, visit https://gerrit.wikimedia.org/r/230556
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I91e58d2add83c5eaded426b9dd9d14c9bec356ad
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: coren <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to