Filippo Giunchedi has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/378039 )

Change subject: [WIP] smart: new module
......................................................................

[WIP] smart: new module

Report smart attributes as Prometheus metrics

Bug: T86552
Change-Id: I324f23acb64f5b7c4e8250e9aba3374c5ceba22b
---
A modules/smart/files/20logger
A modules/smart/files/smart-data-dump
A modules/smart/manifests/init.pp
3 files changed, 298 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/39/378039/1

diff --git a/modules/smart/files/20logger b/modules/smart/files/20logger
new file mode 100644
index 0000000..4ddc291
--- /dev/null
+++ b/modules/smart/files/20logger
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+
+# Send smartd warnings/errors via syslog instead of mail
+
+if ! [ -x /usr/bin/logger ]; then
+    echo "Your system does not have /usr/bin/logger.  Install the bsdutils 
package."
+    exit 1
+fi
+
+input=$1
+shift
+
+/usr/bin/logger -p local0.warning -t smart_failure < $input
diff --git a/modules/smart/files/smart-data-dump 
b/modules/smart/files/smart-data-dump
new file mode 100644
index 0000000..c56ed54
--- /dev/null
+++ b/modules/smart/files/smart-data-dump
@@ -0,0 +1,251 @@
+#!/usr/bin/python3
+
+import argparse
+import collections
+import json
+import logging
+import re
+import subprocess
+import sys
+
+from prometheus_client import CollectorRegistry, Gauge, write_to_textfile
+from prometheus_client.exposition import generate_latest
+
+log = logging.getLogger(__name__)
+PD = collections.namedtuple('PD', ['driver', 'smart_args', 'disk_id'])
+REPORT_ATTRIBUTES = [
+    'airflow_temperature_cel',
+    'command_timeout',
+    'current_pending_sector',
+    'end_to_end_error',
+    'erase_fail_count',
+    'g_sense_error_rate',
+    'hardware_ecc_recovered',
+    'host_reads_mib',
+    'host_reads_32mib',
+    'host_writes_mib',
+    'host_writes_32mib',
+    'load_cycle_count',
+    'media_wearout_indicator',
+    'nand_writes_1gib',
+    'offline_uncorrectable',
+    'power_cycle_count',
+    'power_on_hours',
+    'program_fail_count',
+    'raw_read_error_rate',
+    'reallocated_sector_ct',
+    'reported_uncorrect',
+    'sata_downshift_count',
+    'spin_retry_count',
+    'spin_up_time',
+    'start_stop_count',
+    'temperature_celsius',
+    'total_lbas_read',
+    'total_lbas_written',
+    'udma_crc_error_count',
+    'unsafe_shutdown_count',
+    'workld_host_reads_perc',
+    'workld_media_wear_indic',
+    'workload_minutes',
+]
+
+
+def get_fact(fact_name):
+    raw_output = subprocess.check_output(['/usr/bin/facter', '--puppet',
+         '--json', fact_name])
+    try:
+        fact_value = json.loads(raw_output).get(fact_name, None)
+    except ValueError:
+        return None
+
+    log.debug('Fact "raid" discovered: %r', fact_value)
+    return fact_value
+
+
+def megaraid_list_pd():
+    raw_output = subprocess.check_output(['/usr/bin/timeout', '30', 
'/usr/sbin/smartctl', '--scan-open'])
+    for line in raw_output.splitlines():
+        if 'megaraid,' not in line:
+            continue
+        bus, _, device, _ = line.split(' ', 3)
+        yield PD(driver='megaraid', smart_args=['-d', device, bus],
+                 disk_id=device)
+
+
+def hpsa_list_pd():
+    raw_output = subprocess.check_output(['/usr/bin/timeout', '30', 
'/usr/sbin/hpssacli', 'controller',
+            'all', 'show', 'config'])
+    in_controller = False
+    for line in raw_output.splitlines():
+        m = re.match('^Smart Array .* in Slot (\d+)', line)
+        if m:
+            in_controller = True
+            disk_id = 0
+
+        m = re.match('^\s+physicaldrive', line)
+        if m and in_controller:
+            device = 'cciss,%s' % disk_id
+            # TODO(filippo) assumes /dev/sda
+            yield PD(driver='cciss', disk_id=device,
+                     smart_args=['-d', device, '/dev/sda'])
+            disk_id += 1
+
+
+def noraid_list_pd():
+    # starting with stretch, lsblk has --json but not on trusty/jessie
+    raw_output = subprocess.check_output(['/usr/bin/timeout', '30',
+        '/bin/lsblk', '--noheadings', '--output', 'NAME,TYPE',
+        '--raw'])
+    for line in raw_output.splitlines():
+        name, disk_type = line.split(' ', 1)
+        if disk_type != 'disk':
+            continue
+        yield PD(driver='noraid', smart_args=['-d', 'auto', '/dev/%s' % name],
+                disk_id=name)
+
+
+def _run_smartctl(args, timeout=30):
+    try:
+        cmd_args = ['/usr/bin/timeout', str(timeout), '/usr/sbin/smartctl']
+        cmd_args.extend(args)
+        log.debug('Running: %s', ' '.join(cmd_args))
+        raw_output = subprocess.check_output(cmd_args)
+    except subprocess.CalledProcessError as e:
+        # TODO(filippo) handle non-fatal exit codes
+        raw_output = e.output
+        pass
+    return raw_output
+
+
+def collect_smart_metrics(disks, registry):
+    smart_healthy = Gauge('healthy', 'SMART health', namespace='device_smart',
+            registry=registry, labelnames=['device'])
+
+    device_info = Gauge('info', 'Disk info', namespace='device_smart',
+            registry=registry, labelnames=['device', 'model', 'firmware'])
+
+    smart_attributes = {}
+    for attribute in REPORT_ATTRIBUTES:
+        smart_attributes[attribute] = Gauge(attribute,
+                'SMART attribute %s' % attribute, namespace='device_smart',
+                registry=registry, labelnames=['device'])
+
+    for disk in disks:
+        args = ['--info', '--health']
+        args.extend(disk.smart_args)
+        _parse_smart_info(_run_smartctl(args), disk, smart_healthy, 
device_info)
+
+        args = ['--attributes']
+        args.extend(disk.smart_args)
+        _parse_smart_attributes(_run_smartctl(args), disk, smart_attributes)
+
+
+def _parse_smart_attributes(output, disk, attributes):
+    in_attributes = False
+
+    for line in output.splitlines():
+        if line.startswith('ID#'):
+            in_attributes = True
+            continue
+
+        if not in_attributes or not line:
+            continue
+
+        try:
+            attribute_id, name, flag, value, worst, thresh, attribute_type, \
+               updated, when_failed, raw_value = \
+                    re.split(' +', line.strip(), 9)
+        except ValueError as e:
+            log.error('Unparseable line from smartctl: %r %r', e, line)
+            continue
+
+        metric_name = name.lower()
+        if metric_name not in attributes:
+            log.info('Unreported attribute %r: %r', metric_name, line)
+        else:
+            try:
+                metric_value = raw_value.split(' ')[0]
+                attributes[metric_name].labels(disk.disk_id).set(metric_value)
+            except ValueError as e:
+                log.error('Unparseable %r', line)
+
+
+def _parse_smart_info(output, disk, smart_healthy, device_info):
+    smart_healthy.labels(disk.disk_id).set(0)
+    model, firmware = None, None
+
+    for line in output.splitlines():
+        if ':' not in line:
+            continue
+        key, value = line.split(':', 1)
+        key = key.lower()
+        value = value.strip()
+
+        if key in ('product', 'device model'):
+            model = value
+        if key in ('firmware version'):
+            firmware = value
+        m = re.match('^smart (overall-)?health', key)
+        if m and value.lower() in ('ok', 'passed'):
+            smart_healthy.labels(disk.disk_id).set(1)
+
+    device_info.labels(disk.disk_id, model or 'NA', firmware or 'NA').set(1)
+
+
+# limitations:
+# - one controller of one type per machine
+
+DRIVER_HANDLERS = {
+    'megaraid': megaraid_list_pd,
+    'hpsa': hpsa_list_pd,
+}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--outfile', metavar='FILE.prom',
+                        help='Output file (stdout)')
+    parser.add_argument('-d', '--debug', action='store_true', default=False,
+                        help='Enable debug logging (%(default)s)')
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.basicConfig(level=logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.WARNING)
+
+    if args.outfile and not args.outfile.endswith('.prom'):
+        parser.error('Output file does not end with .prom')
+
+    physical_disks = []
+
+    raid_drivers = get_fact('raid')
+    if raid_drivers is None:
+        log.error('Invalid value for "raid" fact: %r', raid_drivers)
+        return 1
+
+    for driver in raid_drivers:
+        handler = DRIVER_HANDLERS.get(driver)
+        if not handler:
+            continue
+        for pd in handler():
+            physical_disks.append(pd)
+
+    if not raid_drivers or raid_drivers == ['md']:
+        for pd in noraid_list_pd():
+            physical_disks.append(pd)
+
+    log.debug('Gathering SMART data from physical disks: %r',
+              [x.disk_id for x in physical_disks])
+
+    registry = CollectorRegistry()
+    collect_smart_metrics(physical_disks, registry)
+
+    if args.outfile:
+        write_to_textfile(args.outfile, registry)
+    else:
+        sys.stdout.write(generate_latest(registry).decode('utf-8'))
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/modules/smart/manifests/init.pp b/modules/smart/manifests/init.pp
new file mode 100644
index 0000000..6a7fb98
--- /dev/null
+++ b/modules/smart/manifests/init.pp
@@ -0,0 +1,34 @@
+class smart::init {
+    if $facts['is_virtual'] == true {
+        fail('smart module is not supported on virtual hosts')
+    }
+
+    # Prefer smartmontools version from backports (if any) because of newer
+    # smart drivedb.
+    package { 'smartmontools':
+        ensure          => installed,
+        install_options => ['-t', "${::lsbdistcodename}-backports"],
+    }
+
+    # Make sure we send smart alerts from smartd via syslog and not email.
+    file { '/etc/smartmontools/run.d/10mail':
+        ensure  => absent,
+        require => Package['smartmontools'],
+    }
+
+    file { '/etc/smartmontools/run.d/20logger':
+        ensure => present,
+        owner  => 'root',
+        group  => 'root',
+        mode   => '0544',
+        source => "puppet:///modules/${module_name}/20logger",
+    }
+
+    file { '/usr/local/sbin/smart-data-dump':
+        ensure => present,
+        owner  => 'root',
+        group  => 'root',
+        mode   => '0544',
+        source => "puppet:///modules/${module_name}/smart-data-dump",
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/378039
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I324f23acb64f5b7c4e8250e9aba3374c5ceba22b
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to