Filippo Giunchedi has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/378039 )
Change subject: [WIP] smart: new module ...................................................................... [WIP] smart: new module Report smart attributes as Prometheus metrics Bug: T86552 Change-Id: I324f23acb64f5b7c4e8250e9aba3374c5ceba22b --- A modules/smart/files/20logger A modules/smart/files/smart-data-dump A modules/smart/manifests/init.pp 3 files changed, 298 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/puppet refs/changes/39/378039/1 diff --git a/modules/smart/files/20logger b/modules/smart/files/20logger new file mode 100644 index 0000000..4ddc291 --- /dev/null +++ b/modules/smart/files/20logger @@ -0,0 +1,13 @@ +#!/bin/bash -e + +# Send smartd warnings/errors via syslog instead of mail + +if ! [ -x /usr/bin/logger ]; then + echo "Your system does not have /usr/bin/logger. Install the bsdutils package." + exit 1 +fi + +input=$1 +shift + +/usr/bin/logger -p local0.warning -t smart_failure < $input diff --git a/modules/smart/files/smart-data-dump b/modules/smart/files/smart-data-dump new file mode 100644 index 0000000..c56ed54 --- /dev/null +++ b/modules/smart/files/smart-data-dump @@ -0,0 +1,251 @@ +#!/usr/bin/python3 + +import argparse +import collections +import json +import logging +import re +import subprocess +import sys + +from prometheus_client import CollectorRegistry, Gauge, write_to_textfile +from prometheus_client.exposition import generate_latest + +log = logging.getLogger(__name__) +PD = collections.namedtuple('PD', ['driver', 'smart_args', 'disk_id']) +REPORT_ATTRIBUTES = [ + 'airflow_temperature_cel', + 'command_timeout', + 'current_pending_sector', + 'end_to_end_error', + 'erase_fail_count', + 'g_sense_error_rate', + 'hardware_ecc_recovered', + 'host_reads_mib', + 'host_reads_32mib', + 'host_writes_mib', + 'host_writes_32mib', + 'load_cycle_count', + 'media_wearout_indicator', + 'nand_writes_1gib', + 'offline_uncorrectable', + 'power_cycle_count', + 'power_on_hours', + 'program_fail_count', + 'raw_read_error_rate', + 'reallocated_sector_ct', + 'reported_uncorrect', + 'sata_downshift_count', + 'spin_retry_count', + 'spin_up_time', + 'start_stop_count', + 'temperature_celsius', + 'total_lbas_read', + 'total_lbas_written', + 'udma_crc_error_count', + 'unsafe_shutdown_count', + 'workld_host_reads_perc', + 'workld_media_wear_indic', + 'workload_minutes', +] + + +def get_fact(fact_name): + raw_output = subprocess.check_output(['/usr/bin/facter', '--puppet', + '--json', fact_name]) + try: + fact_value = json.loads(raw_output).get(fact_name, None) + except ValueError: + return None + + log.debug('Fact "raid" discovered: %r', fact_value) + return fact_value + + +def megaraid_list_pd(): + raw_output = subprocess.check_output(['/usr/bin/timeout', '30', '/usr/sbin/smartctl', '--scan-open']) + for line in raw_output.splitlines(): + if 'megaraid,' not in line: + continue + bus, _, device, _ = line.split(' ', 3) + yield PD(driver='megaraid', smart_args=['-d', device, bus], + disk_id=device) + + +def hpsa_list_pd(): + raw_output = subprocess.check_output(['/usr/bin/timeout', '30', '/usr/sbin/hpssacli', 'controller', + 'all', 'show', 'config']) + in_controller = False + for line in raw_output.splitlines(): + m = re.match('^Smart Array .* in Slot (\d+)', line) + if m: + in_controller = True + disk_id = 0 + + m = re.match('^\s+physicaldrive', line) + if m and in_controller: + device = 'cciss,%s' % disk_id + # TODO(filippo) assumes /dev/sda + yield PD(driver='cciss', disk_id=device, + smart_args=['-d', device, '/dev/sda']) + disk_id += 1 + + +def noraid_list_pd(): + # starting with stretch, lsblk has --json but not on trusty/jessie + raw_output = subprocess.check_output(['/usr/bin/timeout', '30', + '/bin/lsblk', '--noheadings', '--output', 'NAME,TYPE', + '--raw']) + for line in raw_output.splitlines(): + name, disk_type = line.split(' ', 1) + if disk_type != 'disk': + continue + yield PD(driver='noraid', smart_args=['-d', 'auto', '/dev/%s' % name], + disk_id=name) + + +def _run_smartctl(args, timeout=30): + try: + cmd_args = ['/usr/bin/timeout', str(timeout), '/usr/sbin/smartctl'] + cmd_args.extend(args) + log.debug('Running: %s', ' '.join(cmd_args)) + raw_output = subprocess.check_output(cmd_args) + except subprocess.CalledProcessError as e: + # TODO(filippo) handle non-fatal exit codes + raw_output = e.output + pass + return raw_output + + +def collect_smart_metrics(disks, registry): + smart_healthy = Gauge('healthy', 'SMART health', namespace='device_smart', + registry=registry, labelnames=['device']) + + device_info = Gauge('info', 'Disk info', namespace='device_smart', + registry=registry, labelnames=['device', 'model', 'firmware']) + + smart_attributes = {} + for attribute in REPORT_ATTRIBUTES: + smart_attributes[attribute] = Gauge(attribute, + 'SMART attribute %s' % attribute, namespace='device_smart', + registry=registry, labelnames=['device']) + + for disk in disks: + args = ['--info', '--health'] + args.extend(disk.smart_args) + _parse_smart_info(_run_smartctl(args), disk, smart_healthy, device_info) + + args = ['--attributes'] + args.extend(disk.smart_args) + _parse_smart_attributes(_run_smartctl(args), disk, smart_attributes) + + +def _parse_smart_attributes(output, disk, attributes): + in_attributes = False + + for line in output.splitlines(): + if line.startswith('ID#'): + in_attributes = True + continue + + if not in_attributes or not line: + continue + + try: + attribute_id, name, flag, value, worst, thresh, attribute_type, \ + updated, when_failed, raw_value = \ + re.split(' +', line.strip(), 9) + except ValueError as e: + log.error('Unparseable line from smartctl: %r %r', e, line) + continue + + metric_name = name.lower() + if metric_name not in attributes: + log.info('Unreported attribute %r: %r', metric_name, line) + else: + try: + metric_value = raw_value.split(' ')[0] + attributes[metric_name].labels(disk.disk_id).set(metric_value) + except ValueError as e: + log.error('Unparseable %r', line) + + +def _parse_smart_info(output, disk, smart_healthy, device_info): + smart_healthy.labels(disk.disk_id).set(0) + model, firmware = None, None + + for line in output.splitlines(): + if ':' not in line: + continue + key, value = line.split(':', 1) + key = key.lower() + value = value.strip() + + if key in ('product', 'device model'): + model = value + if key in ('firmware version'): + firmware = value + m = re.match('^smart (overall-)?health', key) + if m and value.lower() in ('ok', 'passed'): + smart_healthy.labels(disk.disk_id).set(1) + + device_info.labels(disk.disk_id, model or 'NA', firmware or 'NA').set(1) + + +# limitations: +# - one controller of one type per machine + +DRIVER_HANDLERS = { + 'megaraid': megaraid_list_pd, + 'hpsa': hpsa_list_pd, +} + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument('--outfile', metavar='FILE.prom', + help='Output file (stdout)') + parser.add_argument('-d', '--debug', action='store_true', default=False, + help='Enable debug logging (%(default)s)') + args = parser.parse_args() + + if args.debug: + logging.basicConfig(level=logging.DEBUG) + else: + logging.basicConfig(level=logging.WARNING) + + if args.outfile and not args.outfile.endswith('.prom'): + parser.error('Output file does not end with .prom') + + physical_disks = [] + + raid_drivers = get_fact('raid') + if raid_drivers is None: + log.error('Invalid value for "raid" fact: %r', raid_drivers) + return 1 + + for driver in raid_drivers: + handler = DRIVER_HANDLERS.get(driver) + if not handler: + continue + for pd in handler(): + physical_disks.append(pd) + + if not raid_drivers or raid_drivers == ['md']: + for pd in noraid_list_pd(): + physical_disks.append(pd) + + log.debug('Gathering SMART data from physical disks: %r', + [x.disk_id for x in physical_disks]) + + registry = CollectorRegistry() + collect_smart_metrics(physical_disks, registry) + + if args.outfile: + write_to_textfile(args.outfile, registry) + else: + sys.stdout.write(generate_latest(registry).decode('utf-8')) + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/modules/smart/manifests/init.pp b/modules/smart/manifests/init.pp new file mode 100644 index 0000000..6a7fb98 --- /dev/null +++ b/modules/smart/manifests/init.pp @@ -0,0 +1,34 @@ +class smart::init { + if $facts['is_virtual'] == true { + fail('smart module is not supported on virtual hosts') + } + + # Prefer smartmontools version from backports (if any) because of newer + # smart drivedb. + package { 'smartmontools': + ensure => installed, + install_options => ['-t', "${::lsbdistcodename}-backports"], + } + + # Make sure we send smart alerts from smartd via syslog and not email. + file { '/etc/smartmontools/run.d/10mail': + ensure => absent, + require => Package['smartmontools'], + } + + file { '/etc/smartmontools/run.d/20logger': + ensure => present, + owner => 'root', + group => 'root', + mode => '0544', + source => "puppet:///modules/${module_name}/20logger", + } + + file { '/usr/local/sbin/smart-data-dump': + ensure => present, + owner => 'root', + group => 'root', + mode => '0544', + source => "puppet:///modules/${module_name}/smart-data-dump", + } +} -- To view, visit https://gerrit.wikimedia.org/r/378039 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I324f23acb64f5b7c4e8250e9aba3374c5ceba22b Gerrit-PatchSet: 1 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: Filippo Giunchedi <fgiunch...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits