ArielGlenn has submitted this change and it was merged.
Change subject: turn files auditor into salt module
......................................................................
turn files auditor into salt module
remotefileauditor invokes the salt module on the minions
retentionaudit is the module itself, a small interface to
the local audit class
localfilesaudit handles the actual audit on the local host
next up: convert logs audit in the same fashion
Change-Id: I0cd978a33cd2508197247355355e2141aadcab73
---
M dataretention/data_auditor.py
A dataretention/retention/localfileaudit.py
A dataretention/retention/remotefileauditor.py
A dataretention/retention/retentionaudit.py
M dataretention/retention/runner.py
5 files changed, 1,217 insertions(+), 24 deletions(-)
Approvals:
ArielGlenn: Verified; Looks good to me, approved
diff --git a/dataretention/data_auditor.py b/dataretention/data_auditor.py
index 739da38..67d9335 100644
--- a/dataretention/data_auditor.py
+++ b/dataretention/data_auditor.py
@@ -3,7 +3,8 @@
sys.path.append('/srv/audits/retention/scripts/')
from retention.cli import CommandLine
-from retention.auditor import FilesAuditor, LogsAuditor, HomesAuditor
+from retention.auditor import LogsAuditor, HomesAuditor
+from retention.remotefileauditor import RemoteFilesAuditor
from retention.examiner import FileExaminer, DirExaminer
def usage(message=None):
@@ -256,11 +257,11 @@
cmdline.run(report, ignored)
elif audit_type == 'root':
- filesaudit = FilesAuditor(hosts_expr, audit_type, prettyprint,
- show_sample_content, dirsizes,
- summary_report,
- depth, files_to_check, ignore_also,
- timeout, maxfiles, store_filepath, verbose)
+ filesaudit = RemoteFilesAuditor(hosts_expr, audit_type, prettyprint,
+ show_sample_content, dirsizes,
+ summary_report,
+ depth, files_to_check, ignore_also,
+ timeout, maxfiles, store_filepath,
verbose)
report, ignored = filesaudit.audit_hosts()
if interactive:
cmdline = CommandLine(store_filepath, timeout, audit_type,
hosts_expr)
diff --git a/dataretention/retention/localfileaudit.py
b/dataretention/retention/localfileaudit.py
new file mode 100644
index 0000000..7bdb705
--- /dev/null
+++ b/dataretention/retention/localfileaudit.py
@@ -0,0 +1,660 @@
+import os
+import sys
+import time
+import re
+import socket
+import runpy
+import stat
+import locale
+import logging
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+import retention.utils
+import retention.magic
+from retention.rule import Rule
+from retention.config import Config
+from retention.fileinfo import FileInfo
+
+log = logging.getLogger(__name__)
+
+class LocalFilesAuditor(object):
+ '''
+ audit files on the local host
+ in a specified set of directories
+ '''
+ def __init__(self, audit_type,
+ show_content=False, dirsizes=False,
+ depth=2, to_check=None, ignore_also=None,
+ timeout=60, maxfiles=None):
+ '''
+ audit_type: type of audit e.g. 'logs', 'homes'
+ show_content: show the first line or so from problematic files
+ dirsizes: show only directories which have too many files to
+ audit properly, don't report on files at all
+ depth: the auditor will give up if a directory has too any files
+ it (saves it form dying on someone's 25gb homedir).
+ this option tells it how far down the tree to go from
+ the top dir of the audit, before starting to count.
+ e.g. do we count in /home/ariel or separately in
+ /home/ariel/* or in /home/ariel/*/*, etc.
+ to_check: comma-separated list of dirs (must end in '/') and/or
+ files that will be checked; if this is None then
+ all dirs/files will be checked
+ ignore_also: comma-separated list of dirs (must end in '/') and/or
+ files that will be skipped in addition to the ones
+ in the config, rules, etc.
+ timeout: salt timeout for running remote commands
+ maxfiles: how many files in a directory tree is too many to audit
+ (at which point we warn about that and move on)
+ '''
+
+ self.audit_type = audit_type
+ self.locations = audit_type + "_locations"
+ self.show_sample_content = show_content
+ self.dirsizes = dirsizes
+ self.depth = depth + 1 # actually count of path separators in dirname
+ self.to_check = to_check
+
+ self.filenames_to_check = None
+ self.dirs_to_check = None
+ self.set_up_to_check()
+
+ self.ignore_also = ignore_also
+ if self.ignore_also is not None:
+ self.ignore_also = self.ignore_also.split(',')
+ self.timeout = timeout
+
+ self.ignored = {}
+ self.set_up_ignored()
+
+ self.hostname = socket.getfqdn()
+
+ self.cutoff = Config.cf['cutoff']
+
+ self.perhost_rules_from_store = None
+ self.perhost_rules_from_file = None
+ self.set_up_perhost_rules()
+
+ self.today = time.time()
+ self.magic = retention.magic.magic_open(retention.magic.MAGIC_NONE)
+ self.magic.load()
+ self.summary = None
+ self.display_from_dict = FileInfo.display_from_dict
+ self.MAX_FILES = maxfiles
+ self.set_up_max_files()
+
+ def set_up_max_files(self):
+ '''
+ more than this many files in a subdir we won't process,
+ we'll just try to name top offenders
+
+ if we've been asked only to report dir trees that are
+ too large in this manner, we can set defaults mich
+ higher, since we don't stat files, open them to guess
+ their filetype, etc; processing then goes much quicker
+ '''
+
+ if self.MAX_FILES is None:
+ if self.dirsizes:
+ self.MAX_FILES = 1000
+ else:
+ self.MAX_FILES = 100
+
+ def set_up_to_check(self):
+ '''
+ turn the to_check arg into lists of dirs and files to check
+ '''
+ if self.to_check is not None:
+ check_list = self.to_check.split(',')
+ self.filenames_to_check = [fname for fname in check_list
+ if not fname.startswith(os.sep)]
+ if not len(self.filenames_to_check):
+ self.filenames_to_check = None
+ self.dirs_to_check = [d.rstrip(os.path.sep) for d in check_list
+ if d.startswith(os.sep)]
+
+ def set_up_perhost_rules(self):
+ self.perhost_rules_from_store = runpy.run_path(
+ '/srv/audits/retention/configs/%s_store.cf' %
self.hostname)['rules']
+ self.perhost_rules_from_file = runpy.run_path(
+ '/srv/audits/retention/configs/allhosts_file.cf')['perhostcf']
+
+ if self.perhost_rules_from_store is not None:
+ self.add_perhost_rules_to_ignored()
+
+ if (self.perhost_rules_from_file is not None and
+ 'ignored_dirs' in self.perhost_rules_from_file):
+ if '/' not in self.ignored['dirs']:
+ self.ignored['dirs']['/'] = []
+ if self.hostname in self.perhost_rules_from_file['ignored_dirs']:
+ for path in self.perhost_rules_from_file[
+ 'ignored_dirs'][self.hostname]:
+ if path.startswith('/'):
+ self.ignored['dirs']['/'].append(path)
+ if '*' in self.perhost_rules_from_file['ignored_dirs']:
+ for path in self.perhost_rules_from_file[
+ 'ignored_dirs'][self.hostname]:
+ if path.startswith('/'):
+ self.ignored['dirs']['/'].append(path)
+
+ def set_up_ignored(self):
+ '''
+ collect up initial list of files/dirs to skip during audit
+ '''
+ self.ignored['files'] = Config.cf['ignored_files']
+ self.ignored['dirs'] = Config.cf['ignored_dirs']
+ self.ignored['prefixes'] = Config.cf['ignored_prefixes']
+ self.ignored['extensions'] = Config.cf['ignored_extensions']
+
+ if self.ignore_also is not None:
+ # silently skip paths that are not absolute
+ for path in self.ignore_also:
+ if path.startswith('/'):
+ if path.endswith('/'):
+ if '/' not in self.ignored['dirs']:
+ self.ignored['dirs']['/'] = []
+ self.ignored['dirs']['/'].append(path[:-1])
+ else:
+ if '/' not in self.ignored['files']:
+ self.ignored['files']['/'] = []
+ self.ignored['files']['/'].append(path)
+
+ def add_perhost_rules_to_ignored(self):
+ '''
+ add dirs/files to be skipped during audit based
+ on rules in the rule store db
+ '''
+ if '/' not in self.ignored['dirs']:
+ self.ignored['dirs']['/'] = []
+ if '/' not in self.ignored['files']:
+ self.ignored['files']['/'] = []
+ for host in self.perhost_rules_from_store:
+ if host == self.hostname:
+ for rule in self.perhost_rules_from_store[host]:
+ path = os.path.join(rule['basedir'], rule['name'])
+ if rule['status'] == 'good':
+ if Rule.entrytype_to_text(rule['type']) == 'dir':
+ if path not in self.ignored['dirs']['/']:
+ self.ignored['dirs']['/'].append(path)
+ elif Rule.entrytype_to_text(rule['type']) == 'file':
+ if path not in self.ignored['files']['/']:
+ self.ignored['files']['/'].append(path)
+ else:
+ # some other random type, don't care
+ continue
+ break
+
+ @staticmethod
+ def startswith(string_arg, list_arg):
+ '''
+ check if the string arg starts with any elt in
+ the list_arg
+ '''
+ for elt in list_arg:
+ if string_arg.startswith(elt):
+ return True
+ return False
+
+ def contains(self, string_arg, list_arg):
+ '''
+ check if the string arg cotains any elt in
+ the list_arg
+ '''
+ for elt in list_arg:
+ if elt in string_arg:
+ return True
+ return False
+
+ @staticmethod
+ def endswith(string_arg, list_arg):
+ '''
+ check if the string arg ends with any elt in
+ the list_arg
+ '''
+ for elt in list_arg:
+ if string_arg.endswith(elt):
+ return True
+ return False
+
+ @staticmethod
+ def startswithpath(string_arg, list_arg):
+ '''
+ check if the string arg starts with any elt in
+ the list_arg and the next character, if any,
+ is the os dir separator
+ '''
+
+ for elt in list_arg:
+ if string_arg == elt or string_arg.startswith(elt + "/"):
+ return True
+ return False
+
+ @staticmethod
+ def subdir_check(dirname, directories):
+ '''
+ check if one of the directories listed is the
+ specified dirname or the dirname is somewhere in
+ a subtree of one of the listed directories,
+ returning True if so and fFalse otherwise
+ '''
+
+ # fixme test this
+ # also see if this needs to replace dirtree_checkeverywhere or not
+ for dname in directories:
+ if dname == dirname or dirname.startswith(dname + "/"):
+ return True
+ return False
+
+ @staticmethod
+ def dirtree_check(dirname, directories):
+ '''
+ check if the dirname is either a directory at or above one of
+ the the directories specified in the tree or vice versa, returning
+ True if so and fFalse otherwise
+ '''
+
+ for dname in directories:
+ if dirname == dname or dirname.startswith(dname + "/"):
+ return True
+ if dname.startswith(dirname + "/"):
+ return True
+ return False
+
+ @staticmethod
+ def expand_ignored_dirs(basedir, ignored):
+ '''
+ find dirs to ignore relative to the specified
+ basedir, in Config entry. Fall back to wildcard spec
+ if there is not entry for the basedir. Dirs in
+ Config entry may have one * in the path, this
+ will be treated as a wildcard for the purposes
+ of checking directories against the entry.
+
+ args: absolute path of basedir being crawled
+ hash of ignored dirs, file, etc
+ returns: list of absolute paths of dirs to ignore,
+ plus separate list of abslute paths containing '*',
+ also to ignore, or the empty list if there are none
+ '''
+
+ dirs = []
+ wildcard_dirs = []
+
+ to_expand = []
+ if 'dirs' in ignored:
+ if '*' in ignored['dirs']:
+ to_expand.extend(ignored['dirs']['*'])
+
+ if '/' in ignored['dirs']:
+ to_expand.extend(ignored['dirs']['/'])
+
+ if basedir in ignored['dirs']:
+ to_expand.extend(ignored['dirs'][basedir])
+
+ for dname in to_expand:
+ if '*' in dname:
+ wildcard_dirs.append(os.path.join(basedir, dname))
+ else:
+ dirs.append(os.path.join(basedir, dname))
+
+ return dirs, wildcard_dirs
+
+ @staticmethod
+ def wildcard_matches(dirname, wildcard_dirs, exact=True):
+ '''given a list of absolute paths with exactly one '*'
+ in each entry, see if the passed dirname matches
+ any of the list entries'''
+ for dname in wildcard_dirs:
+ if len(dirname) + 1 < len(dname):
+ continue
+
+ left, right = dname.split('*', 1)
+ if dirname.startswith(left):
+ if dirname.endswith(right):
+ return True
+ elif (not exact and
+ dirname.rfind(right + "/", len(left)) != -1):
+ return True
+ else:
+ continue
+ return False
+
+ def normalize(self, fname):
+ '''
+ subclasses may want to do something different, see
+ LogsAuditor for an example
+ '''
+ return fname
+
+ @staticmethod
+ def file_is_ignored(fname, basedir, ignored):
+ '''
+ pass normalized name (abs path), basedir (location audited),
+ hash of ignored files, dirs, prefixes, extensions
+ get back True if the file is to be ignored and
+ False otherwise
+ '''
+
+ basename = os.path.basename(fname)
+
+ if 'prefixes' in ignored:
+ if LocalFilesAuditor.startswith(basename, ignored['prefixes']):
+ return True
+
+ if 'extensions' in ignored:
+ if '*' in ignored['extensions']:
+ if LocalFilesAuditor.endswith(basename,
ignored['extensions']['*']):
+ return True
+ if basedir in ignored['extensions']:
+ if LocalFilesAuditor.endswith(
+ basename, ignored['extensions'][basedir]):
+ return True
+
+ if 'files' in ignored:
+ if basename in ignored['files']:
+ return True
+ if '*' in ignored['files']:
+ if LocalFilesAuditor.endswith(basename, ignored['files']['*']):
+ return True
+
+ if '/' in ignored['files']:
+ if fname in ignored['files']['/']:
+ return True
+ if LocalFilesAuditor.wildcard_matches(
+ fname, [w for w in ignored['files']['/'] if '*' in w]):
+ return True
+
+ if basedir in ignored['files']:
+ if LocalFilesAuditor.endswith(basename,
ignored['files'][basedir]):
+ return True
+ return False
+
+ def file_is_wanted(self, fname, basedir):
+ '''
+ decide if we want to audit the specific file or not
+ (is it ignored, or in an ignored directory, or of a type
+ we skip)
+ args: fname - the abs path to the file / dir
+
+ returns True if wanted or False if not
+ '''
+ fname = self.normalize(fname)
+
+ if LocalFilesAuditor.file_is_ignored(fname, basedir, self.ignored):
+ return False
+
+ if (self.filenames_to_check is not None and
+ fname not in self.filenames_to_check):
+ return False
+
+ return True
+
+ @staticmethod
+ def dir_is_ignored(dirname, ignored):
+ expanded_dirs, wildcard_dirs = LocalFilesAuditor.expand_ignored_dirs(
+ os.path.dirname(dirname), ignored)
+ if dirname in expanded_dirs:
+ return True
+ if LocalFilesAuditor.wildcard_matches(dirname, wildcard_dirs):
+ return True
+ return False
+
+ @staticmethod
+ def dir_is_wrong_type(dirname):
+ try:
+ dirstat = os.lstat(dirname)
+ except:
+ return True
+ if stat.S_ISLNK(dirstat.st_mode):
+ return True
+ if not stat.S_ISDIR(dirstat.st_mode):
+ return True
+ return False
+
+ def get_subdirs_to_do(self, dirname, dirname_depth, todo):
+
+ locale.setlocale(locale.LC_ALL, '')
+ if LocalFilesAuditor.dir_is_ignored(dirname, self.ignored):
+ return todo
+ if LocalFilesAuditor.dir_is_wrong_type(dirname):
+ return todo
+
+ if self.depth < dirname_depth:
+ return todo
+
+ if dirname_depth not in todo:
+ todo[dirname_depth] = []
+
+ if self.dirs_to_check is not None:
+ if LocalFilesAuditor.subdir_check(dirname, self.dirs_to_check):
+ todo[dirname_depth].append(dirname)
+ else:
+ todo[dirname_depth].append(dirname)
+
+ if self.depth == dirname_depth:
+ # don't read below the depth level
+ return todo
+
+ dirs = [os.path.join(dirname, d)
+ for d in os.listdir(dirname)]
+ if self.dirs_to_check is not None:
+ dirs = [d for d in dirs if LocalFilesAuditor.dirtree_check(
+ d, self.dirs_to_check)]
+
+ for dname in dirs:
+ todo = self.get_subdirs_to_do(dname, dirname_depth + 1, todo)
+ return todo
+
+ def get_dirs_to_do(self, dirname):
+ if (self.dirs_to_check is not None and
+ not LocalFilesAuditor.dirtree_check(dirname,
self.dirs_to_check)):
+ return {}
+
+ todo = {}
+ depth_of_dirname = dirname.count(os.path.sep)
+ todo = self.get_subdirs_to_do(dirname, depth_of_dirname, todo)
+ return todo
+
+ def process_files_from_path(self, location, base, files, count,
+ results, checklink=True):
+ '''
+ arguments:
+ location: the location being checked
+ base: directory containing the files to be checked
+ files: files to be checked
+ count: number of files in result set so far for this location
+ results: the result set
+ '''
+
+ for fname, st in files:
+ path = os.path.join(base, fname)
+ if self.file_is_wanted(path, location):
+ count += 1
+ if count > self.MAX_FILES:
+ if self.dirsizes:
+ self.warn_dirsize(base)
+ else:
+ self.warn_too_many_files(base)
+ return count
+ # for dirsizes option we don't collect or report files
+ if not self.dirsizes:
+ results.append((path, st))
+ return count
+
+ def walk_nolinks(self, top):
+ '''replaces (and is stolen from) os.walk, checks for and skips
+ links, returns base, paths, files but it's guaranteed that
+ files really are regular files and base/paths are not symlinks
+ the files list is a list of filename, stat of that filename,
+ because we have to do the stat on it anyways to ensure it's a file
+ and not a dir, so the caller might as well get that info'''
+
+ try:
+ names = os.listdir(top)
+ except os.error, err:
+ return
+
+ dirs, files = [], []
+ for name in names:
+ try:
+ filestat = os.lstat(os.path.join(top, name))
+ except:
+ continue
+ if stat.S_ISLNK(filestat.st_mode):
+ continue
+ if stat.S_ISDIR(filestat.st_mode):
+ dirs.append(name)
+ elif stat.S_ISREG(filestat.st_mode):
+ files.append((name, filestat))
+ else:
+ continue
+
+ yield top, dirs, files
+
+ for name in dirs:
+ new_path = os.path.join(top, name)
+ for x in self.walk_nolinks(new_path):
+ yield x
+
+ def process_one_dir(self, location, subdirpath, depth, results):
+ '''
+ arguments:
+ location: the location being checked
+ subdirpath: the path to the subdirectory being checked
+ depth: the depth of the directory being checked (starting at 1)
+ results: the result set
+ '''
+ if self.dirs_to_check is not None:
+ if not LocalFilesAuditor.dirtree_check(subdirpath,
self.dirs_to_check):
+ return
+
+ if LocalFilesAuditor.dir_is_ignored(subdirpath, self.ignored):
+ return True
+
+ count = 0
+
+ # doing a directory higher up in the tree than our depth cutoff,
+ # only do the files in it, because we have the full list of dirs
+ # up to our cutoff we do them one by one
+ if depth < self.depth:
+ filenames = os.listdir(subdirpath)
+ files = []
+ for fname in filenames:
+ try:
+ filestat = os.stat(os.path.join(subdirpath, fname))
+ except:
+ continue
+ if (not stat.S_ISLNK(filestat.st_mode) and
+ stat.S_ISREG(filestat.st_mode)):
+ files.append((fname, filestat))
+ self.process_files_from_path(location, subdirpath,
+ files, count, results)
+ return
+
+ # doing a directory at our cutoff depth, walk it,
+ # because anything below the depth
+ # cutoff won't be in our list
+ temp_results = []
+ for base, paths, files in self.walk_nolinks(subdirpath):
+ expanded_dirs, wildcard_dirs =
LocalFilesAuditor.expand_ignored_dirs(
+ base, self.ignored)
+ if self.dirs_to_check is not None:
+ paths[:] = [p for p in paths
+ if
LocalFilesAuditor.dirtree_check(os.path.join(base, p),
+
self.dirs_to_check)]
+ paths[:] = [p for p in paths if
+ (not LocalFilesAuditor.startswithpath(os.path.join(
+ base, p), expanded_dirs) and
+ not LocalFilesAuditor.wildcard_matches(os.path.join(
+ base, p), wildcard_dirs, exact=False))]
+ count = self.process_files_from_path(location, base, files,
+ count, temp_results,
+ checklink=False)
+ if count > self.MAX_FILES:
+ return
+
+ results.extend(temp_results)
+
+ def find_all_files(self):
+ results = []
+ for location in Config.cf[self.locations]:
+ dirs_to_do = self.get_dirs_to_do(location)
+ if location.count(os.path.sep) >= self.depth + 1:
+ # do the run at least once
+ upper_end = location.count(os.path.sep) + 1
+ else:
+ upper_end = self.depth + 1
+ for depth in range(location.count(os.path.sep), upper_end):
+ if depth in dirs_to_do:
+ for dname in dirs_to_do[depth]:
+ self.process_one_dir(location, dname, depth, results)
+ return results
+
+ @staticmethod
+ def get_open_files():
+ '''
+ scrounge /proc/nnn/fd and collect all open files
+ '''
+ open_files = set()
+ dirs = os.listdir("/proc")
+ for dname in dirs:
+ if not re.match('^[0-9]+$', dname):
+ continue
+ try:
+ links = os.listdir(os.path.join("/proc", dname, "fd"))
+ except:
+ # process may have gone away
+ continue
+ # must follow sym link for all of these, yuck
+ files = set()
+ for link in links:
+ try:
+ files.add(os.readlink(os.path.join("/proc", dname,
+ "fd", link)))
+ except:
+ continue
+ open_files |= files
+ return open_files
+
+ def warn_too_many_files(self, path=None):
+ print "WARNING: too many files to audit",
+ if path is not None:
+ fields = path.split(os.path.sep)
+ print "in directory %s" % os.path.sep.join(fields[:self.depth + 1])
+
+ def warn_dirsize(self, path):
+ fields = path.split(os.path.sep)
+ print ("WARNING: directory %s has more than %d files"
+ % (os.path.sep.join(fields[:self.depth + 1]), self.MAX_FILES))
+
+ def do_local_audit(self):
+ open_files = LocalFilesAuditor.get_open_files()
+
+ all_files = {}
+ files = self.find_all_files()
+
+ count = 0
+ for (f, st) in files:
+ if count < 10:
+ print "got", f, st
+ count += 1
+ all_files[f] = FileInfo(f, self.magic, st)
+ all_files[f].load_file_info(self.today, self.cutoff, open_files)
+
+ all_files_sorted = sorted(all_files, key=lambda f: all_files[f].path)
+ result = []
+
+ if all_files:
+ max_name_length = max([len(all_files[fname].path)
+ for fname in all_files]) + 2
+
+ for fname in all_files_sorted:
+ if (not self.contains(all_files[fname].filetype,
+ Config.cf['ignored_types'])
+ and not all_files[fname].is_empty):
+ result.append(all_files[fname].format_output(
+ self.show_sample_content, False,
+ max_name_length))
+ output = "\n".join(result) + "\n"
+ return output
diff --git a/dataretention/retention/remotefileauditor.py
b/dataretention/retention/remotefileauditor.py
new file mode 100644
index 0000000..3762bfa
--- /dev/null
+++ b/dataretention/retention/remotefileauditor.py
@@ -0,0 +1,516 @@
+import os
+import sys
+import time
+import json
+import socket
+import runpy
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+import retention.utils
+import retention.magic
+from retention.status import Status
+from retention.saltclientplus import LocalClientPlus
+from retention.rule import Rule, RuleStore
+from retention.config import Config
+from retention.fileinfo import FileInfo
+from retention.utils import JsonHelper
+from retention.runner import Runner
+from retention.localfileaudit import LocalFilesAuditor
+
+global_keys = [key for key, value_unused in
+ sys.modules[__name__].__dict__.items()]
+
+def get_dirs_toexamine(host_report):
+ '''
+ given full report output from host (list of
+ json entries), return the list
+ of directories with at least one possibly old file
+ and the list of directories skipped due to too
+ many entries
+ '''
+ dirs_problem = set()
+ dirs_skipped = set()
+ lines = host_report.split("\n")
+ for json_entry in lines:
+ if json_entry == "":
+ continue
+
+ if json_entry.startswith("WARNING:"):
+ bad_dir = RemoteFilesAuditor.get_dirname_from_warning(json_entry)
+ if bad_dir is not None:
+ dirs_skipped.add(bad_dir)
+ continue
+
+ if (json_entry.startswith("WARNING:") or
+ json_entry.startswith("INFO:")):
+ print json_entry
+ continue
+
+ try:
+ entry = json.loads(json_entry,
+ object_hook=JsonHelper.decode_dict)
+ except:
+ print "WARNING: failed to load json for", json_entry
+ continue
+ if 'empty' in entry:
+ empty = FileInfo.string_to_bool(entry['empty'])
+ if empty:
+ continue
+ if 'old' in entry:
+ old = FileInfo.string_to_bool(entry['old'])
+ if old is None or old:
+ if os.path.dirname(entry['path']) not in dirs_problem:
+ dirs_problem.add(os.path.dirname(entry['path']))
+ return sorted(list(dirs_problem)), sorted(list(dirs_skipped))
+
+
+class RemoteFilesAuditor(object):
+ '''
+ audit files across a set of remote hosts,
+ in a specified set of directories
+ '''
+ def __init__(self, hosts_expr, audit_type, prettyprint=False,
+ show_content=False, dirsizes=False, summary_report=False,
+ depth=2, to_check=None, ignore_also=None,
+ timeout=60, maxfiles=None,
+ store_filepath=None,
+ verbose=False):
+ '''
+ hosts_expr: list or grain-based or wildcard expr for hosts
+ to be audited
+ audit_type: type of audit e.g. 'logs', 'homes'
+ prettyprint: nicely format the output display
+ show_content: show the first line or so from problematic files
+ dirsizes: show only directories which have too many files to
+ audit properly, don't report on files at all
+ summary_report: do a summary of results instead of detailed
+ this means different thiings depending on the audit
+ type
+ depth: the auditor will give up if a directory has too any files
+ it (saves it form dying on someone's 25gb homedir).
+ this option tells it how far down the tree to go from
+ the top dir of the audit, before starting to count.
+ e.g. do we count in /home/ariel or separately in
+ /home/ariel/* or in /home/ariel/*/*, etc.
+ to_check: comma-separated list of dirs (must end in '/') and/or
+ files that will be checked; if this is None then
+ all dirs/files will be checked
+ ignore_also: comma-separated list of dirs (must end in '/') and/or
+ files that will be skipped in addition to the ones
+ in the config, rules, etc.
+ timeout: salt timeout for running remote commands
+ maxfiles: how many files in a directory tree is too many to audit
+ (at which point we warn about that and move on)
+ store_filepath: full path to rule store (sqlite3 db)
+ verbose: show informative messages during processing
+ '''
+
+ global rules
+
+ self.hosts_expr = hosts_expr
+ self.audit_type = audit_type
+ self.locations = audit_type + "_locations"
+ self.prettyprint = prettyprint
+ self.show_sample_content = show_content
+ self.dirsizes = dirsizes
+ self.show_summary = summary_report
+ self.depth = depth + 1 # actually count of path separators in dirname
+ self.to_check = to_check
+
+ self.ignore_also = ignore_also
+ if self.ignore_also is not None:
+ self.ignore_also = self.ignore_also.split(',')
+ self.timeout = timeout
+ self.store_filepath = store_filepath
+ self.verbose = verbose
+
+ self.set_up_ignored()
+
+ # need this for locally running jobs
+ self.hostname = socket.getfqdn()
+
+ self.cutoff = Config.cf['cutoff']
+
+ client = LocalClientPlus()
+ hosts, expr_type = Runner.get_hosts_expr_type(self.hosts_expr)
+ self.expanded_hosts = client.cmd_expandminions(
+ hosts, "test.ping", expr_form=expr_type)
+
+ self.set_up_max_files(maxfiles)
+ fileaudit_args = [self.show_sample_content,
+ self.dirsizes,
+ self.depth - 1,
+ self.to_check,
+ ",".join(self.ignore_also) if self.ignore_also is
not None else None,
+ self.timeout,
+ self.MAX_FILES]
+
+ self.runner = Runner(hosts_expr,
+ self.expanded_hosts,
+ self.audit_type,
+ fileaudit_args,
+ self.show_sample_content,
+ self.to_check,
+ self.timeout,
+ self.verbose)
+
+ self.perhost_raw = None
+ if
os.path.exists('/srv/audits/retention/scripts/audit_files_perhost_config.py'):
+ try:
+ self.perhost_rules_from_file = runpy.run_path(
+
'/srv/audits/retention/scripts/audit_files_perhost_config.py')['perhostcf']
+ self.perhost_raw = open(
+
'/srv/audits/retention/scripts/audit_files_perhost_config.py').read()
+ except:
+ pass
+
+ self.write_rules_for_minion()
+
+ self.cdb = RuleStore(self.store_filepath)
+ self.cdb.store_db_init(self.expanded_hosts)
+ self.set_up_and_export_rule_store()
+
+ self.show_ignored(Config.cf[self.locations])
+
+ self.today = time.time()
+ self.magic = retention.magic.magic_open(retention.magic.MAGIC_NONE)
+ self.magic.load()
+ self.summary = None
+ self.display_from_dict = FileInfo.display_from_dict
+
+ def set_up_max_files(self, maxfiles):
+ '''
+ more than this many files in a subdir we won't process,
+ we'll just try to name top offenders
+
+ if we've been asked only to report dir trees that are
+ too large in this manner, we can set defaults mich
+ higher, since we don't stat files, open them to guess
+ their filetype, etc; processing then goes much quicker
+ '''
+
+ if maxfiles is None:
+ if self.dirsizes:
+ self.MAX_FILES = 1000
+ else:
+ self.MAX_FILES = 100
+ else:
+ self.MAX_FILES = maxfiles
+
+ def set_up_and_export_rule_store(self):
+ hosts = self.cdb.store_db_list_all_hosts()
+ where_to_put = os.path.join(os.path.dirname(self.store_filepath),
+ "data_retention.d")
+ if not os.path.isdir(where_to_put):
+ os.makedirs(where_to_put, 0755)
+ for host in hosts:
+ nicepath = os.path.join(where_to_put, host + ".conf")
+ Rule.export_rules(self.cdb, nicepath, host)
+
+ def set_up_ignored(self):
+ '''
+ collect up initial list of files/dirs to skip during audit
+ '''
+ self.ignored = {}
+ self.ignored['files'] = Config.cf['ignored_files']
+ self.ignored['dirs'] = Config.cf['ignored_dirs']
+ self.ignored['prefixes'] = Config.cf['ignored_prefixes']
+ self.ignored['extensions'] = Config.cf['ignored_extensions']
+
+ if self.ignore_also is not None:
+ # silently skip paths that are not absolute
+ for path in self.ignore_also:
+ if path.startswith('/'):
+ if path.endswith('/'):
+ if '/' not in self.ignored['dirs']:
+ self.ignored['dirs']['/'] = []
+ self.ignored['dirs']['/'].append(path[:-1])
+ else:
+ if '/' not in self.ignored['files']:
+ self.ignored['files']['/'] = []
+ self.ignored['files']['/'].append(path)
+
+ def get_perhost_rules_as_json(self):
+ '''
+ this reads from the data_retention.d directory files for the minions
+ on which the audit will be run, converts each host's rules to json
+ strings, and returns a hash of rules where keys are the hostname and
+ values are the list of rules on that host
+ '''
+ where_to_get = os.path.join(os.path.dirname(self.store_filepath),
+ "data_retention.d")
+ if not os.path.isdir(where_to_get):
+ os.mkdir(where_to_get, 0755)
+ # really? or just read each file and be done with it?
+ # also I would like to check the syntax cause paranoid.
+ rules = {}
+ self.cdb = RuleStore(self.store_filepath)
+ self.cdb.store_db_init(self.expanded_hosts)
+ for host in self.expanded_hosts:
+ rules[host] = []
+ nicepath = os.path.join(where_to_get, host + ".conf")
+ if os.path.exists(nicepath):
+ dir_rules = None
+ try:
+ text = open(nicepath)
+ exec(text)
+ except:
+ continue
+ if dir_rules is not None:
+ for status in Status.status_cf:
+ if status in dir_rules:
+ for entry in dir_rules[status]:
+ if entry[0] != os.path.sep:
+ print ("WARNING: relative path in rule,"
+ "skipping:", entry)
+ continue
+ if entry[-1] == os.path.sep:
+ entry = entry[:-1]
+ entry_type = Rule.text_to_entrytype('dir')
+ else:
+ entry_type = Rule.text_to_entrytype('file')
+ rule = Rule.get_rule_as_json(
+ entry, entry_type, status)
+ rules[host].append(rule)
+ return rules
+
+ def write_perhost_rules_normal_code(self, indent):
+ rules = self.get_perhost_rules_as_json()
+
+ for host in rules:
+ rulescode = "rules = {}\n\n"
+ rulescode += "rules['%s'] = [\n" % host
+ rulescode += (indent +
+ (",\n%s" % (indent + indent)).join(rules[host]) + "\n")
+ rulescode += "]\n"
+
+ with open("/srv/salt/audits/retention/configs/%s_store.py" % host,
"w+") as fp:
+ fp.write(rulescode)
+ fp.close()
+
+ def write_rules_for_minion(self):
+ indent = " "
+ self.write_perhost_rules_normal_code(indent)
+ if self.perhost_raw is not None:
+ with open("/srv/salt/audits/retention/configs/allhosts_file.py",
"w+") as fp:
+ fp.write(self.perhost_raw)
+ fp.close()
+
+ def show_ignored(self, basedirs):
+ if self.verbose:
+ sys.stderr.write(
+ "INFO: The below does not include per-host rules\n")
+ sys.stderr.write(
+ "INFO: or rules derived from the directory status entries.\n")
+
+ sys.stderr.write("INFO: Ignoring the following directories:\n")
+
+ for basedir in self.ignored['dirs']:
+ if basedir in basedirs or basedir == '*' or basedir == '/':
+ sys.stderr.write(
+ "INFO: " + ','.join(self.ignored['dirs'][basedir])
+ + " in " + basedir + '\n')
+
+ sys.stderr.write("INFO: Ignoring the following files:\n")
+ for basedir in self.ignored['files']:
+ if basedir in basedirs or basedir == '*' or basedir == '/':
+ sys.stderr.write(
+ "INFO: " + ','.join(self.ignored['files'][basedir])
+ + " in " + basedir + '\n')
+
+ sys.stderr.write(
+ "INFO: Ignoring files starting with the following:\n")
+ sys.stderr.write(
+ "INFO: " + ','.join(self.ignored['prefixes']) + '\n')
+
+ sys.stderr.write(
+ "INFO: Ignoring files ending with the following:\n")
+ for basedir in self.ignored['extensions']:
+ if basedir in basedirs or basedir == '*':
+ sys.stderr.write("INFO: " + ','.join(
+ self.ignored['extensions'][basedir])
+ + " in " + basedir + '\n')
+
+ def contains(self, string_arg, list_arg):
+ '''
+ check if the string arg cotains any elt in
+ the list_arg
+ '''
+ for elt in list_arg:
+ if elt in string_arg:
+ return True
+ return False
+
+ def normalize(self, fname):
+ '''
+ subclasses may want to do something different, see
+ LogsAuditor for an example
+ '''
+ return fname
+
+ @staticmethod
+ def get_dirname_from_warning(warning):
+ '''
+ some audit output lines warn about directory trees
+ having too many files to audit; grab the dirname
+ out of such a line and return it
+ '''
+ start = "WARNING: directory "
+ if warning.startswith(start):
+ # WARNING: directory %s has more than %d files
+ rindex = warning.rfind(" has more than")
+ if not rindex:
+ return None
+ else:
+ return warning[len(start):rindex]
+
+ start = "WARNING: too many files to audit in directory "
+ if warning.startswith(start):
+ return warning[len(start):]
+
+ return None
+
+ def add_stats(self, item, summary):
+ '''
+ gather stats on how many files/dirs
+ may be problematic; summary is where the results
+ are collected, item is the item to include in
+ the summary if needed
+ '''
+ dirname = os.path.dirname(item['path'])
+
+ if dirname not in summary:
+ summary[dirname] = {
+ 'binary': {'old': 0, 'maybe_old': 0, 'nonroot': 0},
+ 'text': {'old': 0, 'maybe_old': 0, 'nonroot': 0}
+ }
+ if item['binary'] is True:
+ group = 'binary'
+ else:
+ group = 'text'
+
+ if item['old'] == 'T':
+ summary[dirname][group]['old'] += 1
+ elif item['old'] == '-':
+ summary[dirname][group]['maybe_old'] += 1
+ if item['owner'] != 0:
+ summary[dirname][group]['nonroot'] += 1
+ return summary
+
+ def display_host_summary(self):
+ if self.summary is not None:
+ paths = sorted(self.summary.keys())
+ for path in paths:
+ for group in self.summary[path]:
+ if (self.summary[path][group]['old'] > 0 or
+ self.summary[path][group]['maybe_old'] > 0 or
+ self.summary[path][group]['nonroot'] > 0):
+ print ("in directory %s, (%s), %d old,"
+ " %d maybe old, %d with non root owner"
+ % (path, group,
self.summary[path][group]['old'],
+ self.summary[path][group]['maybe_old'],
+ self.summary[path][group]['nonroot']))
+
+ def display_summary(self, result):
+ for host in result:
+ self.summary = {}
+ print "host:", host
+
+ if result[host]:
+ self.summary = {}
+ try:
+ lines = result[host].split('\n')
+ for line in lines:
+ if line == '':
+ continue
+ if (line.startswith("WARNING:") or
+ line.startswith("INFO:")):
+ print line
+ continue
+ else:
+ try:
+ item = json.loads(
+ line, object_hook=JsonHelper.decode_dict)
+ if item['empty'] is not True:
+ self.add_stats(item, self.summary)
+ except:
+ print "WARNING: failed to json load from host",
+ print host, "this line:", line
+ self.display_host_summary()
+ except:
+ print "WARNING: failed to process output from host"
+ else:
+ if self.verbose:
+ print "WARNING: no output from host", host
+
+ def display_remote_host(self, result):
+ try:
+ lines = result.split('\n')
+ files = []
+ for line in lines:
+ if line == "":
+ continue
+ elif line.startswith("WARNING:") or line.startswith("INFO:"):
+ print line
+ else:
+ files.append(json.loads(line,
object_hook=JsonHelper.decode_dict))
+
+ if files == []:
+ return
+ path_justify = max([len(finfo['path']) for finfo in files]) + 2
+ for finfo in files:
+ self.display_from_dict(finfo, self.show_sample_content,
path_justify)
+ except:
+ print "WARNING: failed to load json from host"
+
+ def audit_hosts(self):
+ result = self.runner.run_remotely()
+ if result is None:
+ print "WARNING: failed to get output from audit script on any host"
+ elif self.show_summary:
+ self.display_summary(result)
+ else:
+ for host in result:
+ print "host:", host
+ if result[host]:
+ self.display_remote_host(result[host])
+ else:
+ if self.verbose:
+ print "no output from host", host
+ # add some results to rule store
+ self.update_status_rules_from_report(result)
+ return result, self.ignored
+
+ def update_status_rules_from_report(self, report):
+ hostlist = report.keys()
+ for host in hostlist:
+ try:
+ problem_rules = Rule.get_rules(self.cdb, host,
Status.text_to_status('problem'))
+ except:
+ print 'WARNING: problem retrieving problem rules for host',
host
+ problem_rules = None
+ if problem_rules is not None:
+ existing_problems = [rule['path'] for rule in problem_rules]
+ else:
+ existing_problems = []
+
+ dirs_problem, dirs_skipped = get_dirs_toexamine(report[host])
+ if dirs_problem is not None:
+ dirs_problem = list(set(dirs_problem))
+ for dirname in dirs_problem:
+ Rule.do_add_rule(self.cdb, dirname,
+ Rule.text_to_entrytype('dir'),
+ Status.text_to_status('problem'), host)
+
+ if dirs_skipped is not None:
+ dirs_skipped = list(set(dirs_skipped))
+ for dirname in dirs_skipped:
+ if dirname in dirs_problem or dirname in existing_problems:
+ # problem report overrides 'too many to audit'
+ continue
+ Rule.do_add_rule(self.cdb, dirname,
+ Rule.text_to_entrytype('dir'),
+ Status.text_to_status('unreviewed'), host)
+
+
diff --git a/dataretention/retention/retentionaudit.py
b/dataretention/retention/retentionaudit.py
new file mode 100644
index 0000000..b7fefc5
--- /dev/null
+++ b/dataretention/retention/retentionaudit.py
@@ -0,0 +1,17 @@
+# salt module
+import sys
+import logging
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+from retention.localfileaudit import LocalFilesAuditor
+
+def fileaudit_host(show_content, dirsizes, depth,
+ to_check, ignore_also, timeout,
+ maxfiles):
+ fauditor = LocalFilesAuditor('root', show_content,
+ dirsizes, depth, to_check,
+ ignore_also, timeout,
+ maxfiles)
+ result = fauditor.do_local_audit()
+ return result
diff --git a/dataretention/retention/runner.py
b/dataretention/retention/runner.py
index 5a03930..aef8867 100644
--- a/dataretention/retention/runner.py
+++ b/dataretention/retention/runner.py
@@ -11,7 +11,7 @@
'''
def __init__(self, hosts_expr, expanded_hosts,
- audit_type, generate_executor,
+ audit_type, auditor_args,
show_sample_content=False, to_check=None,
timeout=30, verbose=False):
self.hosts_expr = hosts_expr
@@ -19,22 +19,21 @@
self.hosts, self.hosts_expr_type = Runner.get_hosts_expr_type(
self.hosts_expr)
self.audit_type = audit_type
- self.generate_executor = generate_executor
+ self.auditmodule_args = auditor_args
self.show_sample_content = show_sample_content
self.to_check = to_check
self.timeout = timeout
self.verbose = verbose
- @staticmethod
- def running_locally(hosts_expr):
- '''
- determine whether this script is to run on the local
- host or on one or more remote hosts
- '''
- if hosts_expr == "127.0.0.1" or hosts_expr == "localhost":
- return True
+ def get_auditfunction_name(self):
+ if self.audit_type == 'root':
+ return 'fileaudit_host'
+ elif self.audit_type == 'logs':
+ return 'logaudit_host'
+ elif self.audit_type == 'homes':
+ return 'homeaudit_host'
else:
- return False
+ return None
def run_remotely(self):
'''
@@ -46,10 +45,9 @@
if self.expanded_hosts is None:
self.expanded_hosts = client.cmd_expandminions(
self.hosts, "test.ping", expr_form=self.hosts_expr_type)
- code = "# -*- coding: utf-8 -*-\n"
- code += self.generate_executor()
- with open('/srv/audits/retention/scripts/data_auditor.py', 'r') as fp_:
- code += fp_.read()
+
+ # fixme instead of this we call the right salt module based on the
+ # audit type and with the self.auditmodule_args which is a list
hostbatches = [self.expanded_hosts[i: i + Config.cf['batchsize']]
for i in range(0, len(self.expanded_hosts),
@@ -72,14 +70,15 @@
'template=jinja'],
expr_form='list')
# fixme only copy if exists, check returns
# fixme this content should be ordered by host instead of by
ignore-list type
- # and split into separate files just as the previous files are
+ # and split into separate files just as the previous files are,
and actually be in one file
+ # with one copy total per client
new_result = client.cmd_full_return(hosts, 'cp.get_file',
['salt://audits/retention/configs/allhosts_file.py',
"/srv/audits/retention/configs/allhosts_file.cf",
'template=jinja'],
expr_form='list')
- print "salt-copy (2):", new_result
- new_result = client.cmd(hosts, "cmd.exec_code", ["python2", code],
+ # step two: run the appropriate salt audit module function
+ new_result = client.cmd(hosts, "retentionaudit.%s" %
self.get_auditfunction_name(), self.auditmodule_args,
expr_form='list', timeout=self.timeout)
if new_result is not None:
--
To view, visit https://gerrit.wikimedia.org/r/233453
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I0cd978a33cd2508197247355355e2141aadcab73
Gerrit-PatchSet: 2
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits