Merlijn van Deen has submitted this change and it was merged. Change subject: Add report that verifies view definitions ......................................................................
Add report that verifies view definitions - Diff the Table and Column objects from greylisted/whitelisted files vs reality and report on things that are missing - Use a combination of regex (to clean up) and pyparsing to parse the SQL used to define a view and convert that into Table and Column objects - Fix greylisted.yaml to match reality - mostly human errors that were made when transcribing from the perl file - Fully paranthesize all SQL expressions in greylisted.yaml since that is how MySQL seems to give them back when asking for view definitions - Add diff_iters and common_iters helpers to do set operations on lists. We want to keep them as lists since if we output them as sets to YAML then YAML doesn't output them cleanly and adds a python set decorator Bug: T85473 Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c --- M auditor/audit.py M auditor/models.py M auditor/reports/databases.py M auditor/reports/tables.py A auditor/reports/viewdiffs.py R auditor/utils.py M greylisted.yaml M requirements.txt 8 files changed, 257 insertions(+), 99 deletions(-) Approvals: Merlijn van Deen: Verified; Looks good to me, approved diff --git a/auditor/audit.py b/auditor/audit.py index 958dda5..c3d6a6b 100644 --- a/auditor/audit.py +++ b/auditor/audit.py @@ -33,6 +33,7 @@ from reports.databases import databases_report from reports.tables import extra_tables_report +from reports.viewdiffs import views_schema_diff_report argparser = argparse.ArgumentParser() @@ -57,6 +58,7 @@ rr.register_report(databases_report) rr.register_report(extra_tables_report) +rr.register_report(views_schema_diff_report) logging.info('Starting report generation') reports = rr.run() diff --git a/auditor/models.py b/auditor/models.py index 38d37b4..af4131c 100644 --- a/auditor/models.py +++ b/auditor/models.py @@ -17,6 +17,8 @@ import yaml import os +from utils import diff_iters + class Column(object): """ @@ -87,7 +89,11 @@ else: # Greylisted table! for colname, coldata in tabledata['columns'].items(): - table.add_column(Column(colname, coldata.get('whitelisted'), coldata.get('condition'), coldata.get('replacewith'))) + col = Column(colname, + whitelisted=coldata.get('whitelisted'), + condition=coldata.get('condition'), + replacewith=coldata.get('replacewith')) + table.add_column(col) return table @@ -127,6 +133,6 @@ with open(all_dblist_path) as all_file, open(private_dblist_path) as priv_file: all_dbs = [l.strip() for l in all_file.readlines()] priv_dbs = [l.strip() for l in priv_file.readlines()] - dbs = list(set(all_dbs) - (set(priv_dbs))) + dbs, _ = diff_iters(all_dbs, priv_dbs) return cls(dbs, tables) diff --git a/auditor/reports/databases.py b/auditor/reports/databases.py index c2ee418..4e64f4a 100644 --- a/auditor/reports/databases.py +++ b/auditor/reports/databases.py @@ -13,7 +13,7 @@ # limitations under the License. import re -from ..dbutils import get_databases +from ..utils import get_databases def databases_report(config, model, conn): diff --git a/auditor/reports/tables.py b/auditor/reports/tables.py index e4309d6..ab35bce 100644 --- a/auditor/reports/tables.py +++ b/auditor/reports/tables.py @@ -14,7 +14,7 @@ import logging import MySQLdb -from ..dbutils import get_tables +from ..utils import get_tables, diff_iters def _get_extra_tables(model, conn, dbs): @@ -22,7 +22,7 @@ for db in dbs: logging.debug('Finding extra tables on %s', db) try: - diff = set(get_tables(conn, db)) - set(model.tables.keys()) + extra_tables, _ = diff_iters(get_tables(conn, db), model.tables) except MySQLdb.OperationalError as e: # Ignore missing database errors (cod 1049). # Not using MySQLdb.constants.ER because it is stupid and needs to be explicitly imported to use. @@ -31,8 +31,8 @@ continue # We have other reports taking care of unknown dbs else: raise - if diff: - for tablename in diff: + if extra_tables: + for tablename in extra_tables: if tablename in extra_tables: extra_tables[tablename].append(db) else: diff --git a/auditor/reports/viewdiffs.py b/auditor/reports/viewdiffs.py new file mode 100644 index 0000000..e9e91d0 --- /dev/null +++ b/auditor/reports/viewdiffs.py @@ -0,0 +1,136 @@ +# Copyright 2015 Yuvi Panda <yuvipa...@gmail.com> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import logging +from pyparsing import OneOrMore, Optional, Word, SkipTo, StringEnd, alphanums + +from ..models import Column, Table +from ..utils import get_tables, diff_iters, common_iters + + +def _diff(expected, actual, fields): + diff = {} + for field in fields: + a = expected.__getattribute__(field) + b = actual.__getattribute__(field) + if a != b: + diff[field] = { + 'expected': a, + 'found': b + } + return diff + + +def diff_columns(expected, actual): + return _diff(expected, actual, ('name', 'whitelisted', 'condition', 'replacewith')) + + +def diff_tables(expected, actual): + table_diff = _diff(expected, actual, ('name', 'where', 'table_name')) + missing_cols, extra_cols = diff_iters(expected.columns, actual.columns) + common_columns = common_iters(actual.columns, expected.columns) + column_diffs = {} + for name in common_columns: + col_diff = diff_columns(expected.columns[name], actual.columns[name]) + if col_diff: + column_diffs[name] = col_diff + + column_info = {} + if missing_cols: + column_info['missing'] = missing_cols + if extra_cols: + column_info['extra'] = extra_cols + if column_diffs: + column_info['diff'] = column_diffs + if column_info: + table_diff['columns'] = column_info + return table_diff if table_diff else None + + +EXTRACT_VIEWDEF_RE = re.compile(r"DEFINER VIEW `([^`]+)` AS (.*)") + + +def _cleanup_viewdefiner(full_sql, db, table): + """ + Clean up the view definer SQL to make it easier to parse + + Picks out only the SELECT statement from the definer. + Also reduces all references of form dbname.tablename.column to just column + Removes backticks as well + """ + definer_sql = EXTRACT_VIEWDEF_RE.search(full_sql).groups()[1] + private_db_name = db.replace('_p', '') + sql = definer_sql.replace('`', '') \ + .replace('%s.%s.' % (private_db_name, table), '') + + return sql + +# We have to parse a specific subset of SQL, that is used to define views +# The Grammar is: +# +# sql = "SELECT" { column-defintions } "FROM" wikiname.tablename [ "WHERE" cond ] +# column-definition = column-name "AS" column-name +# | "NULL" "AS" column-name +# | "IF(" cond ", NULL, " column-name ")" "AS" column-name +# +# Where cond is a valid SQL relational expression, column-name, wikiname, tablename are strings + +identifier = alphanums + "_" +if_definition = "if(" + SkipTo(",")("condition") + "," + Word(identifier) + "," + Word(identifier) + ")" +column_definition = (if_definition ^ Word(identifier)('expression')) + "AS" + Word(identifier)("name") + Optional(",") +sql_definition = "select" + OneOrMore(column_definition)("columns") + \ + "from" + Word(identifier) + "." + Word(identifier)("tablename") + \ + Optional("where" + SkipTo(StringEnd())("where")) + StringEnd() + + +def _table_from_definer(sql, viewname): + """ + Build a Table object given a cleaned up SQL statement that defines the view + """ + res = sql_definition.parseString(sql) + table = Table(viewname, {}, res.where if res.where else None, res.tablename) + for tokens, start, end in column_definition.scanString(sql): + table.add_column(Column(tokens.name, + whitelisted=tokens.condition == '' and tokens.expression != 'NULL', + condition=tokens.condition if tokens.condition else None)) + + return table + + +def views_schema_diff_report(config, model, conn): + """ + Diff between schema of views in the public db and how they should be + """ + report = {} + cur = conn.cursor() + for db in model.public_dbs: + report_db = {} + all_tables = get_tables(conn, db) + # We want tables that are present in this db and in our model + # because missing / extra tables are reported from tables.py + table_names = common_iters(all_tables, model.tables) + for name in table_names: + table = model.tables[name] + cur.execute('SHOW CREATE VIEW %s.%s' % (db, name)) + sql = _cleanup_viewdefiner(cur.fetchone()[1], db, table.table_name) + t = _table_from_definer(sql, name) + diff = diff_tables(table, t) + if diff: + report_db[name] = diff + if report_db: + report[db] = report_db + logging.info("Differences found in DB %s - %s tables with differences", db, len(report_db)) + else: + logging.info("No differences found in DB %s", db) + return report diff --git a/auditor/dbutils.py b/auditor/utils.py similarity index 76% rename from auditor/dbutils.py rename to auditor/utils.py index 1a4c63c..ddd771f 100644 --- a/auditor/dbutils.py +++ b/auditor/utils.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -Collection of utilities for dealing with databases +Collection of utilities """ @@ -40,3 +40,19 @@ tables = [r[0] for r in cur.fetchall()] cur.close() return tables + + +def diff_iters(iter1, iter2): + """ + Returns (iter1 - iter2, iter2 - iter1), but as lists + """ + return list(set(iter1) - set(iter2)), list(set(iter2) - set(iter1)) + + +def common_iters(iter1, iter2): + """ + Returns set(iter1) intersection set(iter2) + + :return: Items in iter1 *and* iter2 + """ + return set(iter1).intersection(iter2) diff --git a/greylisted.yaml b/greylisted.yaml index 8492f5c..8b0af0b 100644 --- a/greylisted.yaml +++ b/greylisted.yaml @@ -3,7 +3,7 @@ af_actions: whitelisted: true af_comments: - condition: af_hidden != NULL + condition: af_hidden whitelisted: false af_deleted: whitelisted: true @@ -20,7 +20,7 @@ af_id: whitelisted: true af_pattern: - condition: af_hidden != NULL + condition: af_hidden whitelisted: false af_public_comments: whitelisted: true @@ -153,7 +153,7 @@ ar_id: whitelisted: true ar_len: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_minor_edit: whitelisted: true @@ -166,22 +166,22 @@ ar_rev_id: whitelisted: true ar_sha1: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_text: whitelisted: false ar_text_id: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_timestamp: whitelisted: true ar_title: whitelisted: true ar_user: - condition: ar_deleted & 4 + condition: (ar_deleted & 4) whitelisted: false ar_user_text: - condition: ar_deleted & 4 + condition: (ar_deleted & 4) whitelisted: false archive_userindex: table_name: archive @@ -195,7 +195,7 @@ ar_id: whitelisted: true ar_len: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_minor_edit: whitelisted: true @@ -208,12 +208,12 @@ ar_rev_id: whitelisted: true ar_sha1: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_text: whitelisted: false ar_text_id: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_timestamp: whitelisted: true @@ -223,13 +223,13 @@ whitelisted: true ar_user_text: whitelisted: true - where: ar_deleted & 4 = 0 + where: ((ar_deleted & 4) = 0) filearchive: columns: fa_archive_name: whitelisted: true fa_bits: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_deleted: whitelisted: true @@ -240,32 +240,32 @@ fa_deleted_user: whitelisted: true fa_description: - condition: fa_deleted & 2 + condition: (fa_deleted & 2) whitelisted: false fa_height: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_id: whitelisted: true fa_major_mime: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_media_type: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_metadata: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_minor_mime: - whitelisted: true - fa_name: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false + fa_name: + whitelisted: true fa_sha1: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_size: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_storage_group: whitelisted: true @@ -274,13 +274,13 @@ fa_timestamp: whitelisted: true fa_user: - condition: fa_deleted & 4 + condition: (fa_deleted & 4) whitelisted: false fa_user_text: - condition: fa_deleted & 4 + condition: (fa_deleted & 4) whitelisted: false fa_width: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false filearchive_userindex: table_name: filearchive @@ -288,7 +288,7 @@ fa_archive_name: whitelisted: true fa_bits: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_deleted: whitelisted: true @@ -299,32 +299,32 @@ fa_deleted_user: whitelisted: true fa_description: - condition: fa_deleted & 2 + condition: (fa_deleted & 2) whitelisted: false fa_height: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_id: whitelisted: true fa_major_mime: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_media_type: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_metadata: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_minor_mime: - whitelisted: true - fa_name: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false + fa_name: + whitelisted: true fa_sha1: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_size: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_storage_group: whitelisted: true @@ -337,13 +337,13 @@ fa_user_text: whitelisted: true fa_width: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false - where: fa_deleted & 4 = 0 + where: ((fa_deleted & 4) = 0) ipblocks: columns: ipb_address: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_allow_usertalk: whitelisted: true @@ -360,8 +360,7 @@ ipb_create_account: whitelisted: true ipb_deleted: - replacewith: 0 - whitelisted: false + whitelisted: true ipb_enable_autoblock: whitelisted: true ipb_expiry: @@ -371,10 +370,10 @@ ipb_parent_block_id: whitelisted: true ipb_range_end: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_range_start: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_reason: whitelisted: true @@ -382,7 +381,7 @@ whitelisted: true ipb_user: whitelisted: true - where: ipb_deleted = 0 + where: (ipb_deleted = 0) ipblocks_ipindex: table_name: ipblocks columns: @@ -403,8 +402,7 @@ ipb_create_account: whitelisted: true ipb_deleted: - replacewith: 0 - whitelisted: false + whitelisted: true ipb_enable_autoblock: whitelisted: true ipb_expiry: @@ -423,24 +421,24 @@ whitelisted: true ipb_user: whitelisted: true - where: ipb_deleted = 0 AND ipb_auto = 0 + where: ((ipb_deleted = 0) and (ipb_auto = 0)) logging: columns: log_action: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true log_id: whitelisted: true log_namespace: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_page: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_params: condition: log_deleted @@ -448,24 +446,24 @@ log_timestamp: whitelisted: true log_title: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_type: whitelisted: true log_user: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false log_user_text: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false - where: logtype != 'suppress' + where: (log_type <> 'suppress') logging_logindex: table_name: logging columns: log_action: whitelisted: true log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true @@ -485,30 +483,30 @@ log_type: whitelisted: true log_user: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false log_user_text: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false - where: (log_deleted & 1) = 0 AND logtype != 'suppress' + where: (((log_deleted & 1) = 0) and (log_type <> 'suppress')) logging_userindex: table_name: logging columns: log_action: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true log_id: whitelisted: true log_namespace: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_page: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_params: condition: log_deleted @@ -516,7 +514,7 @@ log_timestamp: whitelisted: true log_title: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_type: whitelisted: true @@ -524,7 +522,7 @@ whitelisted: true log_user_text: whitelisted: true - where: log_deleted & 4 = 0 AND log_type != 'suppress' + where: (((log_deleted & 4) = 0) and (log_type <> 'suppress')) mark_as_helpful: columns: mah_id: @@ -558,7 +556,7 @@ oi_deleted: whitelisted: true oi_description: - condition: oi_deleted & 2 + condition: (oi_deleted & 2) whitelisted: false oi_height: whitelisted: true @@ -579,10 +577,10 @@ oi_timestamp: whitelisted: true oi_user: - condition: oi_deleted & 4 + condition: (oi_deleted & 4) whitelisted: false oi_user_text: - condition: oi_deleted & 4 + condition: (oi_deleted & 4) whitelisted: false oi_width: whitelisted: true @@ -596,7 +594,7 @@ oi_deleted: whitelisted: true oi_description: - condition: oi_deleted & 2 + condition: (oi_deleted & 2) whitelisted: false oi_height: whitelisted: true @@ -622,13 +620,13 @@ whitelisted: true oi_width: whitelisted: true - where: oi_deleted & 4 = 0 + where: ((oi_deleted & 4) = 0) recentchanges: columns: rc_bot: whitelisted: true rc_comment: - condition: rc_deleted & 2 + condition: (rc_deleted & 2) whitelisted: false rc_cur_id: whitelisted: true @@ -639,7 +637,7 @@ rc_id: whitelisted: true rc_ip: - whitelisted: true + whitelisted: false rc_last_oldid: whitelisted: true rc_log_action: @@ -673,10 +671,10 @@ rc_type: whitelisted: true rc_user: - condition: rc_deleted & 4 + condition: (rc_deleted & 4) whitelisted: false rc_user_text: - condition: rc_deleted & 4 + condition: (rc_deleted & 4) whitelisted: false recentchanges_userindex: table_name: recentchanges @@ -684,7 +682,7 @@ rc_bot: whitelisted: true rc_comment: - condition: rc_deleted & 2 + condition: (rc_deleted & 2) whitelisted: false rc_cur_id: whitelisted: true @@ -695,7 +693,7 @@ rc_id: whitelisted: true rc_ip: - whitelisted: true + whitelisted: false rc_last_oldid: whitelisted: true rc_log_action: @@ -732,11 +730,11 @@ whitelisted: true rc_user_text: whitelisted: true - where: rc_deleted & 4 = 0 + where: ((rc_deleted & 4) = 0) revision: columns: rev_comment: - condition: rev_deleted & 2 + condition: (rev_deleted & 2) whitelisted: false rev_content_format: whitelisted: true @@ -747,7 +745,7 @@ rev_id: whitelisted: true rev_len: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_minor_edit: whitelisted: true @@ -756,24 +754,24 @@ rev_parent_id: whitelisted: true rev_sha1: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_text_id: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_timestamp: whitelisted: true rev_user: - condition: rev_deleted & 4 + condition: (rev_deleted & 4) whitelisted: false rev_user_text: - condition: rev_deleted & 4 + condition: (rev_deleted & 4) whitelisted: false revision_userindex: table_name: revision columns: rev_comment: - condition: rev_deleted & 2 + condition: (rev_deleted & 2) whitelisted: false rev_content_format: whitelisted: true @@ -784,7 +782,7 @@ rev_id: whitelisted: true rev_len: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_minor_edit: whitelisted: true @@ -793,10 +791,10 @@ rev_parent_id: whitelisted: true rev_sha1: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_text_id: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_timestamp: whitelisted: true @@ -804,7 +802,7 @@ whitelisted: true rev_user_text: whitelisted: true - where: rev_deleted & 4 = 0 + where: ((rev_deleted & 4) = 0) user: columns: user_editcount: @@ -834,7 +832,7 @@ user_real_name: whitelisted: true user_registration: - whitelisted: false + whitelisted: true user_token: whitelisted: false user_touched: @@ -869,5 +867,4 @@ whitelisted: true up_value: whitelisted: true - where: up_property in ('disablemail', 'fancysig', 'gender', 'language', 'nickname', - 'skin', 'timecorrection', 'variant') + where: (up_property in ('disablemail','fancysig','gender','language','nickname','skin','timecorrection','variant')) diff --git a/requirements.txt b/requirements.txt index d6714a7..97156b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ +pyparsing==2.0.3 MySQL-python==1.2.5 PyYAML==3.11 -- To view, visit https://gerrit.wikimedia.org/r/182848 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c Gerrit-PatchSet: 8 Gerrit-Project: operations/software/labsdb-auditor Gerrit-Branch: master Gerrit-Owner: Yuvipanda <yuvipa...@gmail.com> Gerrit-Reviewer: Legoktm <legoktm.wikipe...@gmail.com> Gerrit-Reviewer: Merlijn van Deen <valhall...@arctus.nl> Gerrit-Reviewer: Yuvipanda <yuvipa...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits