Yuvipanda has uploaded a new change for review. https://gerrit.wikimedia.org/r/182848
Change subject: Add report that verifies view definitions ...................................................................... Add report that verifies view definitions - Fix greylisted.yaml to match reality - mostly human errors that were made when transcribing from the perl file - Fully paranthesize all SQL expressions in greylisted.yaml since that is how MySQL seems to give them back when asking for view definitions - Use a combination of regex (to clean up) and pyparsing to parse the SQL used to define a view and convert that into Table and Column objects - Diff the Table and Column objects from greylisted/whitelisted files vs reality and report on things that are missing Bug: T85473 Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c --- M auditor/audit.py A auditor/reports/viewdiffs.py M greylisted.yaml M requirements.txt 4 files changed, 212 insertions(+), 93 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/software/labsdb-auditor refs/changes/48/182848/1 diff --git a/auditor/audit.py b/auditor/audit.py index 958dda5..10b5371 100644 --- a/auditor/audit.py +++ b/auditor/audit.py @@ -33,6 +33,7 @@ from reports.databases import databases_report from reports.tables import extra_tables_report +from reports.viewdiffs import views_schema_diff_report argparser = argparse.ArgumentParser() @@ -55,8 +56,9 @@ rr = ReportRunner(config) -rr.register_report(databases_report) -rr.register_report(extra_tables_report) +# rr.register_report(databases_report) +# rr.register_report(extra_tables_report) +rr.register_report(views_schema_diff_report) logging.info('Starting report generation') reports = rr.run() diff --git a/auditor/reports/viewdiffs.py b/auditor/reports/viewdiffs.py new file mode 100644 index 0000000..69415a5 --- /dev/null +++ b/auditor/reports/viewdiffs.py @@ -0,0 +1,119 @@ +# Copyright 2015 Yuvi Panda <yuvipa...@gmail.com> +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +from pyparsing import OneOrMore, Optional, Word, SkipTo, StringEnd, alphanums + +from ..models import Column, Table +from ..dbutils import get_tables + + +def _diff(expected, actual, fields): + diff = {} + for field in fields: + a = expected.__getattribute__(field) + b = actual.__getattribute__(field) + if a != b: + diff[field] = { + 'expected': a, + 'found': b + } + return diff + +def diff_columns(expected, actual): + return _diff(expected, actual, ('name', 'whitelisted', 'condition', 'replacewith')) + +def diff_tables(expected, actual): + table_diff = _diff(expected, actual, ('name', 'where', 'table_name')) + missing_cols = list(set(expected.columns.keys()) - set(actual.columns.keys())) + extra_cols = list(set(actual.columns.keys()) - set(expected.columns.keys())) + common_columns = set(actual.columns.keys()).intersection(expected.columns.keys()) + column_diffs = {} + for name in common_columns: + col_diff = diff_columns(expected.columns[name], actual.columns[name]) + if col_diff: + column_diffs[name] = col_diff + + column_info = {} + if missing_cols: + column_info['missing'] = missing_cols + if extra_cols: + column_info['extra'] = extra_cols + if column_diffs: + column_info['diff'] = column_diffs + if column_info: + table_diff['columns'] = column_info + return table_diff if table_diff else None + +EXTRACT_VIEWDEF_RE = re.compile(r"DEFINER VIEW `([^`]+)` AS (.*)") +def _cleanup_viewdefiner(full_sql, db, table): + definer_sql = EXTRACT_VIEWDEF_RE.search(full_sql).groups()[1] + private_db_name = db.replace('_p', '') + sql = definer_sql.replace('`', '') \ + .replace('%s.%s.' % (private_db_name, table), '') + + return sql + +# We have to parse a specific subset of SQL, that is used to define views +# The Grammar is: +# +# sql = "SELECT" { column-defintions } "FROM" wikiname.tablename [ "WHERE" cond ] +# column-definition = column-name "AS" column-name +# | "NULL" "AS" column-name +# | "IF(" cond ", NULL, " column-name ")" "AS" column-name +# +# Where cond is a valid SQL relational expression, column-name, wikiname, tablename are strings + +identifier = alphanums + "_" +if_definition = "if(" + SkipTo(",")("condition") + "," + Word(identifier) + "," + Word(identifier) + ")" +column_definition = (if_definition ^ Word(identifier)('expression')) + "AS" + Word(identifier)("name") + Optional(",") +sql_definition = "select" + OneOrMore(column_definition)("columns") + \ + "from" + Word(identifier) + "." + Word(identifier)("tablename") + \ + Optional("where" + SkipTo(StringEnd())("where")) + StringEnd() + +def _table_from_definer(sql, tablename): + """ + Build a Table object given a cleaned up SQL statement that defines the view + """ + res = sql_definition.parseString(sql) + table = Table(tablename, {}, res.where if res.where else None, res.tablename) + for tokens, start, end in column_definition.scanString(sql): + table.add_column(Column(tokens.name, + whitelisted=tokens.condition == '' and tokens.expression != 'NULL', + condition=tokens.condition if tokens.condition else None)) + + + return table + + +def views_schema_diff_report(config, model, conn): + """ + Diff between schema of views in the public db and how they should be + """ + report = {} + cur = conn.cursor() + for db in model.public_dbs: + report_db = {} + all_tables = get_tables(conn, db) + # We want tables that are present in this db and in our model + tables = {name: model.tables[name] for name in set(all_tables).intersection(set(model.tables.keys()))} + for name, table in tables.items(): + cur.execute('SHOW CREATE VIEW %s.%s' % (db, name)) + sql = _cleanup_viewdefiner(cur.fetchone()[1], db, table.table_name) + t = _table_from_definer(sql, name) + diff = diff_tables(table, t) + if diff: + report_db[name] = diff + if report_db: + report[db] = report_db + return report diff --git a/greylisted.yaml b/greylisted.yaml index 8492f5c..8b0af0b 100644 --- a/greylisted.yaml +++ b/greylisted.yaml @@ -3,7 +3,7 @@ af_actions: whitelisted: true af_comments: - condition: af_hidden != NULL + condition: af_hidden whitelisted: false af_deleted: whitelisted: true @@ -20,7 +20,7 @@ af_id: whitelisted: true af_pattern: - condition: af_hidden != NULL + condition: af_hidden whitelisted: false af_public_comments: whitelisted: true @@ -153,7 +153,7 @@ ar_id: whitelisted: true ar_len: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_minor_edit: whitelisted: true @@ -166,22 +166,22 @@ ar_rev_id: whitelisted: true ar_sha1: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_text: whitelisted: false ar_text_id: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_timestamp: whitelisted: true ar_title: whitelisted: true ar_user: - condition: ar_deleted & 4 + condition: (ar_deleted & 4) whitelisted: false ar_user_text: - condition: ar_deleted & 4 + condition: (ar_deleted & 4) whitelisted: false archive_userindex: table_name: archive @@ -195,7 +195,7 @@ ar_id: whitelisted: true ar_len: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_minor_edit: whitelisted: true @@ -208,12 +208,12 @@ ar_rev_id: whitelisted: true ar_sha1: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_text: whitelisted: false ar_text_id: - condition: ar_deleted & 1 + condition: (ar_deleted & 1) whitelisted: false ar_timestamp: whitelisted: true @@ -223,13 +223,13 @@ whitelisted: true ar_user_text: whitelisted: true - where: ar_deleted & 4 = 0 + where: ((ar_deleted & 4) = 0) filearchive: columns: fa_archive_name: whitelisted: true fa_bits: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_deleted: whitelisted: true @@ -240,32 +240,32 @@ fa_deleted_user: whitelisted: true fa_description: - condition: fa_deleted & 2 + condition: (fa_deleted & 2) whitelisted: false fa_height: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_id: whitelisted: true fa_major_mime: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_media_type: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_metadata: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_minor_mime: - whitelisted: true - fa_name: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false + fa_name: + whitelisted: true fa_sha1: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_size: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_storage_group: whitelisted: true @@ -274,13 +274,13 @@ fa_timestamp: whitelisted: true fa_user: - condition: fa_deleted & 4 + condition: (fa_deleted & 4) whitelisted: false fa_user_text: - condition: fa_deleted & 4 + condition: (fa_deleted & 4) whitelisted: false fa_width: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false filearchive_userindex: table_name: filearchive @@ -288,7 +288,7 @@ fa_archive_name: whitelisted: true fa_bits: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_deleted: whitelisted: true @@ -299,32 +299,32 @@ fa_deleted_user: whitelisted: true fa_description: - condition: fa_deleted & 2 + condition: (fa_deleted & 2) whitelisted: false fa_height: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_id: whitelisted: true fa_major_mime: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_media_type: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_metadata: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_minor_mime: - whitelisted: true - fa_name: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false + fa_name: + whitelisted: true fa_sha1: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_size: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false fa_storage_group: whitelisted: true @@ -337,13 +337,13 @@ fa_user_text: whitelisted: true fa_width: - condition: fa_deleted & 1 + condition: (fa_deleted & 1) whitelisted: false - where: fa_deleted & 4 = 0 + where: ((fa_deleted & 4) = 0) ipblocks: columns: ipb_address: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_allow_usertalk: whitelisted: true @@ -360,8 +360,7 @@ ipb_create_account: whitelisted: true ipb_deleted: - replacewith: 0 - whitelisted: false + whitelisted: true ipb_enable_autoblock: whitelisted: true ipb_expiry: @@ -371,10 +370,10 @@ ipb_parent_block_id: whitelisted: true ipb_range_end: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_range_start: - condition: ipb_auto != 0 + condition: (ipb_auto <> 0) whitelisted: false ipb_reason: whitelisted: true @@ -382,7 +381,7 @@ whitelisted: true ipb_user: whitelisted: true - where: ipb_deleted = 0 + where: (ipb_deleted = 0) ipblocks_ipindex: table_name: ipblocks columns: @@ -403,8 +402,7 @@ ipb_create_account: whitelisted: true ipb_deleted: - replacewith: 0 - whitelisted: false + whitelisted: true ipb_enable_autoblock: whitelisted: true ipb_expiry: @@ -423,24 +421,24 @@ whitelisted: true ipb_user: whitelisted: true - where: ipb_deleted = 0 AND ipb_auto = 0 + where: ((ipb_deleted = 0) and (ipb_auto = 0)) logging: columns: log_action: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true log_id: whitelisted: true log_namespace: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_page: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_params: condition: log_deleted @@ -448,24 +446,24 @@ log_timestamp: whitelisted: true log_title: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_type: whitelisted: true log_user: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false log_user_text: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false - where: logtype != 'suppress' + where: (log_type <> 'suppress') logging_logindex: table_name: logging columns: log_action: whitelisted: true log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true @@ -485,30 +483,30 @@ log_type: whitelisted: true log_user: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false log_user_text: - condition: log_deleted & 4 + condition: (log_deleted & 4) whitelisted: false - where: (log_deleted & 1) = 0 AND logtype != 'suppress' + where: (((log_deleted & 1) = 0) and (log_type <> 'suppress')) logging_userindex: table_name: logging columns: log_action: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_comment: - condition: log_deleted & 2 + condition: (log_deleted & 2) whitelisted: false log_deleted: whitelisted: true log_id: whitelisted: true log_namespace: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_page: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_params: condition: log_deleted @@ -516,7 +514,7 @@ log_timestamp: whitelisted: true log_title: - condition: log_deleted & 1 + condition: (log_deleted & 1) whitelisted: false log_type: whitelisted: true @@ -524,7 +522,7 @@ whitelisted: true log_user_text: whitelisted: true - where: log_deleted & 4 = 0 AND log_type != 'suppress' + where: (((log_deleted & 4) = 0) and (log_type <> 'suppress')) mark_as_helpful: columns: mah_id: @@ -558,7 +556,7 @@ oi_deleted: whitelisted: true oi_description: - condition: oi_deleted & 2 + condition: (oi_deleted & 2) whitelisted: false oi_height: whitelisted: true @@ -579,10 +577,10 @@ oi_timestamp: whitelisted: true oi_user: - condition: oi_deleted & 4 + condition: (oi_deleted & 4) whitelisted: false oi_user_text: - condition: oi_deleted & 4 + condition: (oi_deleted & 4) whitelisted: false oi_width: whitelisted: true @@ -596,7 +594,7 @@ oi_deleted: whitelisted: true oi_description: - condition: oi_deleted & 2 + condition: (oi_deleted & 2) whitelisted: false oi_height: whitelisted: true @@ -622,13 +620,13 @@ whitelisted: true oi_width: whitelisted: true - where: oi_deleted & 4 = 0 + where: ((oi_deleted & 4) = 0) recentchanges: columns: rc_bot: whitelisted: true rc_comment: - condition: rc_deleted & 2 + condition: (rc_deleted & 2) whitelisted: false rc_cur_id: whitelisted: true @@ -639,7 +637,7 @@ rc_id: whitelisted: true rc_ip: - whitelisted: true + whitelisted: false rc_last_oldid: whitelisted: true rc_log_action: @@ -673,10 +671,10 @@ rc_type: whitelisted: true rc_user: - condition: rc_deleted & 4 + condition: (rc_deleted & 4) whitelisted: false rc_user_text: - condition: rc_deleted & 4 + condition: (rc_deleted & 4) whitelisted: false recentchanges_userindex: table_name: recentchanges @@ -684,7 +682,7 @@ rc_bot: whitelisted: true rc_comment: - condition: rc_deleted & 2 + condition: (rc_deleted & 2) whitelisted: false rc_cur_id: whitelisted: true @@ -695,7 +693,7 @@ rc_id: whitelisted: true rc_ip: - whitelisted: true + whitelisted: false rc_last_oldid: whitelisted: true rc_log_action: @@ -732,11 +730,11 @@ whitelisted: true rc_user_text: whitelisted: true - where: rc_deleted & 4 = 0 + where: ((rc_deleted & 4) = 0) revision: columns: rev_comment: - condition: rev_deleted & 2 + condition: (rev_deleted & 2) whitelisted: false rev_content_format: whitelisted: true @@ -747,7 +745,7 @@ rev_id: whitelisted: true rev_len: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_minor_edit: whitelisted: true @@ -756,24 +754,24 @@ rev_parent_id: whitelisted: true rev_sha1: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_text_id: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_timestamp: whitelisted: true rev_user: - condition: rev_deleted & 4 + condition: (rev_deleted & 4) whitelisted: false rev_user_text: - condition: rev_deleted & 4 + condition: (rev_deleted & 4) whitelisted: false revision_userindex: table_name: revision columns: rev_comment: - condition: rev_deleted & 2 + condition: (rev_deleted & 2) whitelisted: false rev_content_format: whitelisted: true @@ -784,7 +782,7 @@ rev_id: whitelisted: true rev_len: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_minor_edit: whitelisted: true @@ -793,10 +791,10 @@ rev_parent_id: whitelisted: true rev_sha1: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_text_id: - condition: rev_deleted & 1 + condition: (rev_deleted & 1) whitelisted: false rev_timestamp: whitelisted: true @@ -804,7 +802,7 @@ whitelisted: true rev_user_text: whitelisted: true - where: rev_deleted & 4 = 0 + where: ((rev_deleted & 4) = 0) user: columns: user_editcount: @@ -834,7 +832,7 @@ user_real_name: whitelisted: true user_registration: - whitelisted: false + whitelisted: true user_token: whitelisted: false user_touched: @@ -869,5 +867,4 @@ whitelisted: true up_value: whitelisted: true - where: up_property in ('disablemail', 'fancysig', 'gender', 'language', 'nickname', - 'skin', 'timecorrection', 'variant') + where: (up_property in ('disablemail','fancysig','gender','language','nickname','skin','timecorrection','variant')) diff --git a/requirements.txt b/requirements.txt index d6714a7..97156b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ +pyparsing==2.0.3 MySQL-python==1.2.5 PyYAML==3.11 -- To view, visit https://gerrit.wikimedia.org/r/182848 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c Gerrit-PatchSet: 1 Gerrit-Project: operations/software/labsdb-auditor Gerrit-Branch: master Gerrit-Owner: Yuvipanda <yuvipa...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits