Yuvipanda has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182848

Change subject: Add report that verifies view definitions
......................................................................

Add report that verifies view definitions

- Fix greylisted.yaml to match reality - mostly human errors
  that were made when transcribing from the perl file
- Fully paranthesize all SQL expressions in greylisted.yaml
  since that is how MySQL seems to give them back when asking
  for view definitions
- Use a combination of regex (to clean up) and pyparsing to
  parse the SQL used to define a view and convert that into
  Table and Column objects
- Diff the Table and Column objects from greylisted/whitelisted
  files vs reality and report on things that are missing

Bug: T85473

Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c
---
M auditor/audit.py
A auditor/reports/viewdiffs.py
M greylisted.yaml
M requirements.txt
4 files changed, 212 insertions(+), 93 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/software/labsdb-auditor 
refs/changes/48/182848/1

diff --git a/auditor/audit.py b/auditor/audit.py
index 958dda5..10b5371 100644
--- a/auditor/audit.py
+++ b/auditor/audit.py
@@ -33,6 +33,7 @@
 
 from reports.databases import databases_report
 from reports.tables import extra_tables_report
+from reports.viewdiffs import views_schema_diff_report
 
 
 argparser = argparse.ArgumentParser()
@@ -55,8 +56,9 @@
 
 rr = ReportRunner(config)
 
-rr.register_report(databases_report)
-rr.register_report(extra_tables_report)
+# rr.register_report(databases_report)
+# rr.register_report(extra_tables_report)
+rr.register_report(views_schema_diff_report)
 
 logging.info('Starting report generation')
 reports = rr.run()
diff --git a/auditor/reports/viewdiffs.py b/auditor/reports/viewdiffs.py
new file mode 100644
index 0000000..69415a5
--- /dev/null
+++ b/auditor/reports/viewdiffs.py
@@ -0,0 +1,119 @@
+# Copyright 2015 Yuvi Panda <yuvipa...@gmail.com>
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from pyparsing import OneOrMore, Optional, Word, SkipTo, StringEnd, alphanums
+
+from ..models import Column, Table
+from ..dbutils import get_tables
+
+
+def _diff(expected, actual, fields):
+    diff = {}
+    for field in fields:
+        a = expected.__getattribute__(field)
+        b = actual.__getattribute__(field)
+        if a != b:
+            diff[field] = {
+                'expected': a,
+                'found': b
+            }
+    return diff
+
+def diff_columns(expected, actual):
+    return _diff(expected, actual, ('name', 'whitelisted', 'condition', 
'replacewith'))
+
+def diff_tables(expected, actual):
+    table_diff = _diff(expected, actual, ('name', 'where', 'table_name'))
+    missing_cols = list(set(expected.columns.keys()) - 
set(actual.columns.keys()))
+    extra_cols = list(set(actual.columns.keys()) - 
set(expected.columns.keys()))
+    common_columns = 
set(actual.columns.keys()).intersection(expected.columns.keys())
+    column_diffs = {}
+    for name in common_columns:
+        col_diff = diff_columns(expected.columns[name], actual.columns[name])
+        if col_diff:
+            column_diffs[name] = col_diff
+
+    column_info = {}
+    if missing_cols:
+        column_info['missing'] = missing_cols
+    if extra_cols:
+        column_info['extra'] = extra_cols
+    if column_diffs:
+        column_info['diff'] = column_diffs
+    if column_info:
+        table_diff['columns'] = column_info
+    return table_diff if table_diff else None
+
+EXTRACT_VIEWDEF_RE = re.compile(r"DEFINER VIEW `([^`]+)` AS (.*)")
+def _cleanup_viewdefiner(full_sql, db, table):
+    definer_sql = EXTRACT_VIEWDEF_RE.search(full_sql).groups()[1]
+    private_db_name = db.replace('_p', '')
+    sql = definer_sql.replace('`', '') \
+        .replace('%s.%s.' % (private_db_name, table), '')
+
+    return sql
+
+# We have to parse a specific subset of SQL, that is used to define views
+# The Grammar is:
+#
+# sql = "SELECT" { column-defintions } "FROM" wikiname.tablename [ "WHERE" 
cond ]
+# column-definition = column-name "AS" column-name
+#                     | "NULL" "AS" column-name
+#                     | "IF(" cond ", NULL, " column-name ")" "AS" column-name
+#
+# Where cond is a valid SQL relational expression, column-name, wikiname, 
tablename are strings
+
+identifier = alphanums + "_"
+if_definition = "if(" + SkipTo(",")("condition") + "," + Word(identifier) + 
"," + Word(identifier) + ")"
+column_definition = (if_definition ^ Word(identifier)('expression')) + "AS" + 
Word(identifier)("name") + Optional(",")
+sql_definition = "select" + OneOrMore(column_definition)("columns") + \
+                 "from" + Word(identifier) + "." + 
Word(identifier)("tablename") + \
+                 Optional("where" + SkipTo(StringEnd())("where")) + StringEnd()
+
+def _table_from_definer(sql, tablename):
+    """
+    Build a Table object given a cleaned up SQL statement that defines the view
+    """
+    res = sql_definition.parseString(sql)
+    table = Table(tablename, {}, res.where if res.where else None, 
res.tablename)
+    for tokens, start, end in column_definition.scanString(sql):
+        table.add_column(Column(tokens.name,
+                                whitelisted=tokens.condition == '' and 
tokens.expression != 'NULL',
+                                condition=tokens.condition if tokens.condition 
else None))
+
+
+    return table
+
+
+def views_schema_diff_report(config, model, conn):
+    """
+    Diff between schema of views in the public db and how they should be
+    """
+    report = {}
+    cur = conn.cursor()
+    for db in model.public_dbs:
+        report_db = {}
+        all_tables = get_tables(conn, db)
+        # We want tables that are present in this db and in our model
+        tables = {name: model.tables[name] for name in 
set(all_tables).intersection(set(model.tables.keys()))}
+        for name, table in tables.items():
+            cur.execute('SHOW CREATE VIEW %s.%s' % (db, name))
+            sql = _cleanup_viewdefiner(cur.fetchone()[1], db, table.table_name)
+            t = _table_from_definer(sql, name)
+            diff = diff_tables(table, t)
+            if diff:
+                report_db[name] = diff
+        if report_db:
+            report[db] = report_db
+    return report
diff --git a/greylisted.yaml b/greylisted.yaml
index 8492f5c..8b0af0b 100644
--- a/greylisted.yaml
+++ b/greylisted.yaml
@@ -3,7 +3,7 @@
     af_actions:
       whitelisted: true
     af_comments:
-      condition: af_hidden != NULL
+      condition: af_hidden
       whitelisted: false
     af_deleted:
       whitelisted: true
@@ -20,7 +20,7 @@
     af_id:
       whitelisted: true
     af_pattern:
-      condition: af_hidden != NULL
+      condition: af_hidden
       whitelisted: false
     af_public_comments:
       whitelisted: true
@@ -153,7 +153,7 @@
     ar_id:
       whitelisted: true
     ar_len:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_minor_edit:
       whitelisted: true
@@ -166,22 +166,22 @@
     ar_rev_id:
       whitelisted: true
     ar_sha1:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_text:
       whitelisted: false
     ar_text_id:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_timestamp:
       whitelisted: true
     ar_title:
       whitelisted: true
     ar_user:
-      condition: ar_deleted & 4
+      condition: (ar_deleted & 4)
       whitelisted: false
     ar_user_text:
-      condition: ar_deleted & 4
+      condition: (ar_deleted & 4)
       whitelisted: false
 archive_userindex:
   table_name: archive
@@ -195,7 +195,7 @@
     ar_id:
       whitelisted: true
     ar_len:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_minor_edit:
       whitelisted: true
@@ -208,12 +208,12 @@
     ar_rev_id:
       whitelisted: true
     ar_sha1:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_text:
       whitelisted: false
     ar_text_id:
-      condition: ar_deleted & 1
+      condition: (ar_deleted & 1)
       whitelisted: false
     ar_timestamp:
       whitelisted: true
@@ -223,13 +223,13 @@
       whitelisted: true
     ar_user_text:
       whitelisted: true
-  where: ar_deleted & 4 = 0
+  where: ((ar_deleted & 4) = 0)
 filearchive:
   columns:
     fa_archive_name:
       whitelisted: true
     fa_bits:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_deleted:
       whitelisted: true
@@ -240,32 +240,32 @@
     fa_deleted_user:
       whitelisted: true
     fa_description:
-      condition: fa_deleted & 2
+      condition: (fa_deleted & 2)
       whitelisted: false
     fa_height:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_id:
       whitelisted: true
     fa_major_mime:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_media_type:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_metadata:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_minor_mime:
-      whitelisted: true
-    fa_name:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
+    fa_name:
+      whitelisted: true
     fa_sha1:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_size:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_storage_group:
       whitelisted: true
@@ -274,13 +274,13 @@
     fa_timestamp:
       whitelisted: true
     fa_user:
-      condition: fa_deleted & 4
+      condition: (fa_deleted & 4)
       whitelisted: false
     fa_user_text:
-      condition: fa_deleted & 4
+      condition: (fa_deleted & 4)
       whitelisted: false
     fa_width:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
 filearchive_userindex:
   table_name: filearchive
@@ -288,7 +288,7 @@
     fa_archive_name:
       whitelisted: true
     fa_bits:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_deleted:
       whitelisted: true
@@ -299,32 +299,32 @@
     fa_deleted_user:
       whitelisted: true
     fa_description:
-      condition: fa_deleted & 2
+      condition: (fa_deleted & 2)
       whitelisted: false
     fa_height:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_id:
       whitelisted: true
     fa_major_mime:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_media_type:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_metadata:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_minor_mime:
-      whitelisted: true
-    fa_name:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
+    fa_name:
+      whitelisted: true
     fa_sha1:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_size:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
     fa_storage_group:
       whitelisted: true
@@ -337,13 +337,13 @@
     fa_user_text:
       whitelisted: true
     fa_width:
-      condition: fa_deleted & 1
+      condition: (fa_deleted & 1)
       whitelisted: false
-  where: fa_deleted & 4 = 0
+  where: ((fa_deleted & 4) = 0)
 ipblocks:
   columns:
     ipb_address:
-      condition: ipb_auto != 0
+      condition: (ipb_auto <> 0)
       whitelisted: false
     ipb_allow_usertalk:
       whitelisted: true
@@ -360,8 +360,7 @@
     ipb_create_account:
       whitelisted: true
     ipb_deleted:
-      replacewith: 0
-      whitelisted: false
+      whitelisted: true
     ipb_enable_autoblock:
       whitelisted: true
     ipb_expiry:
@@ -371,10 +370,10 @@
     ipb_parent_block_id:
       whitelisted: true
     ipb_range_end:
-      condition: ipb_auto != 0
+      condition: (ipb_auto <> 0)
       whitelisted: false
     ipb_range_start:
-      condition: ipb_auto != 0
+      condition: (ipb_auto <> 0)
       whitelisted: false
     ipb_reason:
       whitelisted: true
@@ -382,7 +381,7 @@
       whitelisted: true
     ipb_user:
       whitelisted: true
-  where: ipb_deleted = 0
+  where: (ipb_deleted = 0)
 ipblocks_ipindex:
   table_name: ipblocks
   columns:
@@ -403,8 +402,7 @@
     ipb_create_account:
       whitelisted: true
     ipb_deleted:
-      replacewith: 0
-      whitelisted: false
+      whitelisted: true
     ipb_enable_autoblock:
       whitelisted: true
     ipb_expiry:
@@ -423,24 +421,24 @@
       whitelisted: true
     ipb_user:
       whitelisted: true
-  where: ipb_deleted = 0 AND ipb_auto = 0
+  where: ((ipb_deleted = 0) and (ipb_auto = 0))
 logging:
   columns:
     log_action:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_comment:
-      condition: log_deleted & 2
+      condition: (log_deleted & 2)
       whitelisted: false
     log_deleted:
       whitelisted: true
     log_id:
       whitelisted: true
     log_namespace:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_page:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_params:
       condition: log_deleted
@@ -448,24 +446,24 @@
     log_timestamp:
       whitelisted: true
     log_title:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_type:
       whitelisted: true
     log_user:
-      condition: log_deleted & 4
+      condition: (log_deleted & 4)
       whitelisted: false
     log_user_text:
-      condition: log_deleted & 4
+      condition: (log_deleted & 4)
       whitelisted: false
-  where: logtype != 'suppress'
+  where: (log_type <> 'suppress')
 logging_logindex:
   table_name: logging
   columns:
     log_action:
       whitelisted: true
     log_comment:
-      condition: log_deleted & 2
+      condition: (log_deleted & 2)
       whitelisted: false
     log_deleted:
       whitelisted: true
@@ -485,30 +483,30 @@
     log_type:
       whitelisted: true
     log_user:
-      condition: log_deleted & 4
+      condition: (log_deleted & 4)
       whitelisted: false
     log_user_text:
-      condition: log_deleted & 4
+      condition: (log_deleted & 4)
       whitelisted: false
-  where: (log_deleted & 1) = 0 AND logtype != 'suppress'
+  where: (((log_deleted & 1) = 0) and (log_type <> 'suppress'))
 logging_userindex:
   table_name: logging
   columns:
     log_action:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_comment:
-      condition: log_deleted & 2
+      condition: (log_deleted & 2)
       whitelisted: false
     log_deleted:
       whitelisted: true
     log_id:
       whitelisted: true
     log_namespace:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_page:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_params:
       condition: log_deleted
@@ -516,7 +514,7 @@
     log_timestamp:
       whitelisted: true
     log_title:
-      condition: log_deleted & 1
+      condition: (log_deleted & 1)
       whitelisted: false
     log_type:
       whitelisted: true
@@ -524,7 +522,7 @@
       whitelisted: true
     log_user_text:
       whitelisted: true
-  where: log_deleted & 4 = 0 AND log_type != 'suppress'
+  where: (((log_deleted & 4) = 0) and (log_type <> 'suppress'))
 mark_as_helpful:
   columns:
     mah_id:
@@ -558,7 +556,7 @@
     oi_deleted:
       whitelisted: true
     oi_description:
-      condition: oi_deleted & 2
+      condition: (oi_deleted & 2)
       whitelisted: false
     oi_height:
       whitelisted: true
@@ -579,10 +577,10 @@
     oi_timestamp:
       whitelisted: true
     oi_user:
-      condition: oi_deleted & 4
+      condition: (oi_deleted & 4)
       whitelisted: false
     oi_user_text:
-      condition: oi_deleted & 4
+      condition: (oi_deleted & 4)
       whitelisted: false
     oi_width:
       whitelisted: true
@@ -596,7 +594,7 @@
     oi_deleted:
       whitelisted: true
     oi_description:
-      condition: oi_deleted & 2
+      condition: (oi_deleted & 2)
       whitelisted: false
     oi_height:
       whitelisted: true
@@ -622,13 +620,13 @@
       whitelisted: true
     oi_width:
       whitelisted: true
-  where: oi_deleted & 4 = 0
+  where: ((oi_deleted & 4) = 0)
 recentchanges:
   columns:
     rc_bot:
       whitelisted: true
     rc_comment:
-      condition: rc_deleted & 2
+      condition: (rc_deleted & 2)
       whitelisted: false
     rc_cur_id:
       whitelisted: true
@@ -639,7 +637,7 @@
     rc_id:
       whitelisted: true
     rc_ip:
-      whitelisted: true
+      whitelisted: false
     rc_last_oldid:
       whitelisted: true
     rc_log_action:
@@ -673,10 +671,10 @@
     rc_type:
       whitelisted: true
     rc_user:
-      condition: rc_deleted & 4
+      condition: (rc_deleted & 4)
       whitelisted: false
     rc_user_text:
-      condition: rc_deleted & 4
+      condition: (rc_deleted & 4)
       whitelisted: false
 recentchanges_userindex:
   table_name: recentchanges
@@ -684,7 +682,7 @@
     rc_bot:
       whitelisted: true
     rc_comment:
-      condition: rc_deleted & 2
+      condition: (rc_deleted & 2)
       whitelisted: false
     rc_cur_id:
       whitelisted: true
@@ -695,7 +693,7 @@
     rc_id:
       whitelisted: true
     rc_ip:
-      whitelisted: true
+      whitelisted: false
     rc_last_oldid:
       whitelisted: true
     rc_log_action:
@@ -732,11 +730,11 @@
       whitelisted: true
     rc_user_text:
       whitelisted: true
-  where: rc_deleted & 4 = 0
+  where: ((rc_deleted & 4) = 0)
 revision:
   columns:
     rev_comment:
-      condition: rev_deleted & 2
+      condition: (rev_deleted & 2)
       whitelisted: false
     rev_content_format:
       whitelisted: true
@@ -747,7 +745,7 @@
     rev_id:
       whitelisted: true
     rev_len:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_minor_edit:
       whitelisted: true
@@ -756,24 +754,24 @@
     rev_parent_id:
       whitelisted: true
     rev_sha1:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_text_id:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_timestamp:
       whitelisted: true
     rev_user:
-      condition: rev_deleted & 4
+      condition: (rev_deleted & 4)
       whitelisted: false
     rev_user_text:
-      condition: rev_deleted & 4
+      condition: (rev_deleted & 4)
       whitelisted: false
 revision_userindex:
   table_name: revision
   columns:
     rev_comment:
-      condition: rev_deleted & 2
+      condition: (rev_deleted & 2)
       whitelisted: false
     rev_content_format:
       whitelisted: true
@@ -784,7 +782,7 @@
     rev_id:
       whitelisted: true
     rev_len:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_minor_edit:
       whitelisted: true
@@ -793,10 +791,10 @@
     rev_parent_id:
       whitelisted: true
     rev_sha1:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_text_id:
-      condition: rev_deleted & 1
+      condition: (rev_deleted & 1)
       whitelisted: false
     rev_timestamp:
       whitelisted: true
@@ -804,7 +802,7 @@
       whitelisted: true
     rev_user_text:
       whitelisted: true
-  where: rev_deleted & 4 = 0
+  where: ((rev_deleted & 4) = 0)
 user:
   columns:
     user_editcount:
@@ -834,7 +832,7 @@
     user_real_name:
       whitelisted: true
     user_registration:
-      whitelisted: false
+      whitelisted: true
     user_token:
       whitelisted: false
     user_touched:
@@ -869,5 +867,4 @@
       whitelisted: true
     up_value:
       whitelisted: true
-  where: up_property in ('disablemail', 'fancysig', 'gender', 'language', 
'nickname',
-    'skin', 'timecorrection', 'variant')
+  where: (up_property in 
('disablemail','fancysig','gender','language','nickname','skin','timecorrection','variant'))
diff --git a/requirements.txt b/requirements.txt
index d6714a7..97156b4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
+pyparsing==2.0.3
 MySQL-python==1.2.5
 PyYAML==3.11

-- 
To view, visit https://gerrit.wikimedia.org/r/182848
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c
Gerrit-PatchSet: 1
Gerrit-Project: operations/software/labsdb-auditor
Gerrit-Branch: master
Gerrit-Owner: Yuvipanda <yuvipa...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to