[MediaWiki-commits] [Gerrit] Add report that verifies view definitions - change (operations...labsdb-auditor)

2015-01-10 Thread Merlijn van Deen (Code Review)
Merlijn van Deen has submitted this change and it was merged.

Change subject: Add report that verifies view definitions
..


Add report that verifies view definitions

- Diff the Table and Column objects from greylisted/whitelisted
  files vs reality and report on things that are missing
- Use a combination of regex (to clean up) and pyparsing to
  parse the SQL used to define a view and convert that into
  Table and Column objects
- Fix greylisted.yaml to match reality - mostly human errors
  that were made when transcribing from the perl file
- Fully paranthesize all SQL expressions in greylisted.yaml
  since that is how MySQL seems to give them back when asking
  for view definitions
- Add diff_iters and common_iters helpers to do set operations
  on lists. We want to keep them as lists since if we output
  them as sets to YAML then YAML doesn't output them cleanly
  and adds a python set decorator

Bug: T85473

Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c
---
M auditor/audit.py
M auditor/models.py
M auditor/reports/databases.py
M auditor/reports/tables.py
A auditor/reports/viewdiffs.py
R auditor/utils.py
M greylisted.yaml
M requirements.txt
8 files changed, 257 insertions(+), 99 deletions(-)

Approvals:
  Merlijn van Deen: Verified; Looks good to me, approved



diff --git a/auditor/audit.py b/auditor/audit.py
index 958dda5..c3d6a6b 100644
--- a/auditor/audit.py
+++ b/auditor/audit.py
@@ -33,6 +33,7 @@
 
 from reports.databases import databases_report
 from reports.tables import extra_tables_report
+from reports.viewdiffs import views_schema_diff_report
 
 
 argparser = argparse.ArgumentParser()
@@ -57,6 +58,7 @@
 
 rr.register_report(databases_report)
 rr.register_report(extra_tables_report)
+rr.register_report(views_schema_diff_report)
 
 logging.info('Starting report generation')
 reports = rr.run()
diff --git a/auditor/models.py b/auditor/models.py
index 38d37b4..af4131c 100644
--- a/auditor/models.py
+++ b/auditor/models.py
@@ -17,6 +17,8 @@
 import yaml
 import os
 
+from utils import diff_iters
+
 
 class Column(object):
 
@@ -87,7 +89,11 @@
 else:
 # Greylisted table!
 for colname, coldata in tabledata['columns'].items():
-table.add_column(Column(colname, coldata.get('whitelisted'), 
coldata.get('condition'), coldata.get('replacewith')))
+col = Column(colname,
+ whitelisted=coldata.get('whitelisted'),
+ condition=coldata.get('condition'),
+ replacewith=coldata.get('replacewith'))
+table.add_column(col)
 return table
 
 
@@ -127,6 +133,6 @@
 with open(all_dblist_path) as all_file, open(private_dblist_path) as 
priv_file:
 all_dbs = [l.strip() for l in all_file.readlines()]
 priv_dbs = [l.strip() for l in priv_file.readlines()]
-dbs = list(set(all_dbs) - (set(priv_dbs)))
+dbs, _ = diff_iters(all_dbs, priv_dbs)
 
 return cls(dbs, tables)
diff --git a/auditor/reports/databases.py b/auditor/reports/databases.py
index c2ee418..4e64f4a 100644
--- a/auditor/reports/databases.py
+++ b/auditor/reports/databases.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 
-from ..dbutils import get_databases
+from ..utils import get_databases
 
 
 def databases_report(config, model, conn):
diff --git a/auditor/reports/tables.py b/auditor/reports/tables.py
index e4309d6..ab35bce 100644
--- a/auditor/reports/tables.py
+++ b/auditor/reports/tables.py
@@ -14,7 +14,7 @@
 import logging
 import MySQLdb
 
-from ..dbutils import get_tables
+from ..utils import get_tables, diff_iters
 
 
 def _get_extra_tables(model, conn, dbs):
@@ -22,7 +22,7 @@
 for db in dbs:
 logging.debug('Finding extra tables on %s', db)
 try:
-diff = set(get_tables(conn, db)) - set(model.tables.keys())
+extra_tables, _ = diff_iters(get_tables(conn, db), model.tables)
 except MySQLdb.OperationalError as e:
 # Ignore missing database errors (cod 1049).
 # Not using MySQLdb.constants.ER because it is stupid and needs to 
be explicitly imported to use.
@@ -31,8 +31,8 @@
 continue  # We have other reports taking care of unknown dbs
 else:
 raise
-if diff:
-for tablename in diff:
+if extra_tables:
+for tablename in extra_tables:
 if tablename in extra_tables:
 extra_tables[tablename].append(db)
 else:
diff --git a/auditor/reports/viewdiffs.py b/auditor/reports/viewdiffs.py
new file mode 100644
index 000..e9e91d0
--- /dev/null
+++ b/auditor/reports/viewdiffs.py
@@ -0,0 +1,136 @@
+# Copyright 2015 Yuvi Panda yuvipa...@gmail.com
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this 

[MediaWiki-commits] [Gerrit] Add report that verifies view definitions - change (operations...labsdb-auditor)

2015-01-05 Thread Yuvipanda (Code Review)
Yuvipanda has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/182848

Change subject: Add report that verifies view definitions
..

Add report that verifies view definitions

- Fix greylisted.yaml to match reality - mostly human errors
  that were made when transcribing from the perl file
- Fully paranthesize all SQL expressions in greylisted.yaml
  since that is how MySQL seems to give them back when asking
  for view definitions
- Use a combination of regex (to clean up) and pyparsing to
  parse the SQL used to define a view and convert that into
  Table and Column objects
- Diff the Table and Column objects from greylisted/whitelisted
  files vs reality and report on things that are missing

Bug: T85473

Change-Id: I2e8896fd55a75fa7f6ae1a63bdbb6c04e47bff5c
---
M auditor/audit.py
A auditor/reports/viewdiffs.py
M greylisted.yaml
M requirements.txt
4 files changed, 212 insertions(+), 93 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/software/labsdb-auditor 
refs/changes/48/182848/1

diff --git a/auditor/audit.py b/auditor/audit.py
index 958dda5..10b5371 100644
--- a/auditor/audit.py
+++ b/auditor/audit.py
@@ -33,6 +33,7 @@
 
 from reports.databases import databases_report
 from reports.tables import extra_tables_report
+from reports.viewdiffs import views_schema_diff_report
 
 
 argparser = argparse.ArgumentParser()
@@ -55,8 +56,9 @@
 
 rr = ReportRunner(config)
 
-rr.register_report(databases_report)
-rr.register_report(extra_tables_report)
+# rr.register_report(databases_report)
+# rr.register_report(extra_tables_report)
+rr.register_report(views_schema_diff_report)
 
 logging.info('Starting report generation')
 reports = rr.run()
diff --git a/auditor/reports/viewdiffs.py b/auditor/reports/viewdiffs.py
new file mode 100644
index 000..69415a5
--- /dev/null
+++ b/auditor/reports/viewdiffs.py
@@ -0,0 +1,119 @@
+# Copyright 2015 Yuvi Panda yuvipa...@gmail.com
+#
+# Licensed under the Apache License, Version 2.0 (the License);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from pyparsing import OneOrMore, Optional, Word, SkipTo, StringEnd, alphanums
+
+from ..models import Column, Table
+from ..dbutils import get_tables
+
+
+def _diff(expected, actual, fields):
+diff = {}
+for field in fields:
+a = expected.__getattribute__(field)
+b = actual.__getattribute__(field)
+if a != b:
+diff[field] = {
+'expected': a,
+'found': b
+}
+return diff
+
+def diff_columns(expected, actual):
+return _diff(expected, actual, ('name', 'whitelisted', 'condition', 
'replacewith'))
+
+def diff_tables(expected, actual):
+table_diff = _diff(expected, actual, ('name', 'where', 'table_name'))
+missing_cols = list(set(expected.columns.keys()) - 
set(actual.columns.keys()))
+extra_cols = list(set(actual.columns.keys()) - 
set(expected.columns.keys()))
+common_columns = 
set(actual.columns.keys()).intersection(expected.columns.keys())
+column_diffs = {}
+for name in common_columns:
+col_diff = diff_columns(expected.columns[name], actual.columns[name])
+if col_diff:
+column_diffs[name] = col_diff
+
+column_info = {}
+if missing_cols:
+column_info['missing'] = missing_cols
+if extra_cols:
+column_info['extra'] = extra_cols
+if column_diffs:
+column_info['diff'] = column_diffs
+if column_info:
+table_diff['columns'] = column_info
+return table_diff if table_diff else None
+
+EXTRACT_VIEWDEF_RE = re.compile(rDEFINER VIEW `([^`]+)` AS (.*))
+def _cleanup_viewdefiner(full_sql, db, table):
+definer_sql = EXTRACT_VIEWDEF_RE.search(full_sql).groups()[1]
+private_db_name = db.replace('_p', '')
+sql = definer_sql.replace('`', '') \
+.replace('%s.%s.' % (private_db_name, table), '')
+
+return sql
+
+# We have to parse a specific subset of SQL, that is used to define views
+# The Grammar is:
+#
+# sql = SELECT { column-defintions } FROM wikiname.tablename [ WHERE 
cond ]
+# column-definition = column-name AS column-name
+# | NULL AS column-name
+# | IF( cond , NULL,  column-name ) AS column-name
+#
+# Where cond is a valid SQL relational expression, column-name, wikiname, 
tablename are strings
+
+identifier = alphanums + _
+if_definition = if( + SkipTo(,)(condition) + , + Word(identifier) + 
, + Word(identifier) + )