Milimetric has submitted this change and it was merged. Change subject: Speed up verification with scripts ......................................................................
Speed up verification with scripts Change-Id: I3d7fb6749632423b4b28799486f103881a1c3f73 --- M .gitignore A scripts/test-data-loading.sql A scripts/test-user-dimension.sql 3 files changed, 187 insertions(+), 0 deletions(-) Approvals: Milimetric: Verified; Looks good to me, approved diff --git a/.gitignore b/.gitignore index 1006ba8..5086269 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +*.swp *.pyc *.egg *egg-info diff --git a/scripts/test-data-loading.sql b/scripts/test-data-loading.sql new file mode 100644 index 0000000..fd20322 --- /dev/null +++ b/scripts/test-data-loading.sql @@ -0,0 +1,74 @@ +-- period of time to check for +set @to = '20140829000100'; +set @to = '20141001000100'; + +set @from = '20140829000000'; +set @to = '20140903000000'; + + +-- log of problems found using this script: +-- https://wikitech.wikimedia.org/wiki/Analytics/DataWarehouse/Requirements#Initial_Data_Verifications +-- edit fact matches source revision and archive table + select etl.wiki, + etl.k as date_with_bad_match, + etl.value as value_in_warehouse, + coalesce(en.value, de.value) as value_in_original_db + + from (select wiki, + date_format(time, '%Y%m%d') as k, + count(*) as value + from warehouse.edit + where time between @from and @to + and wiki in ('dewiki', 'enwiki') + group by wiki, k + ) etl + + left join + + (select 'dewiki' as wiki, + left(time, 8) as k, + archived, + count(*) as value + from (select rev_timestamp as time, + 0 as archived + from dewiki.revision + where rev_timestamp between @from and @to + + union all + + select ar_timestamp as time, + 1 as archived + from dewiki.archive + where ar_timestamp between @from and @to + ) all_revisions + group by wiki, k + ) de on de.k = etl.k + and de.value <> etl.value + and de.wiki = etl.wiki + + left join + + (select 'enwiki' as wiki, + left(time, 8) as k, + archived, + count(*) as value + from (select rev_timestamp as time, + 0 as archived + from enwiki.revision + where rev_timestamp between @from and @to + + union all + + select ar_timestamp as time, + 1 as archived + from enwiki.archive + where ar_timestamp between @from and @to + ) all_revisions + group by wiki, k + ) en on en.k = etl.k + and en.value <> etl.value + and en.wiki = etl.wiki + + where de.value is not null + or en.value is not null +; diff --git a/scripts/test-user-dimension.sql b/scripts/test-user-dimension.sql new file mode 100644 index 0000000..321b95c --- /dev/null +++ b/scripts/test-user-dimension.sql @@ -0,0 +1,112 @@ +-- user dimension matches source user table + + select count(*) as enwiki_users_that_should_be_bots + from warehouse.user + where user_id in ( + select ug_user + from enwiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'enwiki' + and in_bot_user_group = 0 +; + + select count(*) as enwiki_users_that_should_not_be_bots + from warehouse.user + where user_id not in ( + select ug_user + from enwiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'enwiki' + and in_bot_user_group = 1 +; + + + select count(*) as dewiki_users_that_should_be_bots + from warehouse.user + where user_id in ( + select ug_user + from dewiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'dewiki' + and in_bot_user_group = 0 +; + + select count(*) as dewiki_users_that_should_not_be_bots + from warehouse.user + where user_id not in ( + select ug_user + from dewiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'dewiki' + and in_bot_user_group = 1 +; + + + select count(*) as enwiki_users_that_should_be_bots + from warehouse.user + where user_id in ( + select ug_user + from enwiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'enwiki' + and in_bot_user_group = 0 +; + + select count(*) as enwiki_users_that_should_not_be_bots + from warehouse.user + where user_id not in ( + select ug_user + from enwiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'enwiki' + and in_bot_user_group = 1 +; + + + select count(*) as dewiki_users_that_should_be_bots + from warehouse.user + where user_id in ( + select ug_user + from dewiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'dewiki' + and in_bot_user_group = 0 +; + + select count(*) as dewiki_users_that_should_not_be_bots + from warehouse.user + where user_id not in ( + select ug_user + from dewiki.user_groups + where ug_group = 'bot' + ) + and wiki = 'dewiki' + and in_bot_user_group = 1 +; + + +-- registration type checks + select count(*) as wrong_action + from warehouse.user + left join + enwiki.logging on log_user = user_id + and log_type = 'newusers' + where wiki = 'enwiki' + and registration_type <> log_action +; + + select count(*) as wrong_action + from warehouse.user + left join + dewiki.logging on log_user = user_id + and log_type = 'newusers' + where wiki = 'dewiki' + and registration_type <> log_action +; -- To view, visit https://gerrit.wikimedia.org/r/185239 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I3d7fb6749632423b4b28799486f103881a1c3f73 Gerrit-PatchSet: 3 Gerrit-Project: analytics/data-warehouse Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Aklapper <aklap...@wikimedia.org> Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Springle <sprin...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits