Milimetric has submitted this change and it was merged.
Change subject: Speed up verification with scripts
..
Speed up verification with scripts
Change-Id: I3d7fb6749632423b4b28799486f103881a1c3f73
---
M .gitignore
A scripts/test-data-loading.sql
A scripts/test-user-dimension.sql
3 files changed, 187 insertions(+), 0 deletions(-)
Approvals:
Milimetric: Verified; Looks good to me, approved
diff --git a/.gitignore b/.gitignore
index 1006ba8..5086269 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.swp
*.pyc
*.egg
*egg-info
diff --git a/scripts/test-data-loading.sql b/scripts/test-data-loading.sql
new file mode 100644
index 000..fd20322
--- /dev/null
+++ b/scripts/test-data-loading.sql
@@ -0,0 +1,74 @@
+-- period of time to check for
+set @to = '20140829000100';
+set @to = '20141001000100';
+
+set @from = '2014082900';
+set @to = '2014090300';
+
+
+-- log of problems found using this script:
+--
https://wikitech.wikimedia.org/wiki/Analytics/DataWarehouse/Requirements#Initial_Data_Verifications
+-- edit fact matches source revision and archive table
+ select etl.wiki,
+etl.k as date_with_bad_match,
+etl.value as value_in_warehouse,
+coalesce(en.value, de.value)as value_in_original_db
+
+ from (select wiki,
+date_format(time, '%Y%m%d') as k,
+count(*) as value
+ from warehouse.edit
+ where time between @from and @to
+and wiki in ('dewiki', 'enwiki')
+ group by wiki, k
+) etl
+
+left join
+
+(select 'dewiki' as wiki,
+left(time, 8) as k,
+archived,
+count(*) as value
+ from (select rev_timestamp as time,
+0 as archived
+ from dewiki.revision
+ where rev_timestamp between @from and @to
+
+ union all
+
+ select ar_timestamp as time,
+1 as archived
+ from dewiki.archive
+ where ar_timestamp between @from and @to
+) all_revisions
+ group by wiki, k
+) deon de.k = etl.k
+ and de.value <> etl.value
+ and de.wiki = etl.wiki
+
+left join
+
+(select 'enwiki' as wiki,
+left(time, 8) as k,
+archived,
+count(*) as value
+ from (select rev_timestamp as time,
+0 as archived
+ from enwiki.revision
+ where rev_timestamp between @from and @to
+
+ union all
+
+ select ar_timestamp as time,
+1 as archived
+ from enwiki.archive
+ where ar_timestamp between @from and @to
+) all_revisions
+ group by wiki, k
+) enon en.k = etl.k
+ and en.value <> etl.value
+ and en.wiki = etl.wiki
+
+ where de.value is not null
+ or en.value is not null
+;
diff --git a/scripts/test-user-dimension.sql b/scripts/test-user-dimension.sql
new file mode 100644
index 000..321b95c
--- /dev/null
+++ b/scripts/test-user-dimension.sql
@@ -0,0 +1,112 @@
+-- user dimension matches source user table
+
+ select count(*) as enwiki_users_that_should_be_bots
+ from warehouse.user
+ where user_id in (
+ select ug_user
+ from enwiki.user_groups
+ where ug_group = 'bot'
+)
+and wiki = 'enwiki'
+and in_bot_user_group = 0
+;
+
+ select count(*) as enwiki_users_that_should_not_be_bots
+ from warehouse.user
+ where user_id not in (
+ select ug_user
+ from enwiki.user_groups
+ where ug_group = 'bot'
+)
+and wiki = 'enwiki'
+and in_bot_user_group = 1
+;
+
+
+ select count(*) as dewiki_users_that_should_be_bots
+ from warehouse.user
+ where user_id in (
+ select ug_user
+ from dewiki.user_groups
+ where ug_group = 'bot'
+)
+and wiki = 'dewiki'
+and in_bot_user_group = 0
+;
+
+ select count(*) as dewiki_users_that_should_not_be_bots
+ from warehouse.user
+ where user_id not in (
+ select ug_user
+ from dewiki.user_groups
+ where ug_group = 'bot'
+)
+and wiki = 'dewiki'
+and in_bot_user_group = 1
+;
+
+
+ select count(*) as enwiki_users_that_should_be_bots
+ from warehouse.user
+ where user_id in (
+ select ug_user
+ from enwiki.user_groups
+ where ug_group = 'bot'
+)
+and wiki = 'enwiki'
+and in_bot_user_group = 0
+;
+
+ select count(*) as enwiki_users_that_should_not_be_bots
+ from warehouse.user
+ where user_id not in (
+ select