Milimetric has submitted this change and it was merged.

Change subject: Speed up verification with scripts
......................................................................


Speed up verification with scripts

Change-Id: I3d7fb6749632423b4b28799486f103881a1c3f73
---
M .gitignore
A scripts/test-data-loading.sql
A scripts/test-user-dimension.sql
3 files changed, 187 insertions(+), 0 deletions(-)

Approvals:
  Milimetric: Verified; Looks good to me, approved



diff --git a/.gitignore b/.gitignore
index 1006ba8..5086269 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*.swp
 *.pyc
 *.egg
 *egg-info
diff --git a/scripts/test-data-loading.sql b/scripts/test-data-loading.sql
new file mode 100644
index 0000000..fd20322
--- /dev/null
+++ b/scripts/test-data-loading.sql
@@ -0,0 +1,74 @@
+-- period of time to check for
+set @to     = '20140829000100';
+set @to     = '20141001000100';
+
+set @from   = '20140829000000';
+set @to     = '20140903000000';
+
+
+-- log of problems found using this script:
+--   
https://wikitech.wikimedia.org/wiki/Analytics/DataWarehouse/Requirements#Initial_Data_Verifications
+-- edit fact matches source revision and archive table
+ select etl.wiki,
+        etl.k                           as date_with_bad_match,
+        etl.value                       as value_in_warehouse,
+        coalesce(en.value, de.value)    as value_in_original_db
+
+   from (select wiki,
+                date_format(time, '%Y%m%d') as k,
+                count(*) as value
+           from warehouse.edit
+          where time between @from and @to
+            and wiki in ('dewiki', 'enwiki')
+          group by wiki, k
+        ) etl
+
+            left join
+
+        (select 'dewiki' as wiki,
+                left(time, 8) as k,
+                archived,
+                count(*) as value
+           from (select rev_timestamp as time,
+                        0 as archived
+                   from dewiki.revision
+                  where rev_timestamp between @from and @to
+
+                  union all
+
+                 select ar_timestamp as time,
+                        1 as archived
+                   from dewiki.archive
+                  where ar_timestamp between @from and @to
+                ) all_revisions
+          group by wiki, k
+        ) de            on de.k = etl.k
+                       and de.value <> etl.value
+                       and de.wiki = etl.wiki
+
+            left join
+
+        (select 'enwiki' as wiki,
+                left(time, 8) as k,
+                archived,
+                count(*) as value
+           from (select rev_timestamp as time,
+                        0 as archived
+                   from enwiki.revision
+                  where rev_timestamp between @from and @to
+
+                  union all
+
+                 select ar_timestamp as time,
+                        1 as archived
+                   from enwiki.archive
+                  where ar_timestamp between @from and @to
+                ) all_revisions
+          group by wiki, k
+        ) en            on en.k = etl.k
+                       and en.value <> etl.value
+                       and en.wiki = etl.wiki
+
+  where de.value is not null
+     or en.value is not null
+;
diff --git a/scripts/test-user-dimension.sql b/scripts/test-user-dimension.sql
new file mode 100644
index 0000000..321b95c
--- /dev/null
+++ b/scripts/test-user-dimension.sql
@@ -0,0 +1,112 @@
+-- user dimension matches source user table
+
+ select count(*) as enwiki_users_that_should_be_bots
+   from warehouse.user
+  where user_id in (
+         select ug_user
+           from enwiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'enwiki'
+    and in_bot_user_group = 0
+;
+
+ select count(*) as enwiki_users_that_should_not_be_bots
+   from warehouse.user
+  where user_id not in (
+         select ug_user
+           from enwiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'enwiki'
+    and in_bot_user_group = 1
+;
+
+
+ select count(*) as dewiki_users_that_should_be_bots
+   from warehouse.user
+  where user_id in (
+         select ug_user
+           from dewiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'dewiki'
+    and in_bot_user_group = 0
+;
+
+ select count(*) as dewiki_users_that_should_not_be_bots
+   from warehouse.user
+  where user_id not in (
+         select ug_user
+           from dewiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'dewiki'
+    and in_bot_user_group = 1
+;
+
+
+ select count(*) as enwiki_users_that_should_be_bots
+   from warehouse.user
+  where user_id in (
+         select ug_user
+           from enwiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'enwiki'
+    and in_bot_user_group = 0
+;
+
+ select count(*) as enwiki_users_that_should_not_be_bots
+   from warehouse.user
+  where user_id not in (
+         select ug_user
+           from enwiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'enwiki'
+    and in_bot_user_group = 1
+;
+
+
+ select count(*) as dewiki_users_that_should_be_bots
+   from warehouse.user
+  where user_id in (
+         select ug_user
+           from dewiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'dewiki'
+    and in_bot_user_group = 0
+;
+
+ select count(*) as dewiki_users_that_should_not_be_bots
+   from warehouse.user
+  where user_id not in (
+         select ug_user
+           from dewiki.user_groups
+          where ug_group = 'bot'
+        )
+    and wiki = 'dewiki'
+    and in_bot_user_group = 1
+;
+
+
+-- registration type checks
+ select count(*) as wrong_action
+   from warehouse.user
+            left join
+        enwiki.logging  on log_user = user_id
+                       and log_type = 'newusers'
+  where wiki = 'enwiki'
+    and registration_type <> log_action
+;
+
+ select count(*) as wrong_action
+   from warehouse.user
+            left join
+        dewiki.logging  on log_user = user_id
+                       and log_type = 'newusers'
+  where wiki = 'dewiki'
+    and registration_type <> log_action
+;

-- 
To view, visit https://gerrit.wikimedia.org/r/185239
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I3d7fb6749632423b4b28799486f103881a1c3f73
Gerrit-PatchSet: 3
Gerrit-Project: analytics/data-warehouse
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Aklapper <aklap...@wikimedia.org>
Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Springle <sprin...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to