QChris has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/167839

Change subject: [WIP] Add schema for edit fact table
......................................................................

[WIP] Add schema for edit fact table

Change-Id: I499ac0eb56190ed3cba876177406b03af3b73ca4
---
A schemas/edit.sql
1 file changed, 70 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/data-warehouse 
refs/changes/39/167839/1

diff --git a/schemas/edit.sql b/schemas/edit.sql
new file mode 100644
index 0000000..265fa96
--- /dev/null
+++ b/schemas/edit.sql
@@ -0,0 +1,70 @@
+-- Edits from the revision and archive tables land here
+
+-- fact table
+ create table edit as (
+    time            timestamp,
+    wiki            varchar(100),
+    user_id         int,            -- see user dimension below
+    rev_id          int,
+    page_id         int,            -- see page dimension below
+ )
+
+-- user dimension
+ create table user as (
+    wiki                varchar(100),
+    user_id             int,
+    user_name           varchar(100),
+    user_registration   timestamp,
+    in_bot_user_group   tinyint,
+
+    valid_from          timestamp,
+    valid_to            timestamp,
+    valid_currently     tinyint,
+ )
+
+-- page dimension (not strictly needed right away)
+ create table page as (
+    wiki                varchar(100),
+    page_id             int,
+    namespace           int,
+    archived            tinyint,
+
+    valid_from          timestamp,
+    valid_to            timestamp,
+    valid_currently     tinyint,
+ )
+
+
+-- with this setup, we could re-write metrics as follows.
+-- The hope is that we can optimize these queries more easily than we can the 
current queries
+-- for simplicity:
+--      C = day to run
+--      B = C - 1 month
+--      A = C - 2 months
+
+-- rolling recurring old active editor
+ select user_id
+   from edit
+            inner join
+        user    on edit.user_id = user.user_id
+                and edit.wiki = user.wiki
+                and C between user.valid_from and user.valid_to
+  where time between A and C
+    and user.in_bot_user_group = 0
+    and user.user_registration < A
+  group by user_id
+ having sum(time <= B) >= 5
+    and sum(time > B) >= 5
+
+-- rolling new active editor
+ select user_id
+   from edit
+            inner join
+        user    on edit.user_id = user.user_id
+                and edit.wiki = user.wiki
+                and C between user.valid_from and user.valid_to
+  where time between B and C
+    and user.in_bot_user_group = 0
+    and user.user_registration >= B
+  group by user_id
+ having count(*) > 5

-- 
To view, visit https://gerrit.wikimedia.org/r/167839
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I499ac0eb56190ed3cba876177406b03af3b73ca4
Gerrit-PatchSet: 1
Gerrit-Project: analytics/data-warehouse
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Milimetric <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to