QChris has uploaded a new change for review. https://gerrit.wikimedia.org/r/167839
Change subject: [WIP] Add schema for edit fact table ...................................................................... [WIP] Add schema for edit fact table Change-Id: I499ac0eb56190ed3cba876177406b03af3b73ca4 --- A schemas/edit.sql 1 file changed, 70 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/data-warehouse refs/changes/39/167839/1 diff --git a/schemas/edit.sql b/schemas/edit.sql new file mode 100644 index 0000000..265fa96 --- /dev/null +++ b/schemas/edit.sql @@ -0,0 +1,70 @@ +-- Edits from the revision and archive tables land here + +-- fact table + create table edit as ( + time timestamp, + wiki varchar(100), + user_id int, -- see user dimension below + rev_id int, + page_id int, -- see page dimension below + ) + +-- user dimension + create table user as ( + wiki varchar(100), + user_id int, + user_name varchar(100), + user_registration timestamp, + in_bot_user_group tinyint, + + valid_from timestamp, + valid_to timestamp, + valid_currently tinyint, + ) + +-- page dimension (not strictly needed right away) + create table page as ( + wiki varchar(100), + page_id int, + namespace int, + archived tinyint, + + valid_from timestamp, + valid_to timestamp, + valid_currently tinyint, + ) + + +-- with this setup, we could re-write metrics as follows. +-- The hope is that we can optimize these queries more easily than we can the current queries +-- for simplicity: +-- C = day to run +-- B = C - 1 month +-- A = C - 2 months + +-- rolling recurring old active editor + select user_id + from edit + inner join + user on edit.user_id = user.user_id + and edit.wiki = user.wiki + and C between user.valid_from and user.valid_to + where time between A and C + and user.in_bot_user_group = 0 + and user.user_registration < A + group by user_id + having sum(time <= B) >= 5 + and sum(time > B) >= 5 + +-- rolling new active editor + select user_id + from edit + inner join + user on edit.user_id = user.user_id + and edit.wiki = user.wiki + and C between user.valid_from and user.valid_to + where time between B and C + and user.in_bot_user_group = 0 + and user.user_registration >= B + group by user_id + having count(*) > 5 -- To view, visit https://gerrit.wikimedia.org/r/167839 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I499ac0eb56190ed3cba876177406b03af3b73ca4 Gerrit-PatchSet: 1 Gerrit-Project: analytics/data-warehouse Gerrit-Branch: master Gerrit-Owner: QChris <[email protected]> Gerrit-Reviewer: Milimetric <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
