Matthias Mullie has uploaded a new change for review. https://gerrit.wikimedia.org/r/249731
Change subject: Exclude moderated data ...................................................................... Exclude moderated data We've been seeing a big increase in active boards/topics/replies/users, but there's also been a lot of spam lately (as also evidenced by the results of moderation-actions.sql) We don't currently know exactly how much of the increase was spam, making it hard to figure out the impact of the recently deployed opt-in feature. This will omit data from topics & posts that have been moderated. Notice how the queries in active-boards.sql & active-topics.sql have changed drastically. I'm unsure why the complex inner queries were needed - I may have overlooked something... Bug: T116797 Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4 --- M flow/active-boards.sql M flow/active-topics.sql M flow/messages-posted.sql M flow/unique-users.sql 4 files changed, 47 insertions(+), 47 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-flow-data refs/changes/31/249731/1 diff --git a/flow/active-boards.sql b/flow/active-boards.sql index 89116f5..0596af2 100644 --- a/flow/active-boards.sql +++ b/flow/active-boards.sql @@ -1,16 +1,13 @@ SELECT DATE('{from_timestamp}') as weekstart, - count(distinct workflow_wiki, workflow_namespace, workflow_title_text) as num_boards - FROM flow_workflow - JOIN ( - SELECT a.tree_ancestor_id, - DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") as timestamp - FROM flow_tree_node a - JOIN ( - SELECT b.tree_descendant_id, MAX(b.tree_depth) as max - FROM flow_tree_node b - GROUP BY b.tree_descendant_id - ) y ON y.max = a.tree_depth AND y.tree_descendant_id = a.tree_descendant_id - ) z ON z.tree_ancestor_id = workflow_id - WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' ) - AND timestamp >= '{from_timestamp}' - AND timestamp < '{to_timestamp}'; + COUNT(DISTINCT workflow_wiki, workflow_namespace, workflow_title_text) AS num_boards +FROM +( + SELECT * + FROM flow_workflow + INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type = 'post' # join needed to be able to exclude moderated topics + WHERE + workflow_wiki NOT IN ('testwiki', 'test2wiki') AND + workflow_type = 'topic' AND + DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") BETWEEN '{from_timestamp}' AND '{to_timestamp}' + GROUP BY rev_type, rev_type_id HAVING SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic', 'delete-topic', 'suppress-topic') # exclude topics where last revision was moderation +) AS temp; diff --git a/flow/active-topics.sql b/flow/active-topics.sql index 71e8073..4704181 100644 --- a/flow/active-topics.sql +++ b/flow/active-topics.sql @@ -1,16 +1,13 @@ SELECT DATE('{from_timestamp}') as weekstart, - count(distinct workflow_id) as num_topics - FROM flow_workflow - JOIN ( - SELECT a.tree_ancestor_id, - DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") as timestamp - FROM flow_tree_node a - JOIN ( - SELECT b.tree_descendant_id, MAX(b.tree_depth) as max - FROM flow_tree_node b - GROUP BY b.tree_descendant_id - ) y ON y.max = a.tree_depth AND y.tree_descendant_id = a.tree_descendant_id - ) z ON z.tree_ancestor_id = workflow_id - WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' ) - AND timestamp >= '{from_timestamp}' - AND timestamp < '{to_timestamp}'; + COUNT(DISTINCT workflow_id) AS num_boards +FROM +( + SELECT * + FROM flow_workflow + INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type = 'post' # join needed to be able to exclude moderated topics + WHERE + workflow_wiki NOT IN ('testwiki', 'test2wiki') AND + workflow_type = 'topic' AND + DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") BETWEEN '{from_timestamp}' AND '{to_timestamp}' + GROUP BY rev_type, rev_type_id HAVING SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic', 'delete-topic', 'suppress-topic') # exclude topics where last revision was moderation +) AS temp; diff --git a/flow/messages-posted.sql b/flow/messages-posted.sql index 652bde1..807c6d7 100644 --- a/flow/messages-posted.sql +++ b/flow/messages-posted.sql @@ -1,11 +1,14 @@ SELECT DATE('{from_timestamp}') AS Week, COUNT(*) AS Replies -FROM ( - SELECT DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") AS timestamp - FROM flow_revision - WHERE rev_change_type = 'reply' - AND rev_user_wiki NOT IN ('testwiki', 'test2wiki') - HAVING timestamp >= '{from_timestamp}' - AND timestamp < '{to_timestamp}' -) x; +FROM +( + SELECT 1 + FROM flow_revision AS a + LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND a.rev_type_id = b.rev_type_id + WHERE + a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND + a.rev_change_type = 'reply' AND + DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") BETWEEN '{from_timestamp}' AND '{to_timestamp}' + GROUP BY a.rev_type, a.rev_type_id HAVING SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post', 'delete-post', 'suppress-post') # exclude topics where last revision was moderation +) AS temp; diff --git a/flow/unique-users.sql b/flow/unique-users.sql index 07df6d9..23e452b 100644 --- a/flow/unique-users.sql +++ b/flow/unique-users.sql @@ -2,12 +2,15 @@ DATE('{from_timestamp}') AS Week, COUNT(DISTINCT rev_user_id, COALESCE(rev_user_ip, ''), rev_user_wiki) AS "Unique users" FROM ( - SELECT DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") AS timestamp, - rev_user_id, - rev_user_ip, - rev_user_wiki - FROM flow_revision - WHERE rev_user_wiki NOT IN ('testwiki', 'test2wiki') - HAVING timestamp >= '{from_timestamp}' - AND timestamp < '{to_timestamp}' -) x; + SELECT + a.rev_user_id, + a.rev_user_ip, + a.rev_user_wiki + FROM flow_revision AS a + LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND a.rev_type_id = b.rev_type_id + WHERE + a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND + a.rev_change_type = 'reply' AND + DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S") BETWEEN '{from_timestamp}' AND '{to_timestamp}' + GROUP BY a.rev_type, a.rev_type_id HAVING SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post', 'delete-post', 'suppress-post') # exclude topics where last revision was moderation +) AS temp; -- To view, visit https://gerrit.wikimedia.org/r/249731 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4 Gerrit-PatchSet: 1 Gerrit-Project: analytics/limn-flow-data Gerrit-Branch: master Gerrit-Owner: Matthias Mullie <mmul...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits