Matthias Mullie has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/249731

Change subject: Exclude moderated data
......................................................................

Exclude moderated data

We've been seeing a big increase in active boards/topics/replies/users,
but there's also been a lot of spam lately (as also evidenced by the results
of moderation-actions.sql)
We don't currently know exactly how much of the increase was spam, making it
hard to figure out the impact of the recently deployed opt-in feature.
This will omit data from topics & posts that have been moderated.

Notice how the queries in active-boards.sql & active-topics.sql have changed
drastically. I'm unsure why the complex inner queries were needed - I may have
overlooked something...

Bug: T116797
Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4
---
M flow/active-boards.sql
M flow/active-topics.sql
M flow/messages-posted.sql
M flow/unique-users.sql
4 files changed, 47 insertions(+), 47 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/limn-flow-data 
refs/changes/31/249731/1

diff --git a/flow/active-boards.sql b/flow/active-boards.sql
index 89116f5..0596af2 100644
--- a/flow/active-boards.sql
+++ b/flow/active-boards.sql
@@ -1,16 +1,13 @@
 SELECT DATE('{from_timestamp}') as weekstart,
-       count(distinct workflow_wiki, workflow_namespace, workflow_title_text) 
as num_boards
-  FROM flow_workflow
-  JOIN (
-        SELECT a.tree_ancestor_id,
-               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 as timestamp
-          FROM flow_tree_node a
-          JOIN (
-                SELECT b.tree_descendant_id, MAX(b.tree_depth) as max
-                  FROM flow_tree_node b
-                 GROUP BY b.tree_descendant_id
-               ) y ON y.max = a.tree_depth AND y.tree_descendant_id = 
a.tree_descendant_id
-       ) z ON z.tree_ancestor_id = workflow_id
- WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' )
-   AND timestamp >= '{from_timestamp}'
-   AND timestamp < '{to_timestamp}';
+       COUNT(DISTINCT workflow_wiki, workflow_namespace, workflow_title_text) 
AS num_boards
+FROM
+(
+       SELECT *
+       FROM flow_workflow
+       INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type = 
'post' # join needed to be able to exclude moderated topics
+       WHERE
+               workflow_wiki NOT IN ('testwiki', 'test2wiki') AND
+               workflow_type = 'topic' AND
+               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+       GROUP BY rev_type, rev_type_id HAVING 
SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic', 
'delete-topic', 'suppress-topic') # exclude topics where last revision was 
moderation
+) AS temp;
diff --git a/flow/active-topics.sql b/flow/active-topics.sql
index 71e8073..4704181 100644
--- a/flow/active-topics.sql
+++ b/flow/active-topics.sql
@@ -1,16 +1,13 @@
 SELECT DATE('{from_timestamp}') as weekstart,
-       count(distinct workflow_id) as num_topics
-  FROM flow_workflow
-  JOIN (
-        SELECT a.tree_ancestor_id,
-               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.tree_descendant_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 as timestamp
-          FROM flow_tree_node a
-          JOIN (
-                SELECT b.tree_descendant_id, MAX(b.tree_depth) as max
-                  FROM flow_tree_node b
-                 GROUP BY b.tree_descendant_id
-               ) y ON y.max = a.tree_depth AND y.tree_descendant_id = 
a.tree_descendant_id
-       ) z ON z.tree_ancestor_id = workflow_id
- WHERE workflow_wiki NOT IN ( 'testwiki', 'test2wiki' )
-   AND timestamp >= '{from_timestamp}'
-   AND timestamp < '{to_timestamp}';
+       COUNT(DISTINCT workflow_id) AS num_boards
+FROM
+(
+       SELECT *
+       FROM flow_workflow
+       INNER JOIN flow_revision ON rev_type_id = workflow_id AND rev_type = 
'post' # join needed to be able to exclude moderated topics
+       WHERE
+               workflow_wiki NOT IN ('testwiki', 'test2wiki') AND
+               workflow_type = 'topic' AND
+               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(workflow_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+       GROUP BY rev_type, rev_type_id HAVING 
SUBSTRING_INDEX(GROUP_CONCAT(rev_change_type), ',', -1) NOT IN ('hide-topic', 
'delete-topic', 'suppress-topic') # exclude topics where last revision was 
moderation
+) AS temp;
diff --git a/flow/messages-posted.sql b/flow/messages-posted.sql
index 652bde1..807c6d7 100644
--- a/flow/messages-posted.sql
+++ b/flow/messages-posted.sql
@@ -1,11 +1,14 @@
 SELECT
        DATE('{from_timestamp}') AS Week,
        COUNT(*) AS Replies
-FROM (
-       SELECT 
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 AS timestamp
-       FROM flow_revision
-       WHERE rev_change_type = 'reply'
-       AND rev_user_wiki NOT IN ('testwiki', 'test2wiki')
-       HAVING timestamp >= '{from_timestamp}'
-       AND timestamp < '{to_timestamp}'
-) x;
+FROM
+(
+       SELECT 1
+       FROM flow_revision AS a
+       LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND 
a.rev_type_id = b.rev_type_id
+       WHERE
+               a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND
+               a.rev_change_type = 'reply' AND
+               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+       GROUP BY a.rev_type, a.rev_type_id HAVING 
SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post', 
'delete-post', 'suppress-post') # exclude topics where last revision was 
moderation
+) AS temp;
diff --git a/flow/unique-users.sql b/flow/unique-users.sql
index 07df6d9..23e452b 100644
--- a/flow/unique-users.sql
+++ b/flow/unique-users.sql
@@ -2,12 +2,15 @@
        DATE('{from_timestamp}') AS Week,
        COUNT(DISTINCT rev_user_id, COALESCE(rev_user_ip, ''), rev_user_wiki) 
AS "Unique users"
 FROM (
-       SELECT 
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 AS timestamp,
-              rev_user_id,
-              rev_user_ip,
-              rev_user_wiki
-       FROM flow_revision
-       WHERE rev_user_wiki NOT IN ('testwiki', 'test2wiki')
-       HAVING timestamp >= '{from_timestamp}'
-       AND timestamp < '{to_timestamp}'
-) x;
+       SELECT
+               a.rev_user_id,
+               a.rev_user_ip,
+               a.rev_user_wiki
+       FROM flow_revision AS a
+       LEFT JOIN flow_revision AS b ON a.rev_type = b.rev_type AND 
a.rev_type_id = b.rev_type_id
+       WHERE
+               a.rev_user_wiki NOT IN ('testwiki', 'test2wiki') AND
+               a.rev_change_type = 'reply' AND
+               
DATE_FORMAT(FROM_UNIXTIME((conv(substring(hex(a.rev_id),1,12),16,10)>>2)/1000),"%Y%m%d%H%i%S")
 BETWEEN '{from_timestamp}' AND '{to_timestamp}'
+       GROUP BY a.rev_type, a.rev_type_id HAVING 
SUBSTRING_INDEX(GROUP_CONCAT(b.rev_change_type), ',', -1) NOT IN ('hide-post', 
'delete-post', 'suppress-post') # exclude topics where last revision was 
moderation
+) AS temp;

-- 
To view, visit https://gerrit.wikimedia.org/r/249731
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3cc6bd45ff5ec684bb7463cf762b9e78059e23a4
Gerrit-PatchSet: 1
Gerrit-Project: analytics/limn-flow-data
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <mmul...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to