Nuria has submitted this change and it was merged.
Change subject: Filter out bots from relevant metrics
......................................................................
Filter out bots from relevant metrics
Using the definition that bots are users showing up in the user_groups
table with a ug_group value of 'bot', we filter out bots from the
rolling metrics.
Bug: 72134
Change-Id: I53258947816e35b770c4b1fccfe9f4a98f82554e
---
M tests/fixtures.py
M tests/test_metrics/test_rolling_active_editor.py
M tests/test_metrics/test_rolling_new_active_editor.py
M tests/test_metrics/test_rolling_surviving_new_active_editor.py
M tests/test_models/test_mappings.py
M wikimetrics/metrics/rolling_active_editor.py
M wikimetrics/metrics/rolling_new_active_editor.py
M wikimetrics/metrics/rolling_surviving_new_active_editor.py
M wikimetrics/models/mediawiki/__init__.py
A wikimetrics/models/mediawiki/user_groups.py
10 files changed, 110 insertions(+), 8 deletions(-)
Approvals:
Mforns: Checked; Looks good to me, but someone else must approve
Nuria: Looks good to me, approved
jenkins-bot: Verified
diff --git a/tests/fixtures.py b/tests/fixtures.py
index 3ac24b4..2b62b9a 100644
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -23,6 +23,7 @@
Revision,
Page,
MediawikiUser,
+ MediawikiUserGroups,
Logging,
Archive,
)
@@ -593,6 +594,13 @@
self.mwSession.query(Revision).delete()
self.mwSession.commit()
+ def make_bot(self, user_id, session):
+ """
+ Update the database to make user_id a bot
+ """
+ session.add(MediawikiUserGroups(ug_user=user_id, ug_group='bot'))
+ session.commit()
+
def setUp(self):
#****************************************************************
# set up and clean database (Warning: this DESTROYS ALL DATA)
@@ -613,6 +621,7 @@
self.mwSession.query(Logging).delete()
self.mwSession.query(Revision).delete()
self.mwSession.query(Archive).delete()
+ self.mwSession.query(MediawikiUserGroups).delete()
self.mwSession.query(MediawikiUser).delete()
self.mwSession.query(Page).delete()
self.mwSession.commit()
diff --git a/tests/test_metrics/test_rolling_active_editor.py
b/tests/test_metrics/test_rolling_active_editor.py
index 4734205..2312b4b 100644
--- a/tests/test_metrics/test_rolling_active_editor.py
+++ b/tests/test_metrics/test_rolling_active_editor.py
@@ -3,7 +3,7 @@
from tests.fixtures import DatabaseTest, i, d
from wikimetrics.utils import format_pretty_date as s
-from wikimetrics.models import Revision
+from wikimetrics.models import Revision, MediawikiUser
from wikimetrics.metrics import RollingActiveEditor
from wikimetrics.enums import TimeseriesChoices
@@ -116,3 +116,15 @@
assert_equal(results[user_id][metric.id], result)
# users with no edits at all just won't show up
assert_equal(results.get(self.editor_ids[3], -1), -1)
+
+ def test_wiki_cohort_all_bots(self):
+ # make everyone a bot and make sure they're excluded
+ for r in self.mwSession.query(MediawikiUser.user_id).all():
+ self.make_bot(r[0], self.mwSession)
+
+ metric = RollingActiveEditor(
+ end_date=self.r_plus_30,
+ )
+ results = metric(None, self.mwSession)
+
+ assert_equal(results.keys(), [])
diff --git a/tests/test_metrics/test_rolling_new_active_editor.py
b/tests/test_metrics/test_rolling_new_active_editor.py
index 30b9880..b057819 100644
--- a/tests/test_metrics/test_rolling_new_active_editor.py
+++ b/tests/test_metrics/test_rolling_new_active_editor.py
@@ -3,7 +3,7 @@
from tests.fixtures import DatabaseTest, i, d
from wikimetrics.utils import format_pretty_date as s
-from wikimetrics.models import Revision, Logging
+from wikimetrics.models import Revision, Logging, MediawikiUser
from wikimetrics.metrics import RollingNewActiveEditor
from wikimetrics.enums import TimeseriesChoices
@@ -152,3 +152,15 @@
results = metric(None, self.mwSession)
assert_equal(results.keys(), [])
+
+ def test_wiki_cohort_all_bots(self):
+ # make everyone a bot and make sure they're excluded
+ for r in self.mwSession.query(MediawikiUser.user_id).all():
+ self.make_bot(r[0], self.mwSession)
+
+ metric = RollingNewActiveEditor(
+ end_date=self.r_plus_30,
+ )
+ results = metric(None, self.mwSession)
+
+ assert_equal(results.keys(), [])
diff --git a/tests/test_metrics/test_rolling_surviving_new_active_editor.py
b/tests/test_metrics/test_rolling_surviving_new_active_editor.py
index a8eb480..d60f53a 100644
--- a/tests/test_metrics/test_rolling_surviving_new_active_editor.py
+++ b/tests/test_metrics/test_rolling_surviving_new_active_editor.py
@@ -3,7 +3,7 @@
from tests.fixtures import DatabaseTest, i, d
from wikimetrics.utils import format_pretty_date as s
-from wikimetrics.models import Revision, Logging
+from wikimetrics.models import Revision, Logging, MediawikiUser
from wikimetrics.metrics import RollingSurvivingNewActiveEditor
from wikimetrics.enums import TimeseriesChoices
@@ -165,3 +165,15 @@
results = metric(None, self.mwSession)
assert_equal(results.keys(), [])
+
+ def test_wiki_cohort_all_bots(self):
+ # make everyone a bot and make sure they're excluded
+ for r in self.mwSession.query(MediawikiUser.user_id).all():
+ self.make_bot(r[0], self.mwSession)
+
+ metric = RollingSurvivingNewActiveEditor(
+ end_date=self.r_plus_60,
+ )
+ results = metric(None, self.mwSession)
+
+ assert_equal(results.keys(), [])
diff --git a/tests/test_models/test_mappings.py
b/tests/test_models/test_mappings.py
index 116201a..323f3a0 100644
--- a/tests/test_models/test_mappings.py
+++ b/tests/test_models/test_mappings.py
@@ -10,6 +10,7 @@
Logging,
Page,
MediawikiUser,
+ MediawikiUserGroups,
Revision,
)
from wikimetrics.enums import CohortUserRole, UserRole
@@ -71,6 +72,13 @@
row = self.mwSession.query(MediawikiUser).get(self.editors[0].user_id)
assert_equal(row.user_name, 'Editor test-specific-0')
+ def test_mediawiki_user_groups(self):
+ ug = MediawikiUserGroups(ug_user=self.editors[0].user_id,
ug_group='test')
+ self.mwSession.add(ug)
+ self.mwSession.commit()
+ fetch = self.mwSession.query(MediawikiUserGroups).first()
+ assert_equal(fetch.ug_group, 'test')
+
def test_mediawiki_page(self):
row = self.mwSession.query(Page).get(self.revisions[0].rev_page)
assert_equal(row.page_title, 'test-specific-page')
diff --git a/wikimetrics/metrics/rolling_active_editor.py
b/wikimetrics/metrics/rolling_active_editor.py
index db1d677..35ffc9c 100644
--- a/wikimetrics/metrics/rolling_active_editor.py
+++ b/wikimetrics/metrics/rolling_active_editor.py
@@ -6,7 +6,9 @@
from wikimetrics.forms.fields import BetterDateTimeField
from wikimetrics.utils import today
-from wikimetrics.models.mediawiki import Revision, MediawikiUser, Archive
+from wikimetrics.models.mediawiki import (
+ Revision, MediawikiUser, Archive, MediawikiUserGroups
+)
from metric import Metric
@@ -43,6 +45,12 @@
) AS user_content_revision_count
GROUP BY user_id
HAVING SUM(revisions) >= @n;
+
+ NOTE: updated to exclude bots as identified by:
+
+ SELECT ug_user
+ FROM user_groups
+ WHERE ug_group = 'bot'
"""
show_in_ui = True
@@ -94,11 +102,16 @@
.group_by(Archive.ar_user)
archived = self.filter(archived, user_ids, column=Archive.ar_user)
+ bot_user_ids = session.query(MediawikiUserGroups.ug_user)\
+ .filter(MediawikiUserGroups.ug_group == 'bot')\
+ .subquery()
+
edits = revisions.union_all(archived).subquery()
edits_by_user = session.query(
edits.c.user_id,
func.IF(func.SUM(edits.c.count) >= number_of_edits, 1, 0)
)\
+ .filter(edits.c.user_id.notin_(bot_user_ids))\
.group_by(edits.c.user_id)
metric_results = {r[0]: {self.id : r[1]} for r in edits_by_user.all()}
diff --git a/wikimetrics/metrics/rolling_new_active_editor.py
b/wikimetrics/metrics/rolling_new_active_editor.py
index 3456485..1f2c9f9 100644
--- a/wikimetrics/metrics/rolling_new_active_editor.py
+++ b/wikimetrics/metrics/rolling_new_active_editor.py
@@ -5,7 +5,7 @@
from wikimetrics.forms.fields import BetterDateTimeField
from wikimetrics.utils import today
-from wikimetrics.models.mediawiki import Revision, Archive, Logging
+from wikimetrics.models.mediawiki import Revision, Archive, Logging,
MediawikiUserGroups
from metric import Metric
@@ -51,6 +51,12 @@
) AS user_content_revision_count
GROUP BY user_id
HAVING SUM(revisions) >= @n;
+
+ NOTE: updated to exclude bots as identified by:
+
+ SELECT ug_user
+ FROM user_groups
+ WHERE ug_group = 'bot'
"""
show_in_ui = True
@@ -115,11 +121,16 @@
.filter(Archive.ar_user.in_(filtered_new))\
.group_by(Archive.ar_user)
+ bot_user_ids = session.query(MediawikiUserGroups.ug_user)\
+ .filter(MediawikiUserGroups.ug_group == 'bot')\
+ .subquery()
+
new_edits = revisions.union_all(archived).subquery()
new_edits_by_user = session.query(
new_edits.c.user_id,
func.IF(func.SUM(new_edits.c.count) >= number_of_edits, 1, 0)
)\
+ .filter(new_edits.c.user_id.notin_(bot_user_ids))\
.group_by(new_edits.c.user_id)
metric_results = {r[0]: {self.id : r[1]} for r in
new_edits_by_user.all()}
diff --git a/wikimetrics/metrics/rolling_surviving_new_active_editor.py
b/wikimetrics/metrics/rolling_surviving_new_active_editor.py
index 22e3c06..d43afb8 100644
--- a/wikimetrics/metrics/rolling_surviving_new_active_editor.py
+++ b/wikimetrics/metrics/rolling_surviving_new_active_editor.py
@@ -5,7 +5,7 @@
from wikimetrics.forms.fields import BetterDateTimeField
from wikimetrics.utils import today
-from wikimetrics.models.mediawiki import Revision, Archive, Logging
+from wikimetrics.models.mediawiki import Revision, Archive, Logging,
MediawikiUserGroups
from metric import Metric
@@ -52,8 +52,14 @@
GROUP BY user_id
) AS user_content_revision_count
GROUP BY user_id
- HAVING SUM(revisions1) >= @n;
- AND SUM(revisions2) >= @n;
+ HAVING SUM(revisions1) >= @n
+ AND SUM(revisions2) >= @n
+
+ NOTE: updated to exclude bots as identified by:
+
+ SELECT ug_user
+ FROM user_groups
+ WHERE ug_group = 'bot'
"""
show_in_ui = True
@@ -131,6 +137,10 @@
.filter(Archive.ar_user.in_(filtered_new))\
.group_by(Archive.ar_user)
+ bot_user_ids = session.query(MediawikiUserGroups.ug_user)\
+ .filter(MediawikiUserGroups.ug_group == 'bot')\
+ .subquery()
+
# For each user, with both counts from both tables,
# sum the count_one values together, check it's >= number_of_edits
# sum the count_two values together, check it's >= number_of_edits
@@ -144,6 +154,7 @@
), 1, 0
)
)\
+ .filter(new_edits.c.user_id.notin_(bot_user_ids))\
.group_by(new_edits.c.user_id)
metric_results = {r[0]: {self.id : r[1]} for r in
new_edits_by_user.all()}
diff --git a/wikimetrics/models/mediawiki/__init__.py
b/wikimetrics/models/mediawiki/__init__.py
index 0eb46f8..bf82c52 100644
--- a/wikimetrics/models/mediawiki/__init__.py
+++ b/wikimetrics/models/mediawiki/__init__.py
@@ -3,6 +3,7 @@
from revision import *
from page import *
from user import *
+from user_groups import *
from logging import *
from archive import *
diff --git a/wikimetrics/models/mediawiki/user_groups.py
b/wikimetrics/models/mediawiki/user_groups.py
new file mode 100644
index 0000000..b61324f
--- /dev/null
+++ b/wikimetrics/models/mediawiki/user_groups.py
@@ -0,0 +1,13 @@
+from sqlalchemy import Column, Integer, ForeignKey
+from sqlalchemy.dialects.mysql import VARBINARY
+
+from wikimetrics.configurables import db
+
+
+class MediawikiUserGroups(db.MediawikiBase):
+ __tablename__ = 'user_groups'
+
+ ug_user = Column(
+ Integer, ForeignKey('user.user_id'), nullable=False, default=0,
primary_key=True
+ )
+ ug_group = Column(VARBINARY(255), nullable=False, primary_key=True)
--
To view, visit https://gerrit.wikimedia.org/r/167064
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I53258947816e35b770c4b1fccfe9f4a98f82554e
Gerrit-PatchSet: 2
Gerrit-Project: analytics/wikimetrics
Gerrit-Branch: master
Gerrit-Owner: Milimetric <[email protected]>
Gerrit-Reviewer: Mforns <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits