Mholloway has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/326544 )

Change subject: Reinstate manual blacklist
......................................................................

Reinstate manual blacklist

Bug: T153001
Change-Id: Ia82bc5d98764d6609b3e1455b64672475f5b8720
---
A etc/feed/blacklist.js
D lib/feed/filter-special.js
A lib/feed/filter.js
M lib/feed/most-read.js
M test/features/most-read/most-read.js
M test/features/most-read/page-filtering.js
6 files changed, 109 insertions(+), 72 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/44/326544/1

diff --git a/etc/feed/blacklist.js b/etc/feed/blacklist.js
new file mode 100644
index 0000000..d4d6ed4
--- /dev/null
+++ b/etc/feed/blacklist.js
@@ -0,0 +1,24 @@
+'use strict';
+
+/**
+ * A list of titles persistently in the most-read results but that appear to
+ * have few human viewers based on their traffic being almost all mobile or
+ * almost all non-mobile. See 
https://phabricator.wikimedia.org/T124716#2080637.
+ * TODO: Get this filtering or something like it into the upstream pageview 
API.
+ */
+const BLACKLIST = [
+    '-',
+    'Test_card',
+    'Web_scraping',
+    'XHamster',
+    'Java_(programming_language)',
+    'Images/upload/bel.jpg',
+    'Superintelligence:_Paths,_Dangers,_Strategies',
+    'Okto',
+    'Proyecto_40',
+    'AMGTV',
+    'Lali_EspĆ³sito',
+    'La7'
+];
+
+module.exports = BLACKLIST;
diff --git a/lib/feed/filter-special.js b/lib/feed/filter-special.js
deleted file mode 100644
index 5dab66a..0000000
--- a/lib/feed/filter-special.js
+++ /dev/null
@@ -1,18 +0,0 @@
-/**
- * Helper functions for handling blacklisted titles.
- */
-
-'use strict';
-
-const escape = require('escape-string-regexp');
-
-function filterSpecialPages(articles, mainPageTitle) {
-    const mainPageRegExp = new RegExp(`^${escape(mainPageTitle)}$`, 'i');
-    return articles.filter((article) => {
-        return article.pageid
-               && article.ns === 0
-               && !mainPageRegExp.test(article.title);
-    });
-}
-
-module.exports = filterSpecialPages;
diff --git a/lib/feed/filter.js b/lib/feed/filter.js
new file mode 100644
index 0000000..977bc69
--- /dev/null
+++ b/lib/feed/filter.js
@@ -0,0 +1,75 @@
+/**
+ * Helper functions for handling blacklisted titles.
+ */
+
+'use strict';
+
+const BLACKLIST = require('../../etc/feed/blacklist');
+const escape = require('escape-string-regexp');
+const HTTPError = require('../util').HTTPError;
+
+
+/**
+ * Articles with less than this proportion of pageviews on either desktop or
+ * mobile are likely bot traffic and will be filtered from the output.
+ *
+ * Must be in the interval [0, 1].
+ */
+const BOT_FILTER_THRESHOLD = 0.1;
+
+function filterBlacklist(titles) {
+    return titles.filter((title) => {
+        return BLACKLIST.indexOf(title.article) === -1;
+    });
+}
+
+function filterSpecialPages(articles, mainPageTitle) {
+    const mainPageRegExp = new RegExp(`^${escape(mainPageTitle)}$`, 'i');
+    return articles.filter((article) => {
+        return article.pageid
+               && article.ns === 0
+               && !mainPageRegExp.test(article.title);
+    });
+}
+
+/**
+ * Implements an editor-proposed heuristic in which top-pageview articles with
+ * almost all desktop pageviews or almost all mobile pageviews, rather than a
+ * relatively even mix of the two, are presumed to be inflated by bot traffic
+ * and not of sufficient human interest to include in the feed.
+ */
+function filterBotTraffic(allPlatformsMostRead, desktopMostRead) {
+
+    if (BOT_FILTER_THRESHOLD < 0 || BOT_FILTER_THRESHOLD > 1) {
+        throw new HTTPError({
+            status: 500,
+            type: 'internal_error',
+            title: 'Internal error',
+            detail: 'An internal error occurred'
+        });
+    }
+
+    // Create an object with title keys for speedy desktop article lookups when
+    // iterating over combined-platform top pageviews
+    const desktopArticles = (function() {
+        const result = {};
+        desktopMostRead.forEach((entry) => {
+            result[entry.article] = entry;
+        });
+        return result;
+    }());
+    return allPlatformsMostRead.filter((entry) => {
+        const title = entry.article;
+        const totalViews = entry.views;
+        const desktopViews = desktopArticles[title] && 
desktopArticles[title].views;
+        return (desktopViews
+            && desktopViews >= totalViews * BOT_FILTER_THRESHOLD
+            && desktopViews <= totalViews * (1 - BOT_FILTER_THRESHOLD));
+    });
+}
+
+module.exports = {
+    filterBlacklist,
+    filterSpecialPages,
+    filterBotTraffic
+};
diff --git a/lib/feed/most-read.js b/lib/feed/most-read.js
index 17aa1f7..fb64d95 100644
--- a/lib/feed/most-read.js
+++ b/lib/feed/most-read.js
@@ -8,17 +8,9 @@
 const mUtil = require('../mobile-util');
 const api = require('../api-util');
 const mwapi = require('../mwapi');
-const filterSpecialPages = require('./filter-special');
+const filter = require('./filter');
 const dateUtil = require('../dateUtil');
-const HTTPError = require('../util').HTTPError;
 
-/**
- * Articles with less than this proportion of pageviews on either desktop or
- * mobile are likely bot traffic and will be filtered from the output.
- *
- * Must be in the interval [0, 1].
- */
-const FILTER_THRESHOLD = 0.1;
 
 /**
  * Construct a list of title strings from the array of good article objects.
@@ -60,32 +52,6 @@
     });
 }
 
-/**
- * Implements an editor-proposed heuristic in which top-pageview articles with
- * almost all desktop pageviews or almost all mobile pageviews, rather than a
- * relatively even mix of the two, are presumed to be inflated by bot traffic
- * and not of sufficient human interest to include in the feed.
- */
-function filterBotTraffic(allPlatformsMostRead, desktopMostRead) {
-    // Create an object with title keys for speedy desktop article lookups when
-    // iterating over combined-platform top pageviews
-    const desktopArticles = (function() {
-        const result = {};
-        desktopMostRead.forEach((entry) => {
-            result[entry.article] = entry;
-        });
-        return result;
-    }());
-    return allPlatformsMostRead.filter((entry) => {
-        const title = entry.article;
-        const totalViews = entry.views;
-        const desktopViews = desktopArticles[title] && 
desktopArticles[title].views;
-        return (desktopViews
-            && desktopViews >= totalViews * FILTER_THRESHOLD
-            && desktopViews <= totalViews * (1 - FILTER_THRESHOLD));
-    });
-}
-
 function promise(app, req) {
     const aggregated = !!req.query.aggregated;
     let goodTitles;
@@ -96,15 +62,6 @@
             return BBPromise.resolve({});
         }
         dateUtil.throwDateError();
-    }
-
-    if (FILTER_THRESHOLD < 0 || FILTER_THRESHOLD > 1) {
-        throw new HTTPError({
-            status: 500,
-            type: 'internal_error',
-            title: 'Internal error',
-            detail: 'An internal error occurred'
-        });
     }
 
     return getTopPageviews(app, req, aggregated)
@@ -129,7 +86,7 @@
         const combinedArticlesSlice = combinedArticles && 
combinedArticles.slice(0, QUERY_TITLES);
         const desktopArticles = firstDesktopItems && 
firstDesktopItems.articles;
         const desktopArticlesSlice = desktopArticles && 
desktopArticles.slice(0, DESKTOP_TITLES);
-        goodTitles = filterBotTraffic(combinedArticlesSlice, 
desktopArticlesSlice);
+        goodTitles = filter.filterBotTraffic(combinedArticlesSlice, 
desktopArticlesSlice);
 
         if (mUtil.isEmpty(goodTitles)) {
             mUtil.throw404('No results found.');
@@ -158,7 +115,7 @@
 
         mUtil.fillInMemberKeys(goodTitles, ['title', 'article']);
         mUtil.mergeByProp(goodTitles, pages, 'title', true);
-        goodTitles = filterSpecialPages(goodTitles, mainPageTitle);
+        goodTitles = filter.filterSpecialPages(goodTitles, mainPageTitle);
 
         const results = goodTitles.map((entry) => {
             return Object.assign(entry, {
@@ -197,8 +154,5 @@
 }
 
 module.exports = {
-    promise,
-
-    // visible for testing
-    filterBotTraffic
+    promise
 };
diff --git a/test/features/most-read/most-read.js 
b/test/features/most-read/most-read.js
index 6871f08..6ff4346 100644
--- a/test/features/most-read/most-read.js
+++ b/test/features/most-read/most-read.js
@@ -5,6 +5,7 @@
 const server = require('../../utils/server');
 const headers = require('../../utils/headers');
 const testUtil = require('../../utils/testUtil');
+const BLACKLIST = require('../../../etc/feed/blacklist');
 
 function addDays(date, days) {
     const result = new Date(date);
@@ -37,6 +38,7 @@
                 assert.ok(res.body.date);
                 assert.ok(res.body.articles.length);
                 res.body.articles.forEach((elem) => {
+                    assert.ok(BLACKLIST.indexOf(elem.title) === -1, 
'blacklisted title');
                     assert.ok(elem.$merge, '$merge should be present');
                 });
             });
@@ -50,6 +52,7 @@
                 assert.ok(res.body.date);
                 assert.ok(res.body.articles.length);
                 res.body.articles.forEach((elem) => {
+                    assert.ok(BLACKLIST.indexOf(elem.title) === -1, 
'blacklisted title');
                     assert.ok(elem.$merge, '$merge should be present');
                 });
             });
diff --git a/test/features/most-read/page-filtering.js 
b/test/features/most-read/page-filtering.js
index 5ee343c..3ba5d84 100644
--- a/test/features/most-read/page-filtering.js
+++ b/test/features/most-read/page-filtering.js
@@ -4,8 +4,7 @@
 const path = require('path');
 const parse = require('csv-parse/lib/sync');
 const assert = require('../../utils/assert');
-const filterSpecialPages = require('../../../lib/feed/filter-special');
-const mostRead = require('../../../lib/feed/most-read');
+const filter = require('../../../lib/feed/filter');
 
 const combinedMostRead = require('./all-access-top-50').items[0].articles;
 const desktopMostRead = require('./desktop-top-100').items[0].articles;
@@ -17,12 +16,12 @@
 describe('page filtering', () => {
     it('main page filtering RegExp should handle all main page title chars', 
() => {
         mainPageTitles.forEach((title) => {
-            assert.ok(filterSpecialPages(articles, title));
+            assert.ok(filter.filterSpecialPages(articles, title));
         });
     });
 
     it('Page with skewed pageview distribution by platform should be omitted 
from results', () => {
-        mostRead.filterBotTraffic(combinedMostRead, 
desktopMostRead).forEach((entry) => {
+        filter.filterBotTraffic(combinedMostRead, 
desktopMostRead).forEach((entry) => {
             assert.ok(entry.article !== "AMGTV");
         });
     });

-- 
To view, visit https://gerrit.wikimedia.org/r/326544
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia82bc5d98764d6609b3e1455b64672475f5b8720
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <mhollo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to