Mholloway has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/326544 )
Change subject: Reinstate manual blacklist ...................................................................... Reinstate manual blacklist Bug: T153001 Change-Id: Ia82bc5d98764d6609b3e1455b64672475f5b8720 --- A etc/feed/blacklist.js D lib/feed/filter-special.js A lib/feed/filter.js M lib/feed/most-read.js M test/features/most-read/most-read.js M test/features/most-read/page-filtering.js 6 files changed, 109 insertions(+), 72 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/44/326544/1 diff --git a/etc/feed/blacklist.js b/etc/feed/blacklist.js new file mode 100644 index 0000000..d4d6ed4 --- /dev/null +++ b/etc/feed/blacklist.js @@ -0,0 +1,24 @@ +'use strict'; + +/** + * A list of titles persistently in the most-read results but that appear to + * have few human viewers based on their traffic being almost all mobile or + * almost all non-mobile. See https://phabricator.wikimedia.org/T124716#2080637. + * TODO: Get this filtering or something like it into the upstream pageview API. + */ +const BLACKLIST = [ + '-', + 'Test_card', + 'Web_scraping', + 'XHamster', + 'Java_(programming_language)', + 'Images/upload/bel.jpg', + 'Superintelligence:_Paths,_Dangers,_Strategies', + 'Okto', + 'Proyecto_40', + 'AMGTV', + 'Lali_EspĆ³sito', + 'La7' +]; + +module.exports = BLACKLIST; diff --git a/lib/feed/filter-special.js b/lib/feed/filter-special.js deleted file mode 100644 index 5dab66a..0000000 --- a/lib/feed/filter-special.js +++ /dev/null @@ -1,18 +0,0 @@ -/** - * Helper functions for handling blacklisted titles. - */ - -'use strict'; - -const escape = require('escape-string-regexp'); - -function filterSpecialPages(articles, mainPageTitle) { - const mainPageRegExp = new RegExp(`^${escape(mainPageTitle)}$`, 'i'); - return articles.filter((article) => { - return article.pageid - && article.ns === 0 - && !mainPageRegExp.test(article.title); - }); -} - -module.exports = filterSpecialPages; diff --git a/lib/feed/filter.js b/lib/feed/filter.js new file mode 100644 index 0000000..977bc69 --- /dev/null +++ b/lib/feed/filter.js @@ -0,0 +1,75 @@ +/** + * Helper functions for handling blacklisted titles. + */ + +'use strict'; + +const BLACKLIST = require('../../etc/feed/blacklist'); +const escape = require('escape-string-regexp'); +const HTTPError = require('../util').HTTPError; + + +/** + * Articles with less than this proportion of pageviews on either desktop or + * mobile are likely bot traffic and will be filtered from the output. + * + * Must be in the interval [0, 1]. + */ +const BOT_FILTER_THRESHOLD = 0.1; + +function filterBlacklist(titles) { + return titles.filter((title) => { + return BLACKLIST.indexOf(title.article) === -1; + }); +} + +function filterSpecialPages(articles, mainPageTitle) { + const mainPageRegExp = new RegExp(`^${escape(mainPageTitle)}$`, 'i'); + return articles.filter((article) => { + return article.pageid + && article.ns === 0 + && !mainPageRegExp.test(article.title); + }); +} + +/** + * Implements an editor-proposed heuristic in which top-pageview articles with + * almost all desktop pageviews or almost all mobile pageviews, rather than a + * relatively even mix of the two, are presumed to be inflated by bot traffic + * and not of sufficient human interest to include in the feed. + */ +function filterBotTraffic(allPlatformsMostRead, desktopMostRead) { + + if (BOT_FILTER_THRESHOLD < 0 || BOT_FILTER_THRESHOLD > 1) { + throw new HTTPError({ + status: 500, + type: 'internal_error', + title: 'Internal error', + detail: 'An internal error occurred' + }); + } + + // Create an object with title keys for speedy desktop article lookups when + // iterating over combined-platform top pageviews + const desktopArticles = (function() { + const result = {}; + desktopMostRead.forEach((entry) => { + result[entry.article] = entry; + }); + return result; + }()); + return allPlatformsMostRead.filter((entry) => { + const title = entry.article; + const totalViews = entry.views; + const desktopViews = desktopArticles[title] && desktopArticles[title].views; + return (desktopViews + && desktopViews >= totalViews * BOT_FILTER_THRESHOLD + && desktopViews <= totalViews * (1 - BOT_FILTER_THRESHOLD)); + }); +} + +module.exports = { + filterBlacklist, + filterSpecialPages, + filterBotTraffic +}; diff --git a/lib/feed/most-read.js b/lib/feed/most-read.js index 17aa1f7..fb64d95 100644 --- a/lib/feed/most-read.js +++ b/lib/feed/most-read.js @@ -8,17 +8,9 @@ const mUtil = require('../mobile-util'); const api = require('../api-util'); const mwapi = require('../mwapi'); -const filterSpecialPages = require('./filter-special'); +const filter = require('./filter'); const dateUtil = require('../dateUtil'); -const HTTPError = require('../util').HTTPError; -/** - * Articles with less than this proportion of pageviews on either desktop or - * mobile are likely bot traffic and will be filtered from the output. - * - * Must be in the interval [0, 1]. - */ -const FILTER_THRESHOLD = 0.1; /** * Construct a list of title strings from the array of good article objects. @@ -60,32 +52,6 @@ }); } -/** - * Implements an editor-proposed heuristic in which top-pageview articles with - * almost all desktop pageviews or almost all mobile pageviews, rather than a - * relatively even mix of the two, are presumed to be inflated by bot traffic - * and not of sufficient human interest to include in the feed. - */ -function filterBotTraffic(allPlatformsMostRead, desktopMostRead) { - // Create an object with title keys for speedy desktop article lookups when - // iterating over combined-platform top pageviews - const desktopArticles = (function() { - const result = {}; - desktopMostRead.forEach((entry) => { - result[entry.article] = entry; - }); - return result; - }()); - return allPlatformsMostRead.filter((entry) => { - const title = entry.article; - const totalViews = entry.views; - const desktopViews = desktopArticles[title] && desktopArticles[title].views; - return (desktopViews - && desktopViews >= totalViews * FILTER_THRESHOLD - && desktopViews <= totalViews * (1 - FILTER_THRESHOLD)); - }); -} - function promise(app, req) { const aggregated = !!req.query.aggregated; let goodTitles; @@ -96,15 +62,6 @@ return BBPromise.resolve({}); } dateUtil.throwDateError(); - } - - if (FILTER_THRESHOLD < 0 || FILTER_THRESHOLD > 1) { - throw new HTTPError({ - status: 500, - type: 'internal_error', - title: 'Internal error', - detail: 'An internal error occurred' - }); } return getTopPageviews(app, req, aggregated) @@ -129,7 +86,7 @@ const combinedArticlesSlice = combinedArticles && combinedArticles.slice(0, QUERY_TITLES); const desktopArticles = firstDesktopItems && firstDesktopItems.articles; const desktopArticlesSlice = desktopArticles && desktopArticles.slice(0, DESKTOP_TITLES); - goodTitles = filterBotTraffic(combinedArticlesSlice, desktopArticlesSlice); + goodTitles = filter.filterBotTraffic(combinedArticlesSlice, desktopArticlesSlice); if (mUtil.isEmpty(goodTitles)) { mUtil.throw404('No results found.'); @@ -158,7 +115,7 @@ mUtil.fillInMemberKeys(goodTitles, ['title', 'article']); mUtil.mergeByProp(goodTitles, pages, 'title', true); - goodTitles = filterSpecialPages(goodTitles, mainPageTitle); + goodTitles = filter.filterSpecialPages(goodTitles, mainPageTitle); const results = goodTitles.map((entry) => { return Object.assign(entry, { @@ -197,8 +154,5 @@ } module.exports = { - promise, - - // visible for testing - filterBotTraffic + promise }; diff --git a/test/features/most-read/most-read.js b/test/features/most-read/most-read.js index 6871f08..6ff4346 100644 --- a/test/features/most-read/most-read.js +++ b/test/features/most-read/most-read.js @@ -5,6 +5,7 @@ const server = require('../../utils/server'); const headers = require('../../utils/headers'); const testUtil = require('../../utils/testUtil'); +const BLACKLIST = require('../../../etc/feed/blacklist'); function addDays(date, days) { const result = new Date(date); @@ -37,6 +38,7 @@ assert.ok(res.body.date); assert.ok(res.body.articles.length); res.body.articles.forEach((elem) => { + assert.ok(BLACKLIST.indexOf(elem.title) === -1, 'blacklisted title'); assert.ok(elem.$merge, '$merge should be present'); }); }); @@ -50,6 +52,7 @@ assert.ok(res.body.date); assert.ok(res.body.articles.length); res.body.articles.forEach((elem) => { + assert.ok(BLACKLIST.indexOf(elem.title) === -1, 'blacklisted title'); assert.ok(elem.$merge, '$merge should be present'); }); }); diff --git a/test/features/most-read/page-filtering.js b/test/features/most-read/page-filtering.js index 5ee343c..3ba5d84 100644 --- a/test/features/most-read/page-filtering.js +++ b/test/features/most-read/page-filtering.js @@ -4,8 +4,7 @@ const path = require('path'); const parse = require('csv-parse/lib/sync'); const assert = require('../../utils/assert'); -const filterSpecialPages = require('../../../lib/feed/filter-special'); -const mostRead = require('../../../lib/feed/most-read'); +const filter = require('../../../lib/feed/filter'); const combinedMostRead = require('./all-access-top-50').items[0].articles; const desktopMostRead = require('./desktop-top-100').items[0].articles; @@ -17,12 +16,12 @@ describe('page filtering', () => { it('main page filtering RegExp should handle all main page title chars', () => { mainPageTitles.forEach((title) => { - assert.ok(filterSpecialPages(articles, title)); + assert.ok(filter.filterSpecialPages(articles, title)); }); }); it('Page with skewed pageview distribution by platform should be omitted from results', () => { - mostRead.filterBotTraffic(combinedMostRead, desktopMostRead).forEach((entry) => { + filter.filterBotTraffic(combinedMostRead, desktopMostRead).forEach((entry) => { assert.ok(entry.article !== "AMGTV"); }); }); -- To view, visit https://gerrit.wikimedia.org/r/326544 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia82bc5d98764d6609b3e1455b64672475f5b8720 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: Mholloway <mhollo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits