Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/237392
Change subject: Update bot filtering for webrequests. ...................................................................... Update bot filtering for webrequests. Rename is_crawler to isSpider to more coherent with data tagging. Update spiders matching function with better regexp and WikimediaBot removal. Add a function matching WikimediaBot and associated UDF. Update and add tests. Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8 --- M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java D refinery-core/src/test/resources/isCrawler_test_data.csv A refinery-core/src/test/resources/isSpider_test_data.csv M refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java 10 files changed, 315 insertions(+), 22 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/92/237392/1 diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java index 9ce2d4d..9fc229c 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java @@ -49,12 +49,17 @@ public static final String REFERER_EXTERNAL = "external"; /* - * Now back to the good part. - * Wikimedia-specific crawlers + * Spiders identification pattern (obvisouly not perfect...) */ - private static final Pattern crawlerPattern = Pattern.compile( - "(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot).*" - ); + private static final Pattern spiderPattern = Pattern.compile("(?i)^(" + + ".*(bot|spider|WordPress|AppEngine|AppleDictionaryService|Python-urllib|python-requests|Google-HTTP-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks).*" + + "|(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java/1\\.|curl|PHP/).*" + + "|-|)$"); + + /* + * WikimediaBot identification pattern + */ + private static final Pattern wikimediaBotPattern = Pattern.compile("\\bWikimediaBot\\b"); /** * Pattern for automatically-added subdomains that indicate zero, @@ -74,13 +79,31 @@ ); /** - * Identify Wikimedia-specific crawlers; returns TRUE - * if the user agent matches a known crawler. + * Identify a bunch of spiders; returns TRUE + * if the user agent matches a known spider and doesn't + * match the WikimediaBot convention. * @param userAgent the user agent associated with the request. * @return boolean */ + public boolean isSpider(String userAgent) { + return spiderPattern.matcher(userAgent).find() && ! wikimediaBotPattern.matcher(userAgent).find(); + } + /** + * Kept for backward compatibility. + */ + @Deprecated public boolean isCrawler(String userAgent) { - return crawlerPattern.matcher(userAgent).find(); + return isSpider(userAgent); + } + + /** + * Identify WikimediaBot; returns TRUE + * if the user agent matches the WikimediaBot convention. + * @param userAgent the user agent associated with the request. + * @return boolean + */ + public boolean isWikimediaBot(String userAgent) { + return wikimediaBotPattern.matcher(userAgent).find(); } /** diff --git a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java index cad8863..07601e0 100644 --- a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java +++ b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java @@ -13,22 +13,46 @@ @RunWith(JUnitParamsRunner.class) public class TestWebrequest { + @Deprecated @Test @FileParameters( - value = "src/test/resources/isCrawler_test_data.csv", + value = "src/test/resources/isSpider_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + + public void testIsCrawler( + String test_description, + boolean is_crawler, + boolean is_WikimediaBot, + String user_agent + ) { + Webrequest webrequest_inst = Webrequest.getInstance(); + assertEquals( + test_description, + is_crawler, + webrequest_inst.isCrawler( + user_agent + ) + ); + } + + @Test + @FileParameters( + value = "src/test/resources/isSpider_test_data.csv", mapper = CsvWithHeaderMapper.class ) - public void testisCrawler( + public void testIsSpider( String test_description, - boolean is_crawler, + boolean isSpider, + boolean isWikimediaBot, String user_agent ) { Webrequest webrequest_inst = Webrequest.getInstance(); assertEquals( test_description, - is_crawler, - webrequest_inst.isCrawler( + isSpider, + webrequest_inst.isSpider( user_agent ) ); @@ -36,6 +60,28 @@ @Test @FileParameters( + value = "src/test/resources/isSpider_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + + public void testIsWikimediabot( + String test_description, + boolean isSpider, + boolean isWikimediaBot, + String user_agent + ) { + Webrequest webrequest_inst = Webrequest.getInstance(); + assertEquals( + test_description, + isWikimediaBot, + webrequest_inst.isWikimediaBot( + user_agent + ) + ); + } + + @Test + @FileParameters( value = "src/test/resources/x_analytics_test_data.csv", mapper = CsvWithHeaderMapper.class ) diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv b/refinery-core/src/test/resources/isCrawler_test_data.csv deleted file mode 100644 index d0cb88a..0000000 --- a/refinery-core/src/test/resources/isCrawler_test_data.csv +++ /dev/null @@ -1,7 +0,0 @@ -test_description, is_crawler,user_agent -Is crawler - Google, true,MediaWikiCrawler-Google/2.0 (+wikidata-exter...@google.com) -Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/) -Is crawler – wikiwix, true,wikiwix-bot-3.0 -Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko -Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 -Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as a MIME type on certain class, false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 diff --git a/refinery-core/src/test/resources/isSpider_test_data.csv b/refinery-core/src/test/resources/isSpider_test_data.csv new file mode 100644 index 0000000..d5323b1 --- /dev/null +++ b/refinery-core/src/test/resources/isSpider_test_data.csv @@ -0,0 +1,59 @@ +test_description, isSpider, isWikimediaBot, user_agent +is spider - Google, true,false,MediaWikiCrawler-Google/2.0 (+wikidata-exter...@google.com) +is spider – goo.ne.jp, true,false,goo wikipedia (http://help.goo.ne.jp/contact/) +is spider - bin bot, true, false,Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) +is spider - dash, true, false,- +is spider - google bot, true, false,Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) +is spider - yahoo bot, true, false,Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp) +is spider - peachy bot, true, false,Peachy MediaWiki Bot API Version 2.0 (alpha 8) +is spider - google bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML; like Gecko) Version/8.0 Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; +http://www.google.com/bot.html) +is spider - baidu bot, true, false,Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html) +is spider - yandex bot, true, false,Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots) +is spider - wikiwix bot, true, false,wikiwix-bot-3.0 +is spider - java 8 unknown bot, true, false,Java/1.8.0_51 +is spider - bing bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML; like Gecko) Version/7.0 Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; http://www.bing.com/bingbot.htm) +is spider - apple dictinnary bot, true, false,AppleDictionaryService/229 +is spider - php wikibot, true, false,php wikibot classes +is spider - MS Search bot, true, false,Mozilla/4.0 (compatible; MSIE 4.01; Windows NT; MS Search 6.0 Robot) +is spider - Python unknown bot, true, false,python-requests/2.7.0 CPython/3.4.2 Linux/3.16.0-4-amd64 +is spider - searchmetrics bot, true, false,Mozilla/5.0 (compatible; SearchmetricsBot; http://www.searchmetrics.com/en/searchmetrics-bot/) +is spider - facebook external hit, true, false,facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php) +is spider - apple dictinnary bot, true, false,AppleDictionaryService/229.1 +is spider - cliqzbot, true, false,Mozilla/5.0 (compatible; Cliqzbot/1.0 +http://cliqz.com/company/cliqzbot) +is spider - apple bot, true, false,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.2.5 (KHTML; like Gecko) Version/8.0.2 Safari/600.2.5 (Applebot/0.1; +http://www.apple.com/go/applebot) +is spider - java 8 unknown bot, true, false,Java/1.8.0_25 +is spider - DotNetWikiBot, true, false,DotNetWikiBot/2.101 (Microsoft Windows NT 6.2.9200.0; .NET CLR 4.0.30319.34209) +is spider - Pywikibot, true, false,wymowa (commons:commons; User:Alkamid) Pywikibot/2.0b3 (g3) requests/2.7.0 Python/3.4.0.final.0 +is spider - msn media bot, true, false,msnbot-media/1.1 (+http://search.msn.com/msnbot.htm) +is spider - youdaobot, true, false,Mozilla/5.0 (compatible; YoudaoBot/1.0; http://www.youdao.com/help/webmaster/spider/; ) +is spider - java 8 unknown bot, true, false,Java/1.8.0_40 +is spider - java 6 unknown bot, true, false,Java/1.6.0_20 +is spider - java 8 unknown bot, true, false,Java/1.8.0_45 +is spider - Python unknown bot, true, false,Python-urllib/2.7 +is spider - java 7 unknown bot, true, false,Java/1.7.0_67 +is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/Img/2.0; +http://go.mail.ru/help/robots) +is spider - java 7 unknown bot, true, false,Java/1.7.0_79 +is spider - RBot, true, false,RBot/0.3 (under...@wolfhome.com) +is spider - Pywikipediabot, true, false,pywikipedia-git-wdlabel.py/r581 Pywikipediabot/1.0 +is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots) +is spider - sogou bot, true, false,Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07) +is spider - java 7 unknown bot, true, false,Java/1.7.0_65 +is spider - taxonbot, true, false,TaxonBot@de.wikipedia <anima...@gmx.net> – MediaWiki Tcl Bot Framework 0.5 +is spider - apple dictionnary bot, true, false,AppleDictionaryService/208 +is spider - ClueBot, true, false,ClueBot/1.1 +is spider - Unknown bot, true, false,Mozilla/5.0 (MyMemory Bot http://mymemory.traslated.net/doc/) +is spider - baidu image bot, true, false,Baiduspider-image+(+http://www.baidu.com/search/spider.htm) +is spider - Pywikipediabot, true, false,pywikipedia-addzumra.py/rg11224 Pywikipediabot/1.0 Unknown +is spider - yeti bot, true, false,Mozilla/5.0 (compatible; Yeti/1.1; +http://help.naver.com/robots/) +is spider - Pywikipediabot, true, false,pwb/rg3113 Pywikipediabot/2.0 +is spider - exabot, true, false,Mozilla/5.0 (compatible; Exabot/3.0; +http://www.exabot.com/go/robot) +is spider - Python unknown bot, true, false,Python-urllib/2.6 +is spider - yacybot, true, false,yacybot (/global; amd64 Linux 3.13.0-63-generic; java 1.7.0_80; Europe/en) http://yacy.net/bot.html +is spider - Pywikibot, true, false,maj_articles_recents (wikipedia:fr; User:Z%C3%A9roBot) Pywikibot/2.0b3 (g4795) httplib2/0.7.2 Python/2.7.3.final.0 +is spider - Pywikibot, true, false,CategorieAutoriPer (wikisource:it; User:CandalBot) Pywikibot/2.0b3 (g5671) requests/2.0.0 Python/2.7.3.final.0 +is spider - curl bot, true, false,curl/7.35.0 +is spider - empty, true, false, +Is Not spider - firefox, false,false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko +Is Not spider - iphone, false,false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 +Is Not spider - opera, false,false,Opera/9.80 (Android; Opera Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10 +Is Not spider - WikimediaBot, false,true,Whatever UA info containing WikimediaBot should match. diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java index 51df7a4..6d13899 100644 --- a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java @@ -23,13 +23,14 @@ * A hive UDF to identify Wikimedia-specific crawlers, * which ua-parser misses (for obvious reasons) */ +@Deprecated public class IsCrawlerUDF extends UDF { public boolean evaluate( String user_agent ) { Webrequest webrequest_inst = Webrequest.getInstance(); return webrequest_inst.isCrawler( - user_agent + user_agent ); } } \ No newline at end of file diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java new file mode 100644 index 0000000..bb11963 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java @@ -0,0 +1,35 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.wikimedia.analytics.refinery.core.Webrequest; + +/** + * A hive UDF to identify spiders, + * which ua-parser misses (for obvious reasons) + */ +public class IsSpiderUDF extends UDF { + public boolean evaluate( + String user_agent + ) { + Webrequest webrequest_inst = Webrequest.getInstance(); + return webrequest_inst.isSpider( + user_agent + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java new file mode 100644 index 0000000..08c95b4 --- /dev/null +++ b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java @@ -0,0 +1,34 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.wikimedia.analytics.refinery.hive; + +import org.apache.hadoop.hive.ql.exec.UDF; +import org.wikimedia.analytics.refinery.core.Webrequest; + +/** + * A hive UDF to identify WikimediaBot. + */ +public class IsWikimediaBotUDF extends UDF { + public boolean evaluate( + String user_agent + ) { + Webrequest webrequest_inst = Webrequest.getInstance(); + return webrequest_inst.isWikimediaBot( + user_agent + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java index 58effb3..aab9ed1 100644 --- a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java @@ -23,17 +23,19 @@ import junitparams.JUnitParamsRunner; import junitparams.mappers.CsvWithHeaderMapper; +@Deprecated @RunWith(JUnitParamsRunner.class) public class TestIsCrawlerUDF { @Test @FileParameters( - value = "../refinery-core/src/test/resources/isCrawler_test_data.csv", + value = "../refinery-core/src/test/resources/isSpider_test_data.csv", mapper = CsvWithHeaderMapper.class ) public void testIsCrawler( String test_description, boolean is_crawler, + boolean is_wikimediaBot, String user_agent ) { IsCrawlerUDF udf = new IsCrawlerUDF(); diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java new file mode 100644 index 0000000..1566e5f --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java @@ -0,0 +1,50 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; +import org.junit.Test; +import org.junit.runner.RunWith; + +import static org.junit.Assert.assertEquals; + +@RunWith(JUnitParamsRunner.class) +public class TestIsSpiderUDF { + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/isSpider_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsCrawler( + String test_description, + boolean isSpider, + boolean isWikimediaBot, + String user_agent + ) { + IsSpiderUDF udf = new IsSpiderUDF(); + + assertEquals( + test_description, + isSpider, + udf.evaluate( + user_agent + ) + ); + } +} \ No newline at end of file diff --git a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java new file mode 100644 index 0000000..b5f47e5 --- /dev/null +++ b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java @@ -0,0 +1,50 @@ +/** + * Copyright (C) 2014 Wikimedia Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.wikimedia.analytics.refinery.hive; + +import junitparams.FileParameters; +import junitparams.JUnitParamsRunner; +import junitparams.mappers.CsvWithHeaderMapper; +import org.junit.Test; +import org.junit.runner.RunWith; + +import static org.junit.Assert.assertEquals; + +@RunWith(JUnitParamsRunner.class) +public class TestIsWikimediaBotUDF { + + @Test + @FileParameters( + value = "../refinery-core/src/test/resources/isSpider_test_data.csv", + mapper = CsvWithHeaderMapper.class + ) + public void testIsWikimediaBot( + String test_description, + boolean isSpider, + boolean isWikimediaBot, + String user_agent + ) { + IsWikimediaBotUDF udf = new IsWikimediaBotUDF(); + + assertEquals( + test_description, + isWikimediaBot, + udf.evaluate( + user_agent + ) + ); + } +} \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/237392 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery/source Gerrit-Branch: master Gerrit-Owner: Joal <j...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits