Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/237392

Change subject: Update bot filtering for webrequests.
......................................................................

Update bot filtering for webrequests.

Rename is_crawler to isSpider to more coherent with data tagging.
Update spiders matching function with better regexp and WikimediaBot removal.
Add a function matching WikimediaBot and associated UDF.
Update and add tests.

Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8
---
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
D refinery-core/src/test/resources/isCrawler_test_data.csv
A refinery-core/src/test/resources/isSpider_test_data.csv
M 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java
10 files changed, 315 insertions(+), 22 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/92/237392/1

diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
index 9ce2d4d..9fc229c 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/Webrequest.java
@@ -49,12 +49,17 @@
     public static final String REFERER_EXTERNAL = "external";
 
     /*
-     * Now back to the good part.
-     * Wikimedia-specific crawlers
+     * Spiders identification pattern (obvisouly not perfect...) 
      */
-    private static final Pattern crawlerPattern = Pattern.compile(
-        "(goo wikipedia|MediaWikiCrawler-Google|wikiwix-bot).*"
-    );
+    private static final Pattern spiderPattern = Pattern.compile("(?i)^(" +
+                    
".*(bot|spider|WordPress|AppEngine|AppleDictionaryService|Python-urllib|python-requests|Google-HTTP-Java-Client|[Ff]acebook|[Yy]ahoo|RockPeaks).*"
 +
+                    "|(goo 
wikipedia|MediaWikiCrawler-Google|wikiwix-bot|Java/1\\.|curl|PHP/).*" +
+                    "|-|)$");
+
+    /*
+     * WikimediaBot identification pattern 
+     */
+    private static final Pattern wikimediaBotPattern = 
Pattern.compile("\\bWikimediaBot\\b");
 
     /**
      * Pattern for automatically-added subdomains that indicate zero,
@@ -74,13 +79,31 @@
     );
 
     /**
-     * Identify Wikimedia-specific crawlers; returns TRUE
-     * if the user agent matches a known crawler.
+     * Identify a bunch of spiders; returns TRUE
+     * if the user agent matches a known spider and doesn't
+     * match the WikimediaBot convention.
      * @param    userAgent    the user agent associated with the request.
      * @return   boolean
      */
+    public boolean isSpider(String userAgent) {
+        return spiderPattern.matcher(userAgent).find() && ! 
wikimediaBotPattern.matcher(userAgent).find();
+    }
+    /**
+     * Kept for backward compatibility.
+     */
+    @Deprecated
     public boolean isCrawler(String userAgent) {
-        return crawlerPattern.matcher(userAgent).find();
+        return isSpider(userAgent);
+    }
+
+    /**
+     * Identify WikimediaBot; returns TRUE
+     * if the user agent matches the WikimediaBot convention.
+     * @param    userAgent    the user agent associated with the request.
+     * @return   boolean
+     */
+    public boolean isWikimediaBot(String userAgent) {
+        return wikimediaBotPattern.matcher(userAgent).find();
     }
 
     /**
diff --git 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
index cad8863..07601e0 100644
--- 
a/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
+++ 
b/refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestWebrequest.java
@@ -13,22 +13,46 @@
 @RunWith(JUnitParamsRunner.class)
 public class TestWebrequest {
 
+    @Deprecated
     @Test
     @FileParameters(
-        value = "src/test/resources/isCrawler_test_data.csv",
+            value = "src/test/resources/isSpider_test_data.csv",
+            mapper = CsvWithHeaderMapper.class
+    )
+
+    public void testIsCrawler(
+            String test_description,
+            boolean is_crawler,
+            boolean is_WikimediaBot,
+            String user_agent
+    ) {
+        Webrequest webrequest_inst = Webrequest.getInstance();
+        assertEquals(
+                test_description,
+                is_crawler,
+                webrequest_inst.isCrawler(
+                        user_agent
+                )
+        );
+    }
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/isSpider_test_data.csv",
         mapper = CsvWithHeaderMapper.class
     )
 
-    public void testisCrawler(
+    public void testIsSpider(
         String test_description,
-        boolean is_crawler,
+        boolean isSpider,
+        boolean isWikimediaBot,
         String user_agent
     ) {
         Webrequest webrequest_inst = Webrequest.getInstance();
         assertEquals(
             test_description,
-            is_crawler,
-            webrequest_inst.isCrawler(
+            isSpider,
+            webrequest_inst.isSpider(
                 user_agent
             )
         );
@@ -36,6 +60,28 @@
 
     @Test
     @FileParameters(
+            value = "src/test/resources/isSpider_test_data.csv",
+            mapper = CsvWithHeaderMapper.class
+    )
+
+    public void testIsWikimediabot(
+            String test_description,
+            boolean isSpider,
+            boolean isWikimediaBot,
+            String user_agent
+    ) {
+        Webrequest webrequest_inst = Webrequest.getInstance();
+        assertEquals(
+                test_description,
+                isWikimediaBot,
+                webrequest_inst.isWikimediaBot(
+                        user_agent
+                )
+        );
+    }
+
+    @Test
+    @FileParameters(
         value = "src/test/resources/x_analytics_test_data.csv",
         mapper = CsvWithHeaderMapper.class
     )
diff --git a/refinery-core/src/test/resources/isCrawler_test_data.csv 
b/refinery-core/src/test/resources/isCrawler_test_data.csv
deleted file mode 100644
index d0cb88a..0000000
--- a/refinery-core/src/test/resources/isCrawler_test_data.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-test_description, is_crawler,user_agent
-Is crawler - Google, true,MediaWikiCrawler-Google/2.0 
(+wikidata-exter...@google.com)
-Is crawler – goo.ne.jp, true,goo wikipedia (http://help.goo.ne.jp/contact/)
-Is crawler – wikiwix, true,wikiwix-bot-3.0
-Is Not Pageview - http_status != 200, false,Mozilla/5.0 (Windows NT 6.1; 
Trident/7.0; rv:11.0) like Gecko
-Is Not Pageview - content_type does not match, false,Mozilla/5.0 (iPhone; CPU 
iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) 
Version/7.0 Mobile/11D257 Safari/9537.53
-Is Not Pageview - API stupidity: it outputs a 200 status code and text/html as 
a MIME type on certain class, false,Opera/9.80 (Android; Opera 
Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
diff --git a/refinery-core/src/test/resources/isSpider_test_data.csv 
b/refinery-core/src/test/resources/isSpider_test_data.csv
new file mode 100644
index 0000000..d5323b1
--- /dev/null
+++ b/refinery-core/src/test/resources/isSpider_test_data.csv
@@ -0,0 +1,59 @@
+test_description, isSpider, isWikimediaBot, user_agent
+is spider - Google, true,false,MediaWikiCrawler-Google/2.0 
(+wikidata-exter...@google.com)
+is spider – goo.ne.jp, true,false,goo wikipedia 
(http://help.goo.ne.jp/contact/)
+is spider - bin bot, true, false,Mozilla/5.0 (compatible; bingbot/2.0; 
+http://www.bing.com/bingbot.htm)
+is spider - dash, true, false,-
+is spider - google bot, true, false,Mozilla/5.0 (compatible; Googlebot/2.1; 
+http://www.google.com/bot.html)
+is spider - yahoo bot, true, false,Mozilla/5.0 (compatible; Yahoo! Slurp; 
http://help.yahoo.com/help/us/ysearch/slurp)
+is spider - peachy bot, true, false,Peachy MediaWiki Bot API Version 2.0 
(alpha 8)
+is spider - google bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 
8_3 like Mac OS X) AppleWebKit/600.1.4 (KHTML; like Gecko) Version/8.0 
Mobile/12F70 Safari/600.1.4 (compatible; Googlebot/2.1; 
+http://www.google.com/bot.html)
+is spider - baidu bot, true, false,Mozilla/5.0 (compatible; Baiduspider/2.0; 
+http://www.baidu.com/search/spider.html)
+is spider - yandex bot, true, false,Mozilla/5.0 (compatible; YandexBot/3.0; 
+http://yandex.com/bots)
+is spider - wikiwix bot, true, false,wikiwix-bot-3.0
+is spider - java 8 unknown bot, true, false,Java/1.8.0_51
+is spider - bing bot safari, true, false,Mozilla/5.0 (iPhone; CPU iPhone OS 
7_0 like Mac OS X) AppleWebKit/537.51.1 (KHTML; like Gecko) Version/7.0 
Mobile/11A465 Safari/9537.53 (compatible; bingbot/2.0; 
http://www.bing.com/bingbot.htm)
+is spider - apple dictinnary bot, true, false,AppleDictionaryService/229
+is spider - php wikibot, true, false,php wikibot classes
+is spider - MS Search bot, true, false,Mozilla/4.0 (compatible; MSIE 4.01; 
Windows NT; MS Search 6.0 Robot)
+is spider - Python unknown bot, true, false,python-requests/2.7.0 
CPython/3.4.2 Linux/3.16.0-4-amd64
+is spider - searchmetrics bot, true, false,Mozilla/5.0 (compatible; 
SearchmetricsBot; http://www.searchmetrics.com/en/searchmetrics-bot/)
+is spider - facebook external hit, true, false,facebookexternalhit/1.1 
(+http://www.facebook.com/externalhit_uatext.php)
+is spider - apple dictinnary bot, true, false,AppleDictionaryService/229.1
+is spider - cliqzbot, true, false,Mozilla/5.0 (compatible; Cliqzbot/1.0 
+http://cliqz.com/company/cliqzbot)
+is spider - apple bot, true, false,Mozilla/5.0 (Macintosh; Intel Mac OS X 
10_10_1) AppleWebKit/600.2.5 (KHTML; like Gecko) Version/8.0.2 Safari/600.2.5 
(Applebot/0.1; +http://www.apple.com/go/applebot)
+is spider - java 8 unknown bot, true, false,Java/1.8.0_25
+is spider - DotNetWikiBot, true, false,DotNetWikiBot/2.101 (Microsoft Windows 
NT 6.2.9200.0; .NET CLR 4.0.30319.34209)
+is spider - Pywikibot, true, false,wymowa (commons:commons; User:Alkamid) 
Pywikibot/2.0b3 (g3) requests/2.7.0 Python/3.4.0.final.0
+is spider - msn media bot, true, false,msnbot-media/1.1 
(+http://search.msn.com/msnbot.htm)
+is spider - youdaobot, true, false,Mozilla/5.0 (compatible; YoudaoBot/1.0; 
http://www.youdao.com/help/webmaster/spider/; )
+is spider - java 8 unknown bot, true, false,Java/1.8.0_40
+is spider - java 6 unknown bot, true, false,Java/1.6.0_20
+is spider - java 8 unknown bot, true, false,Java/1.8.0_45
+is spider - Python unknown bot, true, false,Python-urllib/2.7
+is spider - java 7 unknown bot, true, false,Java/1.7.0_67
+is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; 
Mail.RU_Bot/Img/2.0; +http://go.mail.ru/help/robots)
+is spider - java 7 unknown bot, true, false,Java/1.7.0_79
+is spider - RBot, true, false,RBot/0.3 (under...@wolfhome.com)
+is spider - Pywikipediabot, true, false,pywikipedia-git-wdlabel.py/r581 
Pywikipediabot/1.0
+is spider - mail.ru_bot, true, false,Mozilla/5.0 (compatible; Linux x86_64; 
Mail.RU_Bot/2.0; +http://go.mail.ru/help/robots)
+is spider - sogou bot, true, false,Sogou web 
spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)
+is spider - java 7 unknown bot, true, false,Java/1.7.0_65
+is spider - taxonbot, true, false,TaxonBot@de.wikipedia <anima...@gmx.net> – 
MediaWiki Tcl Bot Framework 0.5
+is spider - apple dictionnary bot, true, false,AppleDictionaryService/208
+is spider - ClueBot, true, false,ClueBot/1.1
+is spider - Unknown bot, true, false,Mozilla/5.0 (MyMemory Bot 
http://mymemory.traslated.net/doc/)
+is spider - baidu image bot, true, 
false,Baiduspider-image+(+http://www.baidu.com/search/spider.htm)
+is spider - Pywikipediabot, true, false,pywikipedia-addzumra.py/rg11224 
Pywikipediabot/1.0 Unknown
+is spider - yeti bot, true, false,Mozilla/5.0 (compatible; Yeti/1.1; 
+http://help.naver.com/robots/)
+is spider - Pywikipediabot, true, false,pwb/rg3113 Pywikipediabot/2.0
+is spider - exabot, true, false,Mozilla/5.0 (compatible; Exabot/3.0; 
+http://www.exabot.com/go/robot)
+is spider - Python unknown bot, true, false,Python-urllib/2.6
+is spider - yacybot, true, false,yacybot (/global; amd64 Linux 
3.13.0-63-generic; java 1.7.0_80; Europe/en) http://yacy.net/bot.html
+is spider - Pywikibot, true, false,maj_articles_recents (wikipedia:fr; 
User:Z%C3%A9roBot) Pywikibot/2.0b3 (g4795) httplib2/0.7.2 Python/2.7.3.final.0
+is spider - Pywikibot, true, false,CategorieAutoriPer (wikisource:it; 
User:CandalBot) Pywikibot/2.0b3 (g5671) requests/2.0.0 Python/2.7.3.final.0
+is spider - curl bot, true, false,curl/7.35.0
+is spider - empty, true, false,
+Is Not spider - firefox, false,false,Mozilla/5.0 (Windows NT 6.1; Trident/7.0; 
rv:11.0) like Gecko
+Is Not spider - iphone, false,false,Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 
like Mac OS X) AppleWebKit/537.51.2 (KHTML like Gecko) Version/7.0 
Mobile/11D257 Safari/9537.53
+Is Not spider - opera, false,false,Opera/9.80 (Android; Opera 
Mini/7.6.35843/35.5858; U; en) Presto/2.8.119 Version/11.10
+Is Not spider - WikimediaBot, false,true,Whatever UA info containing 
WikimediaBot should match.
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java
index 51df7a4..6d13899 100644
--- 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsCrawlerUDF.java
@@ -23,13 +23,14 @@
  * A hive UDF to identify Wikimedia-specific crawlers,
  * which ua-parser misses (for obvious reasons)
  */
+@Deprecated
 public class IsCrawlerUDF extends UDF {
     public boolean evaluate(
         String user_agent
     ) {
         Webrequest webrequest_inst = Webrequest.getInstance();
         return webrequest_inst.isCrawler(
-             user_agent
+                user_agent
         );
     }
 }
\ No newline at end of file
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java
new file mode 100644
index 0000000..bb11963
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsSpiderUDF.java
@@ -0,0 +1,35 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.Webrequest;
+
+/**
+ * A hive UDF to identify spiders,
+ * which ua-parser misses (for obvious reasons)
+ */
+public class IsSpiderUDF extends UDF {
+    public boolean evaluate(
+        String user_agent
+    ) {
+        Webrequest webrequest_inst = Webrequest.getInstance();
+        return webrequest_inst.isSpider(
+                user_agent
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java
 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java
new file mode 100644
index 0000000..08c95b4
--- /dev/null
+++ 
b/refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/IsWikimediaBotUDF.java
@@ -0,0 +1,34 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.wikimedia.analytics.refinery.hive;
+
+import org.apache.hadoop.hive.ql.exec.UDF;
+import org.wikimedia.analytics.refinery.core.Webrequest;
+
+/**
+ * A hive UDF to identify WikimediaBot.
+ */
+public class IsWikimediaBotUDF extends UDF {
+    public boolean evaluate(
+        String user_agent
+    ) {
+        Webrequest webrequest_inst = Webrequest.getInstance();
+        return webrequest_inst.isWikimediaBot(
+                user_agent
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java
index 58effb3..aab9ed1 100644
--- 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsCrawlerUDF.java
@@ -23,17 +23,19 @@
 import junitparams.JUnitParamsRunner;
 import junitparams.mappers.CsvWithHeaderMapper;
 
+@Deprecated
 @RunWith(JUnitParamsRunner.class)
 public class TestIsCrawlerUDF {
 
     @Test
     @FileParameters(
-        value = "../refinery-core/src/test/resources/isCrawler_test_data.csv",
+        value = "../refinery-core/src/test/resources/isSpider_test_data.csv",
         mapper = CsvWithHeaderMapper.class
     )
     public void testIsCrawler(
         String test_description,
         boolean is_crawler,
+        boolean is_wikimediaBot,
         String user_agent
     ) {
         IsCrawlerUDF udf = new IsCrawlerUDF();
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java
new file mode 100644
index 0000000..1566e5f
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsSpiderUDF.java
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestIsSpiderUDF {
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/isSpider_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsCrawler(
+        String test_description,
+        boolean isSpider,
+        boolean isWikimediaBot,
+        String user_agent
+    ) {
+        IsSpiderUDF udf = new IsSpiderUDF();
+
+        assertEquals(
+            test_description,
+            isSpider,
+            udf.evaluate(
+                user_agent
+            )
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java
 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java
new file mode 100644
index 0000000..b5f47e5
--- /dev/null
+++ 
b/refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsWikimediaBotUDF.java
@@ -0,0 +1,50 @@
+/**
+ * Copyright (C) 2014  Wikimedia Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.wikimedia.analytics.refinery.hive;
+
+import junitparams.FileParameters;
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+
+import static org.junit.Assert.assertEquals;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestIsWikimediaBotUDF {
+
+    @Test
+    @FileParameters(
+        value = "../refinery-core/src/test/resources/isSpider_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsWikimediaBot(
+        String test_description,
+        boolean isSpider,
+        boolean isWikimediaBot,
+        String user_agent
+    ) {
+        IsWikimediaBotUDF udf = new IsWikimediaBotUDF();
+
+        assertEquals(
+            test_description,
+            isWikimediaBot,
+            udf.evaluate(
+                user_agent
+            )
+        );
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/237392
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3b468050b613c1e97d87b782cbfd90c9fdc433b8
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: Joal <j...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to