OliverKeyes has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/181049

Change subject: [WIP] UDF for identifying if a request meets the legacy 
pageview definition.
......................................................................

[WIP] UDF for identifying if a request meets the legacy pageview definition.

This is very much a work in practise, by which I mean "the most you can say
about it is that it compiles". It still needs proper test integration
and an actual UDF. It would be enhanced by someone working out
how we go about integrating the test datasets in a non-painful way
(that may be dependent on Otto's branch getting merged).

Change-Id: Iaf376e702806d664332d87a69cc34cb7e4c9f095
---
M refinery-core/pom.xml
A 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/LegacyPageview.java
A refinery-core/src/test/java/org/wikimedia/mediawiki/TestLegacyPageview.java
A refinery-core/src/test/resources/legacy_pageview_test_data.csv
4 files changed, 188 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/49/181049/1

diff --git a/refinery-core/pom.xml b/refinery-core/pom.xml
index e69de29..d81f982 100644
--- a/refinery-core/pom.xml
+++ b/refinery-core/pom.xml
@@ -0,0 +1,67 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+      <groupId>org.wikimedia.analytics.refinery</groupId>
+      <artifactId>refinery</artifactId>
+      <version>0.0.3-SNAPSHOT</version>
+  </parent>
+
+  <groupId>org.wikimedia.analytics.refinery.core</groupId>
+  <artifactId>refinery-core</artifactId>
+  <name>Wikimedia Analytics Refinery Core</name>
+  <packaging>jar</packaging>
+
+  <dependencies>
+      <dependency>
+         <groupId>org.apache.hadoop</groupId>
+         <artifactId>hadoop-common</artifactId>
+      </dependency>
+
+      <dependency>
+          <groupId>org.apache.hadoop</groupId>
+          <artifactId>hadoop-client</artifactId>
+          <version>2.3.0-cdh5.0.2</version>
+      </dependency>
+
+      <dependency>
+          <groupId>junit</groupId>
+          <artifactId>junit</artifactId>
+          <version>4.11</version>
+          <scope>test</scope>
+      </dependency>
+
+      <dependency>
+          <groupId>pl.pragmatists</groupId>
+          <artifactId>JUnitParams</artifactId>
+          <version>1.0.3</version>
+          <scope>test</scope>
+      </dependency>
+
+  </dependencies>
+
+  <build>
+      <plugins>
+          <plugin>
+              <groupId>org.apache.maven.plugins</groupId>
+              <artifactId>maven-shade-plugin</artifactId>
+              <version>2.0</version>
+              <configuration>
+                  <shadedArtifactAttached>false</shadedArtifactAttached>
+              </configuration>
+              <executions>
+                  <execution>
+                      <phase>package</phase>
+                      <goals>
+                          <goal>shade</goal>
+                      </goals>
+                      <configuration>
+                          
<createDependencyReducedPom>false</createDependencyReducedPom>
+                      </configuration>
+                  </execution>
+              </executions>
+          </plugin>
+      </plugins>
+  </build>
+</project>
diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/LegacyPageview.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/LegacyPageview.java
new file mode 100644
index 0000000..ad93a2e
--- /dev/null
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/LegacyPageview.java
@@ -0,0 +1,81 @@
+// Copyright 2014 Wikimedia Foundation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package org.wikimedia.analytics.refinery.core;
+
+import java.util.regex.Pattern;
+import java.util.HashSet;
+import java.util.Arrays;
+
+/**
+ * Static functions to work wtih Wikimedia webrequest data.
+ * This class was orignally created while reading 
https://gist.github.com/Ironholds/96558613fe38dd4d1961
+ */
+public class LegacyPageview {
+
+    private static final Pattern acceptedUriHostsPattern = Pattern.compile(
+               
"\\.(wik(ipedia|ibooks|tionary|imediafoundation|inews|iquote|isource|iversity|ivoyage|idata)|mediawiki)\\.org$"
+    );
+
+    private static final Pattern acceptedMetaUriHostsPattern = Pattern.compile(
+               
"(commons|meta|incubator|species|strategy|outreach|usability|quality)(\\.m)?\\.wikimedia)\\.org$"
+    );
+
+    private static final String acceptedUriPaths = "/wiki/";
+
+    private static final String rejectedUriPaths = 
"/wiki/Special:CentralAutoLogin/";
+
+    private static final HashSet<String> rejectedUriPathPages = new 
HashSet<String>(Arrays.asList(
+       "/wiki/undefined",
+       "/wiki/Undefined"
+    ));
+
+    private static final HashSet<String> rejectedStatusCodes = new 
HashSet<String>(Arrays.asList(
+       "301",
+       "302",
+       "303"
+       ));
+
+       private static final Pattern rejectedIPPattern = Pattern.compile(
+               
"^(10\\.20\\.0|10\\.64\\.0|10\\.128\\.0|10\\.64\\.32|208\\.80\\.15[2-5]|91\\.198\\.174)\\..+"
+       );
+
+    /**
+     * Given a webrequest ip, x_forwarded_for, uri_host, uri_path, and 
http_status, returns
+     * True if we consider this a 'pageview', False otherwise.
+     *
+     * See: 
https://meta.wikimedia.org/wiki/Research:Page_view/Generalised_filters
+     *      for information on how to classify a pageview.
+     */
+    public static boolean isPageview(String ipAddress, String xForwarded, 
String uriHost, String uriPath, String httpStatus) {
+        // If http_status is not (301,302,303),
+        // the request is to a "recognised" project,
+        // the the path starts with /wiki/,
+        // the path isn't to Special:CentralAutoLogin,
+        // The page isn't undefined or Undefined,
+        // The source IP isn't in a specified range (or,
+        // it is, but the XFF field is not empty),
+        // Then this is a legacy pageview.
+
+        return (
+               !rejectedStatusCodes.equals(httpStatus) &&
+               (acceptedUriHostsPattern.matcher(uriHost).matches() || 
acceptedMetaUriHostsPattern.matcher(uriHost).matches()) &&
+               uriPath.substring(0,6).equals(acceptedUriPaths) &&
+               !uriPath.substring(0,30).equals(rejectedUriPaths) &&
+               !rejectedUriPathPages.contains(uriPath) &&
+               !rejectedStatusCodes.contains(httpStatus) &&
+               (!rejectedIPPattern.matcher(ipAddress).matches() || 
!xForwarded.equals("-"))
+        );
+    }
+}
\ No newline at end of file
diff --git 
a/refinery-core/src/test/java/org/wikimedia/mediawiki/TestLegacyPageview.java 
b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestLegacyPageview.java
new file mode 100644
index 0000000..19a089f
--- /dev/null
+++ 
b/refinery-core/src/test/java/org/wikimedia/mediawiki/TestLegacyPageview.java
@@ -0,0 +1,34 @@
+import junitparams.JUnitParamsRunner;
+import junitparams.mappers.CsvWithHeaderMapper;
+
+@RunWith(JUnitParamsRunner.class)
+public class TestLegacyPageview {
+
+
+    @Test
+    @FileParameters(
+        value = "src/test/resources/legacy_pageview_test_data.csv",
+        mapper = CsvWithHeaderMapper.class
+    )
+    public void testIsPageview(
+        String test_description,
+        boolean is_pageview,
+        String uri_host, 
+        String uri_path, 
+        String http_status, 
+        String ip_address, 
+        String x_forwarded_for
+    ) {
+        assertEquals(
+            test_description,
+            is_pageview,
+            Pageview.isPageview(
+                ip_address,
+                x_forwarded_for,
+                uri_host,
+                uri_path,
+                http_status
+            )
+        );
+    }
+}
\ No newline at end of file
diff --git a/refinery-core/src/test/resources/legacy_pageview_test_data.csv 
b/refinery-core/src/test/resources/legacy_pageview_test_data.csv
new file mode 100644
index 0000000..ae8b8d3
--- /dev/null
+++ b/refinery-core/src/test/resources/legacy_pageview_test_data.csv
@@ -0,0 +1,6 @@
+test_description, is_pageview, uri_host, uri_path, 
http_status,ip_address,x_forwarded_for
+Is Pageview - Desktop, true, en.wikipedia.org, 
/wiki/Horseshoe_crab,200,174.62.175.82,-
+Is Not Pageview – not /wiki/, false, en.wikipedia.org, 
/w/api.php,200,174.62.175.82,-
+Is Not Pageview - http_status is 301, false, en.wikipedia.org, 
/wiki/Noppperrrrs,301,174.62.175.82,-
+Is Not Pageview – IP excluded, false, en.wikipedia.org, 
/wiki/Noppperrrrs,200,10.20.0.5,-
+"Is Pageview – IP excluded, but XFF is not -", false, en.wikipedia.org, 
/w/api.php,200,10.20.0.5,turnip

-- 
To view, visit https://gerrit.wikimedia.org/r/181049
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iaf376e702806d664332d87a69cc34cb7e4c9f095
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery/source
Gerrit-Branch: master
Gerrit-Owner: OliverKeyes <oke...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to