[MediaWiki-commits] [Gerrit] Add get pageview_info udf and underlying functions - change (analytics...source)

2015-06-02 Thread Ottomata (Code Review)
Ottomata has submitted this change and it was merged.

Change subject: Add get pageview_info udf and underlying functions
..


Add get pageview_info udf and underlying functions

Update getProjectFromHost function in PageviewDefinition
Add getDialectFromPath and getPageTitleFromUri in PageviewDefinition
Modify test data to test new functions
Adapt existing test functions to modified test data

Change-Id: Ieed48b6c520c09e62d3bc05085e67f11ed3a96b7
---
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M refinery-core/src/test/resources/pageview_test_data.csv
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetPageviewInfoUDF.java
D 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetPageviewInfoUDF.java
D 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
11 files changed, 578 insertions(+), 166 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
index 808c857..96f46e3 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
@@ -16,9 +16,9 @@
 
 package org.wikimedia.analytics.refinery.core;
 
-import java.util.regex.Pattern;
-import java.util.HashSet;
 import java.util.Arrays;
+import java.util.HashSet;
+import java.util.regex.Pattern;
 
 /**
  * Static functions to work with Wikimedia webrequest data.
@@ -105,6 +105,15 @@
 www,
 download
 ));
+
+/**
+ * Static values for project, dialect and article
+ */
+public static final String UNKNOWN_PROJECT_VALUE = -;
+public static final String UNKNOWN_DIALECT_VALUE = -;
+public static final String UNKNOWN_PAGE_TITLE_VALUE = -;
+public static final String DEFAULT_DIALECT_VALUE = default;
+
 
 /**
  * All API request uriPaths will contain this
@@ -212,7 +221,7 @@
  * @return The project identifier in format [xxx.] (en.wikipedia or 
wikisource for instance)
  */
 public String getProjectFromHost(String uriHost) {
-if (uriHost == null) return -;
+if (uriHost == null) return UNKNOWN_PROJECT_VALUE;
 String[] uri_parts = uriHost.toLowerCase().split(\\.);
 switch (uri_parts.length) {
 // case wikixxx.org
@@ -237,7 +246,181 @@
 else
 return uri_parts[0] + . + uri_parts[3];
 default:
-return -;
+return UNKNOWN_PROJECT_VALUE;
 }
 }
+
+/**
+ * Normalize uriPath to maximize dialect and page title extraction 
correctness
+ * Normalization export path if uriPath is a complete URL, and removes 
double backslashes
+ *
+ * @param uriPath The url's path
+ * @return The normalized uriPath
+ */
+private String normalizeUriPath(String uriPath) {
+// Prevent null pointer exception
+String normPath = (uriPath == null) ?  : uriPath;
+
+// Special case where full url ends-up in uriPath
+// Extract path manually to prevent url encoding issues
+int idxpathBeginning = 0;
+if (normPath.startsWith(http))
+idxpathBeginning = normPath.indexOf(/,  9); // look for / 
after http(s)://
+
+int idxPathEnding = normPath.indexOf(?,  idxpathBeginning); // look 
for query ? after path
+idxPathEnding = (idxPathEnding  0) ? normPath.length() : 
idxPathEnding;
+
+normPath = normPath.substring(idxpathBeginning, idxPathEnding);
+
+
+// Clean uriPath of double backslashes
+normPath = normPath.replaceAll(//+, /);
+normPath = normPath.trim();
+
+return normPath;
+}
+
+/**
+ * Identifies the dialect from a pageview uriPath
+ * NOTE: Provides correct result only if used with is_pageview = true
+ *
+ * @param uriPath The url's path
+ * @return The dialect name (if any)
+ */
+public String getDialectFromPath(String uriPath) {
+// Normalize uriPath
+String normPath = normalizeUriPath(uriPath);
+
+// In 

[MediaWiki-commits] [Gerrit] Add get pageview_info udf and underlying functions - change (analytics...source)

2015-05-28 Thread Joal (Code Review)
Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/214349

Change subject: Add get pageview_info udf and underlying functions
..

Add get pageview_info udf and underlying functions

Update getProjectFromHost function in PageviewDefinition
Add getDialectFromPath and getArticleFromUri in PageviewDefinition
Modify test data to test new functions
Adapt existing test functions to modified test data

Change-Id: Ieed48b6c520c09e62d3bc05085e67f11ed3a96b7
---
M 
refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java
M 
refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java
M refinery-core/src/test/resources/pageview_test_data.csv
A 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetPageviewInfoUDF.java
D 
refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java
A 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetPageviewInfoUDF.java
D 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java
M 
refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java
11 files changed, 530 insertions(+), 164 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source 
refs/changes/49/214349/1

diff --git 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
index 808c857..e9df39c 100644
--- 
a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
+++ 
b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java
@@ -16,6 +16,8 @@
 
 package org.wikimedia.analytics.refinery.core;
 
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
 import java.util.regex.Pattern;
 import java.util.HashSet;
 import java.util.Arrays;
@@ -105,6 +107,12 @@
 www,
 download
 ));
+
+public static final String UNKNOWN_PROJECT_VALUE = -;
+public static final String UNKNOWN_DIALECT_VALUE = -;
+public static final String UNKNOWN_ARTICLE_VALUE = -;
+
+public static final String DEFAULT_DIALECT_VALUE = default;
 
 /**
  * All API request uriPaths will contain this
@@ -212,7 +220,7 @@
  * @return The project identifier in format [xxx.] (en.wikipedia or 
wikisource for instance)
  */
 public String getProjectFromHost(String uriHost) {
-if (uriHost == null) return -;
+if (uriHost == null) return UNKNOWN_PROJECT_VALUE;
 String[] uri_parts = uriHost.toLowerCase().split(\\.);
 switch (uri_parts.length) {
 // case wikixxx.org
@@ -237,7 +245,134 @@
 else
 return uri_parts[0] + . + uri_parts[3];
 default:
-return -;
+return UNKNOWN_PROJECT_VALUE;
 }
 }
+
+/**
+ * Identifies the dialect from a pageview uriPath
+ * NOTE: Provides correct result only if used with is_pageview = true
+ *
+ * @param uriPath The url's path
+ * @return The dialect name (if any)
+ */
+public String getDialectFromPath(String uriPath) {
+if (uriPath == null) return UNKNOWN_DIALECT_VALUE;
+
+// In case of api, unknown dialect
+if (uriPath.startsWith(/w/api.php))
+return UNKNOWN_DIALECT_VALUE;
+
+// Default wiki urls, default dialect
+if (uriPath.equals(/) || uriPath.equals(/wiki) || 
uriPath.equals(/w)
+|| uriPath.startsWith(/wiki/) || uriPath.startsWith(/w/))
+return DEFAULT_DIALECT_VALUE;
+
+// Special dialect case,
+// Extract dialect if it contains a -
+// or default dialect otherwise
+int startIdx = uriPath.indexOf(/);
+startIdx = (startIdx = 0)?(startIdx + 1):startIdx;
+int middleIdx = uriPath.indexOf(-, startIdx);
+int endIdx = uriPath.indexOf(/, startIdx);
+endIdx = (endIdx  0)?endIdx:(uriPath.length());
+if ((startIdx = 0)  (startIdx  endIdx)) {
+if ((middleIdx  0)  (middleIdx  endIdx))
+return uriPath.substring(startIdx, endIdx);
+else
+return DEFAULT_DIALECT_VALUE;
+}
+
+// extraction failed, unknown dialect
+return UNKNOWN_DIALECT_VALUE;
+
+}
+
+/**
+ * Extracts an article name from uriPath
+ * NOTE: Assumes that the page is not index.*
+ *
+ * @param path The url's path
+ *