[MediaWiki-commits] [Gerrit] Add get pageview_info udf and underlying functions - change (analytics...source)
Ottomata has submitted this change and it was merged. Change subject: Add get pageview_info udf and underlying functions .. Add get pageview_info udf and underlying functions Update getProjectFromHost function in PageviewDefinition Add getDialectFromPath and getPageTitleFromUri in PageviewDefinition Modify test data to test new functions Adapt existing test functions to modified test data Change-Id: Ieed48b6c520c09e62d3bc05085e67f11ed3a96b7 --- M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java M refinery-core/src/test/resources/pageview_test_data.csv A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetPageviewInfoUDF.java D refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetPageviewInfoUDF.java D refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java 11 files changed, 578 insertions(+), 166 deletions(-) Approvals: Ottomata: Verified; Looks good to me, approved diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java index 808c857..96f46e3 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java @@ -16,9 +16,9 @@ package org.wikimedia.analytics.refinery.core; -import java.util.regex.Pattern; -import java.util.HashSet; import java.util.Arrays; +import java.util.HashSet; +import java.util.regex.Pattern; /** * Static functions to work with Wikimedia webrequest data. @@ -105,6 +105,15 @@ www, download )); + +/** + * Static values for project, dialect and article + */ +public static final String UNKNOWN_PROJECT_VALUE = -; +public static final String UNKNOWN_DIALECT_VALUE = -; +public static final String UNKNOWN_PAGE_TITLE_VALUE = -; +public static final String DEFAULT_DIALECT_VALUE = default; + /** * All API request uriPaths will contain this @@ -212,7 +221,7 @@ * @return The project identifier in format [xxx.] (en.wikipedia or wikisource for instance) */ public String getProjectFromHost(String uriHost) { -if (uriHost == null) return -; +if (uriHost == null) return UNKNOWN_PROJECT_VALUE; String[] uri_parts = uriHost.toLowerCase().split(\\.); switch (uri_parts.length) { // case wikixxx.org @@ -237,7 +246,181 @@ else return uri_parts[0] + . + uri_parts[3]; default: -return -; +return UNKNOWN_PROJECT_VALUE; } } + +/** + * Normalize uriPath to maximize dialect and page title extraction correctness + * Normalization export path if uriPath is a complete URL, and removes double backslashes + * + * @param uriPath The url's path + * @return The normalized uriPath + */ +private String normalizeUriPath(String uriPath) { +// Prevent null pointer exception +String normPath = (uriPath == null) ? : uriPath; + +// Special case where full url ends-up in uriPath +// Extract path manually to prevent url encoding issues +int idxpathBeginning = 0; +if (normPath.startsWith(http)) +idxpathBeginning = normPath.indexOf(/, 9); // look for / after http(s):// + +int idxPathEnding = normPath.indexOf(?, idxpathBeginning); // look for query ? after path +idxPathEnding = (idxPathEnding 0) ? normPath.length() : idxPathEnding; + +normPath = normPath.substring(idxpathBeginning, idxPathEnding); + + +// Clean uriPath of double backslashes +normPath = normPath.replaceAll(//+, /); +normPath = normPath.trim(); + +return normPath; +} + +/** + * Identifies the dialect from a pageview uriPath + * NOTE: Provides correct result only if used with is_pageview = true + * + * @param uriPath The url's path + * @return The dialect name (if any) + */ +public String getDialectFromPath(String uriPath) { +// Normalize uriPath +String normPath = normalizeUriPath(uriPath); + +// In
[MediaWiki-commits] [Gerrit] Add get pageview_info udf and underlying functions - change (analytics...source)
Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/214349 Change subject: Add get pageview_info udf and underlying functions .. Add get pageview_info udf and underlying functions Update getProjectFromHost function in PageviewDefinition Add getDialectFromPath and getArticleFromUri in PageviewDefinition Modify test data to test new functions Adapt existing test functions to modified test data Change-Id: Ieed48b6c520c09e62d3bc05085e67f11ed3a96b7 --- M refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestLegacyPageviewDefinition.java M refinery-core/src/test/java/org/wikimedia/analytics/refinery/core/TestPageview.java M refinery-core/src/test/resources/pageview_test_data.csv A refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetPageviewInfoUDF.java D refinery-hive/src/main/java/org/wikimedia/analytics/refinery/hive/GetProjectUDF.java A refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetPageviewInfoUDF.java D refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestGetProjectUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsAppPageviewUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsLegacyPageviewUDF.java M refinery-hive/src/test/java/org/wikimedia/analytics/refinery/hive/TestIsPageviewUDF.java 11 files changed, 530 insertions(+), 164 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery/source refs/changes/49/214349/1 diff --git a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java index 808c857..e9df39c 100644 --- a/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java +++ b/refinery-core/src/main/java/org/wikimedia/analytics/refinery/core/PageviewDefinition.java @@ -16,6 +16,8 @@ package org.wikimedia.analytics.refinery.core; +import java.io.UnsupportedEncodingException; +import java.net.URLDecoder; import java.util.regex.Pattern; import java.util.HashSet; import java.util.Arrays; @@ -105,6 +107,12 @@ www, download )); + +public static final String UNKNOWN_PROJECT_VALUE = -; +public static final String UNKNOWN_DIALECT_VALUE = -; +public static final String UNKNOWN_ARTICLE_VALUE = -; + +public static final String DEFAULT_DIALECT_VALUE = default; /** * All API request uriPaths will contain this @@ -212,7 +220,7 @@ * @return The project identifier in format [xxx.] (en.wikipedia or wikisource for instance) */ public String getProjectFromHost(String uriHost) { -if (uriHost == null) return -; +if (uriHost == null) return UNKNOWN_PROJECT_VALUE; String[] uri_parts = uriHost.toLowerCase().split(\\.); switch (uri_parts.length) { // case wikixxx.org @@ -237,7 +245,134 @@ else return uri_parts[0] + . + uri_parts[3]; default: -return -; +return UNKNOWN_PROJECT_VALUE; } } + +/** + * Identifies the dialect from a pageview uriPath + * NOTE: Provides correct result only if used with is_pageview = true + * + * @param uriPath The url's path + * @return The dialect name (if any) + */ +public String getDialectFromPath(String uriPath) { +if (uriPath == null) return UNKNOWN_DIALECT_VALUE; + +// In case of api, unknown dialect +if (uriPath.startsWith(/w/api.php)) +return UNKNOWN_DIALECT_VALUE; + +// Default wiki urls, default dialect +if (uriPath.equals(/) || uriPath.equals(/wiki) || uriPath.equals(/w) +|| uriPath.startsWith(/wiki/) || uriPath.startsWith(/w/)) +return DEFAULT_DIALECT_VALUE; + +// Special dialect case, +// Extract dialect if it contains a - +// or default dialect otherwise +int startIdx = uriPath.indexOf(/); +startIdx = (startIdx = 0)?(startIdx + 1):startIdx; +int middleIdx = uriPath.indexOf(-, startIdx); +int endIdx = uriPath.indexOf(/, startIdx); +endIdx = (endIdx 0)?endIdx:(uriPath.length()); +if ((startIdx = 0) (startIdx endIdx)) { +if ((middleIdx 0) (middleIdx endIdx)) +return uriPath.substring(startIdx, endIdx); +else +return DEFAULT_DIALECT_VALUE; +} + +// extraction failed, unknown dialect +return UNKNOWN_DIALECT_VALUE; + +} + +/** + * Extracts an article name from uriPath + * NOTE: Assumes that the page is not index.* + * + * @param path The url's path + *