QChris has uploaded a new change for review. https://gerrit.wikimedia.org/r/117321
Change subject: Since mobile jobs have been turned off, remove now unneded pig scripts ...................................................................... Since mobile jobs have been turned off, remove now unneded pig scripts The mobile jobs output seems to be unused [1] and have hence been turned off. We remove the corresponding (now unused) pig scripts, as they depend on dclass, and would be in the way for the upcoming removal of dclass. [1] http://lists.wikimedia.org/pipermail/analytics/2014-March/001662.html Change-Id: I13caedc40b43d01d005c299bd9e3088c1ba603c4 --- D pig/mobile_device_props.pig D pig/mobile_platform.pig 2 files changed, 0 insertions(+), 152 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/kraken refs/changes/21/117321/1 diff --git a/pig/mobile_device_props.pig b/pig/mobile_device_props.pig deleted file mode 100644 index 8ecaa01..0000000 --- a/pig/mobile_device_props.pig +++ /dev/null @@ -1,60 +0,0 @@ -REGISTER 'geoip-1.2.9-patch-2-SNAPSHOT.jar' -REGISTER 'kraken-generic-0.0.2-SNAPSHOT-jar-with-dependencies.jar' -REGISTER 'kraken-dclass-0.0.2-SNAPSHOT.jar' -REGISTER 'kraken-pig-0.0.2-SNAPSHOT.jar' - --- Script Parameters: pass via -p param_name=param_value, ex: -p date_bucket_regex=2013-03-24_00 -%default date_bucket_format 'yyyy-MM-dd_HH'; -- Format applied to timestamps for aggregation into buckets. Default: hourly. -%default date_bucket_regex '.*'; -- Regex used to filter the formatted date_buckets; must match whole line. Default: no filtering. - -DEFINE DATE_BUCKET org.wikimedia.analytics.kraken.pig.ConvertDateFormat('yyyy-MM-dd\'T\'HH:mm:ss', '$date_bucket_format'); -DEFINE DCLASS org.wikimedia.analytics.kraken.pig.UserAgentClassifier(); -DEFINE IS_PAGEVIEW org.wikimedia.analytics.kraken.pig.PageViewFilterFunc(); - -IMPORT 'include/load_webrequest.pig'; -- See include/load_webrequest.pig -log_fields = LOAD_WEBREQUEST('$input'); - -log_fields = FILTER log_fields - BY ( (DATE_BUCKET(timestamp) MATCHES '$date_bucket_regex') - AND IS_PAGEVIEW(uri, referer, user_agent, http_status, remote_addr, content_type, request_method) - ); - -device_info_with_non_wmf = FOREACH log_fields - GENERATE - DATE_BUCKET(timestamp) AS date_bucket:chararray, - FLATTEN(DCLASS(user_agent)) AS ( - vendor:chararray, - model:chararray, - device_os:chararray, - device_os_version:chararray, - device_class:chararray, - browser:chararray, - browser_version:chararray, - wmf_mobile_app:chararray, - has_javascript:boolean, - display_dimensions:chararray, - input_device:chararray, - non_wmf_mobile_app:chararray - ); - -device_info = FOREACH device_info_with_non_wmf - GENERATE - date_bucket, - vendor, - model, - device_os, - device_os_version, - device_class, - browser, - browser_version, - wmf_mobile_app, - has_javascript, - display_dimensions, - input_device - ; - -device_info_count = FOREACH (GROUP device_info BY (date_bucket, device_class, device_os)) - GENERATE FLATTEN($0), COUNT($1) AS num:int; -device_info_count = ORDER device_info_count BY date_bucket, device_class, device_os; - -STORE device_info_count INTO '$output' USING PigStorage(); diff --git a/pig/mobile_platform.pig b/pig/mobile_platform.pig deleted file mode 100644 index e90047c..0000000 --- a/pig/mobile_platform.pig +++ /dev/null @@ -1,92 +0,0 @@ -REGISTER 'kraken-generic-0.0.2-SNAPSHOT-jar-with-dependencies.jar' -REGISTER 'kraken-dclass-0.0.2-SNAPSHOT.jar' -REGISTER 'kraken-pig-0.0.2-SNAPSHOT.jar' - --- Script Parameters: pass via -p param_name=param_value, ex: -p date_bucket_regex=2013-03-24 -%default date_bucket_format 'yyyy-MM-dd'; -- Format applied to timestamps for aggregation into buckets. Default: Daily. -%default date_bucket_regex '.*'; -- Regex used to filter the formatted date_buckets; must match whole line. Default: no filtering. - -DEFINE DATE_BUCKET org.wikimedia.analytics.kraken.pig.ConvertDateFormat('yyyy-MM-dd\'T\'HH:mm:ss', '$date_bucket_format'); -/* For testing: -DEFINE DATE_BUCKET org.wikimedia.analytics.kraken.pig.ConvertDateFormat('yyyy-MM-dd\'T\'HH:mm:ss', 'yyyy-MM-dd'); -*/ -DEFINE DCLASS org.wikimedia.analytics.kraken.pig.UserAgentClassifier(); -DEFINE IS_PAGEVIEW org.wikimedia.analytics.kraken.pig.PageViewFilterFunc(); - -IMPORT 'include/load_webrequest.pig'; - -/* For testing: -************* Unsampled ************* -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-01-31_*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-01-31_22.30*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-03-22_16.30*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-03-25_22.30*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-03-25_16.30*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-04-01_16.45*,hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-04-01_17.*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-wikipedia-mobile/dt=2013-04-01*'); - -************* Sampled *************** -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-all-sampled-1000/dt=2013-04-15_12*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-all-sampled-1000/dt=2013-04-15*'); -log_fields = LOAD_WEBREQUEST('hdfs:///wmf/raw/webrequest/webrequest-all-sampled-1000/dt=2013-04*'); - -************* Local ***************** -log_fields = LOAD_WEBREQUEST('pig.sample.webrequest.wikipedia.mobile*'); -*/ -log_fields = LOAD_WEBREQUEST('$input'); - -log_fields = FOREACH log_fields - GENERATE - DATE_BUCKET(timestamp) as date_bucket, - FLATTEN(STRSPLIT(remote_addr,'\\|')) as (ip_addr:chararray, country_code:chararray), - uri, referer, user_agent, http_status, content_type, request_method - ; - -/* For testing: -matching_log_fields = FILTER log_fields BY ( - (date_bucket MATCHES '.*') - AND IS_PAGEVIEW(uri, referer, user_agent, http_status, ip_addr, content_type, request_method) -); -NOTE: the ip_addr field is anonymized. During anonymization, internal IP addresses can be made external and vice versa. -TODO: investigate more carefully whether the anonymization coupled with "IS_PAGEVIEW" changes the results significantly -*/ -matching_log_fields = FILTER log_fields BY ( - (date_bucket MATCHES '$date_bucket_regex') - AND IS_PAGEVIEW(uri, referer, user_agent, http_status, ip_addr, content_type, request_method) -); - -platform_info = FOREACH matching_log_fields - GENERATE - date_bucket, - FLATTEN(DCLASS(REPLACE(user_agent, '%20', ' '))) AS ( - vendor:chararray, - model:chararray, - device_os:chararray, - device_os_version:chararray, - device_class:chararray, - browser:chararray, - browser_version:chararray, - wmf_mobile_app:chararray, - has_javascript:boolean, - display_dimensions:chararray, - input_device:chararray, - non_wmf_mobile_app:chararray - ) - ; -official_platform_info = FILTER platform_info BY wmf_mobile_app is not null; -official_platform_info = FOREACH official_platform_info GENERATE date_bucket, wmf_mobile_app; - -official_platform_info_group = GROUP official_platform_info BY (date_bucket, wmf_mobile_app); -official_platform_info_count = FOREACH official_platform_info_group GENERATE FLATTEN(group), COUNT(official_platform_info) * 1000; -ordered_official_platform_info_count = ORDER official_platform_info_count BY date_bucket, wmf_mobile_app; - -STORE ordered_official_platform_info_count INTO '$output/official' USING PigStorage(); - -unofficial_platform_info = FILTER platform_info BY non_wmf_mobile_app is not null; -unofficial_platform_info = FOREACH unofficial_platform_info GENERATE date_bucket, non_wmf_mobile_app; - -unofficial_platform_info_group = GROUP unofficial_platform_info BY (date_bucket, non_wmf_mobile_app); -unofficial_platform_info_count = FOREACH unofficial_platform_info_group GENERATE FLATTEN(group), COUNT(unofficial_platform_info) * 1000; -ordered_unofficial_platform_info_count = ORDER unofficial_platform_info_count BY date_bucket, non_wmf_mobile_app; - -STORE ordered_unofficial_platform_info_count INTO '$output/unofficial' USING PigStorage(); -- To view, visit https://gerrit.wikimedia.org/r/117321 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I13caedc40b43d01d005c299bd9e3088c1ba603c4 Gerrit-PatchSet: 1 Gerrit-Project: analytics/kraken Gerrit-Branch: master Gerrit-Owner: QChris <christ...@quelltextlich.at> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits