Mforns has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/331794 )
Change subject: Add banner impressions jobs ...................................................................... Add banner impressions jobs Bug: T155141 Change-Id: Iae85a39048596c71a8b4838a1b4ee60b9f6f6b7f --- A oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql A oozie/banner_impressions/druid/daily/load_banner_impressions.json 2 files changed, 155 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/94/331794/1 diff --git a/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql b/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql new file mode 100644 index 0000000..d430fe6 --- /dev/null +++ b/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql @@ -0,0 +1,84 @@ + +SET hive.exec.compress.output=true; +SET mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec; +ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar; + + +DROP TABLE IF EXISTS andyrussg.tmp_banner_impressions; +CREATE EXTERNAL TABLE IF NOT EXISTS andyrussg.tmp_banner_impressions ( + `ts` string, + `campaign` string, + `banner` string, + `uselang` string, + `project` string, + `device` string, + `status_code` string, + `country` string, + `anonymous` boolean, + `bucket` string, + `region` string, + -- `geocoded_consistent` boolean, TODO + `sample_rate` float, + `request_count` bigint, + `normalized_impressions` bigint +) +ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' +STORED AS TEXTFILE +LOCATION '/user/andyrussg/banner_impressions'; + + +CREATE TEMPORARY MACRO uri_param_value(param_name string, col string) + parse_url(concat('http://bla.org/woo/', col), 'QUERY', param_name); + +INSERT OVERWRITE TABLE andyrussg.tmp_banner_impressions +SELECT + CONCAT( + LPAD(year, 4, '0'), '-', + LPAD(month, 2, '0'), '-', + LPAD(day, 2, '0'), 'T', + LPAD(hour, 2, '0'), ':00:00Z') AS ts, + uri_param_value('campaign', uri_query) AS campaign, + uri_param_value('banner', uri_query) AS banner, + uri_param_value('uselang', uri_query) AS uselang, + uri_param_value('project', uri_query) AS project, + uri_param_value('device', uri_query) AS device, + uri_param_value('statusCode', uri_query) AS status_code, + uri_param_value('country', uri_query) AS country, + IF (uri_param_value('anonymous', uri_query) = 'true', true, false) AS anonymous, + uri_param_value('bucket', uri_query) AS bucket, + geocoded_data['subdivision'] AS region, + -- geocoded_data['country_code'] = uri_param_value('country', uri_query) AS geocoded_consistent TODO + cast(uri_param_value('recordImpressionSampleRate', uri_query) AS float) AS sample_rate, + COUNT(*) AS request_count, + cast(COUNT(*) / cast(uri_param_value('recordImpressionSampleRate', uri_query) AS float) AS bigint) AS normalized_impressions +FROM + wmf.webrequest +WHERE + year=2016 + AND month=12 + AND day=18 + AND webrequest_source = 'text' + AND uri_path = '/beacon/impression' + AND agent_type = 'user' + AND uri_param_value('debug', uri_query) = 'false' + -- AND x_analytics_map['proxy'] IS NULL Note: add once added to pageviews +GROUP BY + CONCAT( + LPAD(year, 4, '0'), '-', + LPAD(month, 2, '0'), '-', + LPAD(day, 2, '0'), 'T', + LPAD(hour, 2, '0'), ':00:00Z'), + uri_param_value('campaign', uri_query), + uri_param_value('banner', uri_query), + uri_param_value('uselang', uri_query), + uri_param_value('project', uri_query), + uri_param_value('device', uri_query), + uri_param_value('statusCode', uri_query), + uri_param_value('country', uri_query), + IF (uri_param_value('anonymous', uri_query) = 'true', true, false), + uri_param_value('bucket', uri_query), + uri_param_value('recordImpressionSampleRate', uri_query), + geocoded_data['subdivision']; + -- geocoded_data['country_code'] = uri_param_value('country', uri_query); TODO + +-- DROP TABLE IF EXISTS andyrussg.tmp_banner_impressions; diff --git a/oozie/banner_impressions/druid/daily/load_banner_impressions.json b/oozie/banner_impressions/druid/daily/load_banner_impressions.json new file mode 100644 index 0000000..250dab2 --- /dev/null +++ b/oozie/banner_impressions/druid/daily/load_banner_impressions.json @@ -0,0 +1,71 @@ +{ + "type" : "index_hadoop", + "spec" : { + "ioConfig" : { + "type" : "hadoop", + "inputSpec" : { + "type" : "static", + "paths" : "hdfs://analytics-hadoop/user/andyrussg/banner_impressions" + } + }, + "dataSchema" : { + "dataSource" : "banner-impressions-hourly", + "granularitySpec" : { + "type" : "uniform", + "segmentGranularity" : "day", + "queryGranularity" : "hour", + "intervals" : ["2016-12-18/2016-12-19"] + }, + "parser" : { + "type" : "string", + "parseSpec" : { + "format" : "json", + "dimensionsSpec" : { + "dimensions" : [ + "campaign", + "banner", + "uselang", + "project", + "device", + "status_code", + "country", + "anonymous", + "bucket", + "region", + "sample_rate" + ] + }, + "timestampSpec" : { + "format" : "auto", + "column" : "ts" + } + } + }, + "metricsSpec" : [ + { + "name" : "request_count", + "type" : "longSum", + "fieldName": "request_count" + }, + { + "name" : "normalized_impressions", + "type" : "longSum", + "fieldName": "normalized_impressions" + } + ] + }, + "tuningConfig" : { + "type" : "hadoop", + "ignoreInvalidRows" : false, + "partitionsSpec" : { + "type" : "hashed", + "numShards" : 4 + }, + "jobProperties" : { + "mapreduce.reduce.memory.mb" : "2024", + "mapreduce.output.fileoutputformat.compress": "org.apache.hadoop.io.compress.GzipCodec", + "mapreduce.job.queuename": "default" + } + } + } +} -- To view, visit https://gerrit.wikimedia.org/r/331794 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iae85a39048596c71a8b4838a1b4ee60b9f6f6b7f Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Mforns <mfo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits