Mforns has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/331794 )

Change subject: Add banner impressions jobs
......................................................................

Add banner impressions jobs

Bug: T155141
Change-Id: Iae85a39048596c71a8b4838a1b4ee60b9f6f6b7f
---
A 
oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql
A oozie/banner_impressions/druid/daily/load_banner_impressions.json
2 files changed, 155 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/94/331794/1

diff --git 
a/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql
 
b/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql
new file mode 100644
index 0000000..d430fe6
--- /dev/null
+++ 
b/oozie/banner_impressions/druid/daily/generate_daily_druid_banner_impressions.hql
@@ -0,0 +1,84 @@
+
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;
+
+
+DROP TABLE IF EXISTS andyrussg.tmp_banner_impressions;
+CREATE EXTERNAL TABLE IF NOT EXISTS andyrussg.tmp_banner_impressions (
+    `ts`                             string,
+    `campaign`                       string,
+    `banner`                         string,
+    `uselang`                        string,
+    `project`                        string,
+    `device`                         string,
+    `status_code`                    string,
+    `country`                        string,
+    `anonymous`                      boolean,
+    `bucket`                         string,
+    `region`                         string,
+    -- `geocoded_consistent`            boolean, TODO
+    `sample_rate`                    float,
+    `request_count`                  bigint,
+    `normalized_impressions`         bigint
+)
+ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+STORED AS TEXTFILE
+LOCATION '/user/andyrussg/banner_impressions';
+
+
+CREATE TEMPORARY MACRO uri_param_value(param_name string, col string)
+    parse_url(concat('http://bla.org/woo/', col), 'QUERY', param_name);
+
+INSERT OVERWRITE TABLE andyrussg.tmp_banner_impressions
+SELECT
+    CONCAT(
+        LPAD(year, 4, '0'), '-',
+        LPAD(month, 2, '0'), '-',
+        LPAD(day, 2, '0'), 'T',
+        LPAD(hour, 2, '0'), ':00:00Z') AS ts,
+    uri_param_value('campaign', uri_query) AS campaign,
+    uri_param_value('banner', uri_query) AS banner,
+    uri_param_value('uselang', uri_query) AS uselang,
+    uri_param_value('project', uri_query) AS project,
+    uri_param_value('device', uri_query) AS device,
+    uri_param_value('statusCode', uri_query) AS status_code,
+    uri_param_value('country', uri_query) AS country,
+    IF (uri_param_value('anonymous', uri_query) = 'true', true, false) AS 
anonymous,
+    uri_param_value('bucket', uri_query) AS bucket,
+    geocoded_data['subdivision'] AS region,
+    -- geocoded_data['country_code'] = uri_param_value('country', uri_query) 
AS geocoded_consistent TODO
+    cast(uri_param_value('recordImpressionSampleRate', uri_query) AS float) AS 
sample_rate,
+    COUNT(*) AS request_count,
+    cast(COUNT(*) / cast(uri_param_value('recordImpressionSampleRate', 
uri_query) AS float) AS bigint) AS normalized_impressions
+FROM
+    wmf.webrequest
+WHERE
+    year=2016
+    AND month=12
+    AND day=18
+    AND webrequest_source = 'text'
+    AND uri_path = '/beacon/impression'
+    AND agent_type = 'user'
+    AND uri_param_value('debug', uri_query) = 'false'
+    -- AND x_analytics_map['proxy'] IS NULL Note: add once added to pageviews
+GROUP BY
+    CONCAT(
+        LPAD(year, 4, '0'), '-',
+        LPAD(month, 2, '0'), '-',
+        LPAD(day, 2, '0'), 'T',
+        LPAD(hour, 2, '0'), ':00:00Z'),
+    uri_param_value('campaign', uri_query),
+    uri_param_value('banner', uri_query),
+    uri_param_value('uselang', uri_query),
+    uri_param_value('project', uri_query),
+    uri_param_value('device', uri_query),
+    uri_param_value('statusCode', uri_query),
+    uri_param_value('country', uri_query),
+    IF (uri_param_value('anonymous', uri_query) = 'true', true, false),
+    uri_param_value('bucket', uri_query),
+    uri_param_value('recordImpressionSampleRate', uri_query),
+    geocoded_data['subdivision'];
+    -- geocoded_data['country_code'] = uri_param_value('country', uri_query); 
TODO
+
+-- DROP TABLE IF EXISTS andyrussg.tmp_banner_impressions;
diff --git a/oozie/banner_impressions/druid/daily/load_banner_impressions.json 
b/oozie/banner_impressions/druid/daily/load_banner_impressions.json
new file mode 100644
index 0000000..250dab2
--- /dev/null
+++ b/oozie/banner_impressions/druid/daily/load_banner_impressions.json
@@ -0,0 +1,71 @@
+{
+  "type" : "index_hadoop",
+  "spec" : {
+    "ioConfig" : {
+      "type" : "hadoop",
+      "inputSpec" : {
+        "type" : "static",
+        "paths" : "hdfs://analytics-hadoop/user/andyrussg/banner_impressions"
+      }
+    },
+    "dataSchema" : {
+      "dataSource" : "banner-impressions-hourly",
+      "granularitySpec" : {
+        "type" : "uniform",
+        "segmentGranularity" : "day",
+        "queryGranularity" : "hour",
+        "intervals" : ["2016-12-18/2016-12-19"]
+      },
+      "parser" : {
+        "type" : "string",
+        "parseSpec" : {
+          "format" : "json",
+          "dimensionsSpec" : {
+            "dimensions" : [
+                "campaign",
+                "banner",
+                "uselang",
+                "project",
+                "device",
+                "status_code",
+                "country",
+                "anonymous",
+                "bucket",
+                "region",
+                "sample_rate"
+            ]
+          },
+          "timestampSpec" : {
+            "format" : "auto",
+            "column" : "ts"
+          }
+        }
+      },
+      "metricsSpec" : [
+        {
+          "name" : "request_count",
+          "type" : "longSum",
+          "fieldName": "request_count"
+        },
+        {
+          "name" : "normalized_impressions",
+          "type" : "longSum",
+          "fieldName": "normalized_impressions"
+        }
+      ]
+    },
+    "tuningConfig" : {
+      "type" : "hadoop",
+      "ignoreInvalidRows" : false,
+      "partitionsSpec" : {
+        "type" : "hashed",
+        "numShards" : 4
+      },
+      "jobProperties" : {
+        "mapreduce.reduce.memory.mb" : "2024",
+        "mapreduce.output.fileoutputformat.compress": 
"org.apache.hadoop.io.compress.GzipCodec",
+        "mapreduce.job.queuename": "default"
+      }
+    }
+  }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/331794
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iae85a39048596c71a8b4838a1b4ee60b9f6f6b7f
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Mforns <mfo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to