Joal has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/217229

Change subject: Add 2 fields and 1 update to refine table
......................................................................

Add 2 fields and 1 update to refine table

Add pageview_info and normalized_host fields from hive UDF.
Add wiki_app_version in existing user_agent_map modifying user_agent parsing 
UDF.

Bug: T96044,T99918,T99932
Change-Id: I5d0527ce1ad76a67ab26eb62b9e96c36c82ef787
---
M hive/webrequest/create_webrequest_table.hql
M oozie/webrequest/refine/bundle.properties
M oozie/webrequest/refine/refine_webrequest.hql
3 files changed, 11 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/29/217229/1

diff --git a/hive/webrequest/create_webrequest_table.hql 
b/hive/webrequest/create_webrequest_table.hql
index 184c3d9..90b75e9 100644
--- a/hive/webrequest/create_webrequest_table.hql
+++ b/hive/webrequest/create_webrequest_table.hql
@@ -44,13 +44,15 @@
     -- Next two fields are to replace original ua and x_analytics ones.
     -- However such schema modification implies backward incompatibility.
     -- We will replace once we feel confident enough that 'every' backward 
incompatible change is done.
-    `user_agent_map`    map<string, string>  COMMENT 'User-agent map with 
browser_name, browser_major, device, os_name, os_minor, os_major keys and 
associated values',
+    `user_agent_map`    map<string, string>  COMMENT 'User-agent map with 
browser_name, browser_major, device, os_name, os_major, os_minor and 
wmf_app_version keys and associated values',
     `x_analytics_map`   map<string, string>  COMMENT 'X_analytics map view of 
the x_analytics field',
     `ts`                timestamp            COMMENT 'Unix timestamp in 
milliseconds extracted from dt',
     `access_method`     string  COMMENT 'Method used to accessing the site 
(mobile app|mobile web|desktop)',
     `agent_type`        string  COMMENT 'Categorise the agent making the 
webrequest as either user or spider (automatas to be added).',
     `is_zero`           boolean COMMENT 'Indicates if the webrequest is 
accessed through a zero provider',
-    `referer_class`     string  COMMENT 'Indicates if a referer is internal, 
external or unknown.'
+    `referer_class`     string  COMMENT 'Indicates if a referer is internal, 
external or unknown.',
+    `normalized_host`   struct<project_class: string, project:string, 
qualifiers: array<string>, tld: String>  COMMENT 'struct containing 
project_class (such as wikipedia or wikidata for instance), project (such as en 
or commons), qualifiers (a list of in-between values, such as m and/or zero) 
and tld (org most often)',
+    `pageview_info`     map<string, string>  COMMENT 'map containing project, 
dialect and page_title values only when is_pageview = TRUE.'
 )
 PARTITIONED BY (
     `webrequest_source` string  COMMENT 'Source cluster',
diff --git a/oozie/webrequest/refine/bundle.properties 
b/oozie/webrequest/refine/bundle.properties
index dbee34a..a654bd9 100644
--- a/oozie/webrequest/refine/bundle.properties
+++ b/oozie/webrequest/refine/bundle.properties
@@ -51,14 +51,14 @@
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
 
 # Version of Hive UDF jar to import
-refinery_jar_version              = 0.0.11
+refinery_jar_version              = 0.0.12
 
 # Fully qualified Hive table name.
 source_table                      = wmf_raw.webrequest
 destination_table                 = wmf.webrequest
 
 # Record version to keep track of changes
-record_version                    = 0.0.4
+record_version                    = 0.0.5
 
 # HDFS path to directory where webrequest data is time bucketed.
 webrequest_raw_data_directory     = ${name_node}/wmf/data/raw/webrequest
diff --git a/oozie/webrequest/refine/refine_webrequest.hql 
b/oozie/webrequest/refine/refine_webrequest.hql
index 3bc559a..cdc4f96 100644
--- a/oozie/webrequest/refine/refine_webrequest.hql
+++ b/oozie/webrequest/refine/refine_webrequest.hql
@@ -55,6 +55,8 @@
 CREATE TEMPORARY FUNCTION get_access_method as 
'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF';
 CREATE TEMPORARY FUNCTION is_crawler as 
'org.wikimedia.analytics.refinery.hive.IsCrawlerUDF';
 CREATE TEMPORARY FUNCTION classify_referer AS 
'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF';
+CREATE TEMPORARY FUNCTION get_pageview_info AS 
'org.wikimedia.analytics.refinery.hive.GetPageviewInfoUDF';
+CREATE TEMPORARY FUNCTION normalize_host AS 
'org.wikimedia.analytics.refinery.hive.HostNormalizerUDF';
 
 
 INSERT OVERWRITE TABLE ${destination_table}
@@ -97,7 +99,9 @@
             ELSE 'user'
         END as agent_type,
         (str_to_map(x_analytics, '\;', '=')['zero'] IS NOT NULL) as is_zero,
-        classify_referer(referer) as referer_class
+        classify_referer(referer) as referer_class,
+        normalize_host(uri_host) as normalized_host,
+        get_pageview_info(uri_host, uri_path, uri_query) as pageview_info
     FROM
         ${source_table}
     WHERE

-- 
To view, visit https://gerrit.wikimedia.org/r/217229
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5d0527ce1ad76a67ab26eb62b9e96c36c82ef787
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <j...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to