Joal has uploaded a new change for review. https://gerrit.wikimedia.org/r/217229
Change subject: Add 2 fields and 1 update to refine table ...................................................................... Add 2 fields and 1 update to refine table Add pageview_info and normalized_host fields from hive UDF. Add wiki_app_version in existing user_agent_map modifying user_agent parsing UDF. Bug: T96044,T99918,T99932 Change-Id: I5d0527ce1ad76a67ab26eb62b9e96c36c82ef787 --- M hive/webrequest/create_webrequest_table.hql M oozie/webrequest/refine/bundle.properties M oozie/webrequest/refine/refine_webrequest.hql 3 files changed, 11 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/29/217229/1 diff --git a/hive/webrequest/create_webrequest_table.hql b/hive/webrequest/create_webrequest_table.hql index 184c3d9..90b75e9 100644 --- a/hive/webrequest/create_webrequest_table.hql +++ b/hive/webrequest/create_webrequest_table.hql @@ -44,13 +44,15 @@ -- Next two fields are to replace original ua and x_analytics ones. -- However such schema modification implies backward incompatibility. -- We will replace once we feel confident enough that 'every' backward incompatible change is done. - `user_agent_map` map<string, string> COMMENT 'User-agent map with browser_name, browser_major, device, os_name, os_minor, os_major keys and associated values', + `user_agent_map` map<string, string> COMMENT 'User-agent map with browser_name, browser_major, device, os_name, os_major, os_minor and wmf_app_version keys and associated values', `x_analytics_map` map<string, string> COMMENT 'X_analytics map view of the x_analytics field', `ts` timestamp COMMENT 'Unix timestamp in milliseconds extracted from dt', `access_method` string COMMENT 'Method used to accessing the site (mobile app|mobile web|desktop)', `agent_type` string COMMENT 'Categorise the agent making the webrequest as either user or spider (automatas to be added).', `is_zero` boolean COMMENT 'Indicates if the webrequest is accessed through a zero provider', - `referer_class` string COMMENT 'Indicates if a referer is internal, external or unknown.' + `referer_class` string COMMENT 'Indicates if a referer is internal, external or unknown.', + `normalized_host` struct<project_class: string, project:string, qualifiers: array<string>, tld: String> COMMENT 'struct containing project_class (such as wikipedia or wikidata for instance), project (such as en or commons), qualifiers (a list of in-between values, such as m and/or zero) and tld (org most often)', + `pageview_info` map<string, string> COMMENT 'map containing project, dialect and page_title values only when is_pageview = TRUE.' ) PARTITIONED BY ( `webrequest_source` string COMMENT 'Source cluster', diff --git a/oozie/webrequest/refine/bundle.properties b/oozie/webrequest/refine/bundle.properties index dbee34a..a654bd9 100644 --- a/oozie/webrequest/refine/bundle.properties +++ b/oozie/webrequest/refine/bundle.properties @@ -51,14 +51,14 @@ hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml # Version of Hive UDF jar to import -refinery_jar_version = 0.0.11 +refinery_jar_version = 0.0.12 # Fully qualified Hive table name. source_table = wmf_raw.webrequest destination_table = wmf.webrequest # Record version to keep track of changes -record_version = 0.0.4 +record_version = 0.0.5 # HDFS path to directory where webrequest data is time bucketed. webrequest_raw_data_directory = ${name_node}/wmf/data/raw/webrequest diff --git a/oozie/webrequest/refine/refine_webrequest.hql b/oozie/webrequest/refine/refine_webrequest.hql index 3bc559a..cdc4f96 100644 --- a/oozie/webrequest/refine/refine_webrequest.hql +++ b/oozie/webrequest/refine/refine_webrequest.hql @@ -55,6 +55,8 @@ CREATE TEMPORARY FUNCTION get_access_method as 'org.wikimedia.analytics.refinery.hive.GetAccessMethodUDF'; CREATE TEMPORARY FUNCTION is_crawler as 'org.wikimedia.analytics.refinery.hive.IsCrawlerUDF'; CREATE TEMPORARY FUNCTION classify_referer AS 'org.wikimedia.analytics.refinery.hive.RefererClassifierUDF'; +CREATE TEMPORARY FUNCTION get_pageview_info AS 'org.wikimedia.analytics.refinery.hive.GetPageviewInfoUDF'; +CREATE TEMPORARY FUNCTION normalize_host AS 'org.wikimedia.analytics.refinery.hive.HostNormalizerUDF'; INSERT OVERWRITE TABLE ${destination_table} @@ -97,7 +99,9 @@ ELSE 'user' END as agent_type, (str_to_map(x_analytics, '\;', '=')['zero'] IS NOT NULL) as is_zero, - classify_referer(referer) as referer_class + classify_referer(referer) as referer_class, + normalize_host(uri_host) as normalized_host, + get_pageview_info(uri_host, uri_path, uri_query) as pageview_info FROM ${source_table} WHERE -- To view, visit https://gerrit.wikimedia.org/r/217229 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5d0527ce1ad76a67ab26eb62b9e96c36c82ef787 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Joal <j...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits