Madhuvishy has submitted this change and it was merged. Change subject: Add oozie job extracting aqs usage statistics ......................................................................
Add oozie job extracting aqs usage statistics Oozie job to hourly extract data from hive refined webrequest table into aqs_hourly table to facilitate gathering statistics on pageview API usage. Bug: T118938 Change-Id: Id59012c7f57b13d62ea413762dc21e7412131838 --- A hive/aqs/create_aqs_hourly_table.hql A oozie/aqs/datasets.xml A oozie/aqs/hourly/README.md A oozie/aqs/hourly/aqs_hourly.hql A oozie/aqs/hourly/coordinator.properties A oozie/aqs/hourly/coordinator.xml A oozie/aqs/hourly/workflow.xml 7 files changed, 459 insertions(+), 0 deletions(-) Approvals: Madhuvishy: Verified; Looks good to me, approved diff --git a/hive/aqs/create_aqs_hourly_table.hql b/hive/aqs/create_aqs_hourly_table.hql new file mode 100644 index 0000000..3d0982b --- /dev/null +++ b/hive/aqs/create_aqs_hourly_table.hql @@ -0,0 +1,35 @@ +-- Creates table statement for aqs_hourly table. +-- +-- NOTE: When choosing partition field types, +-- one should take into consideration Hive's +-- insistence on storing partition values +-- as strings. See: +-- https://wikitech.wikimedia.org/wiki/File:Hive_partition_formats.png +-- and +-- http://bots.wmflabs.org/~wm-bot/logs/%23wikimedia-analytics/20140721.txt +-- +-- Parameters: +-- <none> +-- +-- Usage +-- hive -f create_aqs_hourly_table.hql --database wmf +-- + +CREATE EXTERNAL TABLE IF NOT EXISTS `aqs_hourly`( + `cache_status` string COMMENT 'Cache status', + `http_status` string COMMENT 'HTTP status of response', + `http_method` string COMMENT 'HTTP method of request', + `response_size` bigint COMMENT 'Response size', + `uri_host` string COMMENT 'Host of request', + `uri_path` string COMMENT 'Path of request', + `request_count` bigint COMMENT 'Number of requests' +) +PARTITIONED BY ( + `year` int COMMENT 'Unpadded year of request', + `month` int COMMENT 'Unpadded month of request', + `day` int COMMENT 'Unpadded day of request', + `hour` int COMMENT 'Unpadded hour of request' +) +STORED AS PARQUET +LOCATION '/wmf/data/wmf/aqs/hourly' +; diff --git a/oozie/aqs/datasets.xml b/oozie/aqs/datasets.xml new file mode 100644 index 0000000..35c38cb --- /dev/null +++ b/oozie/aqs/datasets.xml @@ -0,0 +1,33 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- +Defines reusable datasets for aqs data. +Use this dataset in your coordinator.xml files by setting: + + ${start_time} - the initial instance of your data. + Example: 2014-04-01T00:00Z + ${aqs_data_directory} - Path to directory where aqs data is time bucketed. + Example: /wmf/data/wmf/aqs +--> + +<datasets> + + <!-- + To unpad MONTH, DAY, and HOUR, we force coercion to a number by + adding 0. + + Note that we do not use “${...}” but “${"$"}{...}", as dataset files are + passed to EL twice in cascade, and in the first EL level, ${MONTH} + evaluates to the string “${MONTH}”. Hence, we escape the dollar sign in + “${....}" to “${"$"}{...}”. At the first EL level, “${"$"}” gets turned + into a dollar sign, and “{...}” is just passed along. Hence, we arrive + at “${...}” as input for the second EL level. There, the variables hold + their expected values, and we can start unpadding them. + --> + <dataset name="aqs_hourly" + frequency="${coord:hours(1)}" + initial-instance="${start_time}" + timezone="Universal"> + <uri-template>${aqs_data_directory}/hourly/year=${YEAR}/month=${"$"}{MONTH + 0}/day=${"$"}{DAY + 0}/hour=${"$"}{HOUR + 0}</uri-template> + <done-flag>_SUCCESS</done-flag> + </dataset> +</datasets> diff --git a/oozie/aqs/hourly/README.md b/oozie/aqs/hourly/README.md new file mode 100644 index 0000000..8c185fc --- /dev/null +++ b/oozie/aqs/hourly/README.md @@ -0,0 +1,20 @@ +# Aggregation / reduction of aqs data from webrequest + +This job is responsible for filtering aqs data +from the refined webrequest table and aggregating it into +statistically interesting dimensions. + +Output is appended into (year, month, day, hour) partitions +in /wmf/data/wmf/aqs/hourly. + +# Outline + +* ```coordinator.properties``` is used to define parameters to the + aggregation pipeline. +* ```coordinator.xml``` injects the aggregation workflow for each dataset. +* ```workflow.xml``` + * Runs a hive query to aggregate from webrequest into aqs_stats + +Note that this job uses the refined dataset. If a webrequest refinement job +does not have the _SUCCESS done-flag in the directory, the data for that +hour will not be aggregated until it does. diff --git a/oozie/aqs/hourly/aqs_hourly.hql b/oozie/aqs/hourly/aqs_hourly.hql new file mode 100644 index 0000000..44d307f --- /dev/null +++ b/oozie/aqs/hourly/aqs_hourly.hql @@ -0,0 +1,50 @@ +-- Parameters: +-- source_table -- Fully qualified table name to compute the +-- aggregation for. +-- destination_table -- Fully qualified table name to fill in +-- aggregated values. +-- year -- year of partition to compute aggregation +-- for. +-- month -- month of partition to compute aggregation +-- for. +-- day -- day of partition to compute aggregation +-- for. +-- hour -- hour of partition to compute aggregation +-- for. +-- +-- Usage: +-- hive -f aqs_hourly.hql \ +-- -d source_table=wmf.webrequest \ +-- -d destination_table=wmf.aqs_hourly \ +-- -d year=2015 \ +-- -d month=11 \ +-- -d day=1 \ +-- -d hour=0 +-- + +SET parquet.compression = SNAPPY; +SET mapred.reduce.tasks = 8; + +INSERT OVERWRITE TABLE ${destination_table} + PARTITION(year=${year},month=${month},day=${day},hour=${hour}) + SELECT + cache_status, + http_status, + http_method, + response_size, + uri_host, + uri_path, + COUNT(1) AS request_count + FROM + ${source_table} + WHERE webrequest_source = 'text' + AND year=${year} AND month=${month} AND day=${day} AND hour=${hour} + AND uri_path like '/api/rest_v1/metrics/pageviews/%' + GROUP BY + cache_status, + http_status, + http_method, + response_size, + uri_host, + uri_path, +; diff --git a/oozie/aqs/hourly/coordinator.properties b/oozie/aqs/hourly/coordinator.properties new file mode 100644 index 0000000..6ced06b --- /dev/null +++ b/oozie/aqs/hourly/coordinator.properties @@ -0,0 +1,57 @@ +# Configures a coordinator to manage automatically aggregating aqs data from +# the refined webrequest table. +# Any of the following properties are overidable with -D. +# Usage: +# oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config oozie/aqs/hourly/coordinator.properties +# +# NOTE: The $oozie_directory must be synced to HDFS so that all relevant +# .xml files exist there when this job is submitted. + + +name_node = hdfs://analytics-hadoop +job_tracker = resourcemanager.analytics.eqiad.wmnet:8032 +queue_name = default + +user = hdfs + +# Base path in HDFS to oozie files. +# Other files will be used relative to this path. +oozie_directory = ${name_node}/wmf/refinery/current/oozie + +# HDFS path to coordinator to run. +coordinator_file = ${oozie_directory}/aqs/hourly/coordinator.xml + +# HDFS path to workflow to run. +workflow_file = ${oozie_directory}/aqs/hourly/workflow.xml + +# HDFS path to refine webrequest dataset definitions +webrequest_datasets_file = ${oozie_directory}/webrequest/datasets.xml +webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest + +# HDFS path to aqs dataset definitions +aqs_datasets_file = ${oozie_directory}/aqs/datasets.xml +aqs_data_directory = ${name_node}/wmf/data/wmf/aqs + +# Initial import time of the webrequest dataset. +start_time = 2015-11-01T00:00Z + +# Time to stop running this coordinator. Year 3000 == never! +stop_time = 3000-01-01T00:00Z + +# HDFS path to workflow to mark a directory as done +mark_directory_done_workflow_file = ${oozie_directory}/util/mark_directory_done/workflow.xml + +# Workflow to send an error email +send_error_email_workflow_file = ${oozie_directory}/util/send_error_email/workflow.xml + +# HDFS path to hive-site.xml file. This is needed to run hive actions. +hive_site_xml = ${oozie_directory}/util/hive/hive-site.xml + +# Fully qualified Hive table name. +webrequest_table = wmf.webrequest +aqs_hourly_table = wmf.aqs_hourly + +# Coordintator to start. +oozie.coord.application.path = ${coordinator_file} +oozie.use.system.libpath = true +oozie.action.external.stats.write = true diff --git a/oozie/aqs/hourly/coordinator.xml b/oozie/aqs/hourly/coordinator.xml new file mode 100644 index 0000000..ddc8dda --- /dev/null +++ b/oozie/aqs/hourly/coordinator.xml @@ -0,0 +1,116 @@ +<?xml version="1.0" encoding="UTF-8"?> +<coordinator-app xmlns="uri:oozie:coordinator:0.4" + name="aqs_hourly-coord" + frequency="${coord:hours(1)}" + start="${start_time}" + end="${stop_time}" + timezone="Universal"> + + <parameters> + + <!-- Required properties. --> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + <property><name>queue_name</name></property> + + <property><name>workflow_file</name></property> + + <property><name>start_time</name></property> + <property><name>stop_time</name></property> + + <property><name>webrequest_datasets_file</name></property> + <property><name>webrequest_data_directory</name></property> + + <property><name>aqs_datasets_file</name></property> + <property><name>aqs_data_directory</name></property> + + <property><name>hive_site_xml</name></property> + + <property><name>webrequest_table</name></property> + <property><name>aqs_hourly_table</name></property> + + <property><name>mark_directory_done_workflow_file</name></property> + </parameters> + + <controls> + <!-- + By having materialized jobs not timeout, we ease backfilling incidents + after recoverable hiccups on the dataset producers. + --> + <timeout>-1</timeout> + + <!-- + pageview aggregation is not too heavy, but we limit + concurrency for resource sharing. + + Also note, that back-filling is not limited by the + coordinator's frequency, so back-filling works nicely + even-though the concurrency is low. + --> + <concurrency>2</concurrency> + + + <!-- + Since we expect only one incarnation per hourly dataset, the + default throttle of 12 is way to high, and there is not need + to keep that many materialized jobs around. + + By resorting to 2, we keep the hdfs checks on the datasets + low, while still being able to easily feed the concurrency. + --> + <throttle>2</throttle> + </controls> + + <datasets> + <!-- + Include refined and aqs datasets files. + $webrequest_datasets_file will be used as the input events + $aqs_datasets_file will be used as the output events + --> + <include>${webrequest_datasets_file}</include> + <include>${aqs_datasets_file}</include> + </datasets> + + <input-events> + <data-in name="text_refined_input" dataset="webrequest_text"> + <instance>${coord:current(0)}</instance> + </data-in> + </input-events> + + <output-events> + <data-out name="aqs_hourly_output" dataset="aqs_hourly"> + <instance>${coord:current(0)}</instance> + </data-out> + </output-events> + + <action> + <workflow> + <app-path>${workflow_file}</app-path> + <configuration> + + <property> + <name>year</name> + <value>${coord:formatTime(coord:nominalTime(), "y")}</value> + </property> + <property> + <name>month</name> + <value>${coord:formatTime(coord:nominalTime(), "M")}</value> + </property> + <property> + <name>day</name> + <value>${coord:formatTime(coord:nominalTime(), "d")}</value> + </property> + <property> + <name>hour</name> + <value>${coord:formatTime(coord:nominalTime(), "H")}</value> + </property> + <property> + <!-- To mark directory done after success --> + <name>aqs_hourly_dataset_directory</name> + <value>${coord:dataOut('aqs_hourly_output')}</value> + </property> + + </configuration> + </workflow> + </action> +</coordinator-app> diff --git a/oozie/aqs/hourly/workflow.xml b/oozie/aqs/hourly/workflow.xml new file mode 100644 index 0000000..92f7bc1 --- /dev/null +++ b/oozie/aqs/hourly/workflow.xml @@ -0,0 +1,148 @@ +<?xml version="1.0" encoding="UTF-8"?> +<workflow-app xmlns="uri:oozie:workflow:0.4" + name="aqs-hourly-${webrequest_table}->${aqs_hourly_table}-${year},${month},${day},${hour}-wf"> + + <parameters> + + <!-- Default values for inner oozie settings --> + <property> + <name>oozie_launcher_queue_name</name> + <value>${queue_name}</value> + </property> + <property> + <name>oozie_launcher_memory</name> + <value>256</value> + </property> + + <!-- Required properties --> + <property><name>queue_name</name></property> + <property><name>name_node</name></property> + <property><name>job_tracker</name></property> + + + <!-- Aggregation related configuration properties--> + <property> + <name>hive_script</name> + <!-- This is relative to the containing directory of this file. --> + <value>aqs_hourly.hql</value> + <description>Hive script to run.</description> + </property> + + <property> + <name>hive_site_xml</name> + <description>hive-site.xml file path in HDFS</description> + </property> + <property> + <name>webrequest_table</name> + <description>Hive table to refine</description> + </property> + <property> + <name>aqs_hourly_table</name> + <description>The destinaton table to store aggregated data in.</description> + </property> + + <property> + <name>year</name> + <description>The partition's year</description> + </property> + <property> + <name>month</name> + <description>The partition's month</description> + </property> + <property> + <name>day</name> + <description>The partition's day</description> + </property> + <property> + <name>hour</name> + <description>The partition's hour</description> + </property> + <property> + <name>mark_directory_done_workflow_file</name> + <description>Workflow for marking a directory done</description> + </property> + <property> + <name>aqs_hourly_dataset_directory</name> + <description>Pageview directory to generate the done flag in</description> + </property> + + <property> + <name>archive_job_output_workflow_file</name> + <description>Workflow to move a data file to the archive</description> + </property> + + </parameters> + + <start to="aggregate"/> + + <action name="aggregate"> + <hive xmlns="uri:oozie:hive-action:0.2"> + <job-tracker>${job_tracker}</job-tracker> + <name-node>${name_node}</name-node> + <job-xml>${hive_site_xml}</job-xml> + <configuration> + <property> + <name>mapreduce.job.queuename</name> + <value>${queue_name}</value> + </property> + <!--make sure oozie:launcher runs in a low priority queue --> + <property> + <name>oozie.launcher.mapred.job.queue.name</name> + <value>${oozie_launcher_queue_name}</value> + </property> + <property> + <name>oozie.launcher.mapreduce.map.memory.mb</name> + <value>${oozie_launcher_memory}</value> + </property> + <property> + <name>hive.exec.scratchdir</name> + <value>/tmp/hive-${user}</value> + </property> + </configuration> + <script>${hive_script}</script> + <param>source_table=${webrequest_table}</param> + <param>destination_table=${aqs_hourly_table}</param> + <param>year=${year}</param> + <param>month=${month}</param> + <param>day=${day}</param> + <param>hour=${hour}</param> + </hive> + + <ok to="mark_aggregated_aqs_dataset_done"/> + <error to="send_error_email"/> + </action> + + <action name="mark_aggregated_aqs_dataset_done"> + <sub-workflow> + <app-path>${mark_directory_done_workflow_file}</app-path> + <configuration> + <property> + <name>directory</name> + <value>${aqs_hourly_dataset_directory}</value> + </property> + </configuration> + </sub-workflow> + <ok to="end"/> + <error to="send_error_email"/> + </action> + + <action name="send_error_email"> + <sub-workflow> + <app-path>${send_error_email_workflow_file}</app-path> + <propagate-configuration/> + <configuration> + <property> + <name>parent_name</name> + <value>${wf:name()}</value> + </property> + </configuration> + </sub-workflow> + <ok to="kill"/> + <error to="kill"/> + </action> + + <kill name="kill"> + <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> + </kill> + <end name="end"/> +</workflow-app> -- To view, visit https://gerrit.wikimedia.org/r/259662 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id59012c7f57b13d62ea413762dc21e7412131838 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Joal <j...@wikimedia.org> Gerrit-Reviewer: Madhuvishy <mviswanat...@wikimedia.org> Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org> Gerrit-Reviewer: Nuria <nu...@wikimedia.org> Gerrit-Reviewer: Ottomata <o...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits