Joal has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/357871 )
Change subject: Update cassandra loading for unique devices
......................................................................
Update cassandra loading for unique devices
Having updated unique devices jobs, cassandra loading must
match the change.
Bug: T167043
Change-Id: I11531dedda20362cf3ee451d1c3d022fee0c57d4
---
M oozie/cassandra/README.md
M oozie/cassandra/bundle.properties
M oozie/cassandra/bundle.xml
M oozie/cassandra/coord_unique_devices_daily.properties
M oozie/cassandra/coord_unique_devices_monthly.properties
M oozie/cassandra/daily/unique_devices.hql
M oozie/cassandra/monthly/unique_devices.hql
7 files changed, 36 insertions(+), 33 deletions(-)
Approvals:
Ottomata: Looks good to me, but someone else must approve
Joal: Verified; Looks good to me, approved
diff --git a/oozie/cassandra/README.md b/oozie/cassandra/README.md
index 393ae69..fff5e4c 100644
--- a/oozie/cassandra/README.md
+++ b/oozie/cassandra/README.md
@@ -1,10 +1,13 @@
-# Cassandra loader for pageview API
+# Cassandra loader for AQS
The bundle is responsible for data transformation and load
-into cassandra for the pageview API. It has three main datasets:
+into cassandra for the Analytics Query Service. It has 5 main datasets:
* pageview per article
* pageview per project
* pageview top articles
+* Unique devices
+* Historical pagecounts per project
+
First data is written in TSV files with aggregates computed
in a cube fashion for some dimensions, then those flat files are
loaded into cassandra, and fianlly removed if the job ends correctly.
diff --git a/oozie/cassandra/bundle.properties
b/oozie/cassandra/bundle.properties
index 5ba952f..de90779 100644
--- a/oozie/cassandra/bundle.properties
+++ b/oozie/cassandra/bundle.properties
@@ -45,10 +45,10 @@
# HDFS path to datasets definitions
pageview_datasets_file = ${oozie_directory}/pageview/datasets.xml
projectview_datasets_file =
${oozie_directory}/projectview/datasets.xml
-last_access_uniques_datasets_file =
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
pageview_data_directory = ${name_node}/wmf/data/wmf/pageview
projectview_data_directory = ${name_node}/wmf/data/wmf/projectview
-last_access_uniques_data_directory =
${name_node}/wmf/data/wmf/last_access_uniques
+unique_devices_data_directory = ${name_node}/wmf/data/wmf/unique_devices
# Initial import time of the webrequest dataset.
start_time = 2015-05-01T00:00Z
diff --git a/oozie/cassandra/bundle.xml b/oozie/cassandra/bundle.xml
index 26960b9..0a4e656 100644
--- a/oozie/cassandra/bundle.xml
+++ b/oozie/cassandra/bundle.xml
@@ -216,7 +216,7 @@
<property>
<name>unique_devices_datasets_file</name>
- <value>${last_access_uniques_datasets_file}</value>
+ <value>${unique_devices_datasets_file}</value>
</property>
<property>
<name>unique_devices_hive_script</name>
@@ -680,7 +680,7 @@
</property>
<property>
<name>dataset_name</name>
- <value>last_access_uniques_daily</value>
+ <value>unique_devices_per_domain_daily</value>
</property>
<!-- frequency of the dataset to manage different datasets
correctly in coordinator -->
@@ -696,7 +696,7 @@
</property>
<property>
<name>source_table</name>
- <value>wmf.last_access_uniques_daily</value>
+ <value>wmf.unique_devices_per_domain_daily</value>
</property>
<!-- cassandra loader job stuff -->
@@ -746,7 +746,7 @@
</property>
<property>
<name>dataset_name</name>
- <value>last_access_uniques_monthly</value>
+ <value>unique_devices_per_domain_monthly</value>
</property>
<!-- frequency of the dataset to manage different datasets
correctly in coordinator -->
@@ -762,7 +762,7 @@
</property>
<property>
<name>source_table</name>
- <value>wmf.last_access_uniques_monthly</value>
+ <value>wmf.unique_devices_per_domain_monthly</value>
</property>
<!-- cassandra loader job stuff -->
diff --git a/oozie/cassandra/coord_unique_devices_daily.properties
b/oozie/cassandra/coord_unique_devices_daily.properties
index bd66561..c98a4dc 100644
--- a/oozie/cassandra/coord_unique_devices_daily.properties
+++ b/oozie/cassandra/coord_unique_devices_daily.properties
@@ -43,8 +43,8 @@
send_error_email_workflow_file =
${oozie_directory}/util/send_error_email/workflow.xml
# HDFS path to datasets definitions
-last_access_uniques_data_directory =
${name_node}/wmf/data/wmf/last_access_uniques
-last_access_uniques_datasets_file =
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_data_directory = ${name_node}/wmf/data/wmf/unique_devices
+unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
dataset_freq = day
# Initial import time of the webrequest dataset.
@@ -79,10 +79,10 @@
constant_output_tid_value = 0,timeuuid
workflow_file = ${workflow_file_daily}
-datasets_file = ${last_access_uniques_datasets_file}
-dataset_name = last_access_uniques_daily
+datasets_file = ${unique_devices_datasets_file}
+dataset_name = unique_devices_per_domain_daily
hive_script = unique_devices.hql
-source_table = wmf.last_access_uniques_daily
+source_table = wmf.unique_devices_per_domain_daily
cassandra_parallel_loaders = 1
cassandra_keyspace = local_group_default_T_unique_devices
cassandra_cql = UPDATE "${cassandra_keyspace}"."data" SET
"devices" = ?
diff --git a/oozie/cassandra/coord_unique_devices_monthly.properties
b/oozie/cassandra/coord_unique_devices_monthly.properties
index ca047fe..e50dac8 100644
--- a/oozie/cassandra/coord_unique_devices_monthly.properties
+++ b/oozie/cassandra/coord_unique_devices_monthly.properties
@@ -43,8 +43,8 @@
send_error_email_workflow_file =
${oozie_directory}/util/send_error_email/workflow.xml
# HDFS path to datasets definitions
-last_access_uniques_data_directory =
${name_node}/wmf/data/wmf/last_access_uniques
-last_access_uniques_datasets_file =
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_data_directory = ${name_node}/wmf/data/wmf/unique_devices
+unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
# provides the frequency of the dataset used to manage
#different datasets correctly in coordinator
dataset_freq = month
@@ -81,10 +81,10 @@
constant_output_tid_value = 0,timeuuid
workflow_file = ${workflow_file_monthly}
-datasets_file = ${last_access_uniques_datasets_file}
-dataset_name = last_access_uniques_monthly
+datasets_file = ${unique_devices_datasets_file}
+dataset_name = unique_devices_per_domain_monthly
hive_script = unique_devices.hql
-source_table = wmf.last_access_uniques_monthly
+source_table = wmf.unique_devices_per_domain_monthly
cassandra_parallel_loaders = 1
cassandra_keyspace = local_group_default_T_unique_devices
cassandra_cql = UPDATE "${cassandra_keyspace}"."data" SET
"devices" = ?
diff --git a/oozie/cassandra/daily/unique_devices.hql
b/oozie/cassandra/daily/unique_devices.hql
index 74d18ba..2c3f043 100644
--- a/oozie/cassandra/daily/unique_devices.hql
+++ b/oozie/cassandra/daily/unique_devices.hql
@@ -9,7 +9,7 @@
-- Usage:
-- hive -f unique_devices.hql \
-- -d destination_directory=/tmp/unique_devices \
--- -d source_table=wmf.last_access_uniques_daily \
+-- -d source_table=wmf.unique_devices_per_domain_daily \
-- -d separator=\t \
-- -d year=2016 \
-- -d month=1 \
@@ -24,10 +24,10 @@
WITH unique_devices AS (
SELECT
CONCAT(
- regexp_extract(uri_host,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
- regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+ regexp_extract(domain,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+ regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
) AS project,
- CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+ CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
THEN 'mobile-site'
ELSE 'desktop-site'
END AS access_site,
@@ -41,10 +41,10 @@
AND day = ${day}
GROUP BY
CONCAT(
- regexp_extract(uri_host,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
- regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+ regexp_extract(domain,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+ regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
),
- CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+ CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
THEN 'mobile-site'
ELSE 'desktop-site'
END,
diff --git a/oozie/cassandra/monthly/unique_devices.hql
b/oozie/cassandra/monthly/unique_devices.hql
index a19935b..29d67e4 100644
--- a/oozie/cassandra/monthly/unique_devices.hql
+++ b/oozie/cassandra/monthly/unique_devices.hql
@@ -8,7 +8,7 @@
-- Usage:
-- hive -f unique_devices.hql \
-- -d destination_directory=/tmp/unique_devices \
--- -d source_table=wmf.last_access_uniques_monthly \
+-- -d source_table=wmf.unique_devices_per_domain_monthly \
-- -d separator=\t \
-- -d year=2016 \
-- -d month=1 \
@@ -22,10 +22,10 @@
WITH unique_devices AS (
SELECT
CONCAT(
- regexp_extract(uri_host,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
- regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+ regexp_extract(domain,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+ regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
) AS project,
- CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+ CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
THEN 'mobile-site'
ELSE 'desktop-site'
END AS access_site,
@@ -38,10 +38,10 @@
AND month = ${month}
GROUP BY
CONCAT(
- regexp_extract(uri_host,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
- regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+ regexp_extract(domain,
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+ regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
),
- CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+ CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
THEN 'mobile-site'
ELSE 'desktop-site'
END,
--
To view, visit https://gerrit.wikimedia.org/r/357871
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I11531dedda20362cf3ee451d1c3d022fee0c57d4
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits