Joal has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/357871 )

Change subject: Update cassandra loading for unique devices
......................................................................


Update cassandra loading for unique devices

Having updated unique devices jobs, cassandra loading must
match the change.

Bug: T167043

Change-Id: I11531dedda20362cf3ee451d1c3d022fee0c57d4
---
M oozie/cassandra/README.md
M oozie/cassandra/bundle.properties
M oozie/cassandra/bundle.xml
M oozie/cassandra/coord_unique_devices_daily.properties
M oozie/cassandra/coord_unique_devices_monthly.properties
M oozie/cassandra/daily/unique_devices.hql
M oozie/cassandra/monthly/unique_devices.hql
7 files changed, 36 insertions(+), 33 deletions(-)

Approvals:
  Ottomata: Looks good to me, but someone else must approve
  Joal: Verified; Looks good to me, approved



diff --git a/oozie/cassandra/README.md b/oozie/cassandra/README.md
index 393ae69..fff5e4c 100644
--- a/oozie/cassandra/README.md
+++ b/oozie/cassandra/README.md
@@ -1,10 +1,13 @@
-# Cassandra loader for pageview API
+# Cassandra loader for AQS
 
 The bundle is responsible for data transformation and load
-into cassandra for the pageview API. It has three main datasets:
+into cassandra for the Analytics Query Service. It has 5 main datasets:
 * pageview per article
 * pageview per project
 * pageview top articles
+* Unique devices
+* Historical pagecounts per project
+
 First data is written in TSV files with aggregates computed
 in a cube fashion for some dimensions, then those flat files are
 loaded into cassandra, and fianlly removed if the job ends correctly.
diff --git a/oozie/cassandra/bundle.properties 
b/oozie/cassandra/bundle.properties
index 5ba952f..de90779 100644
--- a/oozie/cassandra/bundle.properties
+++ b/oozie/cassandra/bundle.properties
@@ -45,10 +45,10 @@
 # HDFS path to datasets definitions
 pageview_datasets_file             = ${oozie_directory}/pageview/datasets.xml
 projectview_datasets_file          = 
${oozie_directory}/projectview/datasets.xml
-last_access_uniques_datasets_file  = 
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_datasets_file       = 
${oozie_directory}/unique_devices/datasets.xml
 pageview_data_directory            = ${name_node}/wmf/data/wmf/pageview
 projectview_data_directory         = ${name_node}/wmf/data/wmf/projectview
-last_access_uniques_data_directory = 
${name_node}/wmf/data/wmf/last_access_uniques
+unique_devices_data_directory      = ${name_node}/wmf/data/wmf/unique_devices
 
 # Initial import time of the webrequest dataset.
 start_time                         = 2015-05-01T00:00Z
diff --git a/oozie/cassandra/bundle.xml b/oozie/cassandra/bundle.xml
index 26960b9..0a4e656 100644
--- a/oozie/cassandra/bundle.xml
+++ b/oozie/cassandra/bundle.xml
@@ -216,7 +216,7 @@
 
         <property>
             <name>unique_devices_datasets_file</name>
-            <value>${last_access_uniques_datasets_file}</value>
+            <value>${unique_devices_datasets_file}</value>
         </property>
         <property>
             <name>unique_devices_hive_script</name>
@@ -680,7 +680,7 @@
             </property>
             <property>
                 <name>dataset_name</name>
-                <value>last_access_uniques_daily</value>
+                <value>unique_devices_per_domain_daily</value>
             </property>
             <!-- frequency of the dataset to manage different datasets
                  correctly in coordinator -->
@@ -696,7 +696,7 @@
             </property>
             <property>
                 <name>source_table</name>
-                <value>wmf.last_access_uniques_daily</value>
+                <value>wmf.unique_devices_per_domain_daily</value>
             </property>
 
             <!-- cassandra loader job stuff -->
@@ -746,7 +746,7 @@
             </property>
             <property>
                 <name>dataset_name</name>
-                <value>last_access_uniques_monthly</value>
+                <value>unique_devices_per_domain_monthly</value>
             </property>
             <!-- frequency of the dataset to manage different datasets
                  correctly in coordinator -->
@@ -762,7 +762,7 @@
             </property>
             <property>
                 <name>source_table</name>
-                <value>wmf.last_access_uniques_monthly</value>
+                <value>wmf.unique_devices_per_domain_monthly</value>
             </property>
 
             <!-- cassandra loader job stuff -->
diff --git a/oozie/cassandra/coord_unique_devices_daily.properties 
b/oozie/cassandra/coord_unique_devices_daily.properties
index bd66561..c98a4dc 100644
--- a/oozie/cassandra/coord_unique_devices_daily.properties
+++ b/oozie/cassandra/coord_unique_devices_daily.properties
@@ -43,8 +43,8 @@
 send_error_email_workflow_file     = 
${oozie_directory}/util/send_error_email/workflow.xml
 
 # HDFS path to datasets definitions
-last_access_uniques_data_directory = 
${name_node}/wmf/data/wmf/last_access_uniques
-last_access_uniques_datasets_file  = 
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_data_directory      = ${name_node}/wmf/data/wmf/unique_devices
+unique_devices_datasets_file       = 
${oozie_directory}/unique_devices/datasets.xml
 dataset_freq                       = day
 
 # Initial import time of the webrequest dataset.
@@ -79,10 +79,10 @@
 constant_output_tid_value          = 0,timeuuid
 
 workflow_file                      = ${workflow_file_daily}
-datasets_file                      = ${last_access_uniques_datasets_file}
-dataset_name                       = last_access_uniques_daily
+datasets_file                      = ${unique_devices_datasets_file}
+dataset_name                       = unique_devices_per_domain_daily
 hive_script                        = unique_devices.hql
-source_table                       = wmf.last_access_uniques_daily
+source_table                       = wmf.unique_devices_per_domain_daily
 cassandra_parallel_loaders         = 1
 cassandra_keyspace                 = local_group_default_T_unique_devices
 cassandra_cql                      = UPDATE "${cassandra_keyspace}"."data" SET 
"devices" = ?
diff --git a/oozie/cassandra/coord_unique_devices_monthly.properties 
b/oozie/cassandra/coord_unique_devices_monthly.properties
index ca047fe..e50dac8 100644
--- a/oozie/cassandra/coord_unique_devices_monthly.properties
+++ b/oozie/cassandra/coord_unique_devices_monthly.properties
@@ -43,8 +43,8 @@
 send_error_email_workflow_file     = 
${oozie_directory}/util/send_error_email/workflow.xml
 
 # HDFS path to datasets definitions
-last_access_uniques_data_directory = 
${name_node}/wmf/data/wmf/last_access_uniques
-last_access_uniques_datasets_file  = 
${oozie_directory}/last_access_uniques/datasets.xml
+unique_devices_data_directory      = ${name_node}/wmf/data/wmf/unique_devices
+unique_devices_datasets_file       = 
${oozie_directory}/unique_devices/datasets.xml
 # provides the frequency of the dataset used to manage
 #different datasets correctly in coordinator
 dataset_freq                       = month
@@ -81,10 +81,10 @@
 constant_output_tid_value          = 0,timeuuid
 
 workflow_file                      = ${workflow_file_monthly}
-datasets_file                      = ${last_access_uniques_datasets_file}
-dataset_name                       = last_access_uniques_monthly
+datasets_file                      = ${unique_devices_datasets_file}
+dataset_name                       = unique_devices_per_domain_monthly
 hive_script                        = unique_devices.hql
-source_table                       = wmf.last_access_uniques_monthly
+source_table                       = wmf.unique_devices_per_domain_monthly
 cassandra_parallel_loaders         = 1
 cassandra_keyspace                 = local_group_default_T_unique_devices
 cassandra_cql                      = UPDATE "${cassandra_keyspace}"."data" SET 
"devices" = ?
diff --git a/oozie/cassandra/daily/unique_devices.hql 
b/oozie/cassandra/daily/unique_devices.hql
index 74d18ba..2c3f043 100644
--- a/oozie/cassandra/daily/unique_devices.hql
+++ b/oozie/cassandra/daily/unique_devices.hql
@@ -9,7 +9,7 @@
 -- Usage:
 --     hive -f unique_devices.hql                                 \
 --         -d destination_directory=/tmp/unique_devices           \
---         -d source_table=wmf.last_access_uniques_daily          \
+--         -d source_table=wmf.unique_devices_per_domain_daily    \
 --         -d separator=\t                                        \
 --         -d year=2016                                           \
 --         -d month=1                                             \
@@ -24,10 +24,10 @@
 WITH unique_devices AS (
     SELECT
         CONCAT(
-            regexp_extract(uri_host, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
-            regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+            regexp_extract(domain, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+            regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
             ) AS project,
-        CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+        CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
             THEN 'mobile-site'
             ELSE 'desktop-site'
             END AS access_site,
@@ -41,10 +41,10 @@
         AND day = ${day}
     GROUP BY
         CONCAT(
-            regexp_extract(uri_host, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
-            regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+            regexp_extract(domain, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+            regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
             ),
-        CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+        CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
             THEN 'mobile-site'
             ELSE 'desktop-site'
             END,
diff --git a/oozie/cassandra/monthly/unique_devices.hql 
b/oozie/cassandra/monthly/unique_devices.hql
index a19935b..29d67e4 100644
--- a/oozie/cassandra/monthly/unique_devices.hql
+++ b/oozie/cassandra/monthly/unique_devices.hql
@@ -8,7 +8,7 @@
 -- Usage:
 --     hive -f unique_devices.hql                                 \
 --         -d destination_directory=/tmp/unique_devices           \
---         -d source_table=wmf.last_access_uniques_monthly        \
+--         -d source_table=wmf.unique_devices_per_domain_monthly  \
 --         -d separator=\t                                        \
 --         -d year=2016                                           \
 --         -d month=1                                             \
@@ -22,10 +22,10 @@
 WITH unique_devices AS (
     SELECT
         CONCAT(
-            regexp_extract(uri_host, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
-            regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+            regexp_extract(domain, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+            regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
             ) AS project,
-        CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+        CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
             THEN 'mobile-site'
             ELSE 'desktop-site'
             END AS access_site,
@@ -38,10 +38,10 @@
         AND month = ${month}
     GROUP BY
         CONCAT(
-            regexp_extract(uri_host, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
-            regexp_extract(uri_host, '([a-z0-9-_]+)\\.org$')
+            regexp_extract(domain, 
'^((?!www)([a-z0-9-_]+)\\.)(m\\.)?\\w+\\.org$'),
+            regexp_extract(domain, '([a-z0-9-_]+)\\.org$')
             ),
-        CASE WHEN uri_host RLIKE '(^(m)\\.)|\\.m\\.'
+        CASE WHEN domain RLIKE '(^(m)\\.)|\\.m\\.'
             THEN 'mobile-site'
             ELSE 'desktop-site'
             END,

-- 
To view, visit https://gerrit.wikimedia.org/r/357871
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I11531dedda20362cf3ee451d1c3d022fee0c57d4
Gerrit-PatchSet: 2
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
Gerrit-Reviewer: Joal <[email protected]>
Gerrit-Reviewer: Nuria <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to