Joal has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/360327 )
Change subject: Rename unique devices project-wide
......................................................................
Rename unique devices project-wide
We have reconsidered the name of the project-wide unique devices
dataset, and think it will be better to name it per_project_class,
where a project_class is wikipedia or wikivoyage for instance.
Change-Id: If5298a9ce5ae03483b52fd6fb251a91fadf80abe
---
R
hive/unique_devices/per_project_class/create_unique_devices_per_project_class_daily_table.hql
R
hive/unique_devices/per_project_class/create_unique_devices_per_project_class_monthly_table.hql
M oozie/unique_devices/README.md
M oozie/unique_devices/datasets.xml
R oozie/unique_devices/per_project_class/daily/coordinator.properties
R oozie/unique_devices/per_project_class/daily/coordinator.xml
R
oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily.hql
R
oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily_to_archive.hql
R oozie/unique_devices/per_project_class/daily/workflow.xml
R oozie/unique_devices/per_project_class/druid/daily/coordinator.properties
R oozie/unique_devices/per_project_class/druid/daily/coordinator.xml
A
oozie/unique_devices/per_project_class/druid/daily/generate_druid_unique_devices_per_project_class_daily.hql
R
oozie/unique_devices/per_project_class/druid/daily/load_unique_devices_per_project_class_daily.json.template
R oozie/unique_devices/per_project_class/druid/daily/workflow.xml
R oozie/unique_devices/per_project_class/monthly/coordinator.properties
R oozie/unique_devices/per_project_class/monthly/coordinator.xml
R
oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly.hql
R
oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly_to_archive.hql
R oozie/unique_devices/per_project_class/monthly/workflow.xml
D
oozie/unique_devices/project_wide/druid/daily/generate_druid_unique_devices_project_wide_daily.hql
20 files changed, 204 insertions(+), 204 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/27/360327/1
diff --git
a/hive/unique_devices/project_wide/create_unique_devices_project_wide_daily_table.hql
b/hive/unique_devices/per_project_class/create_unique_devices_per_project_class_daily_table.hql
similarity index 68%
rename from
hive/unique_devices/project_wide/create_unique_devices_project_wide_daily_table.hql
rename to
hive/unique_devices/per_project_class/create_unique_devices_per_project_class_daily_table.hql
index ece0e2e..3440f69 100644
---
a/hive/unique_devices/project_wide/create_unique_devices_project_wide_daily_table.hql
+++
b/hive/unique_devices/per_project_class/create_unique_devices_per_project_class_daily_table.hql
@@ -1,14 +1,14 @@
--- Creates table statement for unique devices project-wide daily table.
+-- Creates table statement for unique devices per project-class daily table.
--
-- Parameters:
-- <none>
--
-- Usage
--- hive -f create_unique_devices_project_wide_daily_table.hql --database
wmf
+-- hive -f create_unique_devices_per_project_class_daily_table.hql
--database wmf
-CREATE EXTERNAL TABLE IF NOT EXISTS `unique_devices_project_wide_daily`(
- `project` string COMMENT 'The lower cased project (wikipedia
for instance)',
+CREATE EXTERNAL TABLE IF NOT EXISTS `unique_devices_per_project_class_daily`(
+ `project_class` string COMMENT 'The lower cased project-class
(wikipedia for instance)',
`country` string COMMENT 'Country name of the accessing
agents (computed using maxmind GeoIP database)',
`country_code` string COMMENT '2 letter country code',
`uniques_underestimate` int COMMENT 'Under estimation of unique devices
seen based on last-access-global cookie, and the nocookies header',
@@ -21,5 +21,5 @@
`day` int COMMENT 'Unpadded day of requests'
)
STORED AS PARQUET
-LOCATION '/wmf/data/wmf/unique_devices/project_wide/daily'
+LOCATION '/wmf/data/wmf/unique_devices/per_project_class/daily'
;
diff --git
a/hive/unique_devices/project_wide/create_unique_devices_project_wide_monthly_table.hql
b/hive/unique_devices/per_project_class/create_unique_devices_per_project_class_monthly_table.hql
similarity index 72%
rename from
hive/unique_devices/project_wide/create_unique_devices_project_wide_monthly_table.hql
rename to
hive/unique_devices/per_project_class/create_unique_devices_per_project_class_monthly_table.hql
index 74251b7..df42ab9 100644
---
a/hive/unique_devices/project_wide/create_unique_devices_project_wide_monthly_table.hql
+++
b/hive/unique_devices/per_project_class/create_unique_devices_per_project_class_monthly_table.hql
@@ -4,11 +4,11 @@
-- <none>
--
-- Usage
--- hive -f create_unique_devices_project_wide_monthly_table.hql --database
wmf
+-- hive -f create_unique_devices_per_project_class_monthly_table.hql
--database wmf
-CREATE EXTERNAL TABLE IF NOT EXISTS `unique_devices_project_wide_monthly`(
- `project` string COMMENT 'The lower cased project (wikipedia
for instance)',
+CREATE EXTERNAL TABLE IF NOT EXISTS `unique_devices_per_project_class_monthly`(
+ `project_class` string COMMENT 'The lower cased project-class
(wikipedia for instance)',
`country` string COMMENT 'Country name of the accessing
agents (computed using maxmind GeoIP database)',
`country_code` string COMMENT '2 letter country code',
`uniques_underestimate` int COMMENT 'Under estimation of unique devices
seen based on last-access-global cookie, and the nocookies header',
@@ -20,5 +20,5 @@
`month` int COMMENT 'Unpadded month of requests'
)
STORED AS PARQUET
-LOCATION '/wmf/data/wmf/unique_devices/project_wide/monthly'
+LOCATION '/wmf/data/wmf/unique_devices/per_project_class/monthly'
;
diff --git a/oozie/unique_devices/README.md b/oozie/unique_devices/README.md
index 2ced214..04cefcb 100644
--- a/oozie/unique_devices/README.md
+++ b/oozie/unique_devices/README.md
@@ -1,9 +1,9 @@
-Contains jobs computing per-domain and project-wide unique devices
+Contains jobs computing per-domain and per-project-class unique devices
based on last-access cookies.
Also contains jobs to load druid with hive-computed values.
-Project-wide jobs are currently launched as:
+Per-project-class jobs are currently launched as:
```
# Daily computation
@@ -13,7 +13,7 @@
-Doozie_launcher_queue_name=production \
-Doozie_launcher_memory=2048 \
-Dstart_time=2015-12-01T00:00Z \
- -config project_wide/daily/coordinator.properties \
+ -config per_project_class/daily/coordinator.properties \
-run
# Monthly computation
@@ -23,7 +23,7 @@
-Doozie_launcher_queue_name=production \
-Doozie_launcher_memory=2048 \
-Dstart_time=2015-12-01T00:00Z \
- -config project_wide/monthly/coordinator.properties \
+ -config per_project_class/monthly/coordinator.properties \
-run
# Daily druid loading
@@ -33,7 +33,7 @@
-Doozie_launcher_queue_name=production \
-Doozie_launcher_memory=2048 \
-Dstart_time=2015-12-01T00:00Z \
- -config project_wide/druid/daily/coordinator.properties \
+ -config per_project_class/druid/daily/coordinator.properties \
-run
```
diff --git a/oozie/unique_devices/datasets.xml
b/oozie/unique_devices/datasets.xml
index 2113ffc..44e0979 100644
--- a/oozie/unique_devices/datasets.xml
+++ b/oozie/unique_devices/datasets.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
-Defines reusable datasets for unique devicess data.
+Defines reusable datasets for unique devices data.
Use this dataset in your coordinator.xml files by setting:
${start_time} - the initial instance of your data.
@@ -13,8 +13,8 @@
<datasets>
<!--
- The unique_devices_project_wide_[daily,monthly] datasets contain
project-wide
- uniques devices per project and country, counted using the last_access
method.
+ The unique_devices_per_project_class_[daily,monthly] datasets contain
uniques
+ devices per project-class and country, counted using the last_access
method.
Note that we do not use “${...}” but “${"$"}{...}", as dataset files are
passed to EL twice in cascade, and in the first EL level, ${MONTH}
@@ -24,19 +24,19 @@
at “${...}” as input for the second EL level. There, the variables hold
their expected values, and we can start unpadding them.
-->
- <dataset name="unique_devices_project_wide_daily"
+ <dataset name="unique_devices_per_project_class_daily"
frequency="${coord:days(1)}"
initial-instance="${start_time}"
timezone="Universal">
-
<uri-template>${unique_devices_data_directory}/project_wide/daily/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}</uri-template>
+
<uri-template>${unique_devices_data_directory}/per_project_class/daily/year=${YEAR}/month=${"$"}{MONTH
+ 0}/day=${"$"}{DAY + 0}</uri-template>
<done-flag>_SUCCESS</done-flag>
</dataset>
- <dataset name="unique_devices_project_wide_monthly"
+ <dataset name="unique_devices_per_project_class_monthly"
frequency="${coord:months(1)}"
initial-instance="${start_time}"
timezone="Universal">
-
<uri-template>${unique_devices_data_directory}/project_wide/monthly/year=${YEAR}/month=${"$"}{MONTH
+ 0}</uri-template>
+
<uri-template>${unique_devices_data_directory}/per_project_class/monthly/year=${YEAR}/month=${"$"}{MONTH
+ 0}</uri-template>
<done-flag>_SUCCESS</done-flag>
</dataset>
diff --git a/oozie/unique_devices/project_wide/daily/coordinator.properties
b/oozie/unique_devices/per_project_class/daily/coordinator.properties
similarity index 75%
rename from oozie/unique_devices/project_wide/daily/coordinator.properties
rename to oozie/unique_devices/per_project_class/daily/coordinator.properties
index e703cad..aaf42f2 100644
--- a/oozie/unique_devices/project_wide/daily/coordinator.properties
+++ b/oozie/unique_devices/per_project_class/daily/coordinator.properties
@@ -1,10 +1,10 @@
-# Configures a coordinator to generate daily project-wide unique devices
+# Configures a coordinator to generate daily per-project-class unique devices
# using last-access method on webrequest table and save a reduced version
# (project aggregates except wikimedia) in archive folders.
#
# Any of the following properties are overidable with -D.
# Usage:
-# oozie job -Duser=$USER -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/unique_devices/project_wide/daily/coordinator.properties
+# oozie job -Duser=$USER -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/unique_devices/per_project_class/daily/coordinator.properties
#
# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
# .xml files exist there when this job is submitted.
@@ -35,21 +35,21 @@
webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
webrequest_datasets_file = ${oozie_directory}/webrequest/datasets.xml
-# unique_devices_project_wide_daily related settings
-unique_devices_project_wide_daily_table =
wmf.unique_devices_project_wide_daily
-unique_devices_data_directory =
${name_node}/wmf/data/wmf/unique_devices
-unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
-unique_devices_project_wide_archive_directory =
${archive_directory}/unique_devices/project_wide
+# unique_devices_per_project_class_daily related settings
+unique_devices_per_project_class_daily_table =
wmf.unique_devices_per_project_class_daily
+unique_devices_data_directory =
${name_node}/wmf/data/wmf/unique_devices
+unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
+unique_devices_per_project_class_archive_directory =
${archive_directory}/unique_devices/per_project_class
# refinery-hive jar path for RedirectToPageview UDF
refinery_hive_jar =
${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-hive-0.0.46.jar
user = hdfs
-workflow_file =
${oozie_directory}/unique_devices/project_wide/daily/workflow.xml
+workflow_file =
${oozie_directory}/unique_devices/per_project_class/daily/workflow.xml
temporary_directory = ${name_node}/tmp
# Coordinator app to run.
oozie.use.system.libpath = true
oozie.action.external.stats.write = true
-oozie.coord.application.path =
${oozie_directory}/unique_devices/project_wide/daily/coordinator.xml
+oozie.coord.application.path =
${oozie_directory}/unique_devices/per_project_class/daily/coordinator.xml
diff --git a/oozie/unique_devices/project_wide/daily/coordinator.xml
b/oozie/unique_devices/per_project_class/daily/coordinator.xml
similarity index 86%
rename from oozie/unique_devices/project_wide/daily/coordinator.xml
rename to oozie/unique_devices/per_project_class/daily/coordinator.xml
index a871da5..2ae3384 100644
--- a/oozie/unique_devices/project_wide/daily/coordinator.xml
+++ b/oozie/unique_devices/per_project_class/daily/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="unique_devices-project_wide-daily-coord"
+ name="unique_devices-per_project_class-daily-coord"
frequency="${coord:days(1)}"
start="${start_time}"
end="${stop_time}"
@@ -22,10 +22,10 @@
<property><name>webrequest_data_directory</name></property>
<property><name>webrequest_datasets_file</name></property>
-
<property><name>unique_devices_project_wide_daily_table</name></property>
+
<property><name>unique_devices_per_project_class_daily_table</name></property>
<property><name>unique_devices_data_directory</name></property>
<property><name>unique_devices_datasets_file</name></property>
-
<property><name>unique_devices_project_wide_archive_directory</name></property>
+
<property><name>unique_devices_per_project_class_archive_directory</name></property>
<property><name>temporary_directory</name></property>
@@ -58,7 +58,7 @@
</input-events>
<output-events>
- <data-out name="unique_devices_project_wide_daily_output"
dataset="unique_devices_project_wide_daily">
+ <data-out name="unique_devices_per_project_class_daily_output"
dataset="unique_devices_per_project_class_daily">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
@@ -95,8 +95,8 @@
<value>${coord:formatTime(coord:nominalTime(), "dd")}</value>
</property>
<property>
-
<name>unique_devices_project_wide_daily_current_directory</name>
-
<value>${coord:dataOut('unique_devices_project_wide_daily_output')}</value>
+
<name>unique_devices_per_project_class_daily_current_directory</name>
+
<value>${coord:dataOut('unique_devices_per_project_class_daily_output')}</value>
</property>
</configuration>
</workflow>
diff --git
a/oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily.hql
b/oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily.hql
similarity index 77%
rename from
oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily.hql
rename to
oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily.hql
index 206b8e5..48642ca 100644
---
a/oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily.hql
+++
b/oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily.hql
@@ -1,4 +1,4 @@
--- Generates unique devices project-wide daily based on WMF-Last-Access-global
cookie
+-- Generates unique devices per-project-class daily based on
WMF-Last-Access-global cookie
--
-- Parameters:
-- source_table -- Table containing source data
@@ -9,9 +9,9 @@
-- refinery_hive_jar -- The path to refinery-hive jar for UDF
--
-- Usage:
--- hive -f unique_devices_project_wide_daily.hql \
+-- hive -f unique_devices_per_project_class_daily.hql \
-- -d source_table=wmf.webrequest \
--- -d destination_table=wmf.unique_devices_project_wide_daily \
+-- -d destination_table=wmf.unique_devices_per_project_class_daily \
-- -d year=2017 \
-- -d month=4 \
-- -d day=1 \
@@ -30,7 +30,7 @@
year,
month,
day,
- normalized_host.project_class AS project,
+ normalized_host.project_class AS project_class,
geocoded_data['country'] AS country,
geocoded_data['country_code'] AS country_code,
unix_timestamp(x_analytics_map['WMF-Last-Access-Global'],
'dd-MMM-yyyy') AS last_access_global,
@@ -53,29 +53,29 @@
-- (fresh sessions are not counted with last_access method)
fresh_sessions_aggregated AS (
SELECT
- project,
+ project_class,
country,
country_code,
COUNT(1) AS uniques_offset
FROM (
SELECT
- hash(ip, user_agent, accept_language, project) AS id,
- project,
+ hash(ip, user_agent, accept_language, project_class) AS id,
+ project_class,
country,
country_code,
SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END)
FROM
last_access_dates
GROUP BY
- hash(ip, user_agent, accept_language, project),
- project,
+ hash(ip, user_agent, accept_language, project_class),
+ project_class,
country,
country_code
-- Only keeping clients having done 1 event without cookies
HAVING SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END) = 1
) fresh_sessions
GROUP BY
- project,
+ project_class,
country,
country_code
)
@@ -83,12 +83,12 @@
PARTITION(year = ${year}, month = ${month}, day = ${day})
SELECT
- COALESCE(la.project, fresh.project) as project,
+ COALESCE(la.project_class, fresh.project_class) as project_class,
COALESCE(la.country, fresh.country) as country,
COALESCE(la.country_code, fresh.country_code) as country_code,
SUM(CASE
- -- project set, last-access-global not set and client accept cookies
--> first visit, count
- WHEN (la.project IS NOT NULL AND la.last_access_global IS NULL AND
la.nocookies is NULL) THEN 1
+ -- project_class set, last-access-global not set and client accept
cookies --> first visit, count
+ WHEN (la.project_class IS NOT NULL AND la.last_access_global IS NULL
AND la.nocookies is NULL) THEN 1
-- last-access-global set and its date is before today --> First visit
today, count
WHEN ((la.last_access_global IS NOT NULL)
AND (la.last_access_global < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-', LPAD('${day}', 2, '0')), 'yyyy-MM-dd'))) THEN 1
@@ -97,8 +97,8 @@
END) AS uniques_underestimate,
COALESCE(fresh.uniques_offset, 0) AS uniques_offset,
SUM(CASE
- -- project set, last-access-global not set and client accept cookies
--> first visit, count
- WHEN (la.project IS NOT NULL AND la.last_access_global IS NULL AND
la.nocookies is NULL) THEN 1
+ -- project_class set, last-access-global not set and client accept
cookies --> first visit, count
+ WHEN (la.project_class IS NOT NULL AND la.last_access_global IS NULL
AND la.nocookies is NULL) THEN 1
-- last-access-global set and its date is before today --> First visit
today, count
WHEN ((la.last_access_global IS NOT NULL)
AND (la.last_access_global < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-', LPAD('${day}', 2, '0')), 'yyyy-MM-dd'))) THEN 1
@@ -109,10 +109,10 @@
last_access_dates AS la
-- Outer join to keep every row from both table
FULL OUTER JOIN fresh_sessions_aggregated AS fresh
- ON (la.project = fresh.project
+ ON (la.project_class = fresh.project_class
AND la.country_code = fresh.country_code)
GROUP BY
- COALESCE(la.project, fresh.project),
+ COALESCE(la.project_class, fresh.project_class),
COALESCE(la.country, fresh.country),
COALESCE(la.country_code, fresh.country_code),
COALESCE(fresh.uniques_offset, 0)
diff --git
a/oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily_to_archive.hql
b/oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily_to_archive.hql
similarity index 80%
rename from
oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily_to_archive.hql
rename to
oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily_to_archive.hql
index 07eac66..7a38894 100644
---
a/oozie/unique_devices/project_wide/daily/unique_devices_project_wide_daily_to_archive.hql
+++
b/oozie/unique_devices/per_project_class/daily/unique_devices_per_project_class_daily_to_archive.hql
@@ -1,4 +1,4 @@
--- Aggregate unique devices project-wide daily by project and remove wikimedia.
+-- Aggregate unique devices per-project-class daily by project_class and
remove wikimedia.
--
-- Parameters:
-- source_table -- Table containing source data
@@ -8,9 +8,9 @@
-- day -- day of the to-be-generated
--
-- Usage:
--- hive -f unique_devices_project_wide_daily_to_archive.hql \
--- -d source_table=wmf.unique_devices_project_wide_daily \
--- -d
destination_directory=/tmp/archive/unique_devices/project_wide_daily \
+-- hive -f unique_devices_per_project_class_daily_to_archive.hql \
+-- -d source_table=wmf.unique_devices_per_project_class_daily \
+-- -d
destination_directory=/tmp/archive/unique_devices/per_project_class \
-- -d year=2017 \
-- -d month=4 \
-- -d day=1
@@ -28,13 +28,13 @@
-- Set 0 as volume column since we don't use it.
SELECT
CONCAT_WS('\t',
- project,
+ project_class,
cast(uniques_underestimate AS string),
cast(uniques_offset AS string),
cast(uniques_estimate AS string)) AS line
FROM (
SELECT
- project,
+ project_class,
SUM(uniques_underestimate) AS uniques_underestimate,
SUM(uniques_offset) AS uniques_offset,
SUM(uniques_estimate) AS uniques_estimate
@@ -42,9 +42,9 @@
WHERE year=${year}
AND month=${month}
AND day=${day}
- AND project != 'wikimedia'
+ AND project_class != 'wikimedia'
GROUP BY
- project
+ project_class
ORDER BY
uniques_estimate DESC
LIMIT 100000000
diff --git a/oozie/unique_devices/project_wide/daily/workflow.xml
b/oozie/unique_devices/per_project_class/daily/workflow.xml
similarity index 89%
rename from oozie/unique_devices/project_wide/daily/workflow.xml
rename to oozie/unique_devices/per_project_class/daily/workflow.xml
index c75f3dc..d3f109e 100644
--- a/oozie/unique_devices/project_wide/daily/workflow.xml
+++ b/oozie/unique_devices/per_project_class/daily/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
- name="unique_devices-project_wide-daily-wf-${year}-${month}-${day}">
+ name="unique_devices-per_project_class-daily-wf-${year}-${month}-${day}">
<parameters>
<!-- Default values for oozie settings -->
@@ -28,7 +28,7 @@
<description>Hive table to read data from.</description>
</property>
<property>
- <name>unique_devices_project_wide_daily_table</name>
+ <name>unique_devices_per_project_class_daily_table</name>
<description>The table to store project-wide daily unique devices
data in.</description>
</property>
<property>
@@ -60,11 +60,11 @@
<description>Path to refinery-hive jar containing
UDFs</description>
</property>
<property>
- <name>unique_devices_project_wide_daily_current_directory</name>
+
<name>unique_devices_per_project_class_daily_current_directory</name>
<description>Directory where the currently computed data is
stored</description>
</property>
<property>
- <name>unique_devices_project_wide_archive_directory</name>
+ <name>unique_devices_per_project_class_archive_directory</name>
<description>Directory where the archived data is
stored</description>
</property>
<property>
@@ -81,9 +81,9 @@
</property>
</parameters>
- <start to="compute_unique_devices_project_wide_daily"/>
+ <start to="compute_unique_devices_per_project_class_daily"/>
- <action name="compute_unique_devices_project_wide_daily">
+ <action name="compute_unique_devices_per_project_class_daily">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${job_tracker}</job-tracker>
<name-node>${name_node}</name-node>
@@ -112,9 +112,9 @@
<value>/tmp/hive-${user}</value>
</property>
</configuration>
- <script>unique_devices_project_wide_daily.hql</script>
+ <script>unique_devices_per_project_class_daily.hql</script>
<param>source_table=${webrequest_table}</param>
-
<param>destination_table=${unique_devices_project_wide_daily_table}</param>
+
<param>destination_table=${unique_devices_per_project_class_daily_table}</param>
<param>year=${year}</param>
<param>month=${month}</param>
<param>day=${day}</param>
@@ -130,7 +130,7 @@
<configuration>
<property>
<name>directory</name>
-
<value>${unique_devices_project_wide_daily_current_directory}</value>
+
<value>${unique_devices_per_project_class_daily_current_directory}</value>
</property>
</configuration>
</sub-workflow>
@@ -168,10 +168,10 @@
</property>
</configuration>
- <script>unique_devices_project_wide_daily_to_archive.hql</script>
+
<script>unique_devices_per_project_class_daily_to_archive.hql</script>
<!-- Here, the source for archive is the
destination of the previous job -->
-
<param>source_table=${unique_devices_project_wide_daily_table}</param>
+
<param>source_table=${unique_devices_per_project_class_daily_table}</param>
<param>year=${year}</param>
<param>month=${month}</param>
<param>day=${day}</param>
@@ -211,7 +211,7 @@
</property>
<property>
<name>archive_file</name>
-
<value>${unique_devices_project_wide_archive_directory}/${padded_year}/${padded_year}-${padded_month}/unique_devices_project_wide_daily-${padded_year}-${padded_month}-${padded_day}.gz</value>
+
<value>${unique_devices_per_project_class_archive_directory}/${padded_year}/${padded_year}-${padded_month}/unique_devices_per_project_class_daily-${padded_year}-${padded_month}-${padded_day}.gz</value>
</property>
</configuration>
</sub-workflow>
diff --git
a/oozie/unique_devices/project_wide/druid/daily/coordinator.properties
b/oozie/unique_devices/per_project_class/druid/daily/coordinator.properties
similarity index 81%
rename from oozie/unique_devices/project_wide/druid/daily/coordinator.properties
rename to
oozie/unique_devices/per_project_class/druid/daily/coordinator.properties
index 3abd6ac..ee0699a 100644
--- a/oozie/unique_devices/project_wide/druid/daily/coordinator.properties
+++ b/oozie/unique_devices/per_project_class/druid/daily/coordinator.properties
@@ -1,9 +1,9 @@
-# Configures a coordinator to generate a day of JSON daily project-wide
+# Configures a coordinator to generate a day of JSON daily per-project-class
# unique devices and load them in druid.
#
# Any of the following properties are override-able with -D.
# Usage:
-# oozie job -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/uniques_devices/project_wide/druid/daily/coordinator.properties
+# oozie job -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/uniques_devices/per_project_class/druid/daily/coordinator.properties
#
# NOTE: The $refinery_directory must be synced to HDFS so that all relevant
# .xml files exist there when this job is submitted.
@@ -28,16 +28,16 @@
hive_site_xml = ${name_node}/user/hive/hive-site.xml
# HDFS path to coordinator to run.
-coordinator_file =
${oozie_directory}/unique_devices/project_wide/druid/daily/coordinator.xml
+coordinator_file =
${oozie_directory}/unique_devices/per_project_class/druid/daily/coordinator.xml
# HDFS path to workflow to run.
-workflow_file =
${oozie_directory}/unique_devices/project_wide/druid/daily/workflow.xml
+workflow_file =
${oozie_directory}/unique_devices/per_project_class/druid/daily/workflow.xml
-# unique devices project-wide settings
+# unique devices per-project-class settings
unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
unique_devices_data_directory = ${name_node}/wmf/data/wmf/unique_devices
-unique_devices_project_wide_daily_table =
wmf.unique_devices_project_wide_daily
+unique_devices_per_project_class_daily_table =
wmf.unique_devices_per_project_class_daily
-# Initial import time of the unique_devices_project_wide_daily dataset.
+# Initial import time of the unique_devices_per_project_class_daily dataset.
start_time = 2017-04-01T00:00Z
# Time to stop running this coordinator. Year 3000 == never!
@@ -47,7 +47,7 @@
temporary_directory = ${name_node}/tmp
# HDFS path to template to use.
-druid_template_file =
${oozie_directory}/unique_devices/project_wide/druid/daily/load_unique_devices_project_wide_daily.json.template
+druid_template_file =
${oozie_directory}/unique_devices/per_project_class/druid/daily/load_unique_devices_per_project_class_daily.json.template
# Druid overlord url
druid_overlord_url = http://druid1001.eqiad.wmnet:8090
diff --git a/oozie/unique_devices/project_wide/druid/daily/coordinator.xml
b/oozie/unique_devices/per_project_class/druid/daily/coordinator.xml
similarity index 90%
rename from oozie/unique_devices/project_wide/druid/daily/coordinator.xml
rename to oozie/unique_devices/per_project_class/druid/daily/coordinator.xml
index c135d97..136d6a9 100644
--- a/oozie/unique_devices/project_wide/druid/daily/coordinator.xml
+++ b/oozie/unique_devices/per_project_class/druid/daily/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="unique_devices-project_wide-druid-daily-coord"
+ name="unique_devices-per_project_class-druid-daily-coord"
frequency="${coord:days(1)}"
start="${start_time}"
end="${stop_time}"
@@ -17,7 +17,7 @@
<property><name>workflow_file</name></property>
<property><name>unique_devices_datasets_file</name></property>
<property><name>unique_devices_data_directory</name></property>
-
<property><name>unique_devices_project_wide_daily_table</name></property>
+
<property><name>unique_devices_per_project_class_daily_table</name></property>
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
@@ -49,7 +49,7 @@
</datasets>
<input-events>
- <data-in name="unique_devices_project_wide_daily_input"
dataset="unique_devices_project_wide_daily">
+ <data-in name="unique_devices_per_project_class_daily_input"
dataset="unique_devices_per_project_class_daily">
<!-- Dataset uses day as frequency -->
<instance>${coord:current(0)}</instance>
</data-in>
diff --git
a/oozie/unique_devices/per_project_class/druid/daily/generate_druid_unique_devices_per_project_class_daily.hql
b/oozie/unique_devices/per_project_class/druid/daily/generate_druid_unique_devices_per_project_class_daily.hql
new file mode 100644
index 0000000..bebff9a
--- /dev/null
+++
b/oozie/unique_devices/per_project_class/druid/daily/generate_druid_unique_devices_per_project_class_daily.hql
@@ -0,0 +1,55 @@
+-- Extracts one day of json formatted daily unique devices
+-- per-project-class to be loaded in Druid
+--
+-- Usage:
+-- hive -f generate_druid_unique_devices_per_project_class_daily.hql \
+-- -d source_table=wmf.unique_devices_per_project_class_daily \
+-- -d
destination_directory=/tmp/druid/unique_devices_per_project_class_daily_json \
+-- -d year=2017 \
+-- -d month=4 \
+-- -d day=1
+--
+
+SET hive.exec.compress.output=true;
+SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+
+
+ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;
+
+
+DROP TABLE IF EXISTS
tmp_druid_unique_devices_per_project_class_daily_${year}_${month}_${day};
+
+
+CREATE EXTERNAL TABLE IF NOT EXISTS
tmp_druid_unique_devices_per_project_class_daily_${year}_${month}_${day} (
+ `dt` string,
+ `project_class` string,
+ `country` string,
+ `country_code` string,
+ `uniques_underestimate` bigint,
+ `uniques_offset` bigint,
+ `uniques_estimate` bigint
+)
+ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
+STORED AS TEXTFILE
+LOCATION '${destination_directory}';
+
+
+INSERT OVERWRITE TABLE
tmp_druid_unique_devices_per_project_class_daily_${year}_${month}_${day}
+SELECT
+ CONCAT(
+ LPAD(year, 4, '0'), '-',
+ LPAD(month, 2, '0'), '-',
+ LPAD(day, 2, '0'), 'T00:00:00Z') AS dt,
+ project_class AS project_class,
+ country AS country,
+ country_code AS country_code,
+ uniques_underestimate AS uniques_underestimate,
+ uniques_offset AS uniques_offset,
+ uniques_estimate AS uniques_estimate
+FROM ${source_table}
+WHERE year = ${year}
+ AND month = ${month}
+ AND day = ${day};
+
+
+DROP TABLE IF EXISTS
tmp_druid_unique_devices_per_project_class_daily_${year}_${month}_${day};
diff --git
a/oozie/unique_devices/project_wide/druid/daily/load_unique_devices_project_wide_daily.json.template
b/oozie/unique_devices/per_project_class/druid/daily/load_unique_devices_per_project_class_daily.json.template
similarity index 93%
rename from
oozie/unique_devices/project_wide/druid/daily/load_unique_devices_project_wide_daily.json.template
rename to
oozie/unique_devices/per_project_class/druid/daily/load_unique_devices_per_project_class_daily.json.template
index 2ed1ed0..3b333e4 100644
---
a/oozie/unique_devices/project_wide/druid/daily/load_unique_devices_project_wide_daily.json.template
+++
b/oozie/unique_devices/per_project_class/druid/daily/load_unique_devices_per_project_class_daily.json.template
@@ -9,7 +9,7 @@
}
},
"dataSchema" : {
- "dataSource" : "unique-devices-project-wide-daily",
+ "dataSource" : "unique-devices-per_project_class-daily",
"granularitySpec" : {
"type" : "uniform",
"segmentGranularity" : "day",
@@ -22,7 +22,7 @@
"format" : "json",
"dimensionsSpec" : {
"dimensions" : [
- "project",
+ "project_class",
"country",
"country_code"
]
diff --git a/oozie/unique_devices/project_wide/druid/daily/workflow.xml
b/oozie/unique_devices/per_project_class/druid/daily/workflow.xml
similarity index 87%
rename from oozie/unique_devices/project_wide/druid/daily/workflow.xml
rename to oozie/unique_devices/per_project_class/druid/daily/workflow.xml
index 9c427f8..8893212 100644
--- a/oozie/unique_devices/project_wide/druid/daily/workflow.xml
+++ b/oozie/unique_devices/per_project_class/druid/daily/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
- name="unique_devices-project_wide-druid-daily-wf-${year}-${month}-${day}">
+
name="unique_devices-per_project_class-druid-daily-wf-${year}-${month}-${day}">
<parameters>
@@ -25,7 +25,7 @@
</property>
<property>
- <name>unique_devices_project_wide_daily_table</name>
+ <name>unique_devices_per_project_class_daily_table</name>
<description>The hive unique devices daily table to
use</description>
</property>
@@ -73,9 +73,9 @@
</parameters>
- <start to="generate_json_unique_devices_project_wide_daily"/>
+ <start to="generate_json_unique_devices_per_project_class_daily"/>
- <action name="generate_json_unique_devices_project_wide_daily">
+ <action name="generate_json_unique_devices_per_project_class_daily">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${job_tracker}</job-tracker>
<name-node>${name_node}</name-node>
@@ -105,24 +105,24 @@
<value>/tmp/hive-${user}</value>
</property>
</configuration>
-
<script>generate_druid_unique_devices_project_wide_daily.hql</script>
-
<param>source_table=${unique_devices_project_wide_daily_table}</param>
-
<param>destination_directory=${temporary_directory}/${wf:id()}-druid-unique_devices-project_wide-daily-${year}-${month}-${day}</param>
+
<script>generate_druid_unique_devices_per_project_class_daily.hql</script>
+
<param>source_table=${unique_devices_per_project_class_daily_table}</param>
+
<param>destination_directory=${temporary_directory}/${wf:id()}-druid-unique_devices-per_project_class-daily-${year}-${month}-${day}</param>
<param>year=${year}</param>
<param>month=${month}</param>
<param>day=${day}</param>
</hive>
- <ok to="mark_json_unique_devices_project_wide_daily_dataset_done" />
+ <ok to="mark_json_unique_devices_per_project_class_daily_dataset_done"
/>
<error to="send_error_email" />
</action>
- <action name="mark_json_unique_devices_project_wide_daily_dataset_done">
+ <action
name="mark_json_unique_devices_per_project_class_daily_dataset_done">
<sub-workflow>
<app-path>${mark_directory_done_workflow_file}</app-path>
<configuration>
<property>
<name>directory</name>
-
<value>${temporary_directory}/${wf:id()}-druid-unique_devices-project_wide-daily-${year}-${month}-${day}</value>
+
<value>${temporary_directory}/${wf:id()}-druid-unique_devices-per_project_class-daily-${year}-${month}-${day}</value>
</property>
</configuration>
</sub-workflow>
@@ -138,7 +138,7 @@
<configuration>
<property>
<name>source_directory</name>
-
<value>${temporary_directory}/${wf:id()}-druid-unique_devices-project_wide-daily-${year}-${month}-${day}</value>
+
<value>${temporary_directory}/${wf:id()}-druid-unique_devices-per_project_class-daily-${year}-${month}-${day}</value>
</property>
<property>
<name>template_file</name>
@@ -160,7 +160,7 @@
<action name="remove_temporary_data">
<fs>
- <delete
path="${temporary_directory}/${wf:id()}-druid-unique_devices-project_wide-daily-${year}-${month}-${day}"/>
+ <delete
path="${temporary_directory}/${wf:id()}-druid-unique_devices-per_project_class-daily-${year}-${month}-${day}"/>
</fs>
<ok to="end"/>
<error to="send_error_email"/>
diff --git a/oozie/unique_devices/project_wide/monthly/coordinator.properties
b/oozie/unique_devices/per_project_class/monthly/coordinator.properties
similarity index 78%
rename from oozie/unique_devices/project_wide/monthly/coordinator.properties
rename to oozie/unique_devices/per_project_class/monthly/coordinator.properties
index 329f3d0..4968a19 100644
--- a/oozie/unique_devices/project_wide/monthly/coordinator.properties
+++ b/oozie/unique_devices/per_project_class/monthly/coordinator.properties
@@ -1,10 +1,10 @@
-# Configures a coordinator to generate monthly project-wide unique devices
+# Configures a coordinator to generate monthly per-project-class unique devices
# using last_access method on webrequest table and save a reduced version
-# (project aggregates without wikimedia) in archive folders.
+# (project-class aggregates without wikimedia) in archive folders.
#
# Any of the following properties are overidable with -D.
# Usage:
-# oozie job -Duser=$USER -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/unique_devices/project_wide/monthly/coordinator.properties
+# oozie job -Duser=$USER -Dstart_time=2017-04-01T00:00Z -submit -config
oozie/unique_devices/per_project_class/monthly/coordinator.properties
#
# NOTE: The $oozie_directory must be synced to HDFS so that all relevant
# .xml files exist there when this job is submitted.
@@ -32,11 +32,11 @@
webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
webrequest_datasets_file = ${oozie_directory}/webrequest/datasets.xml
-# uniques devices project_wide monthly settings
-unique_devices_project_wide_monthly_table =
wmf.unique_devices_project_wide_monthly
+# uniques devices per_project_class monthly settings
+unique_devices_per_project_class_monthly_table =
wmf.unique_devices_per_project_class_monthly
unique_devices_data_directory =
${name_node}/wmf/data/wmf/unique_devices
unique_devices_datasets_file =
${oozie_directory}/unique_devices/datasets.xml
-unique_devices_project_wide_archive_directory =
${archive_directory}/unique_devices/project_wide
+unique_devices_per_project_class_archive_directory =
${archive_directory}/unique_devices/per_project_class
# refinery-hive jar path for RedirectToPageview UDF
refinery_hive_jar =
${artifacts_directory}/org/wikimedia/analytics/refinery/refinery-hive-0.0.46.jar
@@ -44,9 +44,9 @@
temporary_directory = ${name_node}/tmp
user = hdfs
-workflow_file =
${oozie_directory}/unique_devices/project_wide/monthly/workflow.xml
+workflow_file =
${oozie_directory}/unique_devices/per_project_class/monthly/workflow.xml
# Coordinator app to run.
oozie.use.system.libpath = true
oozie.action.external.stats.write = true
-oozie.coord.application.path =
${oozie_directory}/unique_devices/project_wide/monthly/coordinator.xml
+oozie.coord.application.path =
${oozie_directory}/unique_devices/per_project_class/monthly/coordinator.xml
diff --git a/oozie/unique_devices/project_wide/monthly/coordinator.xml
b/oozie/unique_devices/per_project_class/monthly/coordinator.xml
similarity index 84%
rename from oozie/unique_devices/project_wide/monthly/coordinator.xml
rename to oozie/unique_devices/per_project_class/monthly/coordinator.xml
index 694a6ac..217086e 100644
--- a/oozie/unique_devices/project_wide/monthly/coordinator.xml
+++ b/oozie/unique_devices/per_project_class/monthly/coordinator.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<coordinator-app xmlns="uri:oozie:coordinator:0.4"
- name="unique_devices-project_wide-monthly-coord"
+ name="unique_devices-per_project_class-monthly-coord"
frequency="${coord:months(1)}"
start="${start_time}"
end="${stop_time}"
@@ -22,10 +22,10 @@
<property><name>webrequest_data_directory</name></property>
<property><name>webrequest_datasets_file</name></property>
-
<property><name>unique_devices_project_wide_monthly_table</name></property>
+
<property><name>unique_devices_per_project_class_monthly_table</name></property>
<property><name>unique_devices_data_directory</name></property>
<property><name>unique_devices_datasets_file</name></property>
-
<property><name>unique_devices_project_wide_archive_directory</name></property>
+
<property><name>unique_devices_per_project_class_archive_directory</name></property>
<property><name>temporary_directory</name></property>
@@ -58,7 +58,7 @@
</input-events>
<output-events>
- <data-out name="unique_devices_project_wide_monthly_output"
dataset="unique_devices_project_wide_monthly">
+ <data-out name="unique_devices_per_project_class_monthly_output"
dataset="unique_devices_per_project_class_monthly">
<instance>${coord:current(0)}</instance>
</data-out>
</output-events>
@@ -86,8 +86,8 @@
<value>${coord:formatTime(coord:nominalTime(), "MM")}</value>
</property>
<property>
-
<name>unique_devices_project_wide_monthly_current_directory</name>
-
<value>${coord:dataOut('unique_devices_project_wide_monthly_output')}</value>
+
<name>unique_devices_per_project_class_monthly_current_directory</name>
+
<value>${coord:dataOut('unique_devices_per_project_class_monthly_output')}</value>
</property>
</configuration>
</workflow>
diff --git
a/oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly.hql
b/oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly.hql
similarity index 76%
rename from
oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly.hql
rename to
oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly.hql
index 5ddf6cb..1ba32d8 100644
---
a/oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly.hql
+++
b/oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly.hql
@@ -1,4 +1,4 @@
--- Generates unique devices project-wide monthly based on
WMF-Last-Access-global cookie
+-- Generates unique devices per-project-class monthly based on
WMF-Last-Access-global cookie
--
-- Parameters:
-- source_table -- Table containing source data
@@ -8,9 +8,9 @@
-- refinery_hive_jar -- The path to refinery-hive jar for UDF
--
-- Usage
--- hive -f unique_devices_project_wide_monthly.hql \
+-- hive -f unique_devices_per_project_class_monthly.hql \
-- -d source_table=wmf.webrequest \
--- -d destination_table=wmf.unique_devices_project_wide_monthly \
+-- -d destination_table=wmf.unique_devices_per_project_class_monthly \
-- -d year=2017 \
-- -d month=4 \
-- -d
refinery_hive_jar=/wmf/refinery/current/artifacts/refinery-hive.jar
@@ -29,7 +29,7 @@
SELECT
year,
month,
- normalized_host.project_class AS project,
+ normalized_host.project_class AS project_class,
geocoded_data['country'] AS country,
geocoded_data['country_code'] AS country_code,
unix_timestamp(x_analytics_map['WMF-Last-Access-Global'],
'dd-MMM-yyyy') AS last_access_global,
@@ -51,29 +51,29 @@
-- (fresh sessions are not counted with last_access method)
fresh_sessions_aggregated AS (
SELECT
- project,
+ project_class,
country,
country_code,
COUNT(1) AS uniques_offset
FROM (
SELECT
- hash(ip, user_agent, accept_language, project) AS id,
- project,
+ hash(ip, user_agent, accept_language, project_class) AS id,
+ project_class,
country,
country_code,
SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END)
FROM
last_access_dates
GROUP BY
- hash(ip, user_agent, accept_language, project),
- project,
+ hash(ip, user_agent, accept_language, project_class),
+ project_class,
country,
country_code
-- Only keeping clients having done 1 event without cookies
HAVING SUM(CASE WHEN (nocookies IS NOT NULL) THEN 1 ELSE 0 END) = 1
) fresh_sessions
GROUP BY
- project,
+ project_class,
country,
country_code
)
@@ -82,12 +82,12 @@
PARTITION(year = ${year}, month = ${month})
SELECT
- COALESCE(la.project, fresh.project) as project,
+ COALESCE(la.project_class, fresh.project_class) as project_class,
COALESCE(la.country, fresh.country) as country,
COALESCE(la.country_code, fresh.country_code) as country_code,
SUM(CASE
- -- project set, last-access-global not set and client accept cookies
--> first visit, count
- WHEN (la.project IS NOT NULL AND la.last_access_global IS NULL AND
la.nocookies is NULL) THEN 1
+ -- project_class set, last-access-global not set and client accept
cookies --> first visit, count
+ WHEN (la.project_class IS NOT NULL AND la.last_access_global IS NULL
AND la.nocookies is NULL) THEN 1
-- last-access-global set and its date is before today --> First visit
today, count
WHEN ((la.last_access_global IS NOT NULL)
AND (la.last_access_global < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-','01'), 'yyyy-MM-dd'))) THEN 1
@@ -96,8 +96,8 @@
END) AS uniques_underestimate,
COALESCE(fresh.uniques_offset, 0) AS uniques_offset,
SUM(CASE
- -- project set, last-access-global not set and client accept cookies
--> first visit, count
- WHEN (la.project IS NOT NULL AND la.last_access_global IS NULL AND
la.nocookies is NULL) THEN 1
+ -- project_class set, last-access-global not set and client accept
cookies --> first visit, count
+ WHEN (la.project_class IS NOT NULL AND la.last_access_global IS NULL
AND la.nocookies is NULL) THEN 1
-- last-access-global set and its date is before today --> First visit
today, count
WHEN ((la.last_access_global IS NOT NULL)
AND (la.last_access_global < unix_timestamp(CONCAT('${year}-',
LPAD('${month}', 2, '0'), '-','01'), 'yyyy-MM-dd'))) THEN 1
@@ -108,9 +108,9 @@
last_access_dates AS la
-- Outer join to keep every row from both table
FULL OUTER JOIN fresh_sessions_aggregated AS fresh
- ON (la.project = fresh.project AND la.country_code =
fresh.country_code)
+ ON (la.project_class = fresh.project_class AND la.country_code =
fresh.country_code)
GROUP BY
- COALESCE(la.project, fresh.project),
+ COALESCE(la.project_class, fresh.project_class),
COALESCE(la.country, fresh.country),
COALESCE(la.country_code, fresh.country_code),
COALESCE(fresh.uniques_offset, 0)
diff --git
a/oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly_to_archive.hql
b/oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly_to_archive.hql
similarity index 79%
rename from
oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly_to_archive.hql
rename to
oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly_to_archive.hql
index b6069b1..8b512b2 100644
---
a/oozie/unique_devices/project_wide/monthly/unique_devices_project_wide_monthly_to_archive.hql
+++
b/oozie/unique_devices/per_project_class/monthly/unique_devices_per_project_class_monthly_to_archive.hql
@@ -1,4 +1,4 @@
--- Aggregate unique devices project-wide monthly by project and remove
wikimedia.
+-- Aggregate unique devices per-project-class monthly by project and remove
wikimedia.
--
-- Parameters:
-- source_table -- Table containing source data
@@ -7,9 +7,9 @@
-- month -- month of the to-be-generated
--
-- Usage:
--- hive -f unique_devices_project_wide_monthly_to_archive.hql \
--- -d source_table=wmf.unique_devices_project_wide_monthly \
--- -d destination_directory=/tmp/archive/unique_devices/project_wide \
+-- hive -f unique_devices_per_project_class_monthly_to_archive.hql \
+-- -d source_table=wmf.unique_devices_per_project_class_monthly \
+-- -d
destination_directory=/tmp/archive/unique_devices/per_project_class \
-- -d year=2017 \
-- -d month=4
@@ -26,22 +26,22 @@
-- Set 0 as volume column since we don't use it.
SELECT
CONCAT_WS('\t',
- project,
+ project_class,
cast(uniques_underestimate AS string),
cast(uniques_offset AS string),
cast(uniques_estimate AS string)) AS line
FROM (
SELECT
- project,
+ project_class,
SUM(uniques_underestimate) AS uniques_underestimate,
SUM(uniques_offset) AS uniques_offset,
SUM(uniques_estimate) AS uniques_estimate
FROM ${source_table}
WHERE year=${year}
AND month=${month}
- AND project != 'wikimedia'
+ AND project_class != 'wikimedia'
GROUP BY
- project
+ project_class
ORDER BY
uniques_estimate DESC
LIMIT 100000000
diff --git a/oozie/unique_devices/project_wide/monthly/workflow.xml
b/oozie/unique_devices/per_project_class/monthly/workflow.xml
similarity index 88%
rename from oozie/unique_devices/project_wide/monthly/workflow.xml
rename to oozie/unique_devices/per_project_class/monthly/workflow.xml
index c4cff08..4b11bcd 100644
--- a/oozie/unique_devices/project_wide/monthly/workflow.xml
+++ b/oozie/unique_devices/per_project_class/monthly/workflow.xml
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<workflow-app xmlns="uri:oozie:workflow:0.4"
- name="unique_devices-project_wide-monthly-wf-${year}-${month}">
+ name="unique_devices-per_project_class-monthly-wf-${year}-${month}">
<parameters>
<!-- Default values for oozie settings -->
@@ -28,7 +28,7 @@
<description>Hive table to read data from.</description>
</property>
<property>
- <name>unique_devices_project_wide_monthly_table</name>
+ <name>unique_devices_per_project_class_monthly_table</name>
<description>The table to store project-wide monthly unique
devices data in.</description>
</property>
<property>
@@ -52,11 +52,11 @@
<description>Path to refinery-hive jar containing
UDFs</description>
</property>
<property>
- <name>unique_devices_project_wide_monthly_current_directory</name>
+
<name>unique_devices_per_project_class_monthly_current_directory</name>
<description>Directory where the currently computed data is
stored</description>
</property>
<property>
- <name>unique_devices_project_wide_archive_directory</name>
+ <name>unique_devices_per_project_class_archive_directory</name>
<description>Directory where the archived data is
stored</description>
</property>
@@ -74,9 +74,9 @@
</property>
</parameters>
- <start to="compute_unique_devices_project_wide_monthly"/>
+ <start to="compute_unique_devices_per_project_class_monthly"/>
- <action name="compute_unique_devices_project_wide_monthly">
+ <action name="compute_unique_devices_per_project_class_monthly">
<hive xmlns="uri:oozie:hive-action:0.2">
<job-tracker>${job_tracker}</job-tracker>
<name-node>${name_node}</name-node>
@@ -105,9 +105,9 @@
<value>/tmp/hive-${user}</value>
</property>
</configuration>
- <script>unique_devices_project_wide_monthly.hql</script>
+ <script>unique_devices_per_project_class_monthly.hql</script>
<param>source_table=${webrequest_table}</param>
-
<param>destination_table=${unique_devices_project_wide_monthly_table}</param>
+
<param>destination_table=${unique_devices_per_project_class_monthly_table}</param>
<param>year=${year}</param>
<param>month=${month}</param>
<param>refinery_hive_jar=${refinery_hive_jar}</param>
@@ -122,7 +122,7 @@
<configuration>
<property>
<name>directory</name>
-
<value>${unique_devices_project_wide_monthly_current_directory}</value>
+
<value>${unique_devices_per_project_class_monthly_current_directory}</value>
</property>
</configuration>
</sub-workflow>
@@ -161,10 +161,10 @@
</property>
</configuration>
- <script>unique_devices_project_wide_monthly_to_archive.hql</script>
+
<script>unique_devices_per_project_class_monthly_to_archive.hql</script>
<!-- Here, the source for archive is the
destination of the previous job -->
-
<param>source_table=${unique_devices_project_wide_monthly_table}</param>
+
<param>source_table=${unique_devices_per_project_class_monthly_table}</param>
<param>year=${year}</param>
<param>month=${month}</param>
<param>destination_directory=${temporary_directory}/${wf:id()}</param>
@@ -203,7 +203,7 @@
</property>
<property>
<name>archive_file</name>
-
<value>${unique_devices_project_wide_archive_directory}/${padded_year}/${padded_year}-${padded_month}/unique_devices_project_wide_monthly-${padded_year}-${padded_month}.gz</value>
+
<value>${unique_devices_per_project_class_archive_directory}/${padded_year}/${padded_year}-${padded_month}/unique_devices_per_project_class_monthly-${padded_year}-${padded_month}.gz</value>
</property>
</configuration>
</sub-workflow>
diff --git
a/oozie/unique_devices/project_wide/druid/daily/generate_druid_unique_devices_project_wide_daily.hql
b/oozie/unique_devices/project_wide/druid/daily/generate_druid_unique_devices_project_wide_daily.hql
deleted file mode 100644
index ba892fa..0000000
---
a/oozie/unique_devices/project_wide/druid/daily/generate_druid_unique_devices_project_wide_daily.hql
+++ /dev/null
@@ -1,55 +0,0 @@
--- Extracts one day of json formatted daily unique devices
--- project-wide to be loaded in Druid
---
--- Usage:
--- hive -f generate_druid_unique_devices_project_wide_daily.hql \
--- -d source_table=wmf.unique_devices_project_wide_daily \
--- -d
destination_directory=/tmp/druid/unique_devices_project_wide_daily_json \
--- -d year=2017 \
--- -d month=4 \
--- -d day=1
---
-
-SET hive.exec.compress.output=true;
-SET
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
-
-
-ADD JAR /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar;
-
-
-DROP TABLE IF EXISTS
tmp_druid_unique_devices_project_wide_daily_${year}_${month}_${day};
-
-
-CREATE EXTERNAL TABLE IF NOT EXISTS
tmp_druid_unique_devices_project_wide_daily_${year}_${month}_${day} (
- `dt` string,
- `project` string,
- `country` string,
- `country_code` string,
- `uniques_underestimate` bigint,
- `uniques_offset` bigint,
- `uniques_estimate` bigint
-)
-ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
-STORED AS TEXTFILE
-LOCATION '${destination_directory}';
-
-
-INSERT OVERWRITE TABLE
tmp_druid_unique_devices_project_wide_daily_${year}_${month}_${day}
-SELECT
- CONCAT(
- LPAD(year, 4, '0'), '-',
- LPAD(month, 2, '0'), '-',
- LPAD(day, 2, '0'), 'T00:00:00Z') AS dt,
- project AS project,
- country AS country,
- country_code AS country_code,
- uniques_underestimate AS uniques_underestimate,
- uniques_offset AS uniques_offset,
- uniques_estimate AS uniques_estimate
-FROM ${source_table}
-WHERE year = ${year}
- AND month = ${month}
- AND day = ${day};
-
-
-DROP TABLE IF EXISTS
tmp_druid_unique_devices_project_wide_daily_${year}_${month}_${day};
--
To view, visit https://gerrit.wikimedia.org/r/360327
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If5298a9ce5ae03483b52fd6fb251a91fadf80abe
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Joal <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits