Hello Ottomata,
I'd like you to do a code review. Please visit
https://gerrit.wikimedia.org/r/185781
to review the following change.
Change subject: Switch pagecounts-all-sites/archive to raw datasets
......................................................................
Switch pagecounts-all-sites/archive to raw datasets
Since
e8523e431a8eec10b88a623374f79e859feee9cf
the refined datasets come without quality guarantees. This makes them
unusable for production jobs. Hence, we switch pagecounts-all-sites
back to the raw webrequests, so we detect issues and can get a chance
to fix them before computing the datasets.
Change-Id: If7f3a2e8e4b7ed22174afe6a3f2f8d910585c9ee
---
M diagrams/oozie-overview.dia
M oozie/pagecounts-all-sites/load/coordinator.properties
M oozie/pagecounts-all-sites/load/coordinator.xml
3 files changed, 12 insertions(+), 10 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery
refs/changes/81/185781/1
diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 360b1f5..5910071 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties
b/oozie/pagecounts-all-sites/load/coordinator.properties
index 375cb76..9bf5568 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.properties
+++ b/oozie/pagecounts-all-sites/load/coordinator.properties
@@ -21,7 +21,7 @@
workflow_file =
${oozie_directory}/pagecounts-all-sites/load/workflow.xml
# HDFS path to webrequest dataset definition
-webrequest_datasets_file =
${oozie_directory}/webrequest/datasets.xml
+webrequest_raw_datasets_file =
${oozie_directory}/webrequest/datasets_raw.xml
# HDFS path to pagecounts-all-sites dataset definition
pagecounts_all_sites_datasets_file =
${oozie_directory}/pagecounts-all-sites/datasets.xml
@@ -39,13 +39,15 @@
hive_site_xml =
${oozie_directory}/util/hive/hive-site.xml
# Table to read webrequests from (fully qualified)
-webrequest_table = wmf.webrequest
+# (We're using the raw table for now, as the refined ones come without
+# quality guarantees)
+webrequest_raw_table = wmf_raw.webrequest
# Table to write hourly pagecounts to (fully qualified)
pagecounts_all_sites_table = wmf.pagecounts_all_sites
# HDFS paths to directories where webrequest data is time bucketed.
-webrequest_data_directory = ${name_node}/wmf/data/wmf/webrequest
+webrequest_raw_data_directory = ${name_node}/wmf/data/raw/webrequest
# HDFS path to directory where pagecounts-all-sites data is time bucketed.
pagecounts_all_sites_data_directory =
${name_node}/wmf/data/wmf/pagecounts-all-sites
diff --git a/oozie/pagecounts-all-sites/load/coordinator.xml
b/oozie/pagecounts-all-sites/load/coordinator.xml
index f659326..e92ad67 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -17,14 +17,14 @@
<property><name>job_tracker</name></property>
<property><name>start_time</name></property>
<property><name>stop_time</name></property>
- <property><name>webrequest_datasets_file</name></property>
- <property><name>webrequest_data_directory</name></property>
+ <property><name>webrequest_raw_datasets_file</name></property>
+ <property><name>webrequest_raw_data_directory</name></property>
<property><name>pagecounts_all_sites_datasets_file</name></property>
<property><name>pagecounts_all_sites_data_directory</name></property>
<property><name>hive_site_xml</name></property>
<property><name>workflow_file</name></property>
- <property><name>webrequest_table</name></property>
+ <property><name>webrequest_raw_table</name></property>
<property><name>pagecounts_all_sites_table</name></property>
<property><name>mark_directory_done_workflow_file</name></property>
</parameters>
@@ -54,15 +54,15 @@
</controls>
<datasets>
- <include>${webrequest_datasets_file}</include>
+ <include>${webrequest_raw_datasets_file}</include>
<include>${pagecounts_all_sites_datasets_file}</include>
</datasets>
<input-events>
- <data-in name="input_text" dataset="webrequest_text">
+ <data-in name="input_text" dataset="webrequest_text_raw">
<instance>${coord:current(0)}</instance>
</data-in>
- <data-in name="input_mobile" dataset="webrequest_mobile">
+ <data-in name="input_mobile" dataset="webrequest_mobile_raw">
<instance>${coord:current(0)}</instance>
</data-in>
</input-events>
@@ -89,7 +89,7 @@
</property>
<property>
<name>source_table</name>
- <value>${webrequest_table}</value>
+ <value>${webrequest_raw_table}</value>
</property>
<property>
<name>destination_table</name>
--
To view, visit https://gerrit.wikimedia.org/r/185781
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: If7f3a2e8e4b7ed22174afe6a3f2f8d910585c9ee
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits