Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/185781

to review the following change.

Change subject: Switch pagecounts-all-sites/archive to raw datasets
......................................................................

Switch pagecounts-all-sites/archive to raw datasets

Since

  e8523e431a8eec10b88a623374f79e859feee9cf

the refined datasets come without quality guarantees. This makes them
unusable for production jobs. Hence, we switch pagecounts-all-sites
back to the raw webrequests, so we detect issues and can get a chance
to fix them before computing the datasets.

Change-Id: If7f3a2e8e4b7ed22174afe6a3f2f8d910585c9ee
---
M diagrams/oozie-overview.dia
M oozie/pagecounts-all-sites/load/coordinator.properties
M oozie/pagecounts-all-sites/load/coordinator.xml
3 files changed, 12 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/81/185781/1

diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 360b1f5..5910071 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git a/oozie/pagecounts-all-sites/load/coordinator.properties 
b/oozie/pagecounts-all-sites/load/coordinator.properties
index 375cb76..9bf5568 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.properties
+++ b/oozie/pagecounts-all-sites/load/coordinator.properties
@@ -21,7 +21,7 @@
 workflow_file                       = 
${oozie_directory}/pagecounts-all-sites/load/workflow.xml
 
 # HDFS path to webrequest dataset definition
-webrequest_datasets_file            = 
${oozie_directory}/webrequest/datasets.xml
+webrequest_raw_datasets_file        = 
${oozie_directory}/webrequest/datasets_raw.xml
 
 # HDFS path to pagecounts-all-sites dataset definition
 pagecounts_all_sites_datasets_file  = 
${oozie_directory}/pagecounts-all-sites/datasets.xml
@@ -39,13 +39,15 @@
 hive_site_xml                       = 
${oozie_directory}/util/hive/hive-site.xml
 
 # Table to read webrequests from (fully qualified)
-webrequest_table                    = wmf.webrequest
+# (We're using the raw table for now, as the refined ones come without
+# quality guarantees)
+webrequest_raw_table                = wmf_raw.webrequest
 
 # Table to write hourly pagecounts to (fully qualified)
 pagecounts_all_sites_table          = wmf.pagecounts_all_sites
 
 # HDFS paths to directories where webrequest data is time bucketed.
-webrequest_data_directory           = ${name_node}/wmf/data/wmf/webrequest
+webrequest_raw_data_directory       = ${name_node}/wmf/data/raw/webrequest
 
 # HDFS path to directory where pagecounts-all-sites data is time bucketed.
 pagecounts_all_sites_data_directory = 
${name_node}/wmf/data/wmf/pagecounts-all-sites
diff --git a/oozie/pagecounts-all-sites/load/coordinator.xml 
b/oozie/pagecounts-all-sites/load/coordinator.xml
index f659326..e92ad67 100644
--- a/oozie/pagecounts-all-sites/load/coordinator.xml
+++ b/oozie/pagecounts-all-sites/load/coordinator.xml
@@ -17,14 +17,14 @@
         <property><name>job_tracker</name></property>
         <property><name>start_time</name></property>
         <property><name>stop_time</name></property>
-        <property><name>webrequest_datasets_file</name></property>
-        <property><name>webrequest_data_directory</name></property>
+        <property><name>webrequest_raw_datasets_file</name></property>
+        <property><name>webrequest_raw_data_directory</name></property>
         <property><name>pagecounts_all_sites_datasets_file</name></property>
         <property><name>pagecounts_all_sites_data_directory</name></property>
 
         <property><name>hive_site_xml</name></property>
         <property><name>workflow_file</name></property>
-        <property><name>webrequest_table</name></property>
+        <property><name>webrequest_raw_table</name></property>
         <property><name>pagecounts_all_sites_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
     </parameters>
@@ -54,15 +54,15 @@
     </controls>
 
     <datasets>
-        <include>${webrequest_datasets_file}</include>
+        <include>${webrequest_raw_datasets_file}</include>
         <include>${pagecounts_all_sites_datasets_file}</include>
     </datasets>
 
     <input-events>
-        <data-in name="input_text" dataset="webrequest_text">
+        <data-in name="input_text" dataset="webrequest_text_raw">
             <instance>${coord:current(0)}</instance>
         </data-in>
-        <data-in name="input_mobile" dataset="webrequest_mobile">
+        <data-in name="input_mobile" dataset="webrequest_mobile_raw">
             <instance>${coord:current(0)}</instance>
         </data-in>
     </input-events>
@@ -89,7 +89,7 @@
                 </property>
                 <property>
                     <name>source_table</name>
-                    <value>${webrequest_table}</value>
+                    <value>${webrequest_raw_table}</value>
                 </property>
                 <property>
                     <name>destination_table</name>

-- 
To view, visit https://gerrit.wikimedia.org/r/185781
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If7f3a2e8e4b7ed22174afe6a3f2f8d910585c9ee
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to