Hello Ottomata,

I'd like you to do a code review.  Please visit

    https://gerrit.wikimedia.org/r/185708

to review the following change.

Change subject: Add pagecounts-raw computation to pagecounts-all-sites
......................................................................

Add pagecounts-raw computation to pagecounts-all-sites

Change-Id: If32afcc082ae28248eca46b58dc2748811b00489
---
M oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
M oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
M oozie/pagecounts-all-sites/archive/bundle.properties
M oozie/pagecounts-all-sites/archive/bundle.xml
M oozie/pagecounts-all-sites/archive/coordinator.xml
M oozie/pagecounts-all-sites/archive/workflow.xml
6 files changed, 111 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/08/185708/1

diff --git a/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql 
b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
index ae6bdf9..94cd202 100644
--- a/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_pagecounts.hql
@@ -13,6 +13,8 @@
 --     month             -- month of the to-be-generated hour
 --     day               -- day of the to-be-generated hour
 --     hour              -- hour of the to-be-generated-hour
+--     extra_filter      -- additional condition by which to filter the
+--                          selected rows
 --
 --
 -- Usage:
@@ -40,5 +42,6 @@
         AND month=${month}
         AND day=${day}
         AND hour=${hour}
+        ${extra_filter}
     ORDER BY line
     LIMIT 100000000;
diff --git a/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql 
b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
index 36f31c0..8f1e5ca 100644
--- a/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
+++ b/oozie/pagecounts-all-sites/archive/archive_projectcounts.hql
@@ -12,6 +12,8 @@
 --     month             -- month of the to-be-generated hour
 --     day               -- day of the to-be-generated hour
 --     hour              -- hour of the to-be-generated-hour
+--     extra_filter      -- additional condition by which to filter the
+--                          selected rows
 --
 --
 -- Usage:
@@ -39,6 +41,7 @@
         AND month=${month}
         AND day=${day}
         AND hour=${hour}
+        ${extra_filter}
     GROUP BY qualifier
     ORDER BY line
     LIMIT 100000;
diff --git a/oozie/pagecounts-all-sites/archive/bundle.properties 
b/oozie/pagecounts-all-sites/archive/bundle.properties
index 7ef158f..d27c924 100644
--- a/oozie/pagecounts-all-sites/archive/bundle.properties
+++ b/oozie/pagecounts-all-sites/archive/bundle.properties
@@ -55,6 +55,12 @@
 # Archive directory for pagecounts-all-sites
 pagecounts_all_sites_archive_directory = 
${archive_directory}/pagecounts-all-sites
 
+# Archive directory for pagecounts-raw
+pagecounts_raw_archive_directory       = ${archive_directory}/pagecounts-raw
+
+# Extra filter for pagecounts-raw
+pagecounts_raw_extra_filter            = ( NOT qualifier RLIKE 
'\\\\.zero(\\\\.|$)' AND ( NOT qualifier RLIKE '\\\\.m(\\\\.|$)' ) OR qualifier 
RLIKE 
'^(commons|meta|incubator|species|strategy|outreach|usability|quality)\\\\.m$')
+
 # Coordintator to start.
 oozie.bundle.application.path          = 
${oozie_directory}/pagecounts-all-sites/archive/bundle.xml
 oozie.use.system.libpath               = true
diff --git a/oozie/pagecounts-all-sites/archive/bundle.xml 
b/oozie/pagecounts-all-sites/archive/bundle.xml
index 901e820..99c428e 100644
--- a/oozie/pagecounts-all-sites/archive/bundle.xml
+++ b/oozie/pagecounts-all-sites/archive/bundle.xml
@@ -21,12 +21,17 @@
         <property><name>mark_directory_done_workflow_file</name></property>
         <property><name>temporary_directory</name></property>
         
<property><name>pagecounts_all_sites_archive_directory</name></property>
+        <property><name>pagecounts_raw_archive_directory</name></property>
         <property><name>archive_job_output_workflow_file</name></property>
     </parameters>
 
-    <coordinator name="pagecounts_all_sites_archive-pagecounts">
+    <coordinator 
name="pagecounts_all_sites_archive-pagecounts_all_sites-pagecounts">
         <app-path>${coordinator_file}</app-path>
         <configuration>
+            <property>
+                <name>dataset_name</name>
+                <value>pagecounts-all-sites</value>
+            </property>
             <property>
                 <name>aspect_name</name>
                 <value>pagecounts</value>
@@ -35,12 +40,20 @@
                 <name>aspect_compression_ending</name>
                 <value>.gz</value>
             </property>
+            <property>
+                <name>workflow_archive_directory</name>
+                <value>${pagecounts_all_sites_archive_directory}</value>
+            </property>
         </configuration>
     </coordinator>
 
-    <coordinator name="pagecounts_all_sites_archive-projectcounts">
+    <coordinator 
name="pagecounts_all_sites_archive-pagecounts_all_sites-projectcounts">
         <app-path>${coordinator_file}</app-path>
         <configuration>
+            <property>
+                <name>dataset_name</name>
+                <value>pagecounts-all-sites</value>
+            </property>
             <property>
                 <name>aspect_name</name>
                 <value>projectcounts</value>
@@ -49,6 +62,62 @@
                 <name>aspect_compression_ending</name>
                 <value>EMPTY</value>
             </property>
+            <property>
+                <name>workflow_archive_directory</name>
+                <value>${pagecounts_all_sites_archive_directory}</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator name="pagecounts_all_sites_archive-pagecounts_raw-pagecounts">
+        <app-path>${coordinator_file}</app-path>
+        <configuration>
+            <property>
+                <name>dataset_name</name>
+                <value>pagecounts-raw</value>
+            </property>
+            <property>
+                <name>aspect_name</name>
+                <value>pagecounts</value>
+            </property>
+            <property>
+                <name>aspect_compression_ending</name>
+                <value>.gz</value>
+            </property>
+            <property>
+                <name>workflow_archive_directory</name>
+                <value>${pagecounts_raw_archive_directory}</value>
+            </property>
+            <property>
+                <name>extra_filter</name>
+                <value>${pagecounts_raw_extra_filter}</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator 
name="pagecounts_all_sites_archive-pagecounts_raw-projectcounts">
+        <app-path>${coordinator_file}</app-path>
+        <configuration>
+            <property>
+                <name>dataset_name</name>
+                <value>pagecounts-raw</value>
+            </property>
+            <property>
+                <name>aspect_name</name>
+                <value>projectcounts</value>
+            </property>
+            <property>
+                <name>aspect_compression_ending</name>
+                <value>EMPTY</value>
+            </property>
+            <property>
+                <name>workflow_archive_directory</name>
+                <value>${pagecounts_raw_archive_directory}</value>
+            </property>
+            <property>
+                <name>extra_filter</name>
+                <value>${pagecounts_raw_extra_filter}</value>
+            </property>
         </configuration>
     </coordinator>
 
diff --git a/oozie/pagecounts-all-sites/archive/coordinator.xml 
b/oozie/pagecounts-all-sites/archive/coordinator.xml
index 88c0493..0aa06bb 100644
--- a/oozie/pagecounts-all-sites/archive/coordinator.xml
+++ b/oozie/pagecounts-all-sites/archive/coordinator.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <coordinator-app xmlns="uri:oozie:coordinator:0.4"
-    name="pagecounts_all_sites_archive-${aspect_name}-coord"
+    name="pagecounts_all_sites_archive-${dataset_name}-${aspect_name}-coord"
     frequency="${coord:hours(1)}"
     start="${start_time}"
     end="${stop_time}"
@@ -11,8 +11,13 @@
             <name>queue_name</name>
             <value>default</value>
         </property>
+        <property>
+            <name>extra_filter</name>
+            <value></value>
+        </property>
 
         <!-- Required properties. -->
+        <property><name>dataset_name</name></property>
         <property><name>name_node</name></property>
         <property><name>job_tracker</name></property>
         <property><name>start_time</name></property>
@@ -24,7 +29,7 @@
         <property><name>pagecounts_all_sites_table</name></property>
         <property><name>mark_directory_done_workflow_file</name></property>
         <property><name>temporary_directory</name></property>
-        
<property><name>pagecounts_all_sites_archive_directory</name></property>
+        <property><name>workflow_archive_directory</name></property>
         <property><name>archive_job_output_workflow_file</name></property>
         <property><name>aspect_name</name></property>
         <property><name>aspect_compression_ending</name></property>
@@ -70,6 +75,7 @@
             <configuration>
 
                 <!-- Pass these properties through to the workflow -->
+                
<property><name>dataset_name</name><value>${dataset_name}</value></property>
                 
<property><name>name_node</name><value>${name_node}</value></property>
                 
<property><name>job_tracker</name><value>${job_tracker}</value></property>
                 
<property><name>queue_name</name><value>${queue_name}</value></property>
@@ -123,8 +129,8 @@
                     <value>${temporary_directory}</value>
                 </property>
                 <property>
-                    <name>pagecounts_all_sites_archive_directory</name>
-                    <value>${pagecounts_all_sites_archive_directory}</value>
+                    <name>workflow_archive_directory</name>
+                    <value>${workflow_archive_directory}</value>
                 </property>
                 <property>
                     <name>archive_job_output_workflow_file</name>
@@ -138,6 +144,10 @@
                     <name>aspect_compression_ending</name>
                     <value>${aspect_compression_ending}</value>
                 </property>
+                <property>
+                    <name>extra_filter</name>
+                    <value>${extra_filter}</value>
+                </property>
             </configuration>
         </workflow>
     </action>
diff --git a/oozie/pagecounts-all-sites/archive/workflow.xml 
b/oozie/pagecounts-all-sites/archive/workflow.xml
index f067889..466c8f3 100644
--- a/oozie/pagecounts-all-sites/archive/workflow.xml
+++ b/oozie/pagecounts-all-sites/archive/workflow.xml
@@ -1,14 +1,23 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <workflow-app xmlns="uri:oozie:workflow:0.4"
-    
name="pagecounts_all_sites_archive-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
+    
name="pagecounts_all_sites_archive-${dataset_name}-${aspect_name}-${year}-${month}-${day}-${hour}-wf">
 
     <parameters>
         <property>
             <name>queue_name</name>
             <value>default</value>
         </property>
+        <property>
+            <name>extra_filter</name>
+            <value>0=0</value> <!-- sadly enough, 'true' does not work here-->
+            <description>
+                Additional filter to apply when selecting data from the
+                pagecounts-all-site table.
+            </description>
+        </property>
 
         <!-- Required properties -->
+        <property><name>dataset_name</name></property>
         <property><name>name_node</name></property>
         <property><name>job_tracker</name></property>
 
@@ -63,8 +72,8 @@
             <description>A directory in HDFS for temporary files</description>
         </property>
         <property>
-            <name>pagecounts_all_sites_archive_directory</name>
-            <description>Directory for archive of pagecounts-all-sites 
files</description>
+            <name>workflow_archive_directory</name>
+            <description>Directory to archive the workflow output 
to</description>
         </property>
         <property>
             <name>archive_job_output_workflow_file</name>
@@ -102,6 +111,7 @@
             <param>month=${month}</param>
             <param>day=${day}</param>
             <param>hour=${hour}</param>
+            <param>extra_filter= AND ${extra_filter}</param>
         </hive>
         <ok to="mark_dataset_done"/>
         <error to="kill"/>
@@ -152,7 +162,7 @@
                     timestamp in the filename. To not break scripts of people,
                     we also name files that way.
                     -->
-                    
<value>${pagecounts_all_sites_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
 eq 'EMPTY' ? '' : aspect_compression_ending}</value>
+                    
<value>${workflow_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/${aspect_name}-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000${aspect_compression_ending
 eq 'EMPTY' ? '' : aspect_compression_ending}</value>
                 </property>
             </configuration>
         </sub-workflow>

-- 
To view, visit https://gerrit.wikimedia.org/r/185708
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If32afcc082ae28248eca46b58dc2748811b00489
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to