Ottomata has submitted this change and it was merged.

Change subject: Use misc when producing legacy tsvs
......................................................................


Use misc when producing legacy tsvs

Change-Id: I083e5a9d4e08117ec585f1fe9058b1effd635a16
---
M diagrams/oozie-overview.dia
M oozie/webrequest/legacy_tsvs/bundle.properties
M oozie/webrequest/legacy_tsvs/bundle.xml
A oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
M oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
5 files changed, 146 insertions(+), 3 deletions(-)

Approvals:
  Ottomata: Verified; Looks good to me, approved



diff --git a/diagrams/oozie-overview.dia b/diagrams/oozie-overview.dia
index 2944843..1da1b88 100644
--- a/diagrams/oozie-overview.dia
+++ b/diagrams/oozie-overview.dia
Binary files differ
diff --git a/oozie/webrequest/legacy_tsvs/bundle.properties 
b/oozie/webrequest/legacy_tsvs/bundle.properties
index 06b5c73..900fbd3 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.properties
+++ b/oozie/webrequest/legacy_tsvs/bundle.properties
@@ -21,6 +21,7 @@
 # webrequest_sources they depend on. This allows to for example turn off upload
 # and have the coordinators that depend on upload block, while the coordinators
 # that do not depend on upload continue to run.
+coordinator_misc_mobile_text_file   = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
 coordinator_mobile_file             = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile.xml
 coordinator_mobile_text_file        = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text.xml
 coordinator_mobile_text_upload_file = 
${oozie_directory}/webrequest/legacy_tsvs/coordinator_mobile_text_upload.xml
diff --git a/oozie/webrequest/legacy_tsvs/bundle.xml 
b/oozie/webrequest/legacy_tsvs/bundle.xml
index 7bff053..97979ad 100644
--- a/oozie/webrequest/legacy_tsvs/bundle.xml
+++ b/oozie/webrequest/legacy_tsvs/bundle.xml
@@ -9,6 +9,7 @@
         </property>
 
         <!-- Required properties. -->
+        <property><name>coordinator_misc_mobile_text_file</name></property>
         <property><name>coordinator_mobile_file</name></property>
         <property><name>coordinator_mobile_text_file</name></property>
         <property><name>coordinator_mobile_text_upload_file</name></property>
@@ -137,7 +138,7 @@
     </coordinator>
 
     <coordinator name="webrequest_legacy_tsvs-5xx">
-        <app-path>${coordinator_mobile_text_file}</app-path>
+        <app-path>${coordinator_misc_mobile_text_file}</app-path>
         <configuration>
             <property>
                 <name>aspect_name</name>
diff --git a/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml 
b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
new file mode 100644
index 0000000..56fbb84
--- /dev/null
+++ b/oozie/webrequest/legacy_tsvs/coordinator_misc_mobile_text.xml
@@ -0,0 +1,142 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<coordinator-app xmlns="uri:oozie:coordinator:0.4"
+    name="webrequest_legacy_tsvs-${aspect_name}-coord"
+    frequency="${coord:days(1)}"
+    start="${start_time}"
+    end="${stop_time}"
+    timezone="Universal">
+
+    <parameters>
+        <property>
+            <name>queue_name</name>
+            <value>default</value>
+        </property>
+
+        <!-- Required properties. -->
+        <property><name>name_node</name></property>
+        <property><name>job_tracker</name></property>
+        <property><name>start_time</name></property>
+        <property><name>hour_offset</name></property>
+        <property><name>stop_time</name></property>
+        <property><name>webrequest_datasets_file</name></property>
+        <property><name>webrequest_data_directory</name></property>
+        <property><name>hive_site_xml</name></property>
+        <property><name>workflow_file</name></property>
+        <property><name>webrequest_table</name></property>
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>temporary_directory</name></property>
+        <property><name>aspect_tsv_archive_directory</name></property>
+        <property><name>archive_job_output_workflow_file</name></property>
+        <property><name>aspect_name</name></property>
+    </parameters>
+
+    <controls>
+        <!--
+        By having materialized jobs not timeout, we ease backfilling incidents
+        after recoverable hiccups on the dataset producers.
+        -->
+        <timeout>-1</timeout>
+
+        <!--
+        Since the job only runs daily, even low concurrency allows to catch up
+        pretty fast. Hence, we can limit concurrency to 1, as the tsvs 
typically
+        process quite some data.
+        -->
+        <concurrency>1</concurrency>
+
+        <!--
+        In order to keep backfilling after an incident simple, we only start
+        throttling materialization after 4 days.
+        Due to the low concurrency, and low discrepancy between progressing
+        time, and expected availability of datasets, we should typically have
+        far less materialized jobs.
+        -->
+        <throttle>4</throttle>
+    </controls>
+
+    <datasets>
+        <include>${webrequest_datasets_file}</include>
+    </datasets>
+
+    <input-events>
+        <data-in name="webrequest_misc" dataset="webrequest_misc">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <data-in name="webrequest_mobile" dataset="webrequest_mobile">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <data-in name="webrequest_text" dataset="webrequest_text">
+            <start-instance>${coord:current(0)}</start-instance>
+            <end-instance>${coord:current(23)}</end-instance>
+        </data-in>
+
+        <!--
+        The following dataset is not required as input to the
+        workflow, but only helps to delay running it.
+
+        The 24 hours offset is for a full day. And we subtract 2 hours, as
+        webrequest processing starts 2 hours after the respective hour.
+        -->
+        <data-in name="delay" dataset="webrequest_text">
+            <instance>${coord:current(24-2+hour_offset)}</instance>
+        </data-in>
+    </input-events>
+
+    <action>
+        <workflow>
+            <app-path>${workflow_file}</app-path>
+            <configuration>
+
+                <!-- Pass these properties through to the workflow -->
+                
<property><name>name_node</name><value>${name_node}</value></property>
+                
<property><name>job_tracker</name><value>${job_tracker}</value></property>
+                
<property><name>queue_name</name><value>${queue_name}</value></property>
+
+                <property>
+                    <name>hive_site_xml</name>
+                    <value>${hive_site_xml}</value>
+                </property>
+                <property>
+                    <name>webrequest_table</name>
+                    <value>${webrequest_table}</value>
+                </property>
+                <property>
+                    <name>year</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"yyyy")}</value>
+                </property>
+                <property>
+                    <name>month</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"MM")}</value>
+                </property>
+                <property>
+                    <name>day</name>
+                    <value>${coord:formatTime(coord:nominalTime(), 
"dd")}</value>
+                </property>
+                <property>
+                    <name>mark_directory_done_workflow_file</name>
+                    <value>${mark_directory_done_workflow_file}</value>
+                </property>
+                <property>
+                    <name>temporary_directory</name>
+                    <value>${temporary_directory}</value>
+                </property>
+                <property>
+                    <name>aspect_name</name>
+                    <value>${aspect_name}</value>
+                </property>
+                <property>
+                    <name>aspect_tsv_archive_directory</name>
+                    <value>${aspect_tsv_archive_directory}</value>
+                </property>
+                <property>
+                    <name>archive_job_output_workflow_file</name>
+                    <value>${archive_job_output_workflow_file}</value>
+                </property>
+            </configuration>
+        </workflow>
+    </action>
+</coordinator-app>
diff --git a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql 
b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
index 2990dee..ecbcadf 100644
--- a/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
+++ b/oozie/webrequest/legacy_tsvs/generate_5xx_tsv.hql
@@ -56,9 +56,8 @@
             ) line,
             dt
         FROM ${webrequest_table}
-        WHERE webrequest_source IN ('mobile', 'text')
+        WHERE webrequest_source IN ('misc', 'mobile', 'text')
             -- TODO: Add 'bits', once it's turned on again
-            -- TODO: Add 'misc', once it's available
             AND year=${year}
             AND month=${month}
             AND day=${day}

-- 
To view, visit https://gerrit.wikimedia.org/r/186776
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I083e5a9d4e08117ec585f1fe9058b1effd635a16
Gerrit-PatchSet: 3
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: QChris <[email protected]>
Gerrit-Reviewer: Ottomata <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to