Joal has submitted this change and it was merged.

Change subject: Archive hourly pageviews in legacy format
......................................................................


Archive hourly pageviews in legacy format

Legacy format, also called webstatcollector format in the past, is
used by all other page statistics dumps, so this change outputs
pageviews in the new definition with normalized article titles, but in
this common format.

Deployment plan:
1. merge aggregator change: https://gerrit.wikimedia.org/r/#/c/247323/
2. merge this change
3. deploy this change, restart the coordinators
4. rename the existing files generated by this coordinator
   to "views" instead of "counts" and move them
   to "legacy" from "webstatcollector"
5. merge puppet change: https://gerrit.wikimedia.org/r/247458
6. merge a future puppet change that syncs the output of these two
coordinators to pageviews (a new dataset that should deprecate
pageviews-all-sites)

Bug: T114379
Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3
---
M oozie/pageview/hourly/README.md
M oozie/pageview/hourly/coordinator.properties
M oozie/pageview/hourly/coordinator.xml
C oozie/pageview/hourly/transform_pageview_to_legacy_format.hql
M oozie/pageview/hourly/workflow.xml
M oozie/projectview/hourly/README.md
M oozie/projectview/hourly/coordinator.properties
R oozie/projectview/hourly/transform_projectview_to_legacy_format.hql
M oozie/projectview/hourly/workflow.xml
9 files changed, 194 insertions(+), 30 deletions(-)

Approvals:
  Ottomata: Looks good to me, but someone else must approve
  Joal: Verified; Looks good to me, approved



diff --git a/oozie/pageview/hourly/README.md b/oozie/pageview/hourly/README.md
index 0b21c9a..c75cb75 100644
--- a/oozie/pageview/hourly/README.md
+++ b/oozie/pageview/hourly/README.md
@@ -1,11 +1,14 @@
 # Aggregation phase for pageview from webrequest
+# and archive into legacy format.
 
-This job is responsible for filtering pageview from
-refined webrequest table, then aggregating them into
-interesting dimensions.
+This job is responsible for filtering pageview data
+from the refined webrequest table, aggregating it into
+interesting dimensions, and finally transforming and
+archiving it into legacy format.
 
 Output is appended into (year, month, day, hour) partitions
-in /wmf/data/wmf/pageview/hourly
+in /wmf/data/wmf/pageview/hourly, and then archived into
+/wmf/data/archive/pageview/legacy/hourly
 
 # Outline
 
diff --git a/oozie/pageview/hourly/coordinator.properties 
b/oozie/pageview/hourly/coordinator.properties
index 805b295..9917a65 100644
--- a/oozie/pageview/hourly/coordinator.properties
+++ b/oozie/pageview/hourly/coordinator.properties
@@ -1,5 +1,6 @@
 # Configures a coordinator to manage automatically aggregating pageview from
-# the refined webrequest table. Any of the following properties are overidable 
with -D.
+# the refined webrequest table, transforming to legacy format, and archiving.
+# Any of the following properties are overidable with -D.
 # Usage:
 #   oozie job -Duser=$USER -Dstart_time=2015-01-05T00:00Z -submit -config 
oozie/pageview/hourly/coordinator.properties
 #
@@ -35,8 +36,8 @@
 webrequest_data_directory         = ${name_node}/wmf/data/wmf/webrequest
 
 # HDFS path to pageview dataset definitions
-pageview_datasets_file          = ${oozie_directory}/pageview/datasets.xml
-pageview_data_directory         = ${name_node}/wmf/data/wmf/pageview
+pageview_datasets_file            = ${oozie_directory}/pageview/datasets.xml
+pageview_data_directory           = ${name_node}/wmf/data/wmf/pageview
 
 # Initial import time of the webrequest dataset.
 start_time                        = 2015-05-01T00:00Z
@@ -46,6 +47,8 @@
 
 # HDFS path to workflow to mark a directory as done
 mark_directory_done_workflow_file = 
${oozie_directory}/util/mark_directory_done/workflow.xml
+# HDFS path to workflow to archive output.
+archive_job_output_workflow_file  = 
${oozie_directory}/util/archive_job_output/workflow.xml
 
 # HDFS path to hive-site.xml file.  This is needed to run hive actions.
 hive_site_xml                     = ${oozie_directory}/util/hive/hive-site.xml
@@ -57,7 +60,17 @@
 # Record version to keep track of changes
 record_version                    = 0.0.3
 
+
+# Temporary directory for archiving
+temporary_directory               = ${name_node}/tmp
+
+# Archive base directory
+archive_directory                 = ${name_node}/wmf/data/archive
+
+# Archive directory for pageview_hourly_legacy_format
+pageview_archive_directory        = ${archive_directory}/pageview/legacy/hourly
+
 # Coordintator to start.
-oozie.coord.application.path     = ${coordinator_file}
+oozie.coord.application.path      = ${coordinator_file}
 oozie.use.system.libpath          = true
 oozie.action.external.stats.write = true
diff --git a/oozie/pageview/hourly/coordinator.xml 
b/oozie/pageview/hourly/coordinator.xml
index 16619cd..255d53c 100644
--- a/oozie/pageview/hourly/coordinator.xml
+++ b/oozie/pageview/hourly/coordinator.xml
@@ -27,6 +27,12 @@
         <property><name>hive_site_xml</name></property>
         <property><name>source_table</name></property>
         <property><name>destination_table</name></property>
+
+        <property><name>temporary_directory</name></property>
+        <property><name>pageview_archive_directory</name></property>
+
+        <property><name>mark_directory_done_workflow_file</name></property>
+        <property><name>archive_job_output_workflow_file</name></property>
     </parameters>
 
     <controls>
@@ -87,6 +93,7 @@
         <workflow>
             <app-path>${workflow_file}</app-path>
             <configuration>
+
                 <property>
                     <name>year</name>
                     <value>${coord:formatTime(coord:nominalTime(), 
"y")}</value>
@@ -109,6 +116,24 @@
                     <value>${coord:dataOut('pageview_hourly_output')}</value>
                 </property>
 
+                <!-- To mimic webstatcollector, file name must be the end of 
the aggregated hour-->
+                <property>
+                    <name>year_plus_1_hour</name>
+                    
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, "HOUR"), 
"yyyy")}</value>
+                </property>
+                <property>
+                    <name>month_plus_1_hour</name>
+                    
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, "HOUR"), 
"MM")}</value>
+                </property>
+                <property>
+                    <name>day_plus_1_hour</name>
+                    
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, "HOUR"), 
"dd")}</value>
+                </property>
+                <property>
+                    <name>hour_plus_1_hour</name>
+                    
<value>${coord:formatTime(coord:dateOffset(coord:nominalTime(), 1, "HOUR"), 
"HH")}</value>
+                </property>
+
             </configuration>
         </workflow>
     </action>
diff --git 
a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql 
b/oozie/pageview/hourly/transform_pageview_to_legacy_format.hql
similarity index 93%
copy from oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
copy to oozie/pageview/hourly/transform_pageview_to_legacy_format.hql
index 7cce87d..9dd56bf 100644
--- a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
+++ b/oozie/pageview/hourly/transform_pageview_to_legacy_format.hql
@@ -9,8 +9,8 @@
 --     hour              -- hour of partition to compute statistics for.
 --
 -- Usage:
---     hive -f transform_projectview_to_projectcounts.hql         \
---         -d source_table=wmf.projectview_hourly                 \
+--     hive -f transform_pageview_to_legacy_format.hql            \
+--         -d source_table=wmf.pageview_hourly                    \
 --         -d destination_directory=/tmp/example                  \
 --         -d year=2015                                           \
 --         -d month=6                                             \
@@ -28,7 +28,7 @@
     -- prepare the lines by hand through concatenation :-(
     -- Set 0 as volume column since we don't use it.
     SELECT
-        CONCAT_WS(" ", qualifier, "-", cast(view_count AS string), "0") line
+        CONCAT_WS(" ", qualifier, page_title, cast(view_count AS string), "0") 
line
     FROM (
         SELECT
             CONCAT(
@@ -70,7 +70,8 @@
                     ELSE NULL
                 END
             ) qualifier,
-            SUM(view_count) AS view_count
+            page_title,
+            SUM(view_count) as view_count
         FROM ${source_table}
         WHERE year=${year}
             AND month=${month}
@@ -116,8 +117,9 @@
                     WHEN 'wikidata' THEN '.wd'
                     ELSE NULL
                 END
-            )
-    ) projectview_transformed
+            ),
+            page_title
+    ) pageview_transformed
     ORDER BY line
-    LIMIT 100000
-;
\ No newline at end of file
+    LIMIT 100000000
+;
diff --git a/oozie/pageview/hourly/workflow.xml 
b/oozie/pageview/hourly/workflow.xml
index 8850abf..90da188 100644
--- a/oozie/pageview/hourly/workflow.xml
+++ b/oozie/pageview/hourly/workflow.xml
@@ -3,6 +3,7 @@
     
name="pageview-hourly-${source_table}->${destination_table}-${year},${month},${day},${hour}-wf">
 
     <parameters>
+
         <!-- Default values for inner oozie settings -->
         <property>
             <name>oozie_launcher_queue_name</name>
@@ -21,7 +22,7 @@
 
         <!-- Aggregation related configuration properties-->
         <property>
-            <name>hive_script</name>
+            <name>hive_script_aggregate</name>
             <!-- This is relative to the containing directory of this file. -->
             <value>pageview_hourly.hql</value>
             <description>Hive script to run.</description>
@@ -63,6 +64,44 @@
             <name>destination_dataset_directory</name>
             <description>Directory to generate the done flag in</description>
         </property>
+
+        <property>
+            <name>hive_script_transform</name>
+            <!-- This is relative to the containing directory of this file. -->
+            <value>transform_pageview_to_legacy_format.hql</value>
+            <description>Hive script to run for archiving with the legacy 
format used on dumps through 2015.</description>
+        </property>
+        <!-- To mimic webstatcollector, file name must be the end of the 
aggregated hour-->
+        <property>
+            <name>year_plus_1_hour</name>
+            <description>The partition's year plus one hour</description>
+        </property>
+        <property>
+            <name>month_plus_1_hour</name>
+            <description>The partition's month plus one hour</description>
+        </property>
+        <property>
+            <name>day_plus_1_hour</name>
+            <description>The partition's day plus one hour</description>
+        </property>
+        <property>
+            <name>hour_plus_1_hour</name>
+            <description>The partition's hour plus one hour</description>
+        </property>
+
+        <property>
+            <name>temporary_directory</name>
+            <description>A directory in HDFS for temporary files</description>
+        </property>
+        <property>
+            <name>pageview_archive_directory</name>
+            <description>Directory to archive the workflow output 
to</description>
+        </property>
+        <property>
+            <name>archive_job_output_workflow_file</name>
+            <description>Workflow to move a data file to the 
archive</description>
+        </property>
+
     </parameters>
 
     <start to="aggregate"/>
@@ -92,7 +131,7 @@
                 </property>
             </configuration>
 
-            <script>${hive_script}</script>
+            <script>${hive_script_aggregate}</script>
             <param>source_table=${source_table}</param>
             <param>destination_table=${destination_table}</param>
             <param>record_version=${record_version}</param>
@@ -102,11 +141,11 @@
             <param>hour=${hour}</param>
         </hive>
 
-        <ok to="mark_dataset_done"/>
+        <ok to="mark_aggregated_pageview_dataset_done"/>
         <error to="kill"/>
     </action>
 
-    <action name="mark_dataset_done">
+    <action name="mark_aggregated_pageview_dataset_done">
         <sub-workflow>
             <app-path>${mark_directory_done_workflow_file}</app-path>
             <configuration>
@@ -116,6 +155,88 @@
                 </property>
             </configuration>
         </sub-workflow>
+        <ok to="transform"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="transform">
+        <hive xmlns="uri:oozie:hive-action:0.2">
+            <job-tracker>${job_tracker}</job-tracker>
+            <name-node>${name_node}</name-node>
+            <job-xml>${hive_site_xml}</job-xml>
+            <configuration>
+                <property>
+                    <name>mapreduce.job.queuename</name>
+                    <value>${queue_name}</value>
+                </property>
+                <!--make sure oozie:launcher runs in a low priority queue -->
+                <property>
+                    <name>oozie.launcher.mapred.job.queue.name</name>
+                    <value>${oozie_launcher_queue_name}</value>
+                </property>
+                <property>
+                    <name>oozie.launcher.mapreduce.map.memory.mb</name>
+                    <value>${oozie_launcher_memory}</value>
+                </property>
+                <property>
+                    <name>hive.exec.scratchdir</name>
+                    <value>/tmp/hive-${user}</value>
+                </property>
+            </configuration>
+
+            <script>${hive_script_transform}</script>
+            <!-- Here, the source for archive is the
+                 destination of the previous job -->
+            <param>source_table=${destination_table}</param>
+            <param>year=${year}</param>
+            <param>month=${month}</param>
+            <param>day=${day}</param>
+            <param>hour=${hour}</param>
+            
<param>destination_directory=${temporary_directory}/${wf:id()}</param>
+        </hive>
+
+        <ok to="mark_transformed_pageview_dataset_done"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="mark_transformed_pageview_dataset_done">
+        <sub-workflow>
+            <app-path>${mark_directory_done_workflow_file}</app-path>
+            <configuration>
+                <property>
+                    <name>directory</name>
+                    <value>${temporary_directory}/${wf:id()}</value>
+                </property>
+            </configuration>
+        </sub-workflow>
+        <ok to="move_data_to_archive"/>
+        <error to="kill"/>
+    </action>
+
+    <action name="move_data_to_archive">
+        <sub-workflow>
+            <app-path>${archive_job_output_workflow_file}</app-path>
+            <propagate-configuration/>
+            <configuration>
+                <property>
+                    <name>source_directory</name>
+                    <value>${temporary_directory}/${wf:id()}</value>
+                </property>
+                <property>
+                    <name>expected_filename_ending</name>
+                    <value>EMPTY</value>
+                </property>
+                <property>
+                    <name>archive_file</name>
+                    <!--
+                    webstatscollector used the end of the collection period as
+                    timestamp in the filename. To not break scripts of people,
+                    we also name files that way.
+                    -->
+                    
<value>${pageview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/pageviews-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value>
+                </property>
+            </configuration>
+        </sub-workflow>
         <ok to="end"/>
         <error to="kill"/>
     </action>
diff --git a/oozie/projectview/hourly/README.md 
b/oozie/projectview/hourly/README.md
index 911de92..2a595bd 100644
--- a/oozie/projectview/hourly/README.md
+++ b/oozie/projectview/hourly/README.md
@@ -1,13 +1,13 @@
 # Aggregation phase for projectview from pageview
-# and archive into webstatcollector format.
+# and archive into legacy format.
 
 This job is responsible for aggregating projectview
 from pageview, and then transform/archive this data
-into webstatcollector format.
+into legacy format.
 
 Output is appended into (year, month, day, hour) partitions
 in /wmf/data/wmf/projectview/hourly, and then archived into
-/wmf/data/archive/projectview/webstatcollector/hourly
+/wmf/data/archive/projectview/legacy/hourly
 
 # Outline
 
diff --git a/oozie/projectview/hourly/coordinator.properties 
b/oozie/projectview/hourly/coordinator.properties
index 4064bec..c598270 100644
--- a/oozie/projectview/hourly/coordinator.properties
+++ b/oozie/projectview/hourly/coordinator.properties
@@ -66,8 +66,8 @@
 # Archive base directory
 archive_directory                 = ${name_node}/wmf/data/archive
 
-# Archive directory for projectview_hourly_webstatcollector_format
-projectview_archive_directory     = 
${archive_directory}/projectview/webstatcollector/hourly
+# Archive directory for projectview_hourly_legacy_format
+projectview_archive_directory     = 
${archive_directory}/projectview/legacy/hourly
 
 # Coordintator to start.
 oozie.coord.application.path      = ${coordinator_file}
diff --git 
a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql 
b/oozie/projectview/hourly/transform_projectview_to_legacy_format.hql
similarity index 98%
rename from oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
rename to oozie/projectview/hourly/transform_projectview_to_legacy_format.hql
index 7cce87d..4754f74 100644
--- a/oozie/projectview/hourly/transform_projectview_to_projectcounts.hql
+++ b/oozie/projectview/hourly/transform_projectview_to_legacy_format.hql
@@ -9,7 +9,7 @@
 --     hour              -- hour of partition to compute statistics for.
 --
 -- Usage:
---     hive -f transform_projectview_to_projectcounts.hql         \
+--     hive -f transform_projectview_to_legacy_format.hql         \
 --         -d source_table=wmf.projectview_hourly                 \
 --         -d destination_directory=/tmp/example                  \
 --         -d year=2015                                           \
@@ -120,4 +120,4 @@
     ) projectview_transformed
     ORDER BY line
     LIMIT 100000
-;
\ No newline at end of file
+;
diff --git a/oozie/projectview/hourly/workflow.xml 
b/oozie/projectview/hourly/workflow.xml
index 2d8ac0a..15eadd6 100644
--- a/oozie/projectview/hourly/workflow.xml
+++ b/oozie/projectview/hourly/workflow.xml
@@ -68,8 +68,8 @@
         <property>
             <name>hive_script_transform</name>
             <!-- This is relative to the containing directory of this file. -->
-            <value>transform_projectview_to_projectcounts.hql</value>
-            <description>Hive script to run for archiving with 
webstatcollector format.</description>
+            <value>transform_projectview_to_legacy_format.hql</value>
+            <description>Hive script to run for archiving with the legacy 
format used on dumps through 2015.</description>
         </property>
         <!-- To mimic webstatcollector, file name must be the end of the 
aggregated hour-->
         <property>
@@ -233,7 +233,7 @@
                     timestamp in the filename. To not break scripts of people,
                     we also name files that way.
                     -->
-                    
<value>${projectview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/projectcounts-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value>
+                    
<value>${projectview_archive_directory}/${year_plus_1_hour}/${year_plus_1_hour}-${month_plus_1_hour}/projectviews-${year_plus_1_hour}${month_plus_1_hour}${day_plus_1_hour}-${hour_plus_1_hour}0000</value>
                 </property>
             </configuration>
         </sub-workflow>

-- 
To view, visit https://gerrit.wikimedia.org/r/246149
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I98cbd753433de7dbf9fc3dfde3e97c79ddca31d3
Gerrit-PatchSet: 7
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Joal <j...@wikimedia.org>
Gerrit-Reviewer: Milimetric <dandree...@wikimedia.org>
Gerrit-Reviewer: Nuria <nu...@wikimedia.org>
Gerrit-Reviewer: Ottomata <o...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to