[MediaWiki-commits] [Gerrit] analytics/refinery[master]: [wip] Add pageview by country oozie jobs

Fdans (Code Review) Wed, 29 Nov 2017 06:33:31 -0800

Fdans has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394062 )


Change subject: [wip] Add pageview by country oozie jobs
......................................................................

[wip] Add pageview by country oozie jobs

Adds coordinators and hive scripts necessary to load pageviews by country
data into cassandra.

Bug: T181521
Change-Id: I35b7736c07c5f2346cecf6c388dd252632aab88c
---
M oozie/cassandra/bundle.xml
A oozie/cassandra/coord_pageviews_bycountry_daily.properties
A oozie/cassandra/coord_pageviews_bycountry_monthly.properties
M oozie/cassandra/coord_top_articles_daily.properties
A oozie/cassandra/daily/pageview_top_bycountry.hql
A oozie/cassandra/monthly/pageview_top_bycountry.sql
6 files changed, 593 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/62/394062/1

diff --git a/oozie/cassandra/bundle.xml b/oozie/cassandra/bundle.xml
index 0a4e656..99b8226 100644
--- a/oozie/cassandra/bundle.xml
+++ b/oozie/cassandra/bundle.xml
@@ -211,6 +211,59 @@
         </property>
 
         <!--**************************************************
+                       Pageviews by country parameters
+        **************************************************-->
+
+        <property>
+            <name>top_bycountry_datasets_file</name>
+            <value>${pageview_datasets_file}</value>
+        </property>
+        <property>
+            <name>top_bycountry_dataset_name</name>
+            <value>pageview_hourly</value>
+        </property>
+        <property>
+            <name>top_bycountry_dataset_freq</name>
+            <value>hour</value>
+        </property>
+        <property>
+            <name>top_bycountry_hive_script</name>
+            <value>pageview_top_bycountry.hql</value>
+        </property>
+        <property>
+            <name>top_bycountry_source_table</name>
+            <value>wmf.pageview_hourly</value>
+        </property>
+        <property>
+            <name>top_bycountry_parallel_loaders</name>
+            <value>3</value>
+        </property>
+        <property>
+            <name>top_bycountry_keyspace</name>
+            <value>local_group_default_T_top_pageviews_bycountry</value>
+        </property>
+        <property>
+            <name>top_bycountry_cql</name>
+            <value>UPDATE "${top_articles_keyspace}"."data" SET 
"countriesJSON" = ?</value>
+        </property>
+        <property>
+            <name>top_bycountry_hive_fields</name>
+            <value>project,year,month,day,countriesJSON</value>
+        </property>
+        <property>
+            <name>top_bycountry_hive_fields_types</name>
+            <value>text,text,text,text,text</value>
+        </property>
+        <property>
+            <name>top_bycountry_cassandra_fields</name>
+            <value>countriesJSON</value>
+        </property>
+        <property>
+            <name>top_bycountry_cassandra_primary_keys</name>
+            <value>_domain,project,year,month,day,_tid</value>
+        </property>
+
+        <!--**************************************************
                        Unique devices parameters
         **************************************************-->
 
@@ -660,6 +713,142 @@
         </configuration>
     </coordinator>
 
+    <!--**************************************************
+                       Pageviews by country coordinators
+        **************************************************-->
+
+    <coordinator name="cassandra-coord-pageviews-bycountry-daily">
+        <app-path>${coordinator_file_daily}</app-path>
+        <configuration>
+            <property>
+                <name>workflow_file</name>
+                <value>${workflow_file_daily}</value>
+            </property>
+
+            <!-- Dataset the job depends on -->
+            <property>
+                <name>datasets_file</name>
+                <value>${top_bycountry_datasets_file}</value>
+            </property>
+            <property>
+                <name>dataset_name</name>
+                <value>${top_bycountry_dataset_name}</value>
+            </property>
+            <!-- frequency of the dataset to manage different datasets
+                 correctly in coordinator -->
+            <property>
+                <name>dataset_freq</name>
+                <value>${top_bycountry_dataset_freq}</value>
+            </property>
+
+            <!-- Hive data preparation job stuff -->
+            <property>
+                <name>hive_script</name>
+                <value>${top_bycountry_hive_script}</value>
+            </property>
+            <property>
+                <name>source_table</name>
+                <value>${top_bycountry_source_table}</value>
+            </property>
+
+            <!-- cassandra loader job stuff -->
+            <property>
+                <name>cassandra_parallel_loaders</name>
+                <value>${top_bycountry_parallel_loaders}</value>
+            </property>
+            <property>
+                <name>cassandra_cql</name>
+                <value>${top_bycountry_cql}</value>
+            </property>
+            <property>
+                <name>cassandra_keyspace</name>
+                <value>${top_bycountry_keyspace}</value>
+            </property>
+            <property>
+                <name>hive_fields</name>
+                <value>${top_bycountry_hive_fields}</value>
+            </property>
+            <property>
+                <name>hive_fields_types</name>
+                <value>${top_bycountry_hive_fields_types}</value>
+            </property>
+            <property>
+                <name>cassandra_fields</name>
+                <value>${top_bycountry_cassandra_fields}</value>
+            </property>
+            <property>
+                <name>cassandra_primary_keys</name>
+                <value>${top_bycountry_cassandra_primary_keys}</value>
+            </property>
+        </configuration>
+    </coordinator>
+
+    <coordinator name="cassandra-coord-pageview-top-articles-monthly">
+        <app-path>${coordinator_file_monthly}</app-path>
+        <configuration>
+            <property>
+                <name>workflow_file</name>
+                <value>${workflow_file_monthly}</value>
+            </property>
+
+            <!-- Dataset the job depends on -->
+            <property>
+                <name>datasets_file</name>
+                <value>${top_bycountry_datasets_file}</value>
+            </property>
+            <property>
+                <name>dataset_name</name>
+                <value>${top_bycountry_dataset_name}</value>
+            </property>
+            <!-- frequency of the dataset to manage different datasets
+                 correctly in coordinator -->
+            <property>
+                <name>dataset_freq</name>
+                <value>${top_bycountry_dataset_freq}</value>
+            </property>
+
+            <!-- Hive data preparation job stuff -->
+            <property>
+                <name>hive_script</name>
+                <value>${top_bycountry_hive_script}</value>
+            </property>
+            <property>
+                <name>source_table</name>
+                <value>${top_bycountry_source_table}</value>
+            </property>
+
+            <!-- cassandra loader job stuff -->
+            <property>
+                <name>cassandra_parallel_loaders</name>
+                <value>${top_bycountry_parallel_loaders}</value>
+            </property>
+            <property>
+                <name>cassandra_cql</name>
+                <value>${top_bycountry_cql}</value>
+            </property>
+            <property>
+                <name>cassandra_keyspace</name>
+                <value>${top_bycountry_keyspace}</value>
+            </property>
+            <property>
+                <name>hive_fields</name>
+                <value>${top_bycountry_hive_fields}</value>
+            </property>
+            <property>
+                <name>hive_fields_types</name>
+                <value>${top_bycountry_hive_fields_types}</value>
+            </property>
+            <property>
+                <name>cassandra_fields</name>
+                <value>${top_bycountry_cassandra_fields}</value>
+            </property>
+            <property>
+                <name>cassandra_primary_keys</name>
+                <value>${top_bycountry_cassandra_primary_keys}</value>
+            </property>
+        </configuration>
+    </coordinator>
+
 
     <!--**************************************************
                        Unique devices coordinators
diff --git a/oozie/cassandra/coord_pageviews_bycountry_daily.properties 
b/oozie/cassandra/coord_pageviews_bycountry_daily.properties
new file mode 100644
index 0000000..89f195a
--- /dev/null
+++ b/oozie/cassandra/coord_pageviews_bycountry_daily.properties
@@ -0,0 +1,99 @@
+# Configures a coordinator to manage loading cassandra for the top countries 
daily
+# pageview API.Any of the following properties are overidable with -D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-05-05T00:00Z -submit -config 
oozie/cassandra/coord_top_articles_daily.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should
+# override this to point directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory.
+# E.g.  /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# HDFS path to the refinery job jar that will be used by this job.
+refinery_cassandra_jar_path       = 
${refinery_directory}/artifacts/org/wikimedia/analytics/refinery/refinery-cassandra-0.0.35.jar
+cassandra_reducer_class           = 
org.wikimedia.analytics.refinery.cassandra.ReducerToCassandra
+cassandra_output_format_class     = 
org.wikimedia.analytics.refinery.cassandra.CqlOutputFormat
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinators to run.
+coordinator_file_hourly           = 
${oozie_directory}/cassandra/hourly/coordinator.xml
+coordinator_file_daily            = 
${oozie_directory}/cassandra/daily/coordinator.xml
+coordinator_file_monthly          = 
${oozie_directory}/cassandra/monthly/coordinator.xml
+
+# HDFS path to workflows to run.
+workflow_file_hourly              = 
${oozie_directory}/cassandra/hourly/workflow.xml
+workflow_file_daily               = 
${oozie_directory}/cassandra/daily/workflow.xml
+workflow_file_monthly             = 
${oozie_directory}/cassandra/monthly/workflow.xml
+
+# Workflow to send an error email
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# HDFS path to datasets definitions
+pageview_datasets_file            = ${oozie_directory}/pageview/datasets.xml
+projectview_datasets_file         = ${oozie_directory}/projectview/datasets.xml
+pageview_data_directory           = ${name_node}/wmf/data/wmf/pageview
+projectview_data_directory        = ${name_node}/wmf/data/wmf/projectview
+dataset_freq                      = hour
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2015-05-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                     = ${name_node}/user/hive/hive-site.xml
+# Temporary directory
+temporary_directory               = ${name_node}/tmp
+
+# Cassandra cluster info
+cassandra_host                    = aqs1004-a.eqiad.wmnet
+cassandra_port                    = 9042
+cassandra_username                = aqsloader
+cassandra_password                = cassandra
+cassandra_nodes                   = 6
+batch_size                        = 1024
+
+# Hive value separator
+hive_value_separator              = \\t
+# Cassandra table to be loaded (not job dependant)
+cassandra_table                   = data
+
+# Constant field names and value to be loaded into cassandra
+constant_output_domain_field      = _domain
+constant_output_domain_value      = analytics.wikimedia.org,text
+constant_output_granularity_field = granularity
+constant_output_tid_field         = _tid
+constant_output_tid_value         = 0,timeuuid
+
+workflow_file                     = ${workflow_file_daily}
+datasets_file                     = ${pageview_datasets_file}
+dataset_name                      = pageview_hourly
+hive_script                       = pageview_top_countries.hql
+source_table                      = wmf.pageview_hourly
+cassandra_parallel_loaders        = 6
+cassandra_keyspace                = 
local_group_default_T_top_pageviews_bycountry
+cassandra_cql                     = UPDATE "${cassandra_keyspace}"."data" SET 
"countriesJSON" = ?
+hive_fields                       = project,year,month,day,countriesJSON
+hive_fields_types                 = text,text,text,text,text
+cassandra_fields                  = countriesJSON
+cassandra_primary_keys            = _domain,project,year,month,day,_tid
+
+# Coordintator to start.
+oozie.coord.application.path     = ${coordinator_file_daily}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/cassandra/coord_pageviews_bycountry_monthly.properties 
b/oozie/cassandra/coord_pageviews_bycountry_monthly.properties
new file mode 100644
index 0000000..01ee58a
--- /dev/null
+++ b/oozie/cassandra/coord_pageviews_bycountry_monthly.properties
@@ -0,0 +1,101 @@
+# Configures a coordinator to manage loading cassandra for the top countries 
monthly
+# pageview API.Any of the following properties are overidable with -D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2015-05-05T00:00Z -submit -config 
oozie/cassandra/coord_top_articles_monthly.properties
+#
+# NOTE:  The $oozie_directory must be synced to HDFS so that all relevant
+#        .xml files exist there when this job is submitted.
+
+
+name_node                         = hdfs://analytics-hadoop
+job_tracker                       = resourcemanager.analytics.eqiad.wmnet:8032
+queue_name                        = default
+
+user                              = hdfs
+
+# Base path in HDFS to refinery.
+# When submitting this job for production, you should
+# override this to point directly at a deployed
+# directory name, and not the 'symbolic' 'current' directory.
+# E.g.  /wmf/refinery/2015-01-05T17.59.18Z--7bb7f07
+refinery_directory                = ${name_node}/wmf/refinery/current
+
+# HDFS path to the refinery job jar that will be used by this job.
+refinery_cassandra_jar_path       = 
${refinery_directory}/artifacts/org/wikimedia/analytics/refinery/refinery-cassandra-0.0.35.jar
+cassandra_reducer_class           = 
org.wikimedia.analytics.refinery.cassandra.ReducerToCassandra
+cassandra_output_format_class     = 
org.wikimedia.analytics.refinery.cassandra.CqlOutputFormat
+
+# Base path in HDFS to oozie files.
+# Other files will be used relative to this path.
+oozie_directory                   = ${refinery_directory}/oozie
+
+# HDFS path to coordinators to run.
+coordinator_file_hourly           = 
${oozie_directory}/cassandra/hourly/coordinator.xml
+coordinator_file_daily            = 
${oozie_directory}/cassandra/daily/coordinator.xml
+coordinator_file_monthly          = 
${oozie_directory}/cassandra/monthly/coordinator.xml
+
+# HDFS path to workflows to run.
+workflow_file_hourly              = 
${oozie_directory}/cassandra/hourly/workflow.xml
+workflow_file_daily               = 
${oozie_directory}/cassandra/daily/workflow.xml
+workflow_file_monthly             = 
${oozie_directory}/cassandra/monthly/workflow.xml
+
+# Workflow to send an error email
+send_error_email_workflow_file    = 
${oozie_directory}/util/send_error_email/workflow.xml
+
+# HDFS path to datasets definitions
+pageview_datasets_file            = ${oozie_directory}/pageview/datasets.xml
+projectview_datasets_file         = ${oozie_directory}/projectview/datasets.xml
+pageview_data_directory           = ${name_node}/wmf/data/wmf/pageview
+projectview_data_directory        = ${name_node}/wmf/data/wmf/projectview
+# provides the frequency of the dataset used to manage
+#different datasets correctly in coordinator
+dataset_freq                      = hour
+
+# Initial import time of the webrequest dataset.
+start_time                        = 2015-05-01T00:00Z
+
+# Time to stop running this coordinator.  Year 3000 == never!
+stop_time                         = 3000-01-01T00:00Z
+
+# HDFS path to hive-site.xml file.  This is needed to run hive actions.
+hive_site_xml                     = ${name_node}/user/hive/hive-site.xml
+# Temporary directory
+temporary_directory               = ${name_node}/tmp
+
+# Cassandra cluster info
+cassandra_host                    = aqs1004-a.eqiad.wmnet
+cassandra_port                    = 9042
+cassandra_username                = aqsloader
+cassandra_password                = cassandra
+cassandra_nodes                   = 6
+batch_size                        = 1024
+
+# Hive value separator
+hive_value_separator              = \\t
+# Cassandra table to be loaded (not job dependant)
+cassandra_table                   = data
+
+# Constant field names and value to be loaded into cassandra
+constant_output_domain_field      = _domain
+constant_output_domain_value      = analytics.wikimedia.org,text
+constant_output_granularity_field = granularity
+constant_output_tid_field         = _tid
+constant_output_tid_value         = 0,timeuuid
+
+workflow_file                     = ${workflow_file_monthly}
+datasets_file                     = ${pageview_datasets_file}
+dataset_name                      = pageview_hourly
+hive_script                       = pageview_top_countries.hql
+source_table                      = wmf.pageview_hourly
+cassandra_parallel_loaders        = 6
+cassandra_keyspace                = 
local_group_default_T_top_pageviews_bycountry
+cassandra_cql                     = UPDATE "${cassandra_keyspace}"."data" SET 
"countriesJSON" = ?
+hive_fields                       = project,year,month,day,countriesJSON
+hive_fields_types                 = text,text,text,text,text
+cassandra_fields                  = countriesJSON
+cassandra_primary_keys            = _domain,project,year,month,day,_tid
+
+# Coordintator to start.
+oozie.coord.application.path      = ${coordinator_file_monthly}
+oozie.use.system.libpath          = true
+oozie.action.external.stats.write = true
\ No newline at end of file
diff --git a/oozie/cassandra/coord_top_articles_daily.properties 
b/oozie/cassandra/coord_top_articles_daily.properties
index 0c4c7c1..5d41246 100644
--- a/oozie/cassandra/coord_top_articles_daily.properties
+++ b/oozie/cassandra/coord_top_articles_daily.properties
@@ -83,15 +83,15 @@
 workflow_file                     = ${workflow_file_daily}
 datasets_file                     = ${pageview_datasets_file}
 dataset_name                      = pageview_hourly
-hive_script                       = pageview_top_articles.hql
+hive_script                       = pageview_top_countries.hql
 source_table                      = wmf.pageview_hourly
 cassandra_parallel_loaders        = 6
-cassandra_keyspace                = local_group_default_T_top_pageviews
-cassandra_cql                     = UPDATE "${cassandra_keyspace}"."data" SET 
"articlesJSON" = ?
-hive_fields                       = project,access,year,month,day,articlesJSON
-hive_fields_types                 = text,text,text,text,text,text
-cassandra_fields                  = articlesJSON
-cassandra_primary_keys            = _domain,project,access,year,month,day,_tid
+cassandra_keyspace                = 
local_group_default_T_top_pageviews_bycountry
+cassandra_cql                     = UPDATE "${cassandra_keyspace}"."data" SET 
"countriesJSON" = ?
+hive_fields                       = project,year,month,day,countriesJSON
+hive_fields_types                 = text,text,text,text,text
+cassandra_fields                  = countriesJSON
+cassandra_primary_keys            = _domain,project,year,month,day,_tid
 
 # Coordintator to start.
 oozie.coord.application.path     = ${coordinator_file_daily}
diff --git a/oozie/cassandra/daily/pageview_top_bycountry.hql 
b/oozie/cassandra/daily/pageview_top_bycountry.hql
new file mode 100644
index 0000000..72aaa48
--- /dev/null
+++ b/oozie/cassandra/daily/pageview_top_bycountry.hql
@@ -0,0 +1,102 @@
+-- Parameters:
+--     destination_directory -- HDFS path to write output files
+--     source_table          -- Fully qualified table name to compute from.
+--     separator             -- Separator for values
+--     year                  -- year of partition to compute from.
+--     month                 -- month of partition to compute from.
+--     day                   -- day of partition to compute from.
+--
+-- Usage:
+--     hive -f pageview_top_bycountry.hql                         \
+--         -d destination_directory=/tmp/pageview_top_bycountry   \
+--         -d source_table=wmf.pageview_hourly                    \
+--         -d separator=\t                                        \
+--         -d year=2015                                           \
+--         -d month=5                                             \
+--         -d day=1                                               \
+--
+
+
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+
+
+WITH ranked AS (
+    SELECT
+        project,
+        country,
+        year,
+        month,
+        day,
+        views,
+        rank() OVER (PARTITION BY project, year, month, day ORDER BY views 
DESC) as rank,
+        row_number() OVER (PARTITION BY project, year, month, day ORDER BY 
views DESC) as rn
+    FROM (
+        SELECT
+            project,
+            country,
+            LPAD(year, 4, "0") as year,
+            LPAD(month, 2, "0") as month,
+            LPAD(day, 2, "0") as day,
+            SUM(view_count) as views
+        FROM ${source_table}
+        WHERE
+            year = ${year}
+            AND month = ${month}
+            AND day = ${day}
+            -- Remove special unknown pageview (see T117346)
+            AND page_title != '-'
+        GROUP BY project, country, year, month, day
+        GROUPING SETS (
+            (project, country, year, month, day)
+        )
+    ) raw
+),
+max_rank AS (
+    SELECT
+        project,
+        year,
+        month,
+        day,
+        rank as max_rank
+    FROM ranked
+    WHERE
+        rn = 1001
+    GROUP BY
+        project,
+        year,
+        month,
+        day,
+        rank
+)
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+-- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+-- works for exports to local directories (see HIVE-5672), we have to
+-- prepare the lines by hand through concatenation :-(
+SELECT
+    CONCAT_WS("${separator}",
+        ranked.project,
+        ranked.year,
+        ranked.month,
+        ranked.day,
+        CONCAT('[',
+            CONCAT_WS(',', collect_set(
+                CONCAT('{"country":"', ranked.country,
+                    '","views":', CAST(ranked.views AS STRING),
+                    ',"rank":', CAST(ranked.rank AS STRING), '}'))
+            ),']')
+    )
+FROM ranked
+LEFT JOIN max_rank ON (
+    ranked.project = max_rank.project
+    AND ranked.year = max_rank.year
+    AND ranked.month = max_rank.month
+    AND ranked.day = max_rank.day
+)
+WHERE ranked.rank < COALESCE(max_rank.max_rank, 1001)
+GROUP BY
+    ranked.project,
+    ranked.year,
+    ranked.month,
+    ranked.day
+;
\ No newline at end of file
diff --git a/oozie/cassandra/monthly/pageview_top_bycountry.sql 
b/oozie/cassandra/monthly/pageview_top_bycountry.sql
new file mode 100644
index 0000000..05ed92b
--- /dev/null
+++ b/oozie/cassandra/monthly/pageview_top_bycountry.sql
@@ -0,0 +1,95 @@
+-- Parameters:
+--     destination_directory -- HDFS path to write output files
+--     source_table          -- Fully qualified table name to compute from.
+--     separator             -- Separator for values
+--     year                  -- year of partition to compute from.
+--     month                 -- month of partition to compute from.
+--     day                   -- day of partition to compute from.
+--
+-- Usage:
+--     hive -f pageview_top_bycountry.hql                         \
+--         -d destination_directory=/tmp/pageview_top_bycountry   \
+--         -d source_table=wmf.pageview_hourly                    \
+--         -d separator=\t                                        \
+--         -d year=2015                                           \
+--         -d month=5                                             \
+--         -d day=1                                               \
+--
+
+
+SET hive.exec.compress.output=true;
+SET 
mapreduce.output.fileoutputformat.compress.codec=org.apache.hadoop.io.compress.GzipCodec;
+
+
+WITH ranked AS (
+    SELECT
+        project,
+        country,
+        year,
+        month,
+        views,
+        rank() OVER (PARTITION BY project, access, year, month ORDER BY views 
DESC) as rank,
+        row_number() OVER (PARTITION BY project, access, year, month ORDER BY 
views DESC) as rn
+    FROM (
+        SELECT
+            project,
+            country,
+            LPAD(year, 4, "0") as year,
+            LPAD(month, 2, "0") as month,
+            SUM(view_count) as views
+        FROM ${source_table}
+        WHERE
+            year = ${year}
+            AND month = ${month}
+            -- Remove special unknown pageview (see T117346)
+            AND page_title != '-'
+        GROUP BY project, country, year, month
+        GROUPING SETS (
+            (project, country, year, month)
+        )
+    ) raw
+),
+max_rank AS (
+    SELECT
+        project,
+        year,
+        month,
+        rank as max_rank
+    FROM ranked
+    WHERE
+        rn = 1001
+    GROUP BY
+        project,
+        year,
+        month,
+        rank
+)
+INSERT OVERWRITE DIRECTORY "${destination_directory}"
+-- Since "ROW FORMAT DELIMITED DELIMITED FIELDS TERMINATED BY ' '" only
+-- works for exports to local directories (see HIVE-5672), we have to
+-- prepare the lines by hand through concatenation :-(
+SELECT
+    CONCAT_WS("${separator}",
+        ranked.project,
+        ranked.year,
+        ranked.month,
+        'all-days',
+        CONCAT('[',
+            CONCAT_WS(',', collect_set(
+                CONCAT('{"country":"', ranked.page_title,
+                    '","views":', CAST(ranked.views AS STRING),
+                    ',"rank":', CAST(ranked.rank AS STRING), '}'))
+            ),']')
+    )
+FROM ranked
+LEFT JOIN max_rank ON (
+    ranked.project = max_rank.project
+    AND ranked.year = max_rank.year
+    AND ranked.month = max_rank.month
+)
+WHERE ranked.rank < COALESCE(max_rank.max_rank, 1001)
+GROUP BY
+    ranked.project,
+    ranked.year,
+    ranked.month
+;
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/394062
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I35b7736c07c5f2346cecf6c388dd252632aab88c
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Fdans <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] analytics/refinery[master]: [wip] Add pageview by country oozie jobs

Reply via email to