Yurik has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/168168

Change subject: create table if doesn't exist, runner script
......................................................................

create table if doesn't exist, runner script

Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
---
A scripts/run-hivezero.sh
M scripts/zero-counts.hql
2 files changed, 64 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/zero-sms 
refs/changes/68/168168/1

diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
new file mode 100644
index 0000000..c1a8e96
--- /dev/null
+++ b/scripts/run-hivezero.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [[ -z "$4" ]]; then
+       last=$3
+else
+       last=$4
+fi
+
+for ((day = $3; day <= $last; day++)); do
+       printf -v p "%04d-%02d-%02d" $1 $2 $day
+       echo hive -f zero-counts.hql -d "year="$1 -d "month="$2 -d "day="$3 -d 
"date="$p
+       hive -f zero-counts.hql -d "year="$1 -d "month="$2 -d "day="$3 -d 
"date="$p
+done
diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql
index 67a7fb6..2e7d0db 100644
--- a/scripts/zero-counts.hql
+++ b/scripts/zero-counts.hql
@@ -5,25 +5,38 @@
 -- Extracts zero stats from webrequests into a separate table
 --
 -- Usage:
---     hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15
+--     hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d 
date=2014-09-15
+--     Date is duplicated because I haven't figured an easy way to set 
date=printf()
 --
--- Range usage:
---      cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d 
month=10 -d "day="%
--- or
---      for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10 
-d "day="$i; done
-
 -- set hivevar:year=2014;
 -- set hivevar:month=10;
 -- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS yurik.zero_webstats (
+  xcs string,
+  via string,
+  ipset string,
+  https string,
+  lang string,
+  subdomain string,
+  site string,
+  count bigint)
+PARTITIONED BY (
+  date string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
 
 
 INSERT OVERWRITE TABLE yurik.zero_webstats
-    PARTITION(date) IF NOT EXISTS
+    PARTITION(date="${date}") IF NOT EXISTS
     SELECT
-        xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date
+        xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
     FROM (
         SELECT
-            printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
             regexp_extract(x_analytics, 'zero=([^\;]+)') xcs,
             regexp_extract(x_analytics, 'proxy=([^\;]+)') via,
             regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset,
@@ -58,5 +71,32 @@
             AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
 
     ) prepared
-    GROUP BY date, xcs, via, ipset, https, lang, subdomain, site
-    DISTRIBUTE BY date;
+    GROUP BY xcs, via, ipset, https, lang, subdomain, site
+    DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});
+
+
+-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum (
+--   date string,
+--   tag string,
+--   count bigint)
+-- PARTITIONED BY (
+--   xcs string)
+-- ROW FORMAT DELIMITED
+--   FIELDS TERMINATED BY '\t';
+
+--   xcs string,
+--   via string,
+--   ipset string,
+--   https string,
+--   lang string,
+--   subdomain string,
+--   site string,
+--   count bigint)
+
+-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum
+--     PARTITION(xcs)
+--     SELECT
+--         date, tag, COUNT(*) count, xcs
+--     FROM yurik.zero_webstats
+--     GROUP BY xcs, date, tag
+--     DISTRIBUTE BY xcs;

-- 
To view, visit https://gerrit.wikimedia.org/r/168168
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to