Yurik has submitted this change and it was merged.
Change subject: create table if doesn't exist, runner script
......................................................................
create table if doesn't exist, runner script
Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
---
A scripts/run-hivezero.sh
M scripts/zero-counts.hql
2 files changed, 64 insertions(+), 11 deletions(-)
Approvals:
Yurik: Verified; Looks good to me, approved
diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
new file mode 100644
index 0000000..c1a8e96
--- /dev/null
+++ b/scripts/run-hivezero.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+if [[ -z "$4" ]]; then
+ last=$3
+else
+ last=$4
+fi
+
+for ((day = $3; day <= $last; day++)); do
+ printf -v p "%04d-%02d-%02d" $1 $2 $day
+ echo hive -f zero-counts.hql -d "year="$1 -d "month="$2 -d "day="$3 -d
"date="$p
+ hive -f zero-counts.hql -d "year="$1 -d "month="$2 -d "day="$3 -d
"date="$p
+done
diff --git a/scripts/zero-counts.hql b/scripts/zero-counts.hql
index 67a7fb6..2e7d0db 100644
--- a/scripts/zero-counts.hql
+++ b/scripts/zero-counts.hql
@@ -5,25 +5,38 @@
-- Extracts zero stats from webrequests into a separate table
--
-- Usage:
--- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15
+-- hive -f zero-counts.hql -d year=2014 -d month=9 -d day=15 -d
date=2014-09-15
+-- Date is duplicated because I haven't figured an easy way to set
date=printf()
--
--- Range usage:
--- cat inp.txt | xargs -I % hive -f zero-counts.hql -d year=2014 -d
month=10 -d "day="%
--- or
--- for i in {1..5}; do hive -f zero-counts.hql -d year=2014 -d month=10
-d "day="$i; done
-
-- set hivevar:year=2014;
-- set hivevar:month=10;
-- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS yurik.zero_webstats (
+ xcs string,
+ via string,
+ ipset string,
+ https string,
+ lang string,
+ subdomain string,
+ site string,
+ count bigint)
+PARTITIONED BY (
+ date string)
+ROW FORMAT DELIMITED
+ FIELDS TERMINATED BY '\t';
INSERT OVERWRITE TABLE yurik.zero_webstats
- PARTITION(date) IF NOT EXISTS
+ PARTITION(date="${date}") IF NOT EXISTS
SELECT
- xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count, date
+ xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
FROM (
SELECT
- printf('%d-%02d-%02d', ${year}, ${month}, ${day}) date,
regexp_extract(x_analytics, 'zero=([^\;]+)') xcs,
regexp_extract(x_analytics, 'proxy=([^\;]+)') via,
regexp_extract(x_analytics, 'zeronet=([^\;]+)') ipset,
@@ -58,5 +71,32 @@
AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0]
RLIKE '^[Uu]ndefined$')
) prepared
- GROUP BY date, xcs, via, ipset, https, lang, subdomain, site
- DISTRIBUTE BY date;
+ GROUP BY xcs, via, ipset, https, lang, subdomain, site
+ DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});
+
+
+-- CREATE TABLE IF NOT EXISTS yurik.zero_webstats_sum (
+-- date string,
+-- tag string,
+-- count bigint)
+-- PARTITIONED BY (
+-- xcs string)
+-- ROW FORMAT DELIMITED
+-- FIELDS TERMINATED BY '\t';
+
+-- xcs string,
+-- via string,
+-- ipset string,
+-- https string,
+-- lang string,
+-- subdomain string,
+-- site string,
+-- count bigint)
+
+-- INSERT OVERWRITE TABLE yurik.zero_webstats_sum
+-- PARTITION(xcs)
+-- SELECT
+-- date, tag, COUNT(*) count, xcs
+-- FROM yurik.zero_webstats
+-- GROUP BY xcs, date, tag
+-- DISTRIBUTE BY xcs;
--
To view, visit https://gerrit.wikimedia.org/r/168168
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I29f04352801dabc312d08664e9521ab115d5e498
Gerrit-PatchSet: 1
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik <[email protected]>
Gerrit-Reviewer: Yurik <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits