Yurik has submitted this change and it was merged.

Change subject: Generified scripts, extra script for new hive fmt
......................................................................


Generified scripts, extra script for new hive fmt

Change-Id: If6a01ec549ec8d78efe27d0bce176bb694a71889
---
M scripts/run-hivezero.sh
M scripts/weblogs2.py
A scripts/zero-counts2.hql
A scripts/zero-counts3.hql
4 files changed, 187 insertions(+), 15 deletions(-)

Approvals:
  Yurik: Verified; Looks good to me, approved



diff --git a/scripts/run-hivezero.sh b/scripts/run-hivezero.sh
index 0081304..35c2855 100755
--- a/scripts/run-hivezero.sh
+++ b/scripts/run-hivezero.sh
@@ -1,8 +1,9 @@
 #!/bin/bash
 
-#                   $1                 $2   $3 $4 %5 $6
-# ./run-hivezero.sh wmf_raw.webrequest 2014 10 1  31
-# ./run-hivezero.sh webreq_archive     2014 10 1  31 overwrite
+#                   $1                 $2   $3 $4 %5 $6              $7        
       $8
+# ./run-hivezero.sh wmf_raw.webrequest 2014 10 1  31 zero_webstats   
zero-counts.hql
+# ./run-hivezero.sh wmf.webrequest     2014 10 1  31 zero_webstats2  
zero-counts2.hql
+# ./run-hivezero.sh webreq_archive     2014 10 1  31 zero_webstats__ 
zero-counts.hql  overwrite
 
 set -e
 
@@ -23,6 +24,19 @@
        monthTo=$3
 fi
 
+if [[ -z "$6" ]]; then
+       dsttable=zero_webstats2
+else
+       dsttable=$6
+fi
+
+if [[ -z "$7" ]]; then
+       script=zero-counts2.hql
+else
+       script=$7
+fi
+
+
 for ((month = $monthFrom; month <= $monthTo; month++)); do
 for ((day = $4; day <= $last; day++)); do
 
@@ -32,6 +46,8 @@
 
                if [[ "$table" == 'wmf_raw.webrequest' ]]; then
                        
path="/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/$year/$(printf 
"%02d" $month)/$(printf "%02d" $day)/23"
+               elif [[ "$table" == 'wmf.webrequest' ]]; then
+                       
path="/mnt/hdfs/wmf/data/wmf/webrequest/webrequest_source=mobile/year=$year/month=$month/day=$day/hour=23"
                else
                        
path="/mnt/hdfs/user/hive/warehouse/yurik.db/$table/year=$year/month=$month/day=$day"
                fi
@@ -45,20 +61,20 @@
                        continue
                fi
 
-               
path="/mnt/hdfs/user/hive/warehouse/yurik.db/zero_webstats/date="$date
+               
path="/mnt/hdfs/user/hive/warehouse/yurik.db/"$dsttable"/date="$date
                echo "***** Checking if '$path' exists"
                if [ -d $path ]; then
-                       if [ "$6" == "overwrite" ]; then
+                       if [ "$8" == "overwrite" ]; then
                                echo "***** Droping partition '$date'"
-                               hive -e "use yurik; ALTER TABLE zero_webstats 
DROP IF EXISTS PARTITION(date = '$date');"
+                               hive -e "use yurik; ALTER TABLE "$dsttable" 
DROP IF EXISTS PARTITION(date = '$date');"
                        else
                                echo "***** Skipping '$date'"
                                continue
                        fi
                fi
                echo -e "*****\n*****\n*****\n*****"
-               echo "*****" hive -f zero-counts.hql -d "table="$table -d 
"year="$year -d "month="$month -d "day="$day -d "date="$date
-               export HADOOP_HEAPSIZE=2048 && hive -f zero-counts.hql -d 
"table="$table -d "year="$year -d "month="$month -d "day="$day -d "date="$date
+               echo "*****" hive -f $script -d "table="$table  -d 
"dsttable="$dsttable -d "year="$year -d "month="$month -d "day="$day -d 
"date="$date
+               export HADOOP_HEAPSIZE=2048 && hive -f $script -d 
"table="$table  -d "dsttable="$dsttable -d "year="$year -d "month="$month -d 
"day="$day -d "date="$date
 
        fi
 
diff --git a/scripts/weblogs2.py b/scripts/weblogs2.py
index f2367df..43023e6 100644
--- a/scripts/weblogs2.py
+++ b/scripts/weblogs2.py
@@ -117,7 +117,8 @@
 
 
 class WebLogProcessor2(LogProcessor):
-    def __init__(self, settingsFile='settings/weblogs2.json'):
+    def __init__(self, settingsFile):
+        print('Using settings %s' % settingsFile)
         super(WebLogProcessor2, self).__init__(settingsFile, 'web2')
 
         self._configs = None
@@ -129,6 +130,10 @@
     def defaultSettings(self, suffix):
         s = super(WebLogProcessor2, self).defaultSettings(suffix)
         s.checkAfterTs = False
+        s.hiveTable = 'wmf_raw.webrequest'
+        s.dstTable = 'zero_webstats'
+        s.hqlScript = 'zero-counts.hql'
+        s.wikiPageSuffix = ''
         return s
 
     def onSavingSettings(self):
@@ -146,9 +151,17 @@
     def runHql(self):
         os.environ["HADOOP_HEAPSIZE"] = "2048"
 
-        for date in dateRange(self.settings.checkAfterTs, datetime.today()):
-            path = 
'/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/%s/23' \
+        if self.settings.hiveTable == 'wmf_raw.webrequest':
+            pathFunc = lambda dt: 
'/mnt/hdfs/wmf/data/raw/webrequest/webrequest_upload/hourly/%s/23' \
                    % strftime("%Y/%m/%d", date.timetuple())
+        elif self.settings.hiveTable == 'wmf.webrequest':
+            pathFunc = lambda dt: 
'/mnt/hdfs/wmf/data/wmf/webrequest/webrequest_source=mobile/year=%s/month=%s/day=%s/hour=23'
 \
+                   % (date.year, date.month, date.day)
+        else:
+            raise 'Unknown hiveTable = ' + str(self.settings.hiveTable)
+
+        for date in dateRange(self.settings.checkAfterTs, datetime.today()):
+            path = pathFunc(date)
             if not os.path.exists(path):
                 continue
             size = sum(os.path.getsize(os.path.join(path, f)) for f in 
os.listdir(path) if
@@ -162,9 +175,10 @@
                 continue
 
             cmd = ['hive',
-                   '-f', 'zero-counts.hql',
+                   '-f', self.settings.hqlScript,
                    '-S',  # --silent
-                   '-d', 'table=wmf_raw.webrequest',
+                   '-d', 'table=' + self.settings.hiveTable,
+                   '-d', 'dsttable=' + self.settings.,
                    '-d', 'year=' + strftime("%Y", date.timetuple()),
                    '-d', 'month=' + strftime("%m", date.timetuple()),
                    '-d', 'day=' + strftime("%d", date.timetuple()),
@@ -474,7 +488,7 @@
             wiki = self.getWiki()
             wiki(
                 'edit',
-                title=wikiTitle,
+                title=wikiTitle + self.settings.wikiPageSuffix,
                 summary='refreshing data',
                 text=text,
                 token=wiki.token()
@@ -496,4 +510,5 @@
 
 if __name__ == '__main__':
     # WebLogProcessor2('settings/weblogs2.local.json').manualRun()
-    WebLogProcessor2().safeRun()
+    import sys
+    WebLogProcessor2('settings/weblogs2.json' if len(sys.argv) < 2 else 
sys.argv[1]).safeRun()
diff --git a/scripts/zero-counts2.hql b/scripts/zero-counts2.hql
new file mode 100644
index 0000000..4c89b6c
--- /dev/null
+++ b/scripts/zero-counts2.hql
@@ -0,0 +1,79 @@
+set hive.exec.dynamic.partition.mode=nonstrict;
+SET hive.exec.compress.output=false;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Extracts zero stats from webrequests into a separate table
+--
+-- Usage:
+--     hive -f zero-counts2.hql -d year=2014 -d month=9 -d day=15 -d 
date=2014-09-15 -d dsttable=zero_webstats3
+--     Date is duplicated because I haven't figured an easy way to set 
date=printf()
+--
+-- set hivevar:year=2014;
+-- set hivevar:month=10;
+-- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+-- set hivevar:dsttable=zero_webstats3;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS ${dsttable} (
+  xcs string,
+  via string,
+  ipset string,
+  https string,
+  lang string,
+  subdomain string,
+  site string,
+  count bigint)
+PARTITIONED BY (
+  date string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+
+-- ALTER TABLE ${dsttable} DROP IF EXISTS PARTITION(date < '${date}')
+
+
+INSERT OVERWRITE TABLE ${dsttable}
+    PARTITION(date="${date}") IF NOT EXISTS
+    SELECT
+        xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
+    FROM (
+        SELECT
+            COALESCE(regexp_extract(x_analytics, 'zero=([^\;]+)'), '') xcs,
+            COALESCE(regexp_extract(x_analytics, 'proxy=([^\;]+)'), '') via,
+            COALESCE(regexp_extract(x_analytics, 'zeronet=([^\;]+)'), '') 
ipset,
+            if (x_analytics LIKE '%https=1%', 'https', '') https,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 1), '') lang,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 3), '') subdomain,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 4), '') site
+
+        FROM wmf.webrequest
+        WHERE
+            webrequest_source IN ('text', 'mobile')
+            AND year=${year}
+            AND month=${month}
+            AND day=${day}
+            AND x_analytics LIKE '%zero=%'
+            AND SUBSTR(uri_path, 1, 6) = '/wiki/'
+            AND (
+                    (
+                        SUBSTR(ip, 1, 9) != '10.128.0.'
+                        AND SUBSTR(ip, 1, 11) NOT IN (
+                            '208.80.152.',
+                            '208.80.153.',
+                            '208.80.154.',
+                            '208.80.155.',
+                            '91.198.174.'
+                        )
+                    ) OR x_forwarded_for != '-'
+                )
+            AND SUBSTR(uri_path, 1, 31) != '/wiki/Special:CentralAutoLogin/'
+            AND http_status NOT IN ( '301', '302', '303' )
+            AND uri_host RLIKE '^[A-Za-z0-9-]+(\\.(zero|m))?\\.[a-z]*\\.org$'
+            AND NOT (SPLIT(TRANSLATE(SUBSTR(uri_path, 7), ' ', '_'), '#')[0] 
RLIKE '^[Uu]ndefined$')
+
+    ) prepared
+    GROUP BY xcs, via, ipset, https, lang, subdomain, site
+    DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});
diff --git a/scripts/zero-counts3.hql b/scripts/zero-counts3.hql
new file mode 100644
index 0000000..8315b49
--- /dev/null
+++ b/scripts/zero-counts3.hql
@@ -0,0 +1,62 @@
+set hive.exec.dynamic.partition.mode=nonstrict;
+SET hive.exec.compress.output=false;
+--^ To work around HIVE-3296, we have SETs before any comments
+
+-- Extracts zero stats from webrequests into a separate table
+--
+-- Usage:
+--     hive -f zero-counts3.hql -d year=2015 -d month=4 -d day=9 -d 
date=2015-04-09 -d dsttable=zero_webstats3
+--     Date is duplicated because I haven't figured an easy way to set 
date=printf()
+--
+-- set hivevar:year=2014;
+-- set hivevar:month=10;
+-- set hivevar:day=21;
+-- set hivevar:date=2014-10-21;
+-- set hivevar:dsttable=zero_webstats3;
+
+use yurik;
+
+
+CREATE TABLE IF NOT EXISTS ${dsttable} (
+  xcs string,
+  via string,
+  ipset string,
+  https string,
+  lang string,
+  subdomain string,
+  site string,
+  count bigint)
+PARTITIONED BY (
+  date string)
+ROW FORMAT DELIMITED
+  FIELDS TERMINATED BY '\t';
+
+
+-- ALTER TABLE ${dsttable} DROP IF EXISTS PARTITION(date < '${date}')
+
+
+INSERT OVERWRITE TABLE ${dsttable}
+    PARTITION(date="${date}") IF NOT EXISTS
+    SELECT
+        xcs, via, ipset, https, lang, subdomain, site, COUNT(*) count
+    FROM (
+        SELECT
+            COALESCE(regexp_extract(x_analytics, 'zero=([^\;]+)'), '') xcs,
+            COALESCE(regexp_extract(x_analytics, 'proxy=([^\;]+)'), '') via,
+            COALESCE(regexp_extract(x_analytics, 'zeronet=([^\;]+)'), '') 
ipset,
+            if (x_analytics LIKE '%https=1%', 'https', '') https,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 1), '') lang,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 3), '') subdomain,
+            COALESCE(regexp_extract(uri_host, 
'^([A-Za-z0-9-]+)(\\.(zero|m))?\\.([a-z]*)\\.org$', 4), '') site
+
+        FROM wmf.webrequest
+        WHERE
+            webrequest_source IN ('text', 'mobile')
+            AND year=${year}
+            AND month=${month}
+            AND day=${day}
+            AND x_analytics LIKE '%zero=%'  -- use is_zero field instead, once 
db is fixed
+            AND is_pageview
+    ) prepared
+    GROUP BY xcs, via, ipset, https, lang, subdomain, site
+    DISTRIBUTE BY printf('%d-%02d-%02d', ${year}, ${month}, ${day});

-- 
To view, visit https://gerrit.wikimedia.org/r/203523
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: If6a01ec549ec8d78efe27d0bce176bb694a71889
Gerrit-PatchSet: 2
Gerrit-Project: analytics/zero-sms
Gerrit-Branch: master
Gerrit-Owner: Yurik <yu...@wikimedia.org>
Gerrit-Reviewer: Yurik <yu...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to