QChris has submitted this change and it was merged.

Change subject: Add --location CLI opt to refinery-drop-webrequest-partitions
......................................................................


Add --location CLI opt to refinery-drop-webrequest-partitions

Since data location in external tables is defined by partition, not
by table schema, and since we recently removed the base table
location information from the webrequest table schema, this
script needs to know where to look for webrequest data in HDFS.

Change-Id: Icfd327195569ef8f1eba12ea50ba6c0dba4fb5e3
---
M bin/refinery-drop-webrequest-partitions
M python/refinery/util.py
2 files changed, 23 insertions(+), 7 deletions(-)

Approvals:
  QChris: Verified; Looks good to me, approved



diff --git a/bin/refinery-drop-webrequest-partitions 
b/bin/refinery-drop-webrequest-partitions
index bb996c1..ddb871a 100755
--- a/bin/refinery-drop-webrequest-partitions
+++ b/bin/refinery-drop-webrequest-partitions
@@ -27,6 +27,8 @@
     -d --older-than-days=<days>         Drop data older than this number of 
days.  [default: 60]
     -D --database=<dbname>              Hive database name.  [default: default]
     -t --table=<table>                  Name of webrequest table.  [default: 
webrequest]
+    -l --location=<location>            Base HDFS location path of the 
webrequest table.  If not
+                                        specified, this will be inferred from 
the table schema metadata.
     -o --hive-options=<options>         Any valid Hive CLI options you want to 
pass to Hive commands.
                                         Example: '--auxpath 
/path/to/hive-serdes-1.0-SNAPSHOT.jar'
     -v --verbose                        Turn on verbose debug logging.
@@ -39,6 +41,7 @@
 import logging
 import re
 import os
+import sys
 from refinery.util import HiveUtils, HdfsUtils
 
 # from pprint import pprint as pp
@@ -48,12 +51,13 @@
     # parse arguments
     arguments = docopt(__doc__)
     # pp(arguments)
-    days          = int(arguments['--older-than-days'])
-    database      = arguments['--database']
-    table         = arguments['--table']
-    hive_options  = arguments['--hive-options']
-    verbose       = arguments['--verbose']
-    dry_run       = arguments['--dry-run']
+    days           = int(arguments['--older-than-days'])
+    database       = arguments['--database']
+    table          = arguments['--table']
+    table_location = arguments['--location']
+    hive_options   = arguments['--hive-options']
+    verbose        = arguments['--verbose']
+    dry_run        = arguments['--dry-run']
 
     log_level = logging.INFO
     if verbose:
@@ -83,7 +87,15 @@
     hive = HiveUtils(database, hive_options)
 
     # The base location of this webrequest table in HDFS.
-    table_location = hive.table_location(table)
+    # If it was not provided via the CLI, then attempt to
+    # infer if from the table metadata.
+    if table_location == None:
+        table_location = hive.table_location(table)
+
+    if not HdfsUtils.validate_path(table_location):
+        logging.error('{0} table location \'{1}\' is not a valid HDFS path.  
Path must start with \'/\' or \'hdfs://\'.  Aborting.'
+            .format(table, table_location))
+        sys.exit(1)
 
     # This glob will be used to list out all partition paths in HDFS.
     partition_glob = os.path.join(table_location, '*', 'hourly', '*', '*', 
'*', '*')
diff --git a/python/refinery/util.py b/python/refinery/util.py
index c77dd29..9092a37 100755
--- a/python/refinery/util.py
+++ b/python/refinery/util.py
@@ -443,3 +443,7 @@
 
         return sh(['hdfs', 'dfs', '-rm', '-R'] + paths)
 
+    @staticmethod
+    def validate_path(path):
+        return path.startswith('/') or path.startswith('hdfs://')
+

-- 
To view, visit https://gerrit.wikimedia.org/r/155281
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Icfd327195569ef8f1eba12ea50ba6c0dba4fb5e3
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: QChris <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to