QChris has submitted this change and it was merged.
Change subject: Add --location CLI opt to refinery-drop-webrequest-partitions
......................................................................
Add --location CLI opt to refinery-drop-webrequest-partitions
Since data location in external tables is defined by partition, not
by table schema, and since we recently removed the base table
location information from the webrequest table schema, this
script needs to know where to look for webrequest data in HDFS.
Change-Id: Icfd327195569ef8f1eba12ea50ba6c0dba4fb5e3
---
M bin/refinery-drop-webrequest-partitions
M python/refinery/util.py
2 files changed, 23 insertions(+), 7 deletions(-)
Approvals:
QChris: Verified; Looks good to me, approved
diff --git a/bin/refinery-drop-webrequest-partitions
b/bin/refinery-drop-webrequest-partitions
index bb996c1..ddb871a 100755
--- a/bin/refinery-drop-webrequest-partitions
+++ b/bin/refinery-drop-webrequest-partitions
@@ -27,6 +27,8 @@
-d --older-than-days=<days> Drop data older than this number of
days. [default: 60]
-D --database=<dbname> Hive database name. [default: default]
-t --table=<table> Name of webrequest table. [default:
webrequest]
+ -l --location=<location> Base HDFS location path of the
webrequest table. If not
+ specified, this will be inferred from
the table schema metadata.
-o --hive-options=<options> Any valid Hive CLI options you want to
pass to Hive commands.
Example: '--auxpath
/path/to/hive-serdes-1.0-SNAPSHOT.jar'
-v --verbose Turn on verbose debug logging.
@@ -39,6 +41,7 @@
import logging
import re
import os
+import sys
from refinery.util import HiveUtils, HdfsUtils
# from pprint import pprint as pp
@@ -48,12 +51,13 @@
# parse arguments
arguments = docopt(__doc__)
# pp(arguments)
- days = int(arguments['--older-than-days'])
- database = arguments['--database']
- table = arguments['--table']
- hive_options = arguments['--hive-options']
- verbose = arguments['--verbose']
- dry_run = arguments['--dry-run']
+ days = int(arguments['--older-than-days'])
+ database = arguments['--database']
+ table = arguments['--table']
+ table_location = arguments['--location']
+ hive_options = arguments['--hive-options']
+ verbose = arguments['--verbose']
+ dry_run = arguments['--dry-run']
log_level = logging.INFO
if verbose:
@@ -83,7 +87,15 @@
hive = HiveUtils(database, hive_options)
# The base location of this webrequest table in HDFS.
- table_location = hive.table_location(table)
+ # If it was not provided via the CLI, then attempt to
+ # infer if from the table metadata.
+ if table_location == None:
+ table_location = hive.table_location(table)
+
+ if not HdfsUtils.validate_path(table_location):
+ logging.error('{0} table location \'{1}\' is not a valid HDFS path.
Path must start with \'/\' or \'hdfs://\'. Aborting.'
+ .format(table, table_location))
+ sys.exit(1)
# This glob will be used to list out all partition paths in HDFS.
partition_glob = os.path.join(table_location, '*', 'hourly', '*', '*',
'*', '*')
diff --git a/python/refinery/util.py b/python/refinery/util.py
index c77dd29..9092a37 100755
--- a/python/refinery/util.py
+++ b/python/refinery/util.py
@@ -443,3 +443,7 @@
return sh(['hdfs', 'dfs', '-rm', '-R'] + paths)
+ @staticmethod
+ def validate_path(path):
+ return path.startswith('/') or path.startswith('hdfs://')
+
--
To view, visit https://gerrit.wikimedia.org/r/155281
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Icfd327195569ef8f1eba12ea50ba6c0dba4fb5e3
Gerrit-PatchSet: 4
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Ottomata <[email protected]>
Gerrit-Reviewer: QChris <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits