Madhuvishy has uploaded a new change for review. https://gerrit.wikimedia.org/r/240299
Change subject: [WIP] Add script to drop old eventlogging partitions ...................................................................... [WIP] Add script to drop old eventlogging partitions Bug: T106253 Change-Id: Iefe6a185bfc1dff5512f902859dc444e9f139f96 --- A bin/refinery-drop-eventlogging-partitions 1 file changed, 116 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery refs/changes/99/240299/1 diff --git a/bin/refinery-drop-eventlogging-partitions b/bin/refinery-drop-eventlogging-partitions new file mode 100755 index 0000000..1485c0c --- /dev/null +++ b/bin/refinery-drop-eventlogging-partitions @@ -0,0 +1,116 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Note: You should make sure to put refinery/python on your PYTHONPATH. +# export PYTHONPATH=$PYTHONPATH:/path/to/refinery/python + +""" +Automatically deletes the hourly time bucketed old eventlogging +directories from HDFS. + +Usage: refinery-drop-webrequest-partitions [options] + +Options: + -h --help Show this help message and exit. + -d --older-than-days=<days> Drop data older than this number of days. [default: 60] + -l --location=<location> Base HDFS location path of the eventlogging data. + -v --verbose Turn on verbose debug logging. + -n --dry-run Don't actually delete any data. Print the HDFS directory paths + that will be deleted +""" +__author__ = 'Madhumitha Viswanathan <[email protected]>' + +import datetime +from docopt import docopt +import logging +import re +import os +import sys +from refinery.util import HiveUtils, HdfsUtils + + +if __name__ == '__main__': + # parse arguments + arguments = docopt(__doc__) + # pp(arguments) + days = int(arguments['--older-than-days']) + location = arguments['--location'] + verbose = arguments['--verbose'] + dry_run = arguments['--dry-run'] + + log_level = logging.INFO + if verbose: + log_level = logging.DEBUG + + logging.basicConfig(level=log_level, + format='%(asctime)s %(levelname)-6s %(message)s', + datefmt='%Y-%m-%dT%H:%M:%S') + + + if not HdfsUtils.validate_path(location): + logging.error('Location \'{0}\' is not a valid HDFS path. Path must start with \'/\' or \'hdfs://\'. Aborting.' + .format(location)) + sys.exit(1) + + + # These globs will be used to list out all partition paths in HDFS. + partition_glob = os.path.join(location, '*', 'hourly', '*', '*', '*', '*') + + # This regexes tells HiveUtils partition_datetime_from_path + # how to extract just the date portion from a partition path. + # The first match group will be passed to datetime.datetime.strptime + # using one of the below date_formats. + date_regex = re.compile(r'.*/hourly/(.+)$') + + # These will be used to extract a datetime object from the string + # matched by date_regex in HiveUtils partition_datetime_from_path + date_format = '%Y/%m/%d/%H' + + # Delete partitions older than this. + old_partition_datetime_threshold = datetime.datetime.now() - datetime.timedelta(days=days) + + partition_paths_to_delete = [] + + # Loop through all the partition directory paths for this table + # and check if any of them are old enough for deletion. + for partition_path in HdfsUtils.ls(partition_glob, include_children=False): + try: + partition_datetime = HiveUtils.partition_datetime_from_path( + partition_path, + date_regex, + date_format + ) + except ValueError as e: + logging.error( + 'HiveUtils.partition_datetime_from_path could not parse date found in {0} using pattern {1}. Skipping. ({2})' + .format(partition_path, date_regex.pattern, e) + ) + continue + + if partition_datetime and partition_datetime < old_partition_datetime_threshold: + partition_paths_to_delete.append(partition_path) + + + # Delete any old HDFS data + if partition_paths_to_delete: + if dry_run: + print('hdfs dfs -rm -R ' + ' '.join(partition_paths_to_delete)) + else: + logging.info('Removing {0} eventlogging partition directories from {1}.' + .format(len(partition_paths_to_delete), location) + ) + HdfsUtils.rm(' '.join(partition_paths_to_delete)) + else: + logging.info('No eventlogging partition directories need to be removed') -- To view, visit https://gerrit.wikimedia.org/r/240299 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iefe6a185bfc1dff5512f902859dc444e9f139f96 Gerrit-PatchSet: 1 Gerrit-Project: analytics/refinery Gerrit-Branch: master Gerrit-Owner: Madhuvishy <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
