Madhuvishy has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/240299

Change subject: [WIP] Add script to drop old eventlogging partitions
......................................................................

[WIP] Add script to drop old eventlogging partitions

Bug: T106253
Change-Id: Iefe6a185bfc1dff5512f902859dc444e9f139f96
---
A bin/refinery-drop-eventlogging-partitions
1 file changed, 116 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/analytics/refinery 
refs/changes/99/240299/1

diff --git a/bin/refinery-drop-eventlogging-partitions 
b/bin/refinery-drop-eventlogging-partitions
new file mode 100755
index 0000000..1485c0c
--- /dev/null
+++ b/bin/refinery-drop-eventlogging-partitions
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Note: You should make sure to put refinery/python on your PYTHONPATH.
+#   export PYTHONPATH=$PYTHONPATH:/path/to/refinery/python
+
+"""
+Automatically deletes the hourly time bucketed old eventlogging
+directories from HDFS.
+
+Usage: refinery-drop-webrequest-partitions [options]
+
+Options:
+    -h --help                           Show this help message and exit.
+    -d --older-than-days=<days>         Drop data older than this number of 
days.  [default: 60]
+    -l --location=<location>            Base HDFS location path of the 
eventlogging data.
+    -v --verbose                        Turn on verbose debug logging.
+    -n --dry-run                        Don't actually delete any data. Print 
the HDFS directory paths
+                                        that will be deleted
+"""
+__author__ = 'Madhumitha Viswanathan <[email protected]>'
+
+import datetime
+from   docopt   import docopt
+import logging
+import re
+import os
+import sys
+from refinery.util import HiveUtils, HdfsUtils
+
+
+if __name__ == '__main__':
+    # parse arguments
+    arguments = docopt(__doc__)
+    # pp(arguments)
+    days            = int(arguments['--older-than-days'])
+    location        = arguments['--location']
+    verbose         = arguments['--verbose']
+    dry_run         = arguments['--dry-run']
+
+    log_level = logging.INFO
+    if verbose:
+        log_level = logging.DEBUG
+
+    logging.basicConfig(level=log_level,
+                        format='%(asctime)s %(levelname)-6s %(message)s',
+                        datefmt='%Y-%m-%dT%H:%M:%S')
+
+
+    if not HdfsUtils.validate_path(location):
+        logging.error('Location \'{0}\' is not a valid HDFS path.  Path must 
start with \'/\' or \'hdfs://\'.  Aborting.'
+            .format(location))
+        sys.exit(1)
+
+
+    # These globs will be used to list out all partition paths in HDFS.
+    partition_glob = os.path.join(location, '*', 'hourly', '*', '*', '*', '*')
+
+    # This regexes tells HiveUtils partition_datetime_from_path
+    # how to extract just the date portion from a partition path.
+    # The first match group will be passed to datetime.datetime.strptime
+    # using one of the below date_formats.
+    date_regex = re.compile(r'.*/hourly/(.+)$')
+
+    # These will be used to extract a datetime object from the string
+    # matched by date_regex in HiveUtils partition_datetime_from_path
+    date_format = '%Y/%m/%d/%H'
+
+    # Delete partitions older than this.
+    old_partition_datetime_threshold = datetime.datetime.now() - 
datetime.timedelta(days=days)
+
+    partition_paths_to_delete = []
+
+    # Loop through all the partition directory paths for this table
+    # and check if any of them are old enough for deletion.
+    for partition_path in HdfsUtils.ls(partition_glob, include_children=False):
+        try:
+            partition_datetime = HiveUtils.partition_datetime_from_path(
+                partition_path,
+                date_regex,
+                date_format
+            )
+        except ValueError as e:
+            logging.error(
+                'HiveUtils.partition_datetime_from_path could not parse date 
found in {0} using pattern {1}. Skipping. ({2})'
+                .format(partition_path, date_regex.pattern, e)
+            )
+            continue
+
+        if partition_datetime and partition_datetime < 
old_partition_datetime_threshold:
+            partition_paths_to_delete.append(partition_path)
+
+
+    # Delete any old HDFS data
+    if partition_paths_to_delete:
+        if dry_run:
+            print('hdfs dfs -rm -R ' + ' '.join(partition_paths_to_delete))
+        else:
+            logging.info('Removing {0} eventlogging partition directories from 
{1}.'
+                .format(len(partition_paths_to_delete), location)
+            )
+            HdfsUtils.rm(' '.join(partition_paths_to_delete))
+    else:
+        logging.info('No eventlogging partition directories need to be 
removed')

-- 
To view, visit https://gerrit.wikimedia.org/r/240299
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iefe6a185bfc1dff5512f902859dc444e9f139f96
Gerrit-PatchSet: 1
Gerrit-Project: analytics/refinery
Gerrit-Branch: master
Gerrit-Owner: Madhuvishy <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to