[MediaWiki-commits] [Gerrit] swift-thumb-stats: dump thumb stats from swift - change (operations/software)
Filippo Giunchedi has submitted this change and it was merged. Change subject: swift-thumb-stats: dump thumb stats from swift .. swift-thumb-stats: dump thumb stats from swift basic script to process thumbs into JSON for later analysis Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961 --- A thumbstats/swift-thumb-stats 1 file changed, 194 insertions(+), 0 deletions(-) Approvals: Filippo Giunchedi: Verified; Looks good to me, approved jenkins-bot: Verified diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats new file mode 100755 index 000..03e9e5e --- /dev/null +++ b/thumbstats/swift-thumb-stats @@ -0,0 +1,194 @@ +#!/usr/bin/python + +# this script will scan all containers with thumbnails for the given account +# and feed each thumbnail to a filter. At the end the result from each filter +# is printed on standard output in JSON in an object like: +# {'FooFilter': result, 'BarFilter': result} + +import argparse +import collections +import datetime +import json +import os +import pprint +import re +import sys +import threading + +import swiftclient + +CONTAINER_THUMB_RE = re.compile('-thumb(\.[a-f0-9][a-f0-9])?$') +THUMB_RE = re.compile('/(?Psize\d+)px-(?Pname.*)$') + + +class Thumb(object): +pass + + +class Filter(object): +def process(self, thumb): +pass +def result(self): +pass + + +class BytesPerSize(Filter): +Size vs size breakdown. +_bytes = {} +def process(self, thumb): +self._bytes[thumb.thumbsize] = \ +self._bytes.setdefault(thumb.thumbsize, 0) + int(thumb.bytes) +def result(self): +return self._bytes +def str(self): +return BytesPerSize + + +class CountPerSize(Filter): +Size vs count breakdown. + +_count = {} +def process(self, thumb): +self._count[thumb.thumbsize] = \ +self._count.setdefault(thumb.thumbsize, 0) + 1 +def result(self): +return self._count +def str(self): +return CountPerSize + + +class BytesByMonth(Filter): +Year+month vs size vs bytes breakdown. +_month = {} +def process(self, thumb): +key = thumb.last_modified[:7] +size = thumb.thumbsize +self._month[key] = self._month.setdefault(key, {}) +self._month[key][size] = \ +self._month[key].setdefault(size, 0) + int(thumb.bytes) +def result(self): +return self._month +def str(self): +return BytesPerMonth + + +def iter_container(connection, name, limit=None): +Iterate over the container contents. + +_, listing = connection.get_container(name, limit=limit) +while listing: +for container in listing: +yield container +marker = container['name'] +_, listing = connection.get_container(name, limit=limit, marker=marker) + + +def iter_thumbs(container): +Iterate over the container contents and yield Thumb objects. + +for thumb in container: +m = THUMB_RE.search(thumb['name']) +if not m: +continue +t = Thumb() +t.name = m.group('name') +t.thumbsize = m.group('size') +t.filename = thumb['name'] +t.bytes = thumb['bytes'] +t.last_modified = thumb['last_modified'] +t.hash = thumb['hash'] +t.content_type = thumb['content_type'] +yield t + + +def _process_container(container, connection, filters): +Iterate over the container thumbs and pass items to each filter. + +container_name = container['name'] +thumbs = iter_thumbs(iter_container(connection, container_name)) +start = datetime.datetime.utcnow() +for i, thumb in enumerate(thumbs): +thumb.container_name = container_name +for f in filters: +f.process(thumb) +if i and i % 1 == 0: +now = datetime.datetime.utcnow() +elapsed = now - start +start = now +print sys.stderr, %s: inserted 1 records from %s (%s) % ( +threading.current_thread().name, container_name, elapsed) + + +def process_container(in_queue, connection, filters): +while True: +try: +container = in_queue.popleft() +_process_container(container, connection, filters) +except IndexError: +break + + +def _join_threads(threads): +Join the given threads while accepting KeyboardInterrupt. + +_threads = threads[:] +while _threads: +try: +for thread in _threads[:]: +if not thread.is_alive(): +_threads.remove(thread) +else: +thread.join(timeout=0.1) +except KeyboardInterrupt: +break + + +def thumb_containers(connection): +headers, containers = connection.get_account(full_listing=True) +for container in containers: +if
[MediaWiki-commits] [Gerrit] swift-thumb-stats: dump thumb stats from swift - change (operations/software)
Filippo Giunchedi has uploaded a new change for review. https://gerrit.wikimedia.org/r/148997 Change subject: swift-thumb-stats: dump thumb stats from swift .. swift-thumb-stats: dump thumb stats from swift basic script to process thumbs into JSON for later analysis Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961 --- A thumbstats/swift-thumb-stats 1 file changed, 192 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/software refs/changes/97/148997/1 diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats new file mode 100755 index 000..304ceb5 --- /dev/null +++ b/thumbstats/swift-thumb-stats @@ -0,0 +1,192 @@ +#!/usr/bin/python + +# this script will scan all containers with thumbnails for the given account +# and feed each thumbnail to a filter. At the end the result from each filter +# is printed on standard output in JSON in an object like: +# {'FooFilter': result, 'BarFilter': result} + +import argparse +import collections +import datetime +import json +import os +import pprint +import re +import sys +import threading + +import swiftclient + +CONTAINER_THUMB_RE = re.compile('-thumb(\.[a-f0-9][a-f0-9])?$') +THUMB_RE = re.compile('/(?Psize\d+)px-(?Pname.*)$') + + +class Thumb(object): +pass + + +class Filter(object): +def process(self, thumb): +pass +def result(self): +pass + + +class BytesPerSize(Filter): +_bytes = {} +def process(self, thumb): +self._bytes[thumb.thumbsize] = \ +self._bytes.setdefault(thumb.thumbsize, 0) + int(thumb.bytes) +def result(self): +return self._bytes +def str(self): +return BytesPerSize + + +class CountPerSize(Filter): +Size vs count breakdown. + +_count = {} +def process(self, thumb): +self._count[thumb.thumbsize] = \ +self._count.setdefault(thumb.thumbsize, 0) + 1 +def result(self): +return self._count +def str(self): +return CountPerSize + + +class BytesByMonth(Filter): +Year+month vs size vs bytes breakdown. +_month = {} +def process(self, thumb): +key = thumb.last_modified[:7] +size = thumb.thumbsize +self._month[key] = self._month.setdefault(key, {}) +self._month[key][size] = \ +self._month[key].setdefault(size, 0) + int(thumb.bytes) +def result(self): +return self._month +def str(self): +return BytesPerMonth + + +def iter_container(connection, name, limit=None): +Iterate over the container contents. + +_, listing = connection.get_container(name, limit=limit) +while listing: +for container in listing: +yield container +marker = container['name'] +_, listing = connection.get_container(name, limit=limit, marker=marker) + + +def iter_thumbs(container): +Iterate over the container contents and yield Thumb objects. + +for thumb in container: +m = THUMB_RE.search(thumb['name']) +if not m: +continue +t = Thumb() +t.name = m.group('name') +t.thumbsize = m.group('size') +t.filename = thumb['name'] +t.bytes = thumb['bytes'] +t.last_modified = thumb['last_modified'] +t.hash = thumb['hash'] +t.content_type = thumb['content_type'] +yield t + + +def _process_container(container, connection, filters): +Iterate over the container thumbs and pass items to each filter. + +container_name = container['name'] +thumbs = iter_thumbs(iter_container(connection, container_name)) +start = datetime.datetime.utcnow() +for i, thumb in enumerate(thumbs): +thumb.container_name = container_name +for f in filters: +f.process(thumb) +if i and i % 1 == 0: +now = datetime.datetime.utcnow() +elapsed = now - start +start = now +print sys.stderr, %s: inserted 1 records from %s (%s) % ( +threading.current_thread().name, container_name, elapsed) + + +def process_container(in_queue, connection, filters): +while True: +try: +container = in_queue.popleft() +_process_container(container, connection, filters) +except IndexError: +break + + +def _join_threads(threads): +Join the given threads while accepting KeyboardInterrupt. + +_threads = threads[:] +while _threads: +try: +for thread in _threads[:]: +if not thread.is_alive(): +_threads.remove(thread) +else: +thread.join(timeout=0.1) +except KeyboardInterrupt: +break + + +def thumb_containers(connection): +headers, containers = connection.get_account(full_listing=True) +for container in containers: +if