[MediaWiki-commits] [Gerrit] swift-thumb-stats: dump thumb stats from swift - change (operations/software)

2014-08-13 Thread Filippo Giunchedi (Code Review)
Filippo Giunchedi has submitted this change and it was merged.

Change subject: swift-thumb-stats: dump thumb stats from swift
..


swift-thumb-stats: dump thumb stats from swift

basic script to process thumbs into JSON for later analysis

Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961
---
A thumbstats/swift-thumb-stats
1 file changed, 194 insertions(+), 0 deletions(-)

Approvals:
  Filippo Giunchedi: Verified; Looks good to me, approved
  jenkins-bot: Verified



diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats
new file mode 100755
index 000..03e9e5e
--- /dev/null
+++ b/thumbstats/swift-thumb-stats
@@ -0,0 +1,194 @@
+#!/usr/bin/python
+
+# this script will scan all containers with thumbnails for the given account
+# and feed each thumbnail to a filter. At the end the result from each filter
+# is printed on standard output in JSON in an object like:
+#   {'FooFilter': result, 'BarFilter': result}
+
+import argparse
+import collections
+import datetime
+import json
+import os
+import pprint
+import re
+import sys
+import threading
+
+import swiftclient
+
+CONTAINER_THUMB_RE = re.compile('-thumb(\.[a-f0-9][a-f0-9])?$')
+THUMB_RE = re.compile('/(?Psize\d+)px-(?Pname.*)$')
+
+
+class Thumb(object):
+pass
+
+
+class Filter(object):
+def process(self, thumb):
+pass
+def result(self):
+pass
+
+
+class BytesPerSize(Filter):
+Size vs size breakdown.
+_bytes = {}
+def process(self, thumb):
+self._bytes[thumb.thumbsize] = \
+self._bytes.setdefault(thumb.thumbsize, 0) + int(thumb.bytes)
+def result(self):
+return self._bytes
+def str(self):
+return BytesPerSize
+
+
+class CountPerSize(Filter):
+Size vs count breakdown.
+
+_count = {}
+def process(self, thumb):
+self._count[thumb.thumbsize] = \
+self._count.setdefault(thumb.thumbsize, 0) + 1
+def result(self):
+return self._count
+def str(self):
+return CountPerSize
+
+
+class BytesByMonth(Filter):
+Year+month vs size vs bytes breakdown.
+_month = {}
+def process(self, thumb):
+key = thumb.last_modified[:7]
+size = thumb.thumbsize
+self._month[key] = self._month.setdefault(key, {})
+self._month[key][size] = \
+self._month[key].setdefault(size, 0) + int(thumb.bytes)
+def result(self):
+return self._month
+def str(self):
+return BytesPerMonth
+
+
+def iter_container(connection, name, limit=None):
+Iterate over the container contents.
+
+_, listing = connection.get_container(name, limit=limit)
+while listing:
+for container in listing:
+yield container
+marker = container['name']
+_, listing = connection.get_container(name, limit=limit, marker=marker)
+
+
+def iter_thumbs(container):
+Iterate over the container contents and yield Thumb objects.
+
+for thumb in container:
+m = THUMB_RE.search(thumb['name'])
+if not m:
+continue
+t = Thumb()
+t.name = m.group('name')
+t.thumbsize = m.group('size')
+t.filename = thumb['name']
+t.bytes = thumb['bytes']
+t.last_modified = thumb['last_modified']
+t.hash = thumb['hash']
+t.content_type = thumb['content_type']
+yield t
+
+
+def _process_container(container, connection, filters):
+Iterate over the container thumbs and pass items to each filter.
+
+container_name = container['name']
+thumbs = iter_thumbs(iter_container(connection, container_name))
+start = datetime.datetime.utcnow()
+for i, thumb in enumerate(thumbs):
+thumb.container_name = container_name
+for f in filters:
+f.process(thumb)
+if i and i % 1 == 0:
+now = datetime.datetime.utcnow()
+elapsed = now - start
+start = now
+print sys.stderr, %s: inserted 1 records from %s (%s) % (
+threading.current_thread().name, container_name, elapsed)
+
+
+def process_container(in_queue, connection, filters):
+while True:
+try:
+container = in_queue.popleft()
+_process_container(container, connection, filters)
+except IndexError:
+break
+
+
+def _join_threads(threads):
+Join the given threads while accepting KeyboardInterrupt.
+
+_threads = threads[:]
+while _threads:
+try:
+for thread in _threads[:]:
+if not thread.is_alive():
+_threads.remove(thread)
+else:
+thread.join(timeout=0.1)
+except KeyboardInterrupt:
+break
+
+
+def thumb_containers(connection):
+headers, containers = connection.get_account(full_listing=True)
+for container in containers:
+if 

[MediaWiki-commits] [Gerrit] swift-thumb-stats: dump thumb stats from swift - change (operations/software)

2014-07-24 Thread Filippo Giunchedi (Code Review)
Filippo Giunchedi has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/148997

Change subject: swift-thumb-stats: dump thumb stats from swift
..

swift-thumb-stats: dump thumb stats from swift

basic script to process thumbs into JSON for later analysis

Change-Id: Iec2e5e5dee17e2bb29d5d4c0e334f0c2defbc961
---
A thumbstats/swift-thumb-stats
1 file changed, 192 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/software 
refs/changes/97/148997/1

diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats
new file mode 100755
index 000..304ceb5
--- /dev/null
+++ b/thumbstats/swift-thumb-stats
@@ -0,0 +1,192 @@
+#!/usr/bin/python
+
+# this script will scan all containers with thumbnails for the given account
+# and feed each thumbnail to a filter. At the end the result from each filter
+# is printed on standard output in JSON in an object like:
+#   {'FooFilter': result, 'BarFilter': result}
+
+import argparse
+import collections
+import datetime
+import json
+import os
+import pprint
+import re
+import sys
+import threading
+
+import swiftclient
+
+CONTAINER_THUMB_RE = re.compile('-thumb(\.[a-f0-9][a-f0-9])?$')
+THUMB_RE = re.compile('/(?Psize\d+)px-(?Pname.*)$')
+
+
+class Thumb(object):
+pass
+
+
+class Filter(object):
+def process(self, thumb):
+pass
+def result(self):
+pass
+
+
+class BytesPerSize(Filter):
+_bytes = {}
+def process(self, thumb):
+self._bytes[thumb.thumbsize] = \
+self._bytes.setdefault(thumb.thumbsize, 0) + int(thumb.bytes)
+def result(self):
+return self._bytes
+def str(self):
+return BytesPerSize
+
+
+class CountPerSize(Filter):
+Size vs count breakdown.
+
+_count = {}
+def process(self, thumb):
+self._count[thumb.thumbsize] = \
+self._count.setdefault(thumb.thumbsize, 0) + 1
+def result(self):
+return self._count
+def str(self):
+return CountPerSize
+
+
+class BytesByMonth(Filter):
+Year+month vs size vs bytes breakdown.
+_month = {}
+def process(self, thumb):
+key = thumb.last_modified[:7]
+size = thumb.thumbsize
+self._month[key] = self._month.setdefault(key, {})
+self._month[key][size] = \
+self._month[key].setdefault(size, 0) + int(thumb.bytes)
+def result(self):
+return self._month
+def str(self):
+return BytesPerMonth
+
+
+def iter_container(connection, name, limit=None):
+Iterate over the container contents.
+
+_, listing = connection.get_container(name, limit=limit)
+while listing:
+for container in listing:
+yield container
+marker = container['name']
+_, listing = connection.get_container(name, limit=limit, marker=marker)
+
+
+def iter_thumbs(container):
+Iterate over the container contents and yield Thumb objects.
+
+for thumb in container:
+m = THUMB_RE.search(thumb['name'])
+if not m:
+continue
+t = Thumb()
+t.name = m.group('name')
+t.thumbsize = m.group('size')
+t.filename = thumb['name']
+t.bytes = thumb['bytes']
+t.last_modified = thumb['last_modified']
+t.hash = thumb['hash']
+t.content_type = thumb['content_type']
+yield t
+
+
+def _process_container(container, connection, filters):
+Iterate over the container thumbs and pass items to each filter.
+
+container_name = container['name']
+thumbs = iter_thumbs(iter_container(connection, container_name))
+start = datetime.datetime.utcnow()
+for i, thumb in enumerate(thumbs):
+thumb.container_name = container_name
+for f in filters:
+f.process(thumb)
+if i and i % 1 == 0:
+now = datetime.datetime.utcnow()
+elapsed = now - start
+start = now
+print sys.stderr, %s: inserted 1 records from %s (%s) % (
+threading.current_thread().name, container_name, elapsed)
+
+
+def process_container(in_queue, connection, filters):
+while True:
+try:
+container = in_queue.popleft()
+_process_container(container, connection, filters)
+except IndexError:
+break
+
+
+def _join_threads(threads):
+Join the given threads while accepting KeyboardInterrupt.
+
+_threads = threads[:]
+while _threads:
+try:
+for thread in _threads[:]:
+if not thread.is_alive():
+_threads.remove(thread)
+else:
+thread.join(timeout=0.1)
+except KeyboardInterrupt:
+break
+
+
+def thumb_containers(connection):
+headers, containers = connection.get_account(full_listing=True)
+for container in containers:
+if