Mwalker has submitted this change and it was merged. Change subject: Banner screenshot job ......................................................................
Banner screenshot job Renders a banner using PhantomJS, for every campaign language, and store the images to disk. TODO: * Correct high-load behavior by waiting for page load or aborting. * Add a switch to toggle between in-context rendering and clipping to div#centralNotice. * Debug empty files. * Null file if size is zero. Change-Id: Ia7dcdaea11fddf5d2746e7e06e58b5bc9b857147 --- A banner_screenshot/.gitignore A banner_screenshot/config.py.example A banner_screenshot/mediawiki/__init__.py A banner_screenshot/mediawiki/centralnotice.py A banner_screenshot/mediawiki/time_util.py A banner_screenshot/rasterize.js A banner_screenshot/shoot_banners 7 files changed, 227 insertions(+), 0 deletions(-) Approvals: Mwalker: Verified; Looks good to me, approved diff --git a/banner_screenshot/.gitignore b/banner_screenshot/.gitignore new file mode 100644 index 0000000..df81b2c --- /dev/null +++ b/banner_screenshot/.gitignore @@ -0,0 +1,2 @@ +*.pyc +config.py diff --git a/banner_screenshot/config.py.example b/banner_screenshot/config.py.example new file mode 100644 index 0000000..b7f64ab --- /dev/null +++ b/banner_screenshot/config.py.example @@ -0,0 +1,7 @@ +article_url = "http://en.wikipedia.org/wiki/Special:Random?banner=%(banner)s&uselang=%(lang)s&country=%(country)s" +centralnotice_mw_api = "http://meta.wikimedia.org/w/api.php" +phantomjs = "/usr/local/phantomjs/phantomjs" +banner_screenshots_dir = "/tmp/banner_screenshots" +banner_screenshot_format = "png" +crop_height = 500 +banner_name_regex = r'^B13_.*_(?P<lang>[a-z]{2})(?P<country>[A-Z0-9]{2})$' diff --git a/banner_screenshot/mediawiki/__init__.py b/banner_screenshot/mediawiki/__init__.py new file mode 100644 index 0000000..247139a --- /dev/null +++ b/banner_screenshot/mediawiki/__init__.py @@ -0,0 +1,22 @@ +''' +Dumb interface to the MediaWiki api. +''' + +import config + +import json + +def mw_call( args ): + import simplemediawiki + + wiki = simplemediawiki.MediaWiki( + config.centralnotice_mw_api, + user_agent='bot: fr-screenshots' + ) + result = wiki.call( args ) + if 'error' in result: + raise RuntimeError(json.dumps(result, indent=4).replace('\\n', '\n')) + val = result[ args['action'] ] + if 'list' in args: + val = val[ args['list'] ] + return val diff --git a/banner_screenshot/mediawiki/centralnotice.py b/banner_screenshot/mediawiki/centralnotice.py new file mode 100644 index 0000000..4133d29 --- /dev/null +++ b/banner_screenshot/mediawiki/centralnotice.py @@ -0,0 +1,56 @@ +''' +Interface to the MediaWiki CentralNotice api +''' + +from mediawiki import mw_call + +cached_campaigns = {} + +def get_banners( **kw ): + if 'campaign' in kw: + campaign = get_campaign( kw['campaign'] ) + return campaign['banners'].keys() + return get_allocations( **kw ) + +def get_campaign( campaign ): + #TODO: push caching down into mediawiki.mw_call, with optional invalidation + global cached_campaigns + if campaign in cached_campaigns: + return cached_campaigns[campaign] + + #if '__iter__' in campaign: return get_campaigns + result = mw_call( { + 'action': 'centralnoticequerycampaign', + 'campaign': campaign, + } ) + + if campaign in result: + cached_campaigns[campaign] = result[campaign] + return cached_campaigns[campaign] + +def get_campaigns( campaigns ): + #FIXME cache + return mw_call( { + 'action': 'centralnoticequerycampaign', + 'campaign': '|'.join( campaigns ), + } ) + +def get_allocations( project=None, language=None, country=None, anonymous=True, bucket='0' ): + result = mw_call( { + 'action': 'centralnoticeallocations', + 'project': project, + 'language': language, + 'country': country, + 'anonymous': anonymous, + 'bucket': bucket, + 'minimal': 'false' + } ) + return result['banners'] + +def get_campaign_logs( since=None ): + result = mw_call( { + 'action': 'query', + 'list': 'centralnoticelogs', + 'start': since, + } ) + return result['logs'] diff --git a/banner_screenshot/mediawiki/time_util.py b/banner_screenshot/mediawiki/time_util.py new file mode 100644 index 0000000..b02c2a6 --- /dev/null +++ b/banner_screenshot/mediawiki/time_util.py @@ -0,0 +1,17 @@ +from datetime import datetime, timedelta + +def str_time_offset(str_time=None, **delta_args): + if not str_time: + str_time = str_now() + time_time = datetime.strptime( str_time, '%Y%m%d%H%M%S' ) + str_time = ( time_time + timedelta( **delta_args )).strftime( '%Y%m%d%H%M%S' ) + return(str_time) + +def str_now(): + return( datetime.utcnow().strftime('%Y%m%d%H%M%S') ) + +def datetimefunix( unix_timestamp ): + return datetime.fromtimestamp(unix_timestamp) + +def strfunix( unix_timestamp ): + return datetime.fromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M') diff --git a/banner_screenshot/rasterize.js b/banner_screenshot/rasterize.js new file mode 100644 index 0000000..2bdd15a --- /dev/null +++ b/banner_screenshot/rasterize.js @@ -0,0 +1,41 @@ +var page = require('webpage').create(), + address, output, size; + +phantom.cookiesEnabled = true; + +if (phantom.args.length < 2 || phantom.args.length > 3) { + console.log('Usage: rasterize.js URL filename'); + phantom.exit(); +} else { + page.onError = function (msg, trace) { + console.log(msg); + trace.forEach(function(item) { + console.log(' ', item.file, ':', item.line); + }) + }; + address = phantom.args[0]; + output = phantom.args[1]; + //page.customHeaders = { 'Referer': address }; + page.viewportSize = { width: 1024, height: 728 }; + page.open(address, function (status) { + if (status !== 'success') { + console.log('Unable to load the address!'); + } else { + //console.log(JSON.stringify(phantom.cookies, null, 2)); + window.setTimeout(function () { + page.clipRect = page.evaluate(function() { + var cn = $('#centralNotice'); + return { + top: cn.offset().top, + left: cn.offset().left, + width: cn.width(), + height: cn.height() + }; + }); + console.log(page.clipRect.width + " x " + page.clipRect.height); + page.render(output); + phantom.exit(); + }, 1000); + } + }); +} diff --git a/banner_screenshot/shoot_banners b/banner_screenshot/shoot_banners new file mode 100755 index 0000000..17c78c4 --- /dev/null +++ b/banner_screenshot/shoot_banners @@ -0,0 +1,82 @@ +#!/usr/bin/env python + +import os +import os.path +import subprocess +import re +import sys + +import config +from mediawiki.centralnotice import get_campaign_logs +from mediawiki.time_util import str_time_offset + +JS_RENDER_SCRIPT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "rasterize.js") + +def reduce_banners(campaign_logs): + '''Return a map from banner names to most recent campaign settings.''' + banners = dict() + for entry in campaign_logs: + settings = entry['end'] + campaign_banners = settings['banners'] + + # we only need one country... + settings['country'] = "US" + if settings['geo'] == "1" and settings['countries']: + settings['country'] = settings['countries'][0] + + if hasattr(campaign_banners, 'keys'): + banners.update( + dict.fromkeys( + campaign_banners.keys(), settings + ) + ) + + return banners + +def get_screenshot_path(name, lang): + return os.path.join( + config.banner_screenshots_dir, + "%(banner)s/%(banner)s_%(lang)s.%(ext)s" % { + "banner": name, + "lang": lang, + "ext": config.banner_screenshot_format, + } + ) + +def banner_screenshot_exists(name, lang): + return os.path.exists(get_screenshot_path(name, lang)) + +def render(name, lang, country): + url = config.article_url % { "banner": name, "lang": lang, "country": country } + path = get_screenshot_path(name, lang) + dir = os.path.dirname(path) + if not os.path.exists(dir): + os.makedirs(dir) + + print "Fetching " + url + " into " + path + subprocess.check_call([config.phantomjs, JS_RENDER_SCRIPT, url, path]) + +def process_banners(): + banners = reduce_banners(get_campaign_logs(since=str_time_offset(days=-2))) + for name, campaign_settings in banners.items(): + country = "US" + m = re.match(config.banner_name_regex, name) + if m: + explicit_lang = m.group('lang') + if explicit_lang != "yy": + campaign_settings['languages'] = [ explicit_lang ] + explicit_country = m.group('country') + if explicit_country != "YY": + campaign_settings['country'] = explicit_country + + for lang in campaign_settings['languages']: + if not banner_screenshot_exists(name, lang): + render(name, lang, campaign_settings['country']) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + for name in sys.argv[1:]: + screenshot_banner(name) + else: + process_banners() -- To view, visit https://gerrit.wikimedia.org/r/37361 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia7dcdaea11fddf5d2746e7e06e58b5bc9b857147 Gerrit-PatchSet: 13 Gerrit-Project: wikimedia/fundraising/tools Gerrit-Branch: master Gerrit-Owner: Adamw <awi...@wikimedia.org> Gerrit-Reviewer: Adamw <awi...@wikimedia.org> Gerrit-Reviewer: Katie Horn <kh...@wikimedia.org> Gerrit-Reviewer: Mwalker <mwal...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits