Gilles has uploaded a new change for review. https://gerrit.wikimedia.org/r/317522
Change subject: Query Mediawiki API when Thumbor 404s ...................................................................... Query Mediawiki API when Thumbor 404s Bug: T148410 This will allow us to handle redirects of thumbnail URLs the same way Mediawiki currently does through thumb.php. THis only logs what it would do at the moment, which will allow us to verify that the code is correct in production. I've also removed the varnish tbf functionality as it kept blocking requests overwealously. And corrected the wiki name in swift containers to match the default settings to the vm. Since these roles were written with a single vagrant wiki in mind, the logged redirect is incorrect and would need to compensate for the rewrite that happens in varnish at the moment, where /images/ is changed into /wiki/dev/ This won't be a problem in the multiwiki setup of production ,since the thumbnail URLs are the correct format expected by rewrite.py there, without the need for the hack at the varnish level to rewrite it. Change-Id: I7f8e93204d71bbaff3add556500a0d18df73922d --- M puppet/modules/role/templates/thumbor/local_repo.php.erb M puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py M puppet/modules/swift/templates/conf.php.erb M puppet/modules/swift/templates/proxy-server.conf.erb M puppet/modules/thumbor/manifests/init.pp M puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb M puppet/modules/thumbor/templates/varnish.vcl.erb 7 files changed, 168 insertions(+), 59 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/vagrant refs/changes/22/317522/1 diff --git a/puppet/modules/role/templates/thumbor/local_repo.php.erb b/puppet/modules/role/templates/thumbor/local_repo.php.erb index 56bb12e..fa72ed4 100644 --- a/puppet/modules/role/templates/thumbor/local_repo.php.erb +++ b/puppet/modules/role/templates/thumbor/local_repo.php.erb @@ -30,5 +30,5 @@ 'deletedDir' => $wgDeletedDirectory, 'deletedHashLevels' => $wgHashedUploadDirectory ? 3 : 0, 'supportsSha1URLs' => true, - 'wikiId' => 'wiki-en', + 'wikiId' => 'wiki-dev', ); \ No newline at end of file diff --git a/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py b/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py index 4d051ac..33a5eb7 100644 --- a/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py +++ b/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py @@ -4,6 +4,7 @@ # unit test is in test_rewrite.py. Tests are referenced by numbered comments. +import json import webob import webob.exc import re @@ -36,34 +37,148 @@ self.account = conf['account'].strip() self.thumbhost = conf['thumbhost'].strip() self.thumborhost = conf['thumborhost'].strip() if 'thumborhost' in conf else None - self.thumbor_wiki_list = [item.strip() for item in conf['thumbor_wiki_list'].split(',')] if 'thumbor_wiki_list' in conf else None + if 'thumbor_wiki_list' in conf: + self.thumbor_wiki_list = [item.strip() for item in conf['thumbor_wiki_list'].split(',')] + else: + self.thumbor_wiki_list = None self.user_agent = conf['user_agent'].strip() self.bind_port = conf['bind_port'].strip() - self.shard_container_list = [item.strip() for item in conf['shard_container_list'].split(',')] - # this parameter controls whether URLs sent to the thumbhost are sent as is (eg. upload/proj/lang/) or with the site/lang - # converted and only the path sent back (eg en.wikipedia/thumb). + self.shard_container_list = [ + item.strip() for item in conf['shard_container_list'].split(',')] + # this parameter controls whether URLs sent to the thumbhost are sent + # as is (eg. upload/proj/lang/) or with the site/lang converted and + # only the path sent back (eg en.wikipedia/thumb). self.backend_url_format = conf['backend_url_format'].strip() # asis, sitelang + self.tld = conf['tld'].strip() - def collectHttpStatusCodes(self, url, thumbor_thread, mediawiki_code): - self.logger.debug("Mediawiki: %d %s" % (mediawiki_code, url)) + def titleToEscapedFilename(self, title): + """ + Converts a namespaced Mediawiki title to the underlying filename + Will throw IndexError if provided title doesn't have the expected column + """ + parts = title.split(':') + unescaped_filename = parts[1] + return unescaped_filename.replace(' ', '_') - if thumbor_thread is None: + def redirectUrlFromRedirects(self, url, redirects): + """ + Replaces the filename in a thumbnail URL based on redirects information + obtained from the Mediawiki API + """ + redirected_url = url + + for redirect in redirects: + from_filename = self.titleToEscapedFilename(redirect['from']) + to_filename = self.titleToEscapedFilename(redirect['to']) + redirected_url = redirected_url.replace(from_filename, to_filename) + + return redirected_url + + def handleThumbor404(self, url, mediawiki_code, hostname): + """ + In case of 404 at the Thumbor level, we might be dealing with a + File page redirect. We query the mediawiki API to find out if + that's the case. + """ + self.logger.debug("handleThumbor404: %r %r %r" % ( + url, mediawiki_code, hostname + )) + + match = re.match( + r'^http://[^/]+/[^-/]+/[^/]+/thumb/[^/]+/[^/]+/(?P<filename>[^/]+)/.+', + url + ) + + if not match: + self.logger.warn("URL format could not be recogniwed: %s" % url) + self.collectHttpStatusCodes(url, None, 404, mediawiki_code, hostname) + return + + opener = urllib2.build_opener() + api_url = 'http://%s/w/api.php?action=query&format=json&redirects&titles=File:%s' % ( + hostname, match.group('filename') + ) + + self.logger.debug("Sending Mediawiki API request: %s" % api_url) + + try: + response = opener.open(api_url).read() + except urllib2.URLError, error: + self.logger.warn( + "Could not get redirects information from Mediawiki API: %s" % error.reason + ) + self.collectHttpStatusCodes(url, None, 404, mediawiki_code, hostname) + return + + self.logger.debug("Mediawiki API response: %s" % response) + data = json.loads(response) + if not ('query' in data + and 'redirects' in data['query'] + and len(data['query']['redirects']) > 0): + self.logger.debug( + "No redirects found in Mediawiki API, this is a legitimate 404" + ) + self.collectHttpStatusCodes(url, None, 404, mediawiki_code, hostname) return try: - # Waits for Thumbor if it took longer than Mediawiki to process the image - # Otherwise returns/throws exceptions immediately - thumbor_result = thumbor_thread.wait() - code = thumbor_result.getcode() - except urllib2.HTTPError, error: - code = error.code - except urllib2.URLError, error: - code = 503 + redirected_url = self.redirectUrlFromRedirects(url, data['query']['redirects']) + except IndexError: + self.logger.warn( + "Could not get extract escaped filename from API redirect information: %r" + % data['query']['redirects'] + ) + self.collectHttpStatusCodes(url, None, 404, mediawiki_code, hostname) + return + + self.logger.warn("Would have redirected %s to %s" % (url, redirected_url)) + + self.collectHttpStatusCodes( + url, + None, + 302, + mediawiki_code, + hostname + ) + + def collectHttpStatusCodes( + self, + url, + thumbor_thread, + thumbor_code, + mediawiki_code, + hostname + ): + self.logger.debug("Mediawiki: %d %s" % (mediawiki_code, url)) + + code = thumbor_code + + if thumbor_thread is not None: + try: + # Waits for Thumbor if it took longer than Mediawiki to process the image + # Otherwise returns/throws exceptions immediately + thumbor_result = thumbor_thread.wait() + code = thumbor_result.getcode() + except urllib2.HTTPError, error: + code = error.code + except urllib2.URLError, error: + code = 503 + + if code == 404 and hostname is not None: + self.logger.debug( + "Thumbor returned a 404, check if we were dealing with a redirect" + ) + self.handleThumbor404(url, mediawiki_code, hostname) + return self.logger.debug("Thumbor: %d %s" % (code, url)) if code != mediawiki_code: - self.logger.warn("HTTP status code mismatch. Mediawiki: %d Thumbor: %d URL: %s" % (mediawiki_code, code, url)) + self.logger.warn( + "HTTP status code mismatch. Mediawiki: %d Thumbor: %d URL: %s" % ( + mediawiki_code, code, url + ) + ) def handle404(self, reqorig, url, container, obj): """ @@ -71,6 +186,7 @@ host and returns it. Note also that the thumb host might write it out to Swift so it won't 404 next time. """ + hostname = None original_request_url = reqorig.url # go to the thumb media store for unknown files reqorig.host = self.thumbhost @@ -128,10 +244,10 @@ if(lang in ['mediawiki']): lang = 'www' proj = 'mediawiki' - hostname = '%s.%s.org' % (lang, proj) + hostname = '%s.%s.%s' % (lang, proj, self.tld) if(proj == 'wikipedia' and lang == 'sources'): - #yay special case - hostname = 'wikisource.org' + # yay special case + hostname = 'wikisource.%s' % self.tld # ok, replace the URL with just the part starting with thumb/ # take off the first two parts of the path (eg /wikipedia/commons/); make sure the string starts with a / encodedurl = 'http://%s/w/thumb_handler.php/%s' % (hostname, match.group('path')) @@ -146,6 +262,7 @@ # log the result of the match here to test and make sure it's sane before enabling the config match = re.match(r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)', encodedurl) if match: + hostname = match.group('host') proj = match.group('proj') lang = match.group('lang') self.logger.warn("sitelang match has proj %s lang %s encodedurl %s" % (proj, lang, encodedurl)) @@ -161,9 +278,17 @@ # call Thumbor but don't wait for the result thumbor_thread = eventlet.spawn(thumbor_opener.open, thumbor_encodedurl) + self.logger.debug("Sending request to Mediawiki: %s" % encodedurl) + # ok, call the encoded url upcopy = opener.open(encodedurl) - self.collectHttpStatusCodes(original_request_url, thumbor_thread, upcopy.getcode()) + self.collectHttpStatusCodes( + original_request_url, + thumbor_thread, + None, + upcopy.getcode(), + hostname + ) except urllib2.HTTPError, error: # copy the urllib2 HTTPError into a webob HTTPError class as-is @@ -180,13 +305,25 @@ headers=error.hdrs.items()) resp = CopiedHTTPError() - self.collectHttpStatusCodes(original_request_url, thumbor_thread, resp.code) + self.collectHttpStatusCodes( + original_request_url, + thumbor_thread, + None, + resp.code, + hostname + ) return resp except urllib2.URLError, error: msg = 'There was a problem while contacting the image scaler: %s' % \ error.reason resp = webob.exc.HTTPServiceUnavailable(msg) - self.collectHttpStatusCodes(original_request_url, thumbor_thread, resp.code) + self.collectHttpStatusCodes( + original_request_url, + thumbor_thread, + None, + resp.code, + hostname + ) return resp # get the Content-Type. diff --git a/puppet/modules/swift/templates/conf.php.erb b/puppet/modules/swift/templates/conf.php.erb index 2ef281c..c74dd21 100644 --- a/puppet/modules/swift/templates/conf.php.erb +++ b/puppet/modules/swift/templates/conf.php.erb @@ -5,7 +5,7 @@ 'swiftAuthUrl' => '127.0.0.1:<%= scope['::swift::port'] %>/auth', 'swiftUser' => '<%= scope['::swift::project'] %>:<%= scope['::swift::user'] %>', 'swiftKey' => '<%= scope['::swift::key'] %>', - 'wikiId' => 'wiki-en', + 'wikiId' => 'wiki-dev', 'shardViaHashLevels' => array( 'local-public' => array( 'levels' => 2, 'base' => 16, 'repeat' => 1 ), 'local-thumb' => array( 'levels' => 2, 'base' => 16, 'repeat' => 1 ), diff --git a/puppet/modules/swift/templates/proxy-server.conf.erb b/puppet/modules/swift/templates/proxy-server.conf.erb index 21cbbf4..a449d10 100644 --- a/puppet/modules/swift/templates/proxy-server.conf.erb +++ b/puppet/modules/swift/templates/proxy-server.conf.erb @@ -34,15 +34,16 @@ # upload doesn't like our User-agent (Python-urllib/2.6), otherwise we could call it using urllib2.urlopen() user_agent = Mozilla/5.0 # this list is the containers that should be sharded -shard_container_list = wiki-en-local-public,wiki-en-local-thumb +shard_container_list = wiki-dev-local-public,wiki-dev-local-thumb # backend_url_format controls whether we pass the URL through to the thumbhost unmolested # or mangle it to be consumed by mediawiki. ms5 takes URLs unmolested, mediawiki wants them # transformed to something more palatable (specifically, turning http://upload/proj/lang/ into http://lang.proj/ # valid options are 'asis' (leave it alone) and 'sitelang' (change upload to lang.site.org) backend_url_format = sitelang +tld = local.wmftest.net:8080 paste.filter_factory = wmf.rewrite:filter_factory # thumbor integration thumborhost = 127.0.0.1:8888 -thumbor_wiki_list = wiki-en \ No newline at end of file +thumbor_wiki_list = wiki-dev \ No newline at end of file diff --git a/puppet/modules/thumbor/manifests/init.pp b/puppet/modules/thumbor/manifests/init.pp index 79fb13a..b2dd8b3 100644 --- a/puppet/modules/thumbor/manifests/init.pp +++ b/puppet/modules/thumbor/manifests/init.pp @@ -205,7 +205,7 @@ # create the sharded thumbnail containers ahead of time. exec { 'create-swift-thumbnail-containers': command => '/usr/local/bin/mwscript extensions/WikimediaMaintenance/filebackend/setZoneAccess.php --wiki wiki --backend swift-backend', - unless => "swift -A http://127.0.0.1:${port}/auth/v1.0 -U ${project}:${user} -K ${key} stat wiki-en-local-public.00 | grep -q wiki-en-local-public.00", + unless => "swift -A http://127.0.0.1:${port}/auth/v1.0 -U ${project}:${user} -K ${key} stat wiki-dev-local-public.00 | grep -q wiki-dev-local-public.00", require => [ Service[ 'swift-account-server', diff --git a/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb b/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb index 7ced536..de1ab06 100644 --- a/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb +++ b/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb @@ -32,8 +32,8 @@ SWIFT_USER = '<%= scope['::swift::project'] %>:<%= scope['::swift::user'] %>' SWIFT_KEY = '<%= scope['::swift::key'] %>' SWIFT_SHARDED_CONTAINERS = [ - 'wiki-en-local-public', - 'wiki-en-local-thumb' + 'wiki-dev-local-public', + 'wiki-dev-local-thumb' ] SWIFT_PATH_PREFIX = 'thumbor/' SWIFT_CONNECTION_TIMEOUT = 5 diff --git a/puppet/modules/thumbor/templates/varnish.vcl.erb b/puppet/modules/thumbor/templates/varnish.vcl.erb index ef59dfb..3721441 100644 --- a/puppet/modules/thumbor/templates/varnish.vcl.erb +++ b/puppet/modules/thumbor/templates/varnish.vcl.erb @@ -1,15 +1,6 @@ vcl 4.0; import xkey; -import tbf; import std; - -sub vcl_init { - tbf.open("/var/run/varnish", "mode=600;dbname=tbf.bdb;truncate"); -} - -sub vcl_fini { - tbf.close(); -} sub vcl_recv { set req.http.X-Forwarded-For = client.ip; @@ -37,7 +28,7 @@ # Swift expects /lang/project/foo if (req.url ~ "^/images/") { - set req.url = "/wiki/en/" + regsub(req.url, "^/images/(.*)", "\1"); + set req.url = "/wiki/dev/" + regsub(req.url, "^/images/(.*)", "\1"); } # Reject any methods that aren't expected to work in the context of thumbnails @@ -46,26 +37,6 @@ } return(hash); -} - -sub vcl_miss { - # 10 tokens to consume, refilled at a rate of 5/s - if (!tbf.rate("ip:" + client.ip, 1, 0.2s, 10)) { - return (synth(429, "Request rate exceeded")); - } -} - -sub vcl_deliver { - # Miss - if (obj.hits == 0) { - # If the request requires more than 1s of CPU time, make it count like n extra - # tokens for the tbf throttling, where n is the amount of CPU seconds minus 1 - if (!tbf.rate("ip:" + client.ip, (std.integer(resp.http.Processing-Utime, 1000) - 1000) / 1000, 0.2s, 10)) { - # tbf.rate() has to be inside an if statement - } - } - - # Let things go to the default-subs.vcl vcl_deliver } sub vcl_backend_response { -- To view, visit https://gerrit.wikimedia.org/r/317522 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7f8e93204d71bbaff3add556500a0d18df73922d Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/vagrant Gerrit-Branch: master Gerrit-Owner: Gilles <gdu...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits