Gilles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/317522

Change subject: Query Mediawiki API when Thumbor 404s
......................................................................

Query Mediawiki API when Thumbor 404s

Bug: T148410
This will allow us to handle redirects of thumbnail URLs the same way Mediawiki 
currently does through thumb.php. THis only logs what it would do at the 
moment, which will allow us to verify that the code is correct in production.

I've also removed the varnish tbf functionality as it kept blocking requests 
overwealously. And corrected the wiki name in swift containers to match the 
default settings to the vm.

Since these roles were written with a single vagrant wiki in mind, the logged 
redirect is incorrect and would need to
compensate for the rewrite that happens in varnish at the moment, where 
/images/ is changed into /wiki/dev/
This won't be a problem in the multiwiki setup of production ,since the 
thumbnail URLs are the correct format
expected by rewrite.py there, without the need for the hack at the varnish 
level to rewrite it.

Change-Id: I7f8e93204d71bbaff3add556500a0d18df73922d
---
M puppet/modules/role/templates/thumbor/local_repo.php.erb
M puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py
M puppet/modules/swift/templates/conf.php.erb
M puppet/modules/swift/templates/proxy-server.conf.erb
M puppet/modules/thumbor/manifests/init.pp
M puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb
M puppet/modules/thumbor/templates/varnish.vcl.erb
7 files changed, 168 insertions(+), 59 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/vagrant 
refs/changes/22/317522/1

diff --git a/puppet/modules/role/templates/thumbor/local_repo.php.erb 
b/puppet/modules/role/templates/thumbor/local_repo.php.erb
index 56bb12e..fa72ed4 100644
--- a/puppet/modules/role/templates/thumbor/local_repo.php.erb
+++ b/puppet/modules/role/templates/thumbor/local_repo.php.erb
@@ -30,5 +30,5 @@
     'deletedDir'        => $wgDeletedDirectory,
     'deletedHashLevels' => $wgHashedUploadDirectory ? 3 : 0,
     'supportsSha1URLs'  => true,
-    'wikiId'            => 'wiki-en',
+    'wikiId'            => 'wiki-dev',
 );
\ No newline at end of file
diff --git a/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py 
b/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py
index 4d051ac..33a5eb7 100644
--- a/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py
+++ b/puppet/modules/swift/files/SwiftMedia/wmf/rewrite.py
@@ -4,6 +4,7 @@
 
 # unit test is in test_rewrite.py. Tests are referenced by numbered comments.
 
+import json
 import webob
 import webob.exc
 import re
@@ -36,34 +37,148 @@
         self.account = conf['account'].strip()
         self.thumbhost = conf['thumbhost'].strip()
         self.thumborhost = conf['thumborhost'].strip() if 'thumborhost' in 
conf else None
-        self.thumbor_wiki_list = [item.strip() for item in 
conf['thumbor_wiki_list'].split(',')] if 'thumbor_wiki_list' in conf else None
+        if 'thumbor_wiki_list' in conf:
+            self.thumbor_wiki_list = [item.strip() for item in 
conf['thumbor_wiki_list'].split(',')]
+        else:
+            self.thumbor_wiki_list = None
         self.user_agent = conf['user_agent'].strip()
         self.bind_port = conf['bind_port'].strip()
-        self.shard_container_list = [item.strip() for item in 
conf['shard_container_list'].split(',')]
-        # this parameter controls whether URLs sent to the thumbhost are sent 
as is (eg. upload/proj/lang/) or with the site/lang
-        # converted  and only the path sent back (eg en.wikipedia/thumb).
+        self.shard_container_list = [
+            item.strip() for item in conf['shard_container_list'].split(',')]
+        # this parameter controls whether URLs sent to the thumbhost are sent
+        # as is (eg. upload/proj/lang/) or with the site/lang converted  and
+        # only the path sent back (eg en.wikipedia/thumb).
         self.backend_url_format = conf['backend_url_format'].strip()  # asis, 
sitelang
+        self.tld = conf['tld'].strip()
 
-    def collectHttpStatusCodes(self, url, thumbor_thread, mediawiki_code):
-        self.logger.debug("Mediawiki: %d %s" % (mediawiki_code, url))
+    def titleToEscapedFilename(self, title):
+        """
+        Converts a namespaced Mediawiki title to the underlying filename
+        Will throw IndexError if provided title doesn't have the expected 
column
+        """
+        parts = title.split(':')
+        unescaped_filename = parts[1]
+        return unescaped_filename.replace(' ', '_')
 
-        if thumbor_thread is None:
+    def redirectUrlFromRedirects(self, url, redirects):
+        """
+        Replaces the filename in a thumbnail URL based on redirects information
+        obtained from the Mediawiki API
+        """
+        redirected_url = url
+
+        for redirect in redirects:
+            from_filename = self.titleToEscapedFilename(redirect['from'])
+            to_filename = self.titleToEscapedFilename(redirect['to'])
+            redirected_url = redirected_url.replace(from_filename, to_filename)
+
+        return redirected_url
+
+    def handleThumbor404(self, url, mediawiki_code, hostname):
+        """
+        In case of 404 at the Thumbor level, we might be dealing with a
+        File page redirect. We query the mediawiki API to find out if
+        that's the case.
+        """
+        self.logger.debug("handleThumbor404: %r %r %r" % (
+            url, mediawiki_code, hostname
+        ))
+
+        match = re.match(
+            
r'^http://[^/]+/[^-/]+/[^/]+/thumb/[^/]+/[^/]+/(?P<filename>[^/]+)/.+',
+            url
+        )
+
+        if not match:
+            self.logger.warn("URL format could not be recogniwed: %s" % url)
+            self.collectHttpStatusCodes(url, None, 404, mediawiki_code, 
hostname)
+            return
+
+        opener = urllib2.build_opener()
+        api_url = 
'http://%s/w/api.php?action=query&format=json&redirects&titles=File:%s' % (
+            hostname, match.group('filename')
+        )
+
+        self.logger.debug("Sending Mediawiki API request: %s" % api_url)
+
+        try:
+            response = opener.open(api_url).read()
+        except urllib2.URLError, error:
+            self.logger.warn(
+                "Could not get redirects information from Mediawiki API: %s" % 
error.reason
+            )
+            self.collectHttpStatusCodes(url, None, 404, mediawiki_code, 
hostname)
+            return
+
+        self.logger.debug("Mediawiki API response: %s" % response)
+        data = json.loads(response)
+        if not ('query' in data
+            and 'redirects' in data['query']
+            and len(data['query']['redirects']) > 0):
+            self.logger.debug(
+                "No redirects found in Mediawiki API, this is a legitimate 404"
+            )
+            self.collectHttpStatusCodes(url, None, 404, mediawiki_code, 
hostname)
             return
 
         try:
-            # Waits for Thumbor if it took longer than Mediawiki to process 
the image
-            # Otherwise returns/throws exceptions immediately
-            thumbor_result = thumbor_thread.wait()
-            code = thumbor_result.getcode()
-        except urllib2.HTTPError, error:
-            code = error.code
-        except urllib2.URLError, error:
-            code = 503
+            redirected_url = self.redirectUrlFromRedirects(url, 
data['query']['redirects'])
+        except IndexError:
+            self.logger.warn(
+                "Could not get extract escaped filename from API redirect 
information: %r"
+                % data['query']['redirects']
+            )
+            self.collectHttpStatusCodes(url, None, 404, mediawiki_code, 
hostname)
+            return
+
+        self.logger.warn("Would have redirected %s to %s" % (url, 
redirected_url))
+
+        self.collectHttpStatusCodes(
+            url,
+            None,
+            302,
+            mediawiki_code,
+            hostname
+        )
+
+    def collectHttpStatusCodes(
+            self,
+            url,
+            thumbor_thread,
+            thumbor_code,
+            mediawiki_code,
+            hostname
+        ):
+        self.logger.debug("Mediawiki: %d %s" % (mediawiki_code, url))
+
+        code = thumbor_code
+
+        if thumbor_thread is not None:
+            try:
+                # Waits for Thumbor if it took longer than Mediawiki to 
process the image
+                # Otherwise returns/throws exceptions immediately
+                thumbor_result = thumbor_thread.wait()
+                code = thumbor_result.getcode()
+            except urllib2.HTTPError, error:
+                code = error.code
+            except urllib2.URLError, error:
+                code = 503
+
+            if code == 404 and hostname is not None:
+                self.logger.debug(
+                    "Thumbor returned a 404, check if we were dealing with a 
redirect"
+                )
+                self.handleThumbor404(url, mediawiki_code, hostname)
+                return
 
         self.logger.debug("Thumbor: %d %s" % (code, url))
 
         if code != mediawiki_code:
-            self.logger.warn("HTTP status code mismatch. Mediawiki: %d 
Thumbor: %d URL: %s" % (mediawiki_code, code, url))
+            self.logger.warn(
+                "HTTP status code mismatch. Mediawiki: %d Thumbor: %d URL: %s" 
% (
+                    mediawiki_code, code, url
+                )
+            )
 
     def handle404(self, reqorig, url, container, obj):
         """
@@ -71,6 +186,7 @@
         host and returns it. Note also that the thumb host might write it out
         to Swift so it won't 404 next time.
         """
+        hostname = None
         original_request_url = reqorig.url
         # go to the thumb media store for unknown files
         reqorig.host = self.thumbhost
@@ -128,10 +244,10 @@
                         if(lang in ['mediawiki']):
                             lang = 'www'
                             proj = 'mediawiki'
-                    hostname = '%s.%s.org' % (lang, proj)
+                    hostname = '%s.%s.%s' % (lang, proj, self.tld)
                     if(proj == 'wikipedia' and lang == 'sources'):
-                        #yay special case
-                        hostname = 'wikisource.org'
+                        # yay special case
+                        hostname = 'wikisource.%s' % self.tld
                     # ok, replace the URL with just the part starting with 
thumb/
                     # take off the first two parts of the path (eg 
/wikipedia/commons/); make sure the string starts with a /
                     encodedurl = 'http://%s/w/thumb_handler.php/%s' % 
(hostname, match.group('path'))
@@ -146,6 +262,7 @@
                 # log the result of the match here to test and make sure it's 
sane before enabling the config
                 match = 
re.match(r'^http://(?P<host>[^/]+)/(?P<proj>[^-/]+)/(?P<lang>[^/]+)/thumb/(?P<path>.+)',
 encodedurl)
                 if match:
+                    hostname = match.group('host')
                     proj = match.group('proj')
                     lang = match.group('lang')
                     self.logger.warn("sitelang match has proj %s lang %s 
encodedurl %s" % (proj, lang, encodedurl))
@@ -161,9 +278,17 @@
                     # call Thumbor but don't wait for the result
                     thumbor_thread = eventlet.spawn(thumbor_opener.open, 
thumbor_encodedurl)
 
+            self.logger.debug("Sending request to Mediawiki: %s" % encodedurl)
+
             # ok, call the encoded url
             upcopy = opener.open(encodedurl)
-            self.collectHttpStatusCodes(original_request_url, thumbor_thread, 
upcopy.getcode())
+            self.collectHttpStatusCodes(
+                original_request_url,
+                thumbor_thread,
+                None,
+                upcopy.getcode(),
+                hostname
+            )
         except urllib2.HTTPError, error:
             # copy the urllib2 HTTPError into a webob HTTPError class as-is
 
@@ -180,13 +305,25 @@
                         headers=error.hdrs.items())
 
             resp = CopiedHTTPError()
-            self.collectHttpStatusCodes(original_request_url, thumbor_thread, 
resp.code)
+            self.collectHttpStatusCodes(
+                original_request_url,
+                thumbor_thread,
+                None,
+                resp.code,
+                hostname
+            )
             return resp
         except urllib2.URLError, error:
             msg = 'There was a problem while contacting the image scaler: %s' 
% \
                   error.reason
             resp = webob.exc.HTTPServiceUnavailable(msg)
-            self.collectHttpStatusCodes(original_request_url, thumbor_thread, 
resp.code)
+            self.collectHttpStatusCodes(
+                original_request_url,
+                thumbor_thread,
+                None,
+                resp.code,
+                hostname
+            )
             return resp
 
         # get the Content-Type.
diff --git a/puppet/modules/swift/templates/conf.php.erb 
b/puppet/modules/swift/templates/conf.php.erb
index 2ef281c..c74dd21 100644
--- a/puppet/modules/swift/templates/conf.php.erb
+++ b/puppet/modules/swift/templates/conf.php.erb
@@ -5,7 +5,7 @@
        'swiftAuthUrl'       => '127.0.0.1:<%= scope['::swift::port'] %>/auth',
        'swiftUser'          => '<%= scope['::swift::project'] %>:<%= 
scope['::swift::user'] %>',
        'swiftKey'           => '<%= scope['::swift::key'] %>',
-       'wikiId'             => 'wiki-en',
+       'wikiId'             => 'wiki-dev',
        'shardViaHashLevels' => array(
                'local-public' => array( 'levels' => 2, 'base' => 16, 'repeat' 
=> 1 ),
                'local-thumb'  => array( 'levels' => 2, 'base' => 16, 'repeat' 
=> 1 ),
diff --git a/puppet/modules/swift/templates/proxy-server.conf.erb 
b/puppet/modules/swift/templates/proxy-server.conf.erb
index 21cbbf4..a449d10 100644
--- a/puppet/modules/swift/templates/proxy-server.conf.erb
+++ b/puppet/modules/swift/templates/proxy-server.conf.erb
@@ -34,15 +34,16 @@
 # upload doesn't like our User-agent (Python-urllib/2.6), otherwise we could 
call it using urllib2.urlopen()
 user_agent = Mozilla/5.0
 # this list is the containers that should be sharded
-shard_container_list = wiki-en-local-public,wiki-en-local-thumb
+shard_container_list = wiki-dev-local-public,wiki-dev-local-thumb
 # backend_url_format controls whether we pass the URL through to the thumbhost 
unmolested
 # or mangle it to be consumed by mediawiki.  ms5 takes URLs unmolested, 
mediawiki wants them
 # transformed to something more palatable (specifically, turning 
http://upload/proj/lang/ into http://lang.proj/
 # valid options are 'asis' (leave it alone) and 'sitelang' (change upload to 
lang.site.org)
 backend_url_format = sitelang
+tld = local.wmftest.net:8080
 
 paste.filter_factory = wmf.rewrite:filter_factory
 
 # thumbor integration
 thumborhost = 127.0.0.1:8888
-thumbor_wiki_list = wiki-en
\ No newline at end of file
+thumbor_wiki_list = wiki-dev
\ No newline at end of file
diff --git a/puppet/modules/thumbor/manifests/init.pp 
b/puppet/modules/thumbor/manifests/init.pp
index 79fb13a..b2dd8b3 100644
--- a/puppet/modules/thumbor/manifests/init.pp
+++ b/puppet/modules/thumbor/manifests/init.pp
@@ -205,7 +205,7 @@
     # create the sharded thumbnail containers ahead of time.
     exec { 'create-swift-thumbnail-containers':
         command   => '/usr/local/bin/mwscript 
extensions/WikimediaMaintenance/filebackend/setZoneAccess.php --wiki wiki 
--backend swift-backend',
-        unless    => "swift -A http://127.0.0.1:${port}/auth/v1.0 -U 
${project}:${user} -K ${key} stat wiki-en-local-public.00 | grep -q 
wiki-en-local-public.00",
+        unless    => "swift -A http://127.0.0.1:${port}/auth/v1.0 -U 
${project}:${user} -K ${key} stat wiki-dev-local-public.00 | grep -q 
wiki-dev-local-public.00",
         require   => [
             Service[
                 'swift-account-server',
diff --git a/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb 
b/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb
index 7ced536..de1ab06 100644
--- a/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb
+++ b/puppet/modules/thumbor/templates/20-thumbor-wikimedia.conf.erb
@@ -32,8 +32,8 @@
 SWIFT_USER = '<%= scope['::swift::project'] %>:<%= scope['::swift::user'] %>'
 SWIFT_KEY = '<%= scope['::swift::key'] %>'
 SWIFT_SHARDED_CONTAINERS = [
-    'wiki-en-local-public',
-    'wiki-en-local-thumb'
+    'wiki-dev-local-public',
+    'wiki-dev-local-thumb'
 ]
 SWIFT_PATH_PREFIX = 'thumbor/'
 SWIFT_CONNECTION_TIMEOUT = 5
diff --git a/puppet/modules/thumbor/templates/varnish.vcl.erb 
b/puppet/modules/thumbor/templates/varnish.vcl.erb
index ef59dfb..3721441 100644
--- a/puppet/modules/thumbor/templates/varnish.vcl.erb
+++ b/puppet/modules/thumbor/templates/varnish.vcl.erb
@@ -1,15 +1,6 @@
 vcl 4.0;
 import xkey;
-import tbf;
 import std;
-
-sub vcl_init {
-    tbf.open("/var/run/varnish", "mode=600;dbname=tbf.bdb;truncate");
-}
-
-sub vcl_fini {
-    tbf.close();
-}
 
 sub vcl_recv {
     set req.http.X-Forwarded-For = client.ip;
@@ -37,7 +28,7 @@
 
     # Swift expects /lang/project/foo
     if (req.url ~ "^/images/") {
-        set req.url = "/wiki/en/" + regsub(req.url, "^/images/(.*)", "\1");
+        set req.url = "/wiki/dev/" + regsub(req.url, "^/images/(.*)", "\1");
     }
 
     # Reject any methods that aren't expected to work in the context of 
thumbnails
@@ -46,26 +37,6 @@
     }
 
     return(hash);
-}
-
-sub vcl_miss {
-    # 10 tokens to consume, refilled at a rate of 5/s
-    if (!tbf.rate("ip:" + client.ip, 1, 0.2s, 10)) {
-        return (synth(429, "Request rate exceeded"));
-    }
-}
-
-sub vcl_deliver {
-    # Miss
-    if (obj.hits == 0) {
-        # If the request requires more than 1s of CPU time, make it count like 
n extra
-        # tokens for the tbf throttling, where n is the amount of CPU seconds 
minus 1
-        if (!tbf.rate("ip:" + client.ip, 
(std.integer(resp.http.Processing-Utime, 1000) - 1000) / 1000, 0.2s, 10)) {
-            # tbf.rate() has to be inside an if statement
-        }
-    }
-
-    # Let things go to the default-subs.vcl vcl_deliver
 }
 
 sub vcl_backend_response {

-- 
To view, visit https://gerrit.wikimedia.org/r/317522
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7f8e93204d71bbaff3add556500a0d18df73922d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/vagrant
Gerrit-Branch: master
Gerrit-Owner: Gilles <gdu...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to