Cscott has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/156013

Change subject: WIP: WTS: handle absolute URLs (/wiki/...) better.
......................................................................

WIP: WTS: handle absolute URLs (/wiki/...) better.

This helps us with html2wt of parser tests written for PHP which use
<a> tags with absolute links and without `typeof` attributes.

Change-Id: I9a535d07d19b4a9c3af677c301d217b889f8441f
---
M lib/wts.LinkHandler.js
1 file changed, 40 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/13/156013/1

diff --git a/lib/wts.LinkHandler.js b/lib/wts.LinkHandler.js
index b08c503..5b31603 100644
--- a/lib/wts.LinkHandler.js
+++ b/lib/wts.LinkHandler.js
@@ -1,6 +1,8 @@
 "use strict";
 
 require('./core-upgrade.js');
+var url = require('url');
+
 var Util = require('./mediawiki.Util.js').Util,
        DU = require('./mediawiki.DOMUtils.js').DOMUtils,
        pd = require('./mediawiki.parser.defines.js'),
@@ -34,6 +36,36 @@
        }
 };
 
+// Helper function for munging protocol-less absolute URLs:
+// If this URL is absolute, but doesn't contain a protocol,
+// try to find a localinterwiki protocol that would work.
+var getHref = function( env, node ) {
+       var href = node.getAttribute('href') || '';
+       if (/^\/[^\/]/.test(href)) {
+               // protocol-less but absolute.  let's find a base href
+               var i, bases = [], nhref;
+               env.conf.wiki.interwikiMap.forEach(function(interwikiInfo, 
prefix) {
+                       if (interwikiInfo.localinterwiki !== undefined &&
+                               interwikiInfo.url !== undefined) {
+                               // this is a possible base href
+                               bases.push(interwikiInfo.url);
+                       }
+               });
+               for (i=0; i<bases.length; i++) {
+                       // evaluate the url relative to this base
+                       nhref = url.resolve(bases[i], href);
+                       // can this match the pattern?
+                       var re = '^' +
+                               
bases[i].split('$1').map(Util.escapeRegExp).join('[\\s\\S]*') +
+                               '$';
+                       if (new RegExp(re).test(nhref)) {
+                               return nhref;
+                       }
+               }
+       }
+       return href;
+}
+
 // Helper function for getting RT data from the tokens
 var getLinkRoundTripData = function( env, node, state ) {
        var dp = DU.getDataParsoid( node );
@@ -54,7 +86,7 @@
                }
        }
 
-       var href = node.getAttribute('href') || '';
+       var href = getHref( env, node );
 
        // Save the token's "real" href for comparison
        rtData.href = href.replace( /^(\.\.?\/)+/, '' );
@@ -244,6 +276,10 @@
        // Get the rt data from the token and tplAttrs
        linkData = getLinkRoundTripData(env, node, state);
 
+       if (linkData.type === null && !node.querySelector('IMG')) {
+               linkData.type = 'mw:ExtLink';
+       }
+
        if ( linkData.type !== null && linkData.target.value !== null  ) {
                // We have a type and target info
 
@@ -257,7 +293,7 @@
                }
 
                var target = linkData.target,
-                       href = node.getAttribute('href') || '';
+                       href = getHref( env, node );
                if 
(/\b(mw:ExtLink|mw:PageProp\/Language)\b/.test(linkData.type)) {
                        var targetVal = target.fromsrc || true ? target.value : 
Util.decodeURI(target.value);
                        // Check if the href matches any of our interwiki URL 
patterns
@@ -479,7 +515,7 @@
                        if ( contentStr &&
                                        // Can we minimize this?
                                        ( target.value === contentStr  ||
-                                       node.getAttribute('href') === 
contentStr) &&
+                                       getHref( env, node ) === contentStr) &&
                                        // But preserve non-minimal encoding
                                        (target.modified || 
linkData.contentModified || dp.stx === 'url'))
                        {
@@ -560,7 +596,7 @@
                } else {
                        // href is already percent-encoded, etc., but it might 
contain
                        // spaces or other wikitext nasties.  escape the 
nasties.
-                       var hrefStr = 
escapeExtLinkURL(node.getAttribute('href'));
+                       var hrefStr = escapeExtLinkURL(getHref( env, node ));
                        cb( '[' + hrefStr + ' ' +
                                state.serializeLinkChildrenToString(node, 
this.wteHandlers.aHandler, false) +
                                ']', node );

-- 
To view, visit https://gerrit.wikimedia.org/r/156013
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9a535d07d19b4a9c3af677c301d217b889f8441f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Cscott <canan...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to