[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Update reverse interwiki map to prefer language prefixes ove...

2017-10-23 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/384171 )

Change subject: Update reverse interwiki map to prefer language prefixes over 
others
..


Update reverse interwiki map to prefer language prefixes over others

* Updated a bunch of parser tests to reflect the change.
* For the T3636 parser test, added a html/parsoid section to eliminate
  a false wt2html failure.

Bug: T177784
Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
---
M lib/config/WikiConfig.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
3 files changed, 41 insertions(+), 21 deletions(-)

Approvals:
  jenkins-bot: Verified
  Arlolra: Looks good to me, approved



diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js
index e9f8e48..60dadf3 100644
--- a/lib/config/WikiConfig.js
+++ b/lib/config/WikiConfig.js
@@ -232,14 +232,12 @@
}
});
 
-   var cachedMatcher = null;
-   this.interWikiMatcher = function() {
-   if (cachedMatcher) {
-   return cachedMatcher;
-   }
-   var keys = [];
-   var patterns = [];
+   var updatePatterns = function(keys, patterns, filter) {
conf.interwikiMap.forEach(function(val, key) {
+   if (!filter(val)) {
+   return;
+   }
+
var url = val.url;
var protocolRelative = url.startsWith('//');
if (val.protorel !== undefined) {
@@ -271,6 +269,20 @@
patterns.push('^' + val.prefix + '%3A(.*?)');
}
});
+   };
+
+   var cachedMatcher = null;
+   this.interWikiMatcher = function() {
+   if (cachedMatcher) {
+   return cachedMatcher;
+   }
+   var keys = [];
+   var patterns = [];
+   // For html -> wt reverse mapping, prefer language interwiki 
prefixes
+   // over other interwiki prefixes. So, use "en" instead of 
"wikipedia"
+   // for English wikipedia interwiki links.
+   updatePatterns(keys, patterns, function(val) { return 
!!val.language; });
+   updatePatterns(keys, patterns, function(val) { return 
!val.language; });
var reString = '^(?:' + patterns.join('|') + ')$';
var regExp = new RegExp(reString, 'i');
var matchFunc = function(s) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 0c418d9..acd73e1 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -652,7 +652,6 @@
 add("html2wt", "Internal link with is link prefix", "Aðrir 
[[wiki/Söfnuður|mótmælendasöfnuðir]] og\n");
 add("html2wt", "Internal link with is link trail and link prefix", 
"[[wiki/Mótmælendatrú|xxxar]]\n[[wiki/Mótmælendatrú|mótmælendatrúar]]\n[[wiki/Söfnuður|mótmælendasöfnuður]]\n[[wiki/Söfnuður|mótmælendasöfnuðir]]\n[[wiki/Söfnuður|mótmælendasöfnuðirxxx]]\n");
 add("html2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved", "[[wiki/Foo|  bar]]\n\n[[wiki/Foo|  ''bar'']]\n\n[http://wp.org 
foo]\n\n[http://wp.org ''foo'']\n");
-add("html2wt", "Interwiki link encoding conversion (T3636)", "* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
 add("html2wt", "Interwiki link with fragment (T4130)", 
"[[meatball:SoftSecurity#foo|MeatBall:SoftSecurity#foo]]\n");
 add("html2wt", "Escaping of interlanguage links (T129218, T156308)", "Blah 
blah blah\n[[:es:Spanish]]\n[[:zh:Chinese| zh : Chinese ]]\n");
 add("html2wt", "Parsoid-specific test: Wikilinks with   should RT 
properly", "[/index.php?title=WW_II&action=edit&redlink=1 WW II]\n");
@@ -1158,8 +1157,8 @@
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[1,3,0]]", 
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de 
Casagemas'' (1901) en el sitio de ]\n");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[4,0,3]]", "1jnda7a\n");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[[1,2],2,4]]", 
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de 
Casagemas''1svf0oe (1901) en el sitio de ]mqtmyg6n94l9");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[4]]]", "[[wikipedia:European_Robin|1rmduf6]]");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[2]]]", "[[wikipedia:European_Robin|134iwocEuropean 
Robin]]");
+add("selser", "mw:ExtLink linking to a interwiki URL can b

[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Update reverse interwiki map to prefer language prefixes ove...

2017-10-13 Thread Subramanya Sastry (Code Review)
Subramanya Sastry has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/384171 )

Change subject: Update reverse interwiki map to prefer language prefixes over 
others
..

Update reverse interwiki map to prefer language prefixes over others

* Updated a bunch of parser tests to reflect the change.

Bug: T177784
Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
---
M lib/config/WikiConfig.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
3 files changed, 38 insertions(+), 21 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/71/384171/1

diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js
index e9f8e48..52e1c4f 100644
--- a/lib/config/WikiConfig.js
+++ b/lib/config/WikiConfig.js
@@ -232,14 +232,12 @@
}
});
 
-   var cachedMatcher = null;
-   this.interWikiMatcher = function() {
-   if (cachedMatcher) {
-   return cachedMatcher;
-   }
-   var keys = [];
-   var patterns = [];
+   var updatePatterns = function(keys, patterns, filter) {
conf.interwikiMap.forEach(function(val, key) {
+   if (!filter(val)) {
+   return;
+   }
+
var url = val.url;
var protocolRelative = url.startsWith('//');
if (val.protorel !== undefined) {
@@ -271,6 +269,20 @@
patterns.push('^' + val.prefix + '%3A(.*?)');
}
});
+   }
+
+   var cachedMatcher = null;
+   this.interWikiMatcher = function() {
+   if (cachedMatcher) {
+   return cachedMatcher;
+   }
+   var keys = [];
+   var patterns = [];
+   // For html -> wt reverse mapping, prefer language interwiki 
prefixes
+   // over other interwiki prefixes. So, use "en" instead of 
"wikipedia"
+   // for English wikipedia interwiki links.
+   updatePatterns(keys, patterns, function(val) { return 
!!val.language; });
+   updatePatterns(keys, patterns, function(val) { return 
!val.language; });
var reString = '^(?:' + patterns.join('|') + ')$';
var regExp = new RegExp(reString, 'i');
var matchFunc = function(s) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 0c418d9..0cd0c61 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -358,6 +358,7 @@
 add("html2html", "Internal link with is link prefix", "Aðrir mótmælendasöfnuðir
 og\n");
 add("html2html", "Internal link with is link trail and link prefix", "xxxar\nmótmælendatrúar\nmótmælendasöfnuður\nmótmælendasöfnuðir\nmótmælendasöfnuðirxxx\n");
 add("html2html", "Parsoid-centric test: Whitespace in ext- and wiki-links 
should be preserved", "
  bar\n\n
  bar\n\nhttp://wp.org\"; 
data-parsoid='{\"targetOff\":59,\"contentOffsets\":[59,62],\"dsr\":[44,63,15,1]}'>foo\n\nhttp://wp.org\"; 
data-parsoid='{\"targetOff\":80,\"contentOffsets\":[80,87],\"dsr\":[65,88,15,1]}'>foo\n");
+add("html2html", "Interwiki link encoding conversion (T3636)\n+!! 
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we 
will prefer the :en: interwiki prefix over wikipedia:", " Wikipedia:ro:Olteniţa\n Wikipedia:ro:Olteniţa\n");
 add("html2html", "Space and question mark encoding in interlanguage links 
(T95473)", "Blah blah blah\nhttp://es.wikipedia.org/wiki/Foo_bar?\"; 
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"http://es.wikipedia.org/wiki/Foo_bar?\"},\"sa\":{\"href\":\"es:Foo_bar?\"},\"dsr\":[15,30,null,null]}'/>");
 add("html2html", "Parsoid-specific test: Wikilinks with   should RT 
properly", "[/index.php?title=WW_II&action=edit&redlink=1
 WW II]\n");
 add("html2html", " to ", "1\n2\n3\n");
@@ -652,7 +653,7 @@
 add("html2wt", "Internal link with is link prefix", "Aðrir 
[[wiki/Söfnuður|mótmælendasöfnuðir]] og\n");
 add("html2wt", "Internal link with is link trail and link prefix", 
"[[wiki/Mótmælendatrú|xxxar]]\n[[wiki/Mótmælendatrú|mótmælendatrúar]]\n[[wiki/Söfnuður|mótmælendasöfnuður]]\n[[wiki/Söfnuður|mótmælendasöfnuðir]]\n[[wiki/Söfnuður|mótmælendasöfnuðirxxx]]\n");
 add("html2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved", "[[wiki/Foo|  bar]]\n\n[[wiki/Foo|  ''bar'']]\n\n[http://wp.org 
foo]\n\n[http://wp.org ''foo'']\n");
-add("html2wt", "Interwiki link encoding conversion (T3636)", "* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
+add("html2wt", "Interwiki link encoding conversion (T3636)\n+!! 
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we 
will prefer the :en: