jenkins-bot has submitted this change and it was merged.
Change subject: Move dp.src handlers to their respective dom handlers
......................................................................
Move dp.src handlers to their respective dom handlers
* That these cases still remain in the encapsulated content handler
looks like a relic of 8939c692.
* This permits better fallback behaviour when the reuse test fails.
(see the entity encoding change in the blacklisted test).
* The newly failing test is because the case now falls to the figure
handler, which doesn't know about dp.src. That seems acceptable
though, since the output is the same as the html2wt test, where we've
removed the dp.src. Also, we're only adding the placeholder marking
so the content is uneditable. Presumably that means selser will take
care of it.
Change-Id: I430ee902a80dda199683937eae6dca02d50c3606
---
M lib/html2wt/DOMHandlers.js
M lib/html2wt/WikitextSerializer.js
M tests/parserTests-blacklist.js
3 files changed, 134 insertions(+), 148 deletions(-)
Approvals:
Subramanya Sastry: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/html2wt/DOMHandlers.js b/lib/html2wt/DOMHandlers.js
index e9edd17..cc4609b 100644
--- a/lib/html2wt/DOMHandlers.js
+++ b/lib/html2wt/DOMHandlers.js
@@ -528,6 +528,21 @@
!(/^mw:Includes\//.test(node.getAttribute('typeof'))));
}
+// Uneditable forms wrapped with mw:Placeholder tags OR unedited nowikis
+function emitPlaceholderSrc(node, state) {
+ var dp = DU.getDataParsoid(node);
+ if (/<nowiki\s*\/>/.test(dp.src)) {
+ state.hasSelfClosingNowikis = true;
+ }
+ // FIXME: Should this also check for tabs and plain space
+ // chars interspersed with newlines?
+ if (dp.src.match(/^\n+$/)) {
+ state.setSep((state.sep.src || '') + dp.src);
+ } else {
+ state.serializer.emitWikitext(dp.src, node);
+ }
+}
+
/**
* A map of `domHandler`s keyed on nodeNames.
*
@@ -964,6 +979,11 @@
var property = node.getAttribute('property');
var dp = DU.getDataParsoid(node);
+ if (dp.src !== undefined &&
+
/(^|\s)mw:Placeholder(\/\w*)?$/.test(type)) {
+ return emitPlaceholderSrc(node, state);
+ }
+
// Check for property before type so that page
properties with
// templated attrs roundtrip properly.
// Ex: {{DEFAULTSORT:{{echo|foo}} }}
@@ -1065,9 +1085,19 @@
},
span: {
handle: Promise.method(function(node, state, wrapperUnmodified)
{
+ var dp = DU.getDataParsoid(node);
var type = node.getAttribute('typeof');
+ var contentSrc = node.textContent || node.innerHTML;
if (isRecognizedSpanWrapper(type)) {
if (type === 'mw:Nowiki') {
+ // src seems to have been added to
nowiki spans for
+ // T44353 which looks like it predates
selser. We also
+ // seem to added src for self-closed
(currently metas)
+ // nowikis to preserve syntactic
variations. Possibly
+ // worth dropping for both cases.
+ if (dp.src !== undefined && contentSrc
=== dp.src) {
+ return emitPlaceholderSrc(node,
state);
+ }
state.emitChunk('<nowiki>', node);
return
Promise.reduce(Array.from(node.childNodes), function(_, child) {
if (DU.isElt(child)) {
@@ -1094,7 +1124,9 @@
} else if (/(?:^|\s)mw\:Entity/.test(type) &&
node.childNodes.length === 1) {
// handle a new mw:Entity (not handled
by selser) by
// serializing its children
- if (DU.isText(node.firstChild)) {
+ if (dp.src !== undefined && contentSrc
=== dp.srcContent) {
+
state.serializer.emitWikitext(dp.src, node);
+ } else if (DU.isText(node.firstChild)) {
state.emitChunk(
Util.entityEncodeAll(node.firstChild.nodeValue),
node.firstChild);
@@ -1103,7 +1135,9 @@
return
state.serializeChildren(node);
}
} else if
(/(^|\s)mw:Placeholder(\/\w*)?/.test(type)) {
- if (/(^|\s)mw:Placeholder(\s|$)/ &&
+ if (dp.src !== undefined) {
+ return emitPlaceholderSrc(node,
state);
+ } else if (/(^|\s)mw:Placeholder(\s|$)/
&&
node.childNodes.length === 1 &&
DU.isText(node.firstChild) &&
// See the DisplaySpace hack in
the urltext rule
@@ -1119,7 +1153,6 @@
}
}
} else {
- var dp = DU.getDataParsoid(node);
var kvs =
DU.getAttributeKVArray(node).filter(function(kv) {
return !/^data-parsoid/.test(kv.k) &&
!(kv.k === 'id' &&
/^mw[\w-]{2,}$/.test(kv.v));
@@ -1323,156 +1356,110 @@
/**
* Function returning `domHandler`s for nodes with encapsulated content.
*/
-var _getEncapsulatedContentHandler = function(n) {
- var self = this;
- var env = this.env;
- var dp = DU.getDataParsoid(n);
- var typeOf, dataMw;
+var _getEncapsulatedContentHandler = function() {
+ return {
+ handle: Promise.method(function(node, state, wrapperUnmodified)
{
+ var env = state.env;
+ var self = state.serializer;
+ var dp = DU.getDataParsoid(node);
+ var dataMw = DU.getDataMw(node);
+ var p;
+ var typeOf = node.getAttribute('typeof') || '';
+ if (/(?:^|\s)mw:Transclusion(?=$|\s)/.test(typeOf)) {
+ if (dataMw.parts) {
+ p = self._buildTemplateWT(node,
dataMw.parts);
+ } else if (dp.src) {
+ env.log("error", "data-mw missing in: "
+ node.outerHTML);
+ p = Promise.resolve(dp.src);
+ } else {
+ throw new Error("Cannot serialize
transclusion without data-mw.parts or data-parsoid.src.");
+ }
+ } else if (/(?:^|\s)mw:Param(?=$|\s)/.test(typeOf)) {
+ if (dp.src) {
+ p = Promise.resolve(dp.src);
+ } else {
+ throw new Error("No source for
params.");
+ }
+ } else if (/(?:^|\s)mw:Extension\//.test(typeOf)) {
+ if (!dataMw.name && !dp.src) {
+ // If there was no typeOf name, and no
dp.src, try getting
+ // the name out of the mw:Extension
type. This will
+ // generate an empty extension tag, but
it's better than
+ // just an error.
+ var extGivenName =
typeOf.replace(/(?:^|\s)mw:Extension\/([^\s]+)/, '$1');
+ if (extGivenName) {
+ env.log('error', 'no data-mw
name for extension in: ', node.outerHTML);
+ dataMw.name = extGivenName;
+ }
+ }
+ if (dataMw.name) {
+ var nativeExt =
env.conf.wiki.extensionTags.get(dataMw.name.toLowerCase());
+ if (nativeExt &&
nativeExt.serialHandler && nativeExt.serialHandler.handle) {
+ p =
nativeExt.serialHandler.handle(node, state, wrapperUnmodified);
+ } else {
+ p =
self.defaultExtensionHandler(node, state);
+ }
+ } else if (dp.src) {
+ env.log('error', 'data-mw missing in: '
+ node.outerHTML);
+ p = Promise.resolve(dp.src);
+ } else {
+ throw new Error('Cannot serialize
extension without data-mw.name or data-parsoid.src.');
+ }
+ } else {
+ throw new Error('Should never reach here');
+ }
+ return p.then(function(s) {
+ state.singleLineContext.disable();
+ self.emitWikitext(handleListPrefix(node, state)
+ s, node);
+ state.singleLineContext.pop();
+ return DU.skipOverEncapsulatedContent(node);
+ });
+ }),
+ sepnls: {
+ // XXX: This is questionable, as the template can expand
+ // to newlines too. Which default should we pick for new
+ // content? We don't really want to make separator
+ // newlines in HTML significant for the semantics of the
+ // template content.
+ before: function(node, otherNode, state) {
+ var env = state.env;
+ var typeOf = node.getAttribute('typeof') || '';
+ var dataMw = DU.getDataMw(node);
+ var dp = DU.getDataParsoid(node);
- if (DU.isFirstEncapsulationWrapperNode(n)) {
- return {
- handle: Promise.method(function(node, state,
wrapperUnmodified) {
- var p;
- typeOf = node.getAttribute('typeof') || '';
- if
(/(?:^|\s)mw:Transclusion(?=$|\s)/.test(typeOf)) {
- dataMw = DU.getDataMw(node);
- if (dataMw.parts) {
- p = self._buildTemplateWT(node,
dataMw.parts);
- } else if (dp.src) {
- env.log("error", "data-mw
missing in: " + node.outerHTML);
- p = Promise.resolve(dp.src);
- } else {
- throw new Error("Cannot
serialize transclusion without data-mw.parts or data-parsoid.src.");
- }
- } else if
(/(?:^|\s)mw:Param(?=$|\s)/.test(typeOf)) {
- if (dp.src) {
- p = Promise.resolve(dp.src);
- } else {
- throw new Error("No source for
params.");
- }
- } else if
(/(?:^|\s)mw:Extension\//.test(typeOf)) {
- dataMw = DU.getDataMw(node);
- if (!dataMw.name && !dp.src) {
- // If there was no typeOf name,
and no dp.src, try getting
- // the name out of the
mw:Extension type. This will
- // generate an empty extension
tag, but it's better than
- // just an error.
- var extGivenName =
typeOf.replace(/(?:^|\s)mw:Extension\/([^\s]+)/, '$1');
- if (extGivenName) {
- env.log('error', 'no
data-mw name for extension in: ', node.outerHTML);
- dataMw.name =
extGivenName;
- }
- }
+ // Handle native extension constraints.
+ if (/(?:^|\s)mw:Extension\//.test(typeOf) &&
+ // Only apply to plain
extension tags.
+
!/(?:^|\s)mw:Transclusion(?:\s|$)/.test(typeOf)) {
if (dataMw.name) {
var nativeExt =
env.conf.wiki.extensionTags.get(dataMw.name.toLowerCase());
- if (nativeExt &&
nativeExt.serialHandler && nativeExt.serialHandler.handle) {
- p =
nativeExt.serialHandler.handle(node, state, wrapperUnmodified);
- } else {
- p =
self.defaultExtensionHandler(node, state);
+ if (nativeExt &&
nativeExt.serialHandler && nativeExt.serialHandler.before) {
+ var ret =
nativeExt.serialHandler.before(node, otherNode, state);
+ if (ret !== null) {
return ret; }
}
- } else if (dp.src) {
- env.log('error', 'data-mw
missing in: ' + node.outerHTML);
- p = Promise.resolve(dp.src);
- } else {
- throw new Error('Cannot
serialize extension without data-mw.name or data-parsoid.src.');
}
- } else {
- throw new Error('Should never reach
here');
}
- return p.then(function(s) {
- state.singleLineContext.disable();
-
self.emitWikitext(handleListPrefix(node, state) + s, node);
- state.singleLineContext.pop();
- return
DU.skipOverEncapsulatedContent(node);
- });
- }),
- sepnls: {
- // XXX: This is questionable, as the template
can expand
- // to newlines too. Which default should we
pick for new
- // content? We don't really want to make
separator
- // newlines in HTML significant for the
semantics of the
- // template content.
- before: function(node, otherNode, state) {
- typeOf = node.getAttribute('typeof') ||
'';
- // Handle native extension constraints.
- if
(/(?:^|\s)mw:Extension\//.test(typeOf) &&
- // Only apply to plain
extension tags.
-
!/(?:^|\s)mw:Transclusion(?:\s|$)/.test(typeOf)) {
- dataMw = DU.getDataMw(node);
- if (dataMw.name) {
- var nativeExt =
env.conf.wiki.extensionTags.get(dataMw.name.toLowerCase());
- if (nativeExt &&
nativeExt.serialHandler && nativeExt.serialHandler.before) {
- var ret =
nativeExt.serialHandler.before(node, otherNode, state);
- if (ret !==
null) { return ret; }
- }
- }
+ // If this content came from a
multi-part-template-block
+ // use the first node in that block for
determining
+ // newline constraints.
+ if (dp.firstWikitextNode) {
+ var nodeName =
dp.firstWikitextNode.toLowerCase();
+ var h = tagHandlers.get(nodeName);
+ if (!h && dp.stx === 'html' && nodeName
!== 'a') {
+ h = htmlElementHandler;
}
-
- // If this content came from a
multi-part-template-block
- // use the first node in that block for
determining
- // newline constraints.
- var nodeDP = DU.getDataParsoid(node);
- if (nodeDP.firstWikitextNode) {
- var nodeName =
nodeDP.firstWikitextNode.toLowerCase();
- var h =
tagHandlers.get(nodeName);
- if (!h && nodeDP.stx === 'html'
&& nodeName !== 'a') {
- h = htmlElementHandler;
- }
- if (h && h.sepnls &&
h.sepnls.before) {
- return
h.sepnls.before(node, otherNode, state);
- }
+ if (h && h.sepnls && h.sepnls.before) {
+ return h.sepnls.before(node,
otherNode, state);
}
+ }
- // default behavior
- return { min: 0, max: 2 };
- },
+ // default behavior
+ return { min: 0, max: 2 };
},
- };
- }
-
- if (dp.src !== undefined) {
- // Uneditable forms wrapped with mw:Placeholder tags
- // OR unedited nowikis
- typeOf = n.getAttribute('typeof') || '';
- if (/(^|\s)mw:Placeholder(\/\w*)?$/.test(typeOf) ||
- (typeOf === "mw:Nowiki" && n.textContent ===
dp.src)) {
- // implement generic src round-tripping:
- // return src, and drop the generated content
- return {
- handle: Promise.method(function(node, state,
wrapperUnmodified) {
- if (/<nowiki\s*\/>/.test(dp.src)) {
- state.hasSelfClosingNowikis =
true;
- }
- // FIXME: Should this also check for
tabs and plain space
- // chars interspersed with newlines?
- if (dp.src.match(/^\n+$/)) {
- state.setSep((state.sep.src ||
'') + dp.src);
- } else {
- self.emitWikitext(dp.src, node);
- }
- }),
- };
- }
-
- // Entities
- if (typeOf === "mw:Entity" && n.childNodes.length === 1) {
- var contentSrc = n.textContent || n.innerHTML;
- return {
- handle: Promise.method(function(node, state,
wrapperUnmodified) {
- if (contentSrc === dp.srcContent) {
- self.emitWikitext(dp.src, node);
- } else {
- self.emitWikitext(contentSrc,
node);
- }
- }),
- };
- }
- }
-
- return null;
+ },
+ };
};
-
/**
* Just the handle for the htmlElementHandler defined below.
diff --git a/lib/html2wt/WikitextSerializer.js
b/lib/html2wt/WikitextSerializer.js
index c7b9e4c..2dca594 100644
--- a/lib/html2wt/WikitextSerializer.js
+++ b/lib/html2wt/WikitextSerializer.js
@@ -79,9 +79,6 @@
var WSP = WikitextSerializer.prototype;
-// Tag handlers
-WSP._getEncapsulatedContentHandler =
DOMHandlers._getEncapsulatedContentHandler;
-
// Used in multiple tag handlers, and hence added as top-level properties
// - linkHandler is used by <a> and <link>
// - figureHandler is used by <figure> and by <a>.linkHandler above
@@ -697,15 +694,16 @@
WSP._getDOMHandler = function(node) {
if (!node || !DU.isElt(node)) { return {}; }
- var handler = this._getEncapsulatedContentHandler(node);
- if (handler !== null) { return handler; }
+ if (DU.isFirstEncapsulationWrapperNode(node)) {
+ return DOMHandlers._getEncapsulatedContentHandler();
+ }
var dp = DU.getDataParsoid(node);
var nodeName = node.nodeName.toLowerCase();
// If available, use a specialized handler for serializing
// to the specialized syntactic form of the tag.
- handler = tagHandlers.get(nodeName + '_' + dp.stx);
+ var handler = tagHandlers.get(nodeName + '_' + dp.stx);
// Unless a specialized handler is available, use the HTML handler
// for html-stx tags. But, <a> tags should never serialize as HTML.
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index f6d1503..b4d8ac4 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -336,6 +336,7 @@
add("wt2wt", "Templates: HTML Tag: 9. Multiple template-generated attributes",
"<div {{echo|1=id=\"v1\" title=\"foo\"}} title=\"foo\">bar</div>");
add("wt2wt", "Templates: HTML Tables: 5. Proper fostering of categories from
inside", "[[Category:foo1]]\n<table><tr><td>foo</td></tr></table>\n<!--Two
categories (Bug
50330)-->\n[[Category:bar1]]\n[[Category:bar2]]\n<table><tr><td>foo</td></tr></table>");
add("wt2wt", "Allow empty links in image captions (Bug 60753)",
"[[File:Foobar.jpg|thumb|Caption
[[Link1]]\n<nowiki>[[]]</nowiki>\n[[Link2]]\n]]");
+add("wt2wt", "Image with multiple attributes from the same template",
"[[File:Foobar.jpg|right|Caption text]]");
add("wt2wt", "Image with multiple widths -- use last",
"[[File:Foobar.jpg|300px|caption]]\n");
add("wt2wt", "Render invalid page names as plain text (bug 51090)",
"[[./../foo|bar]]\n[[foo�|bar]]\n[[foo/.|bar]]\n[[foo/..|bar]]\n<nowiki>[[foo~~~bar]]</nowiki>\n[[foo>bar]]\n[[foo[bar]]\n[[.]]\n[[..]]\n[[foo././bar]]\n\n[[{{echo|./../foo}}<nowiki>|bar]]</nowiki>\n[[{{echo|foo/.}}<nowiki>|bar]]</nowiki>\n[[{{echo|foo/..}}<nowiki>|bar]]</nowiki>\n[[{{echo|foo~~~~bar}}<nowiki>]]</nowiki>\n[[{{echo|foo>bar}}<nowiki>]]</nowiki>\n[[{{echo|foo././bar}}<nowiki>]]</nowiki>\n[[{{echo|foo{bar}}<nowiki>]]</nowiki>\n[[{{echo|foo}bar}}<nowiki>]]</nowiki>\n[[{{echo|foo[bar}}<nowiki>]]</nowiki>\n[[{{echo|foo]bar}}<nowiki>]]</nowiki>\n[[{{echo|foo<bar}}<nowiki>]]</nowiki>\n");
add("wt2wt", "Handling of sections up to level 6 and beyond", "= Level 1
Heading=\n== Level 2 Heading==\n=== Level 3 Heading===\n==== Level 4
Heading====\n===== Level 5 Heading=====\n====== Level 6
Heading======\n======<nowiki>= Level 7
Heading=</nowiki>======\n======<nowiki>== Level 8
Heading==</nowiki>======\n======<nowiki>=== Level 9
Heading===</nowiki>======\n======<nowiki>==== Level 10
Heading====</nowiki>======\n");
@@ -361,7 +362,7 @@
add("wt2wt", "Fuzz testing: Parser24", "{{{|\n<u class=\"|\"
{{{{SSSll!!!!!!!VVVV)]]][[Special:*xxxxxxx--><noinclude>}}}}>\n<br
style=\"onmouseover='alert(document.cookie);' \" />\n\nMOVE YOUR MOUSE CURSOR
OVER THIS TEXT\n{|\n\n|\n|}");
add("wt2wt", "Inline wiki vs wiki block nesting", "'''Bold paragraph'''\n\nNew
wiki paragraph\n");
add("wt2wt", "Mixing markup for italics and bold",
"'<nowiki/>''bold'<nowiki/>'''''bold''bolditalics'''''\n");
-add("wt2wt", "Illegal character references (T106578)", "; Null: �\n; FF:
\n; CR: \n; Control (low): \n; Control (high):  Ÿ\n;
Surrogate: ��\n; This is an okay astral character: 💩");
+add("wt2wt", "Illegal character references (T106578)", "; Null: �\n; FF:
\n; CR: 
\n; Control (low): \n; Control (high):  Ÿ\n;
Surrogate: ��\n; This is an okay astral character: 💩");
add("wt2wt", "Image with page parameter", "[[File:LoremIpsum.djvu]]\n");
add("wt2wt", "Don't fall for the self-closing div", "<div>hello world</div>");
add("wt2wt", "Parsing of overlapping (improperly nested) inline html tags",
"<span><s>x</span>\n");
--
To view, visit https://gerrit.wikimedia.org/r/280760
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I430ee902a80dda199683937eae6dca02d50c3606
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Tim Starling <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits