jenkins-bot has submitted this change and it was merged. Change subject: T95039: encode/decode arbitrary data in comments. ......................................................................
T95039: encode/decode arbitrary data in comments. Use HTML5 entity encoding to ensure that arbitrary data can be represented in comments without triggering premature end of comment. Do a little bit of entity encoding on the wikitext side as well to allow the user to author the string "-->" inside a comment. They probably won't want to (!) but we want to avoid exposing arbitrary content restrictions to DOM consumers (like Visual Editor). If you were wondering, "-->" is represented as "-->" in wikitext. We also normalize HTML entities during DOM diff, which ensures that we don't dirty diff even if VE does the entity encoding inside comments differently than we do. Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8 --- M lib/XMLSerializer.js M lib/dom.computeDSR.js M lib/dom.migrateTrailingNLs.js M lib/dom.wrapTemplates.js M lib/ext.core.PreHandler.js M lib/mediawiki.DOMDiff.js M lib/mediawiki.DOMUtils.js M lib/mediawiki.tokenizer.peg.js M lib/mediawiki.tokenizer.utils.js M lib/pegTokenizer.pegjs.txt M lib/wts.separators.js M lib/wts.utils.js M tests/parserTests-blacklist.js M tests/parserTests.txt M tests/roundtrip-test.js 15 files changed, 166 insertions(+), 97 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/XMLSerializer.js b/lib/XMLSerializer.js index f8ef3b9..8bcd85e 100644 --- a/lib/XMLSerializer.js +++ b/lib/XMLSerializer.js @@ -150,7 +150,13 @@ case TEXT_NODE: return cb(node.data.replace(/[<&]/g, _xmlEncoder)); case COMMENT_NODE: - return cb( "<!--" + node.data.replace(/-->/g, '-->') + "-->"); + // According to + // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml + // we could throw an exception here if node.data would not create + // a "well-formed" XML comment. But we use entity encoding when + // we create the comment node to ensure that node.data will always + // be okay; see DOMUtils.encodeComment(). + return cb( "<!--" + node.data + "-->"); default: cb('??' + node.nodeName); } diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js index e638a16..2ad6b84 100644 --- a/lib/dom.computeDSR.js +++ b/lib/dom.computeDSR.js @@ -246,7 +246,8 @@ } } else if (cType === node.COMMENT_NODE) { if (ce !== null) { - cs = ce - child.data.length - 7; // 7 chars for "<!--" and "-->" + // decode html entities & reencode as wikitext to find length + cs = ce - DU.decodedCommentLength(child); } } else if (cType === node.ELEMENT_NODE) { var cTypeOf = child.getAttribute("typeof"), @@ -482,7 +483,7 @@ if (nType === node.TEXT_NODE) { newCE = newCE + sibling.data.length + DU.indentPreDSRCorrection(sibling); } else if (nType === node.COMMENT_NODE) { - newCE = newCE + sibling.data.length + 7; + newCE = newCE + DU.decodedCommentLength(sibling); } else if (nType === node.ELEMENT_NODE) { var siblingDP = DU.getDataParsoid( sibling ); if (siblingDP.dsr && siblingDP.tsr && siblingDP.dsr[0] <= newCE && e !== null) { diff --git a/lib/dom.migrateTrailingNLs.js b/lib/dom.migrateTrailingNLs.js index dde4394..caed4ea 100644 --- a/lib/dom.migrateTrailingNLs.js +++ b/lib/dom.migrateTrailingNLs.js @@ -80,7 +80,7 @@ if (DU.isComment(n)) { firstEltToMigrate = n; // <!--comment--> - tsrCorrection += (7 + n.data.length); + tsrCorrection += DU.decodedCommentLength(n); } else { if (n.data.match(/^\s*\n\s*$/)) { foundNL = true; diff --git a/lib/dom.wrapTemplates.js b/lib/dom.wrapTemplates.js index 86c31c3..ba71475 100644 --- a/lib/dom.wrapTemplates.js +++ b/lib/dom.wrapTemplates.js @@ -100,7 +100,8 @@ if (DU.isText(n)) { offset += n.data.length; } else { - offset += n.data.length + 7; + // A comment + offset += DU.decodedCommentLength(n); } n = n.previousSibling; } @@ -111,7 +112,8 @@ } if (dsr && typeof (dsr[1]) === 'number') { - var len = DU.isText(endNode) ? endNode.data.length : endNode.data.length + 7; + var len = DU.isText(endNode) ? endNode.data.length : + DU.decodedCommentLength(endNode); dsr = [dsr[1] + offset, dsr[1] + offset + len]; } diff --git a/lib/ext.core.PreHandler.js b/lib/ext.core.PreHandler.js index db3895b..8af8dfb 100644 --- a/lib/ext.core.PreHandler.js +++ b/lib/ext.core.PreHandler.js @@ -64,8 +64,9 @@ * --------------------------------------------------------------------------*/ -var Util = require('./mediawiki.Util.js').Util, - defines = require('./mediawiki.parser.defines.js'); +var DU = require('./mediawiki.DOMUtils.js').DOMUtils, + Util = require('./mediawiki.Util.js').Util, + defines = require('./mediawiki.parser.defines.js'); // define some constructor shortcuts var CommentTk = defines.CommentTk, @@ -310,7 +311,9 @@ function getUpdatedPreTSR(tsr, token) { var tc = token.constructor; if (tc === CommentTk) { - tsr = token.dataAttribs.tsr ? token.dataAttribs.tsr[1] : (tsr === -1 ? -1 : token.value.length + 7 + tsr); + // comment length has 7 added for "<!--" and "-->" deliminters + // (see DU.decodedCommentLength() -- but that takes a node not a token) + tsr = token.dataAttribs.tsr ? token.dataAttribs.tsr[1] : (tsr === -1 ? -1 : DU.decodeComment(token.value).length + 7 + tsr); } else if (tc === SelfclosingTagTk) { // meta-tag (cannot compute) tsr = -1; diff --git a/lib/mediawiki.DOMDiff.js b/lib/mediawiki.DOMDiff.js index 97ac597..1bd2dfc 100644 --- a/lib/mediawiki.DOMDiff.js +++ b/lib/mediawiki.DOMDiff.js @@ -122,13 +122,15 @@ DDP.treeEquals = function (nodeA, nodeB, deep) { if ( nodeA.nodeType !== nodeB.nodeType ) { return false; - } else if (DU.isText(nodeA) || DU.isComment(nodeA)) { + } else if (DU.isText(nodeA)) { // In the past we've had bugs where we let non-primitive strings // leak into our DOM. Safety first: console.assert(nodeA.nodeValue === nodeA.nodeValue.valueOf()); console.assert(nodeB.nodeValue === nodeB.nodeValue.valueOf()); // ok, now do the comparison. return nodeA.nodeValue === nodeB.nodeValue; + } else if (DU.isComment(nodeA)) { + return DU.decodeComment(nodeA.data) === DU.decodeComment(nodeB.data); } else if (DU.isElt(nodeA)) { // Compare node name and attribute length if (nodeA.nodeName !== nodeB.nodeName || diff --git a/lib/mediawiki.DOMUtils.js b/lib/mediawiki.DOMUtils.js index dc5fb61..a3b3788 100644 --- a/lib/mediawiki.DOMUtils.js +++ b/lib/mediawiki.DOMUtils.js @@ -2067,6 +2067,93 @@ dp.sa[ name ] = origVal; } dp.a[ name ] = val; + }, + + // Comment encoding/decoding. + // + // * Some relevant phab tickets: T94055, T70146, T60184, T95039 + // + // The wikitext comment production is very simple: <!-- starts a comment, + // and --> ends a comment. This means we can have almost anything as the + // contents of a comment (except the string "-->", but see below), including + // several things that are not valid in HTML5 comments: + // + // * For one, the html5 comment parsing algorithm [0] leniently accepts + // --!> as a closing comment tag, which differs from the php+tidy combo. + // + // * If the comment's data matches /^-?>/, html5 will end the comment. + // For example, <!-->stuff<--> breaks up as + // <!--> (the comment) followed by, stuff<--> (as text). + // + // * Finally, comment data shouldn't contain two consecutive hyphen-minus + // characters (--), nor end in a hyphen-minus character (/-$/) as defined + // in the spec [1]. + // + // We work around all these problems by using HTML entity encoding inside + // the comment body. The characters -, >, and & must be encoded in order + // to prevent premature termination of the comment by one of the cases + // above. Encoding other characters is optional; all entities will be + // decoded during wikitext serialization. + // + // In order to allow *arbitrary* content inside a wikitext comment, + // including the forbidden string "-->" we also do some minimal entity + // decoding on the wikitext. We are also limited by our inability + // to encode DSR attributes on the comment node, so our wikitext entity + // decoding must be 1-to-1: that is, there must be a unique "decoded" + // string for every wikitext sequence, and for every decoded string there + // must be a unique wikitext which creates it. + // + // The basic idea here is to replace every string ab*c with the string with + // one more b in it. This creates a string with no instance of "ac", + // so you can use 'ac' to encode one more code point. In this case + // a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to + // encode "-->" (which is otherwise unspeakable in wikitext). + // + // Note that any user content which does not match the regular + // expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext + // representation, as shown in the first two examples below. + // + // User-authored comment text Wikitext HTML5 DOM + // -------------------------- ------------- ---------------------- + // & - > & - > & + > + // Use > here Use > here Use &gt; here + // --> --> ++> + // --> --&gt; ++&gt; + // --&gt; --&amp;gt; ++&amp;gt; + // + // [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state + // [1] http://www.w3.org/TR/html5/syntax.html#comments + + // Map a wikitext-escaped comment to an HTML DOM-escaped comment. + encodeComment: function(comment) { + // Undo wikitext escaping to obtain "true value" of comment. + var trueValue = comment + .replace(/--&(amp;)*gt;/g, Util.decodeEntities); + // Now encode '-', '>' and '&' in the "true value" as HTML entities, + // so that they can be safely embedded in an HTML comment. + // This part doesn't have to map strings 1-to-1. + return trueValue + .replace(/[->&]/g, Util.entityEncodeAll); + }, + + // Map an HTML DOM-escaped comment to a wikitext-escaped comment. + decodeComment: function(comment) { + // Undo HTML entity escaping to obtain "true value" of comment. + var trueValue = Util.decodeEntities(comment); + // ok, now encode this "true value" of the comment in such a way + // that the string "-->" never shows up. (See above.) + return trueValue + .replace(/--(&(amp;)*gt;|>)/g, function(s) { + return s==='-->' ? '-->' : '--&'+s.slice(3); + }); + }, + + // Utility function: we often need to know the wikitext DSR length for + // an HTML DOM comment value. + decodedCommentLength: function(node) { + console.assert(DU.isComment(node)); + // Add 7 for the "<!--" and "-->" delimiters in wikitext. + return DU.decodeComment(node.data).length + 7; } }; diff --git a/lib/mediawiki.tokenizer.peg.js b/lib/mediawiki.tokenizer.peg.js index 5971bee..c7063a2 100644 --- a/lib/mediawiki.tokenizer.peg.js +++ b/lib/mediawiki.tokenizer.peg.js @@ -26,6 +26,7 @@ * in the head of pegTokenizer.pegjs.txt. */ var pegIncludes = { + DOMUtils: require('./mediawiki.DOMUtils.js').DOMUtils, Util: require('./mediawiki.Util.js').Util, defines: require('./mediawiki.parser.defines.js'), tu: require('./mediawiki.tokenizer.utils.js'), diff --git a/lib/mediawiki.tokenizer.utils.js b/lib/mediawiki.tokenizer.utils.js index 82a42f5..6620811 100644 --- a/lib/mediawiki.tokenizer.utils.js +++ b/lib/mediawiki.tokenizer.utils.js @@ -12,11 +12,6 @@ EndTagTk = defines.EndTagTk, CommentTk = defines.CommentTk; - -var replaceMatchWithSpaces = function( match ) { - return " ".repeat( match.length ); -}; - var tu = { flattenIfArray: function(e) { @@ -296,37 +291,6 @@ } else { return null; } - }, - - // Comment normalizing - // - // * Some relevant phab tickets: T94055, T70146, T60184. - // - // * For one, the html5 comment parsing algorithm [0] leniently accepts - // --!> as a closing comment tag, which differs from the php+tidy combo. - // We suppress that behaviour at the cost of introducing a dirty diff, - // but selser should take care of it. - // - // * If the comment's data matches /^-?>/, html5 will end the comment, - // introducing roundtrip diffs. For example, <!-->stuff<--> breaks up as - // <!--> (the comment) followed by, stuff<--> (as text). - // - // * Finally, comment data shouldn't contain two consecutive hyphen-minus - // characters (--), nor end in a hyphen-minus character (/-$/) as defined - // in the spec [1]. Normalize to this syntax in order to emit valid XML. - // - // [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state - // [1] http://www.w3.org/TR/html5/syntax.html#comments - - normalizeComment: function( comment ) { - return comment - // suppress html5 comment start (dash)? ending - .replace(/^(-?>)/, replaceMatchWithSpaces) - // normalize double hyphens, with the bonus of - // suppressing the html5 end bang state (--!>) - .replace(/(-{2,})/g, replaceMatchWithSpaces) - // normalize away extra ending hyphens - .replace(/(-+)$/, replaceMatchWithSpaces); } }; diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt index 030b0d9..42790cb 100644 --- a/lib/pegTokenizer.pegjs.txt +++ b/lib/pegTokenizer.pegjs.txt @@ -7,6 +7,7 @@ { var pegIncludes = options.pegIncludes, + DU = pegIncludes.DOMUtils, Util = pegIncludes.Util, PegTokenizer = pegIncludes.PegTokenizer, defines = pegIncludes.defines, @@ -379,13 +380,13 @@ // but, as always, things around here are a little more complicated. // // We accept the same comments, but because we emit them as HTML comments -// instead of deleting them, we have to sanitize the contents of the -// comments. We currently do this in a way which loses information. See -// the normalizeComment helper for further details. +// instead of deleting them, we have to encode the data to ensure that +// we always emit a valid HTML5 comment. See the encodeComment helper +// for further details. comment = '<!--' c:( !"-->" c:. { return c; } )* ('-->' / eof) { - var data = tu.normalizeComment( c.join('') ); + var data = DU.encodeComment( c.join('') ); return [new CommentTk( data, { tsr: [peg$reportedPos, peg$currPos] } )]; } diff --git a/lib/wts.separators.js b/lib/wts.separators.js index 05626b6..72a4985 100644 --- a/lib/wts.separators.js +++ b/lib/wts.separators.js @@ -573,7 +573,7 @@ correction; if (typeof (endDsr) === 'number') { if (DU.isComment(prevNode)) { - correction = prevNode.nodeValue.length + 7; + correction = DU.decodedCommentLength(prevNode); } else { correction = prevNode.nodeValue.length; } diff --git a/lib/wts.utils.js b/lib/wts.utils.js index 8ab8274..b845e52 100644 --- a/lib/wts.utils.js +++ b/lib/wts.utils.js @@ -14,7 +14,7 @@ }, commentWT: function (comment) { - return '<!--' + comment.replace(/-->/, '-->') + '-->'; + return '<!--' + DU.decodeComment(comment) + '-->'; }, /** diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js index 974a4c7..68fa053 100644 --- a/tests/parserTests-blacklist.js +++ b/tests/parserTests-blacklist.js @@ -219,7 +219,7 @@ add("wt2html", "Parser hook: empty input using terminated empty elements (bug 2374)", "<p data-parsoid='{\"dsr\":[0,18,0,0]}'><tag foo=bar/>text</p>"); add("wt2html", "Parser hook: basic arguments using terminated empty elements (bug 2374)", "<p data-parsoid='{\"dsr\":[0,70,0,0]}'><tag width=200 height = \"100\" depth = '50' square/>\nother stuff\n</tag></p>"); add("wt2html", "Parser hook: static parser hook not inside a comment", "<p data-parsoid='{\"dsr\":[0,61,0,0]}'><statictag>hello, world</statictag>\n<statictag action=flush/></p>"); -add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- <statictag>hello, world</statictag> -->\n<p data-parsoid='{\"dsr\":[45,70,0,0]}'><statictag action=flush/></p>"); +add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- <statictag>hello, world</statictag> -->\n<p data-parsoid='{\"dsr\":[45,70,0,0]}'><statictag action=flush/></p>"); add("wt2html", "Sanitizer: Closing of closed but not open tags", ""); add("wt2html", "Sanitizer: Closing of closed but not open table tags", "Table not started"); add("wt2html", "Sanitizer: Escaping of spaces, multibyte characters, colons & other stuff in id=\"\"", "<p data-parsoid='{\"dsr\":[0,45,0,0]}'><span id=\"æ: v\" data-parsoid='{\"stx\":\"html\",\"dsr\":[0,27,16,7]}'>byte</span><a rel=\"mw:WikiLink\" href=\"./Main%20Page#æ:_v\" data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Main%20Page#æ:_v\"},\"sa\":{\"href\":\"#æ: v\"},\"dsr\":[27,45,8,2]}'>backlink</a></p>"); @@ -420,7 +420,6 @@ add("wt2wt", "Non-word characters don't terminate tag names (bug 17663, 40670, 52022)", "<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't terminate </bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate </s.foo>\n\n<sub-ID#1>\n"); add("wt2wt", "Non-word characters don't terminate tag names + tidy", "<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't terminate </bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate </s.foo>\n\n<sub-ID#1>\n"); add("wt2wt", "Isolated close tags should be treated as literal text (bug 52760)", "\n<s.foo>s\n"); -add("wt2wt", "Comment semantics: unclosed comment at end", "<!--This comment will run out to the end of the document-->"); add("wt2wt", "<nowiki> inside <pre> (bug 13238)", "<pre>\n<nowiki>\n</pre>\n<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki><nowiki></nowiki>Foo<nowiki></nowiki></nowiki></pre>"); add("wt2wt", "<nowiki> and <pre> preference (first one wins)", "<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n\n<nowiki>\n<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n"); add("wt2wt", "Templates: Indent-Pre: 1f: Wrapping should be based on expanded content", "{{echo| }}a\n\n{{echo|\n }}a\n\n{{echo|\n b}}\n\n{{echo|a\n }}b\n\n{{echo|a\n}}\n b\n"); @@ -921,7 +920,6 @@ add("html2wt", "Comment test 4", "asdfjkl\n"); add("html2wt", "Comment spacing", "a\n\n b \n\nc\n"); add("html2wt", "Comment whitespace", ""); -add("html2wt", "Comment semantics: unclosed comment at end", ""); add("html2wt", "Comment in template title", "FOO\n"); add("html2wt", "Comment on its own line post-expand", "a\n\nb\n"); add("html2wt", "Comment on its own line post-expand with non-significant whitespace", "a\n\nb\n"); @@ -1773,7 +1771,6 @@ add("selser", "Isolated close tags should be treated as literal text (bug 52760) [3,2]", "i8usmsqui9kke29\n\n<s.foo>s</s>"); add("selser", "Isolated close tags should be treated as literal text (bug 52760) [2,0]", "zskmdh55uiuxflxr\n\n<s.foo>s</s>"); add("selser", "Isolated close tags should be treated as literal text (bug 52760) [3,0]", "<s.foo>s</s>"); -add("selser", "Comment semantics: unclosed comment at end 5", "<!--This comment will run out to the end of the document-->"); add("selser", "Empty lines between lines with block tags [0,0,0,0,4,[4],0,4,2,4,2,0,0,3,3,0,0,[4]]", "<div></div>\n\n\n0jr2knw69qode7b9\n\nak4f0xoun9bv5cdi\n\n4us29365j9f9lik9\n\n3sh6psvknz3hm2t9\n\n8z2ywhrg9wl8fr\n\nz7s9at3qyq2xogvi\n\nb\n\n<div>b</div>\n<div>h2r4miiukcoez5mi</div>"); add("selser", "Empty lines between lines with block tags [0,4,2,2,0,4,0,[2],0,[4],3,0,0,0,0,2,3,0]", "<div></div>g72sdt9b9996bt9\n\nau765cdgwcsg7gb9\n\n\n\nqhrfek8hsgkz9f6r\n<div></div>rw83ofcbk55vzpvi\n\nvky2au8c7nmnp14ib\n<div>eu25mfmjxi0sh5mi</div>\n\n<div>b</div>d\n\nvxxu49npeng66r\n\n<div>e</div>"); add("selser", "<nowiki> inside <pre> (bug 13238) [4,2,2,0,0]", "ptjux9vq9zarlik9\n\ny7rebotz1jo1dcxr\nnb7mtl9jj0jm7vi<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki><nowiki></nowiki>Foo<nowiki></nowiki></nowiki></pre>"); @@ -2598,7 +2595,7 @@ add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [[2,0,0],2,0]", "3i2udpyjqgehr529A <ref> <b> foo </ref> B C\n\n04ufs7jtgivsra4i\n\n<references />"); add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [0,4,0]", "A <ref> <b> foo </ref> B C\n\nexksatqzwpgsc3di<references />"); add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [[0,0,4],3,0]", "A <ref> <b> foo </ref>3ds74snl9wly2e29<references />"); -add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier [4,[2],3,2,4,[4],3,0]", "m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline at the end of this line stays inside the p-tag--> <ref />\n<ref />\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />"); +add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier [4,[2],3,2,4,[4],3,0]", "m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline at the end of this line stays inside the p tag--> <ref />\n<ref />\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [[2,0,4,0],0,0]", "0dpatm9gsbqk6gvi1 <ref name=\"a & b\">foo</ref>k0yidhh5u2b21emi<ref name=\"a & b\" />\n\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [2,0,0]", "zmmmjlha7xim5cdi\n\n1 <ref name=\"a & b\">foo</ref> 2 <ref name=\"a & b\" />\n\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [1,0,0]", "1 <ref name=\"a & b\">foo</ref> 2 <ref name=\"a & b\" />\n\n<references />"); diff --git a/tests/parserTests.txt b/tests/parserTests.txt index b4ed614..fd6a3c4 100644 --- a/tests/parserTests.txt +++ b/tests/parserTests.txt @@ -1585,34 +1585,28 @@ !! test Comment semantics and delimiters -!! options -parsoid=wt2html,html2html !! wikitext <!-- --><!----><!-----><!------> !! html/php !! html/parsoid -<!-- --><!----><!-- --><!-- --> +<!-- --><!----><!-----><!------> !! end !! test Comment semantics and delimiters, redux -!! options -parsoid=wt2html,html2html !! wikitext <!-- In SGML every "foo" here would actually show up in the text -- foo -- bar -- foo -- funky huh? ... --> !! html/php !! html/parsoid -<!-- In SGML every "foo" here would actually show up in the text foo bar - foo funky huh? ... --> +<!-- In SGML every "foo" here would actually show up in the text -- foo -- bar +-- foo -- funky huh? ... --> !! end !! test Comment semantics and delimiters: directors cut -!! options -parsoid=wt2html,html2html !! wikitext <!-- ... However we like to keep things simple and somewhat XML-ish so we eat everything starting with < followed by !-- until the first -- and > we see, @@ -1622,53 +1616,64 @@ <p>--> </p> !! html/parsoid -<!-- ... However we like to keep things simple and somewhat XML-ish so we eat -everything starting with < followed by ! until the first and > we see, -that wouldn't be valid XML however, since in XML has to terminate a comment +<!-- ... However we like to keep things simple and somewhat XML-ish so we eat +everything starting with < followed by !-- until the first -- and > we see, +that wouldn't be valid XML however, since in XML -- has to terminate a comment --><p>--></p> !! end !! test Comment semantics: nesting -!! options -parsoid=wt2html,html2html !! wikitext <!--<!-- no, we're not going to do anything fancy here -->--> !! html/php <p>--> </p> !! html/parsoid -<!--<! no, we're not going to do anything fancy here --><p>--></p> +<!--<!-- no, we're not going to do anything fancy here --><p>--></p> !! end +# Parsoid closes the unclosed comment, even if it means a slight +# round-trip diff. !! test Comment semantics: unclosed comment at end +!! options +parsoid=wt2html,html2html !! wikitext <!--This comment will run out to the end of the document -!! html +!! html/php +!! html/parsoid +<!--This comment will run out to the end of the document--> !! end !! test -Normalize comments to play nice with XML and browsers -!! options -parsoid=wt2html,html2html +Comment semantics: normalize comments to play nice with XML and browsers !! wikitext <!-- Browsers --!> think this is closed --> <!--> This would normally be text --> <!---> As would this --> <!-- XML doesn't like trailing dashes --------> <!-- Nor doubled hyphens -- anywhere in the data --> +But this is not a comment. !! html/php -<p><br /> +<p>But this is not a comment. </p> !! html/parsoid -<!-- Browsers !> think this is closed --> -<!-- This would normally be text --> -<!-- As would this --> -<!-- XML doesn't like trailing dashes --> -<!-- Nor doubled hyphens anywhere in the data --> +<!-- Browsers --!> think this is closed --> +<!--> This would normally be text --> +<!---> As would this --> +<!-- XML doesn't like trailing dashes --------> +<!-- Nor doubled hyphens -- anywhere in the data --> +<p>But this is not a comment.</p> +!! end +!! test +Comment semantics: round-trip even text which contains encoded --> +!! wikitext +<!-- hello & goodbye - > --> --&gt; --&xx --> +!! html/parsoid +<!-- hello & goodbye - > --> --&gt; --&xx --> !! end !! test @@ -3166,9 +3171,9 @@ !! wikitext [[Category:foo]] <!-- No pre-wrapping --> {{echo| [[Category:foo]]}} <!-- No pre-wrapping --> -!! html - <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No pre-wrapping --> -<span about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":" [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" href="./Category:Foo" about="#mwt1"> <!-- No pre-wrapping --> +!! html/parsoid + <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No pre-wrapping --> +<span about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":" [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" href="./Category:Foo" about="#mwt1"> <!-- No pre-wrapping --> !! end !! test @@ -20400,21 +20405,19 @@ !!test Ref: 12. ref-tags act as trailing newline migration barrier -!!options -parsoid !! wikitext -<!--the newline at the end of this line moves out of the p-tag-->a +<!--the newline at the end of this line moves out of the p tag-->a -b<!--the newline at the end of this line stays inside the p-tag--> <ref /> +b<!--the newline at the end of this line stays inside the p tag--> <ref /> <ref /> c <references /> -!! html -<!--the newline at the end of this line moves out of the p-tag--><p>a</p> +!! html/parsoid +<!--the newline at the end of this line moves out of the p tag--><p>a</p> -<p>b<!--the newline at the end of this line stays inside the p-tag--> <span about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-1">[1]</a></span> +<p>b<!--the newline at the end of this line stays inside the p tag--> <span about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-1">[1]</a></span> <span about="#mwt4" class="reference" id="cite_ref-2" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-2">[2]</a></span></p> <p>c</p> diff --git a/tests/roundtrip-test.js b/tests/roundtrip-test.js index 5e8ded9..32d00a8 100755 --- a/tests/roundtrip-test.js +++ b/tests/roundtrip-test.js @@ -208,7 +208,9 @@ var diff = currentOffset - targetRange.start; while ( precedingNodes.length > 0 && diff > 0 ) { var n = precedingNodes.pop(); - var len = n.nodeValue.length + (n.nodeType === c.COMMENT_NODE ? 7 : 0); + var len = DU.isComment(n) ? + DU.decodedCommentLength(n) : + n.nodeValue.length; if ( len > diff ) { break; } @@ -248,10 +250,10 @@ precedingNodes = []; } else if ( c.nodeType === c.TEXT_NODE || c.nodeType === c.COMMENT_NODE ) { if ( currentOffset && ( currentOffset < targetRange.end ) ) { - currentOffset += c.nodeValue.length; - if ( c.nodeType === c.COMMENT_NODE ) { - // Add the length of the '<!--' and '--> bits - currentOffset += 7; + if (DU.isComment(c)) { + currentOffset += DU.decodedCommentLength(c); + } else { + currentOffset += c.nodeValue.length; } if ( currentOffset >= targetRange.end ) { waitingForEndMatch = false; -- To view, visit https://gerrit.wikimedia.org/r/202058 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8 Gerrit-PatchSet: 13 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Catrope <roan.katt...@gmail.com> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Esanders <esand...@wikimedia.org> Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com> Gerrit-Reviewer: Santhosh <santhosh.thottin...@gmail.com> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits