jenkins-bot has submitted this change and it was merged.

Change subject: T95039: encode/decode arbitrary data in comments.
......................................................................


T95039: encode/decode arbitrary data in comments.

Use HTML5 entity encoding to ensure that arbitrary data can be represented
in comments without triggering premature end of comment.

Do a little bit of entity encoding on the wikitext side as well to allow the
user to author the string "-->" inside a comment.  They probably won't want
to (!) but we want to avoid exposing arbitrary content restrictions to
DOM consumers (like Visual Editor).

If you were wondering, "-->" is represented as "-->" in wikitext.

We also normalize HTML entities during DOM diff, which ensures that
we don't dirty diff even if VE does the entity encoding inside comments
differently than we do.

Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8
---
M lib/XMLSerializer.js
M lib/dom.computeDSR.js
M lib/dom.migrateTrailingNLs.js
M lib/dom.wrapTemplates.js
M lib/ext.core.PreHandler.js
M lib/mediawiki.DOMDiff.js
M lib/mediawiki.DOMUtils.js
M lib/mediawiki.tokenizer.peg.js
M lib/mediawiki.tokenizer.utils.js
M lib/pegTokenizer.pegjs.txt
M lib/wts.separators.js
M lib/wts.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
M tests/roundtrip-test.js
15 files changed, 166 insertions(+), 97 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/XMLSerializer.js b/lib/XMLSerializer.js
index f8ef3b9..8bcd85e 100644
--- a/lib/XMLSerializer.js
+++ b/lib/XMLSerializer.js
@@ -150,7 +150,13 @@
        case TEXT_NODE:
                return cb(node.data.replace(/[<&]/g, _xmlEncoder));
        case COMMENT_NODE:
-               return cb( "<!--" + node.data.replace(/-->/g, '--&gt;') + 
"-->");
+               // According to
+               // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml
+               // we could throw an exception here if node.data would not 
create
+               // a "well-formed" XML comment.  But we use entity encoding when
+               // we create the comment node to ensure that node.data will 
always
+               // be okay; see DOMUtils.encodeComment().
+               return cb( "<!--" + node.data + "-->");
        default:
                cb('??' + node.nodeName);
        }
diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js
index e638a16..2ad6b84 100644
--- a/lib/dom.computeDSR.js
+++ b/lib/dom.computeDSR.js
@@ -246,7 +246,8 @@
                        }
                } else if (cType === node.COMMENT_NODE) {
                        if (ce !== null) {
-                               cs = ce - child.data.length - 7; // 7 chars for 
"<!--" and "-->"
+                               // decode html entities & reencode as wikitext 
to find length
+                               cs = ce - DU.decodedCommentLength(child);
                        }
                } else if (cType === node.ELEMENT_NODE) {
                        var cTypeOf = child.getAttribute("typeof"),
@@ -482,7 +483,7 @@
                                        if (nType === node.TEXT_NODE) {
                                                newCE = newCE + 
sibling.data.length + DU.indentPreDSRCorrection(sibling);
                                        } else if (nType === node.COMMENT_NODE) 
{
-                                               newCE = newCE + 
sibling.data.length + 7;
+                                               newCE = newCE + 
DU.decodedCommentLength(sibling);
                                        } else if (nType === node.ELEMENT_NODE) 
{
                                                var siblingDP = 
DU.getDataParsoid( sibling );
                                                if (siblingDP.dsr && 
siblingDP.tsr && siblingDP.dsr[0] <= newCE && e !== null) {
diff --git a/lib/dom.migrateTrailingNLs.js b/lib/dom.migrateTrailingNLs.js
index dde4394..caed4ea 100644
--- a/lib/dom.migrateTrailingNLs.js
+++ b/lib/dom.migrateTrailingNLs.js
@@ -80,7 +80,7 @@
                        if (DU.isComment(n)) {
                                firstEltToMigrate = n;
                                // <!--comment-->
-                               tsrCorrection += (7 + n.data.length);
+                               tsrCorrection += DU.decodedCommentLength(n);
                        } else {
                                if (n.data.match(/^\s*\n\s*$/)) {
                                        foundNL = true;
diff --git a/lib/dom.wrapTemplates.js b/lib/dom.wrapTemplates.js
index 86c31c3..ba71475 100644
--- a/lib/dom.wrapTemplates.js
+++ b/lib/dom.wrapTemplates.js
@@ -100,7 +100,8 @@
                        if (DU.isText(n)) {
                                offset += n.data.length;
                        } else {
-                               offset += n.data.length + 7;
+                               // A comment
+                               offset += DU.decodedCommentLength(n);
                        }
                        n = n.previousSibling;
                }
@@ -111,7 +112,8 @@
                }
 
                if (dsr && typeof (dsr[1]) === 'number') {
-                       var len = DU.isText(endNode) ? endNode.data.length : 
endNode.data.length + 7;
+                       var len = DU.isText(endNode) ? endNode.data.length :
+                               DU.decodedCommentLength(endNode);
                        dsr = [dsr[1] + offset, dsr[1] + offset + len];
                }
 
diff --git a/lib/ext.core.PreHandler.js b/lib/ext.core.PreHandler.js
index db3895b..8af8dfb 100644
--- a/lib/ext.core.PreHandler.js
+++ b/lib/ext.core.PreHandler.js
@@ -64,8 +64,9 @@
 
  * --------------------------------------------------------------------------*/
 
-var Util = require('./mediawiki.Util.js').Util,
-    defines = require('./mediawiki.parser.defines.js');
+var DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+       Util = require('./mediawiki.Util.js').Util,
+       defines = require('./mediawiki.parser.defines.js');
 
 // define some constructor shortcuts
 var CommentTk = defines.CommentTk,
@@ -310,7 +311,9 @@
 function getUpdatedPreTSR(tsr, token) {
        var tc = token.constructor;
        if (tc === CommentTk) {
-               tsr = token.dataAttribs.tsr ? token.dataAttribs.tsr[1] : (tsr 
=== -1 ? -1 : token.value.length + 7 + tsr);
+               // comment length has 7 added for "<!--" and "-->" deliminters
+               // (see DU.decodedCommentLength() -- but that takes a node not 
a token)
+               tsr = token.dataAttribs.tsr ? token.dataAttribs.tsr[1] : (tsr 
=== -1 ? -1 : DU.decodeComment(token.value).length + 7 + tsr);
        } else if (tc === SelfclosingTagTk) {
                // meta-tag (cannot compute)
                tsr = -1;
diff --git a/lib/mediawiki.DOMDiff.js b/lib/mediawiki.DOMDiff.js
index 97ac597..1bd2dfc 100644
--- a/lib/mediawiki.DOMDiff.js
+++ b/lib/mediawiki.DOMDiff.js
@@ -122,13 +122,15 @@
 DDP.treeEquals = function (nodeA, nodeB, deep) {
        if ( nodeA.nodeType !== nodeB.nodeType ) {
                return false;
-       } else if (DU.isText(nodeA) || DU.isComment(nodeA)) {
+       } else if (DU.isText(nodeA)) {
                // In the past we've had bugs where we let non-primitive strings
                // leak into our DOM.  Safety first:
                console.assert(nodeA.nodeValue === nodeA.nodeValue.valueOf());
                console.assert(nodeB.nodeValue === nodeB.nodeValue.valueOf());
                // ok, now do the comparison.
                return nodeA.nodeValue === nodeB.nodeValue;
+       } else if (DU.isComment(nodeA)) {
+               return DU.decodeComment(nodeA.data) === 
DU.decodeComment(nodeB.data);
        } else if (DU.isElt(nodeA)) {
                // Compare node name and attribute length
                if (nodeA.nodeName !== nodeB.nodeName ||
diff --git a/lib/mediawiki.DOMUtils.js b/lib/mediawiki.DOMUtils.js
index dc5fb61..a3b3788 100644
--- a/lib/mediawiki.DOMUtils.js
+++ b/lib/mediawiki.DOMUtils.js
@@ -2067,6 +2067,93 @@
                        dp.sa[ name ] = origVal;
                }
                dp.a[ name ] = val;
+       },
+
+       // Comment encoding/decoding.
+       //
+       //  * Some relevant phab tickets: T94055, T70146, T60184, T95039
+       //
+       // The wikitext comment production is very simple: <!-- starts a 
comment,
+       // and --> ends a comment.  This means we can have almost anything as 
the
+       // contents of a comment (except the string "-->", but see below), 
including
+       // several things that are not valid in HTML5 comments:
+       //
+       //  * For one, the html5 comment parsing algorithm [0] leniently accepts
+       //    --!> as a closing comment tag, which differs from the php+tidy 
combo.
+       //
+       //  * If the comment's data matches /^-?>/, html5 will end the comment.
+       //    For example, <!-->stuff<--> breaks up as
+       //    <!--> (the comment) followed by, stuff<--> (as text).
+       //
+       //  * Finally, comment data shouldn't contain two consecutive 
hyphen-minus
+       //    characters (--), nor end in a hyphen-minus character (/-$/) as 
defined
+       //    in the spec [1].
+       //
+       // We work around all these problems by using HTML entity encoding 
inside
+       // the comment body.  The characters -, >, and & must be encoded in 
order
+       // to prevent premature termination of the comment by one of the cases
+       // above.  Encoding other characters is optional; all entities will be
+       // decoded during wikitext serialization.
+       //
+       // In order to allow *arbitrary* content inside a wikitext comment,
+       // including the forbidden string "-->" we also do some minimal entity
+       // decoding on the wikitext.  We are also limited by our inability
+       // to encode DSR attributes on the comment node, so our wikitext entity
+       // decoding must be 1-to-1: that is, there must be a unique "decoded"
+       // string for every wikitext sequence, and for every decoded string 
there
+       // must be a unique wikitext which creates it.
+       //
+       // The basic idea here is to replace every string ab*c with the string 
with
+       // one more b in it.  This creates a string with no instance of "ac",
+       // so you can use 'ac' to encode one more code point.  In this case
+       // a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to
+       // encode "-->" (which is otherwise unspeakable in wikitext).
+       //
+       // Note that any user content which does not match the regular
+       // expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext
+       // representation, as shown in the first two examples below.
+       //
+       // User-authored comment text    Wikitext       HTML5 DOM
+       // --------------------------    -------------  ----------------------
+       // & - >                         & - >          &amp; &#43; &gt;
+       // Use &gt; here                 Use &gt; here  Use &amp;gt; here
+       // -->                           --&gt;         &#43;&#43;&gt;
+       // --&gt;                        --&amp;gt;     &#43;&#43;&amp;gt;
+       // --&amp;gt;                    --&amp;amp;gt; &#43;&#43;&amp;amp;gt;
+       //
+       // [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state
+       // [1] http://www.w3.org/TR/html5/syntax.html#comments
+
+       // Map a wikitext-escaped comment to an HTML DOM-escaped comment.
+       encodeComment: function(comment) {
+               // Undo wikitext escaping to obtain "true value" of comment.
+               var trueValue = comment
+                       .replace(/--&(amp;)*gt;/g, Util.decodeEntities);
+               // Now encode '-', '>' and '&' in the "true value" as HTML 
entities,
+               // so that they can be safely embedded in an HTML comment.
+               // This part doesn't have to map strings 1-to-1.
+               return trueValue
+                       .replace(/[->&]/g, Util.entityEncodeAll);
+       },
+
+       // Map an HTML DOM-escaped comment to a wikitext-escaped comment.
+       decodeComment: function(comment) {
+               // Undo HTML entity escaping to obtain "true value" of comment.
+               var trueValue = Util.decodeEntities(comment);
+               // ok, now encode this "true value" of the comment in such a way
+               // that the string "-->" never shows up.  (See above.)
+               return trueValue
+                       .replace(/--(&(amp;)*gt;|>)/g, function(s) {
+                               return s==='-->' ? '--&gt;' : 
'--&amp;'+s.slice(3);
+                       });
+       },
+
+       // Utility function: we often need to know the wikitext DSR length for
+       // an HTML DOM comment value.
+       decodedCommentLength: function(node) {
+               console.assert(DU.isComment(node));
+               // Add 7 for the "<!--" and "-->" delimiters in wikitext.
+               return DU.decodeComment(node.data).length + 7;
        }
 
 };
diff --git a/lib/mediawiki.tokenizer.peg.js b/lib/mediawiki.tokenizer.peg.js
index 5971bee..c7063a2 100644
--- a/lib/mediawiki.tokenizer.peg.js
+++ b/lib/mediawiki.tokenizer.peg.js
@@ -26,6 +26,7 @@
  * in the head of pegTokenizer.pegjs.txt.
  */
 var pegIncludes = {
+               DOMUtils: require('./mediawiki.DOMUtils.js').DOMUtils,
                Util: require('./mediawiki.Util.js').Util,
                defines: require('./mediawiki.parser.defines.js'),
                tu: require('./mediawiki.tokenizer.utils.js'),
diff --git a/lib/mediawiki.tokenizer.utils.js b/lib/mediawiki.tokenizer.utils.js
index 82a42f5..6620811 100644
--- a/lib/mediawiki.tokenizer.utils.js
+++ b/lib/mediawiki.tokenizer.utils.js
@@ -12,11 +12,6 @@
        EndTagTk = defines.EndTagTk,
        CommentTk = defines.CommentTk;
 
-
-var replaceMatchWithSpaces = function( match ) {
-       return " ".repeat( match.length );
-};
-
 var tu = {
 
        flattenIfArray: function(e) {
@@ -296,37 +291,6 @@
                } else {
                        return null;
                }
-       },
-
-       // Comment normalizing
-       //
-       //  * Some relevant phab tickets: T94055, T70146, T60184.
-       //
-       //  * For one, the html5 comment parsing algorithm [0] leniently accepts
-       //    --!> as a closing comment tag, which differs from the php+tidy 
combo.
-       //    We suppress that behaviour at the cost of introducing a dirty 
diff,
-       //    but selser should take care of it.
-       //
-       //  * If the comment's data matches /^-?>/, html5 will end the comment,
-       //    introducing roundtrip diffs. For example, <!-->stuff<--> breaks 
up as
-       //    <!--> (the comment) followed by, stuff<--> (as text).
-       //
-       //  * Finally, comment data shouldn't contain two consecutive 
hyphen-minus
-       //    characters (--), nor end in a hyphen-minus character (/-$/) as 
defined
-       //    in the spec [1]. Normalize to this syntax in order to emit valid 
XML.
-       //
-       // [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state
-       // [1] http://www.w3.org/TR/html5/syntax.html#comments
-
-       normalizeComment: function( comment ) {
-               return comment
-                       // suppress html5 comment start (dash)? ending
-                       .replace(/^(-?>)/, replaceMatchWithSpaces)
-                       // normalize double hyphens, with the bonus of
-                       // suppressing the html5 end bang state (--!>)
-                       .replace(/(-{2,})/g, replaceMatchWithSpaces)
-                       // normalize away extra ending hyphens
-                       .replace(/(-+)$/, replaceMatchWithSpaces);
        }
 
 };
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index 030b0d9..42790cb 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -7,6 +7,7 @@
 {
 
     var pegIncludes = options.pegIncludes,
+        DU = pegIncludes.DOMUtils,
         Util = pegIncludes.Util,
         PegTokenizer = pegIncludes.PegTokenizer,
         defines = pegIncludes.defines,
@@ -379,13 +380,13 @@
 // but, as always, things around here are a little more complicated.
 //
 // We accept the same comments, but because we emit them as HTML comments
-// instead of deleting them, we have to sanitize the contents of the
-// comments. We currently do this in a way which loses information. See
-// the normalizeComment helper for further details.
+// instead of deleting them, we have to encode the data to ensure that
+// we always emit a valid HTML5 comment.  See the encodeComment helper
+// for further details.
 
 comment
     = '<!--' c:( !"-->" c:. { return c; } )* ('-->' / eof) {
-        var data = tu.normalizeComment( c.join('') );
+        var data = DU.encodeComment( c.join('') );
         return [new CommentTk( data, { tsr: [peg$reportedPos, peg$currPos] } 
)];
     }
 
diff --git a/lib/wts.separators.js b/lib/wts.separators.js
index 05626b6..72a4985 100644
--- a/lib/wts.separators.js
+++ b/lib/wts.separators.js
@@ -573,7 +573,7 @@
                                        correction;
                                if (typeof (endDsr) === 'number') {
                                        if (DU.isComment(prevNode)) {
-                                               correction = 
prevNode.nodeValue.length + 7;
+                                               correction = 
DU.decodedCommentLength(prevNode);
                                        } else {
                                                correction = 
prevNode.nodeValue.length;
                                        }
diff --git a/lib/wts.utils.js b/lib/wts.utils.js
index 8ab8274..b845e52 100644
--- a/lib/wts.utils.js
+++ b/lib/wts.utils.js
@@ -14,7 +14,7 @@
        },
 
        commentWT: function (comment) {
-               return '<!--' + comment.replace(/-->/, '--&gt;') + '-->';
+               return '<!--' + DU.decodeComment(comment) + '-->';
        },
 
        /**
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 974a4c7..68fa053 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -219,7 +219,7 @@
 add("wt2html", "Parser hook: empty input using terminated empty elements (bug 
2374)", "<p data-parsoid='{\"dsr\":[0,18,0,0]}'>&lt;tag foo=bar/>text</p>");
 add("wt2html", "Parser hook: basic arguments using terminated empty elements 
(bug 2374)", "<p data-parsoid='{\"dsr\":[0,70,0,0]}'>&lt;tag width=200 height = 
\"100\" depth = '50' square/>\nother stuff\n&lt;/tag></p>");
 add("wt2html", "Parser hook: static parser hook not inside a comment", "<p 
data-parsoid='{\"dsr\":[0,61,0,0]}'>&lt;statictag>hello, 
world&lt;/statictag>\n&lt;statictag action=flush/></p>");
-add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- 
<statictag>hello, world</statictag> -->\n<p 
data-parsoid='{\"dsr\":[45,70,0,0]}'>&lt;statictag action=flush/></p>");
+add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- 
<statictag&#x3E;hello, world</statictag&#x3E; -->\n<p 
data-parsoid='{\"dsr\":[45,70,0,0]}'>&lt;statictag action=flush/></p>");
 add("wt2html", "Sanitizer: Closing of closed but not open tags", "");
 add("wt2html", "Sanitizer: Closing of closed but not open table tags", "Table 
not started");
 add("wt2html", "Sanitizer: Escaping of spaces, multibyte characters, colons & 
other stuff in id=\"\"", "<p data-parsoid='{\"dsr\":[0,45,0,0]}'><span id=\"æ: 
v\" data-parsoid='{\"stx\":\"html\",\"dsr\":[0,27,16,7]}'>byte</span><a 
rel=\"mw:WikiLink\" href=\"./Main%20Page#æ:_v\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Main%20Page#æ:_v\"},\"sa\":{\"href\":\"#æ:
 v\"},\"dsr\":[27,45,8,2]}'>backlink</a></p>");
@@ -420,7 +420,6 @@
 add("wt2wt", "Non-word characters don't terminate tag names (bug 17663, 40670, 
52022)", "<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't 
terminate </bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate 
</s.foo>\n\n<sub-ID#1>\n");
 add("wt2wt", "Non-word characters don't terminate tag names + tidy", 
"<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't terminate 
</bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate 
</s.foo>\n\n<sub-ID#1>\n");
 add("wt2wt", "Isolated close tags should be treated as literal text (bug 
52760)", "\n<s.foo>s\n");
-add("wt2wt", "Comment semantics: unclosed comment at end", "<!--This comment 
will run out to the end of the document-->");
 add("wt2wt", "<nowiki> inside <pre> (bug 13238)", 
"<pre>\n<nowiki>\n</pre>\n<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki>&lt;nowiki&gt;</nowiki>Foo<nowiki></nowiki></nowiki></pre>");
 add("wt2wt", "<nowiki> and <pre> preference (first one wins)", 
"<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n\n<nowiki>\n<pre>\n&lt;nowiki&gt;\n</pre>\n</nowiki>\n</pre>\n");
 add("wt2wt", "Templates: Indent-Pre: 1f: Wrapping should be based on expanded 
content", "{{echo| }}a\n\n{{echo|\n }}a\n\n{{echo|\n b}}\n\n{{echo|a\n 
}}b\n\n{{echo|a\n}}\n b\n");
@@ -921,7 +920,6 @@
 add("html2wt", "Comment test 4", "asdfjkl\n");
 add("html2wt", "Comment spacing", "a\n\n  b \n\nc\n");
 add("html2wt", "Comment whitespace", "");
-add("html2wt", "Comment semantics: unclosed comment at end", "");
 add("html2wt", "Comment in template title", "FOO\n");
 add("html2wt", "Comment on its own line post-expand", "a\n\nb\n");
 add("html2wt", "Comment on its own line post-expand with non-significant 
whitespace", "a\n\nb\n");
@@ -1773,7 +1771,6 @@
 add("selser", "Isolated close tags should be treated as literal text (bug 
52760) [3,2]", "i8usmsqui9kke29\n\n<s.foo>s</s>");
 add("selser", "Isolated close tags should be treated as literal text (bug 
52760) [2,0]", "zskmdh55uiuxflxr\n\n<s.foo>s</s>");
 add("selser", "Isolated close tags should be treated as literal text (bug 
52760) [3,0]", "<s.foo>s</s>");
-add("selser", "Comment semantics: unclosed comment at end 5", "<!--This 
comment will run out to the end of the document-->");
 add("selser", "Empty lines between lines with block tags 
[0,0,0,0,4,[4],0,4,2,4,2,0,0,3,3,0,0,[4]]", 
"<div></div>\n\n\n0jr2knw69qode7b9\n\nak4f0xoun9bv5cdi\n\n4us29365j9f9lik9\n\n3sh6psvknz3hm2t9\n\n8z2ywhrg9wl8fr\n\nz7s9at3qyq2xogvi\n\nb\n\n<div>b</div>\n<div>h2r4miiukcoez5mi</div>");
 add("selser", "Empty lines between lines with block tags 
[0,4,2,2,0,4,0,[2],0,[4],3,0,0,0,0,2,3,0]", 
"<div></div>g72sdt9b9996bt9\n\nau765cdgwcsg7gb9\n\n\n\nqhrfek8hsgkz9f6r\n<div></div>rw83ofcbk55vzpvi\n\nvky2au8c7nmnp14ib\n<div>eu25mfmjxi0sh5mi</div>\n\n<div>b</div>d\n\nvxxu49npeng66r\n\n<div>e</div>");
 add("selser", "<nowiki> inside <pre> (bug 13238) [4,2,2,0,0]", 
"ptjux9vq9zarlik9\n\ny7rebotz1jo1dcxr\nnb7mtl9jj0jm7vi<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki><nowiki></nowiki>Foo<nowiki></nowiki></nowiki></pre>");
@@ -2598,7 +2595,7 @@
 add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body 
[[2,0,0],2,0]", "3i2udpyjqgehr529A <ref> <b> foo </ref> B 
C\n\n04ufs7jtgivsra4i\n\n<references />");
 add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body 
[0,4,0]", "A <ref> <b> foo </ref> B C\n\nexksatqzwpgsc3di<references />");
 add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body 
[[0,0,4],3,0]", "A <ref> <b> foo </ref>3ds74snl9wly2e29<references />");
-add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier 
[4,[2],3,2,4,[4],3,0]", 
"m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline 
at the end of this line stays inside the p-tag--> <ref />\n<ref 
/>\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />");
+add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier 
[4,[2],3,2,4,[4],3,0]", 
"m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline 
at the end of this line stays inside the p tag--> <ref />\n<ref 
/>\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />");
 add("selser", "Ref: 19. ref-tags with identical name encodings should get 
identical indexes [[2,0,4,0],0,0]", "0dpatm9gsbqk6gvi1 <ref name=\"a & 
b\">foo</ref>k0yidhh5u2b21emi<ref name=\"a &amp; b\" />\n\n<references />");
 add("selser", "Ref: 19. ref-tags with identical name encodings should get 
identical indexes [2,0,0]", "zmmmjlha7xim5cdi\n\n1 <ref name=\"a & 
b\">foo</ref> 2 <ref name=\"a &amp; b\" />\n\n<references />");
 add("selser", "Ref: 19. ref-tags with identical name encodings should get 
identical indexes [1,0,0]", "1 <ref name=\"a & b\">foo</ref> 2 <ref name=\"a 
&amp; b\" />\n\n<references />");
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index b4ed614..fd6a3c4 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -1585,34 +1585,28 @@
 
 !! test
 Comment semantics and delimiters
-!! options
-parsoid=wt2html,html2html
 !! wikitext
 <!-- --><!----><!-----><!------>
 !! html/php
 
 !! html/parsoid
-<!-- --><!----><!-- --><!--  -->
+<!-- --><!----><!--&#x2D;--><!--&#x2D;&#x2D;-->
 !! end
 
 !! test
 Comment semantics and delimiters, redux
-!! options
-parsoid=wt2html,html2html
 !! wikitext
 <!-- In SGML every "foo" here would actually show up in the text -- foo -- bar
 -- foo -- funky huh? ... -->
 !! html/php
 
 !! html/parsoid
-<!-- In SGML every "foo" here would actually show up in the text    foo    bar
-   foo    funky huh? ... -->
+<!-- In SGML every "foo" here would actually show up in the text &#x2D;&#x2D; 
foo &#x2D;&#x2D; bar
+&#x2D;&#x2D; foo &#x2D;&#x2D; funky huh? ... -->
 !! end
 
 !! test
 Comment semantics and delimiters: directors cut
-!! options
-parsoid=wt2html,html2html
 !! wikitext
 <!-- ... However we like to keep things simple and somewhat XML-ish so we eat
 everything starting with < followed by !-- until the first -- and > we see,
@@ -1622,53 +1616,64 @@
 <p>--&gt;
 </p>
 !! html/parsoid
-<!-- ... However we like to keep things simple and somewhat XML-ish so we eat
-everything starting with < followed by !   until the first    and > we see,
-that wouldn't be valid XML however, since in XML    has to terminate a comment
+<!-- ... However we like to keep things simple and somewhat XML&#x2D;ish so we 
eat
+everything starting with < followed by !&#x2D;&#x2D; until the first 
&#x2D;&#x2D; and &#x3E; we see,
+that wouldn't be valid XML however, since in XML &#x2D;&#x2D; has to terminate 
a comment
 --><p>--></p>
 !! end
 
 !! test
 Comment semantics: nesting
-!! options
-parsoid=wt2html,html2html
 !! wikitext
 <!--<!-- no, we're not going to do anything fancy here -->-->
 !! html/php
 <p>--&gt;
 </p>
 !! html/parsoid
-<!--<!   no, we're not going to do anything fancy here --><p>--></p>
+<!--<!&#x2D;&#x2D; no, we're not going to do anything fancy here --><p>--></p>
 !! end
 
+# Parsoid closes the unclosed comment, even if it means a slight
+# round-trip diff.
 !! test
 Comment semantics: unclosed comment at end
+!! options
+parsoid=wt2html,html2html
 !! wikitext
 <!--This comment will run out to the end of the document
-!! html
+!! html/php
 
+!! html/parsoid
+<!--This comment will run out to the end of the document-->
 !! end
 
 !! test
-Normalize comments to play nice with XML and browsers
-!! options
-parsoid=wt2html,html2html
+Comment semantics: normalize comments to play nice with XML and browsers
 !! wikitext
 <!-- Browsers --!> think this is closed -->
 <!--> This would normally be text -->
 <!---> As would this -->
 <!-- XML doesn't like trailing dashes -------->
 <!-- Nor doubled hyphens -- anywhere in the data -->
+But this is not a comment.
 !! html/php
-<p><br />
+<p>But this is not a comment.
 </p>
 !! html/parsoid
-<!-- Browsers   !> think this is closed -->
-<!--  This would normally be text -->
-<!--   As would this -->
-<!-- XML doesn't like trailing dashes       -->
-<!-- Nor doubled hyphens    anywhere in the data -->
+<!-- Browsers &#x2D;&#x2D;!&#x3E; think this is closed -->
+<!--&#x3E; This would normally be text -->
+<!--&#x2D;&#x3E; As would this -->
+<!-- XML doesn't like trailing dashes &#x2D;&#x2D;&#x2D;&#x2D;&#x2D;&#x2D;-->
+<!-- Nor doubled hyphens &#x2D;&#x2D; anywhere in the data -->
+<p>But this is not a comment.</p>
+!! end
 
+!! test
+Comment semantics: round-trip even text which contains encoded -->
+!! wikitext
+<!-- hello & goodbye - > --&gt; --&amp;gt; --&xx -->
+!! html/parsoid
+<!-- hello &#x26; goodbye &#x2D; &#x3E; &#x2D;&#x2D;&#x3E; 
&#x2D;&#x2D;&#x26;gt; &#x2D;&#x2D;&#x26;xx -->
 !! end
 
 !! test
@@ -3166,9 +3171,9 @@
 !! wikitext
  [[Category:foo]] <!-- No pre-wrapping -->
 {{echo| [[Category:foo]]}} <!-- No pre-wrapping -->
-!! html
- <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No pre-wrapping 
-->
-<span about="#mwt1" typeof="mw:Transclusion" 
data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"
 [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" 
href="./Category:Foo" about="#mwt1"> <!-- No pre-wrapping -->
+!! html/parsoid
+ <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No 
pre&#x2D;wrapping -->
+<span about="#mwt1" typeof="mw:Transclusion" 
data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"
 [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" 
href="./Category:Foo" about="#mwt1"> <!-- No pre&#x2D;wrapping -->
 !! end
 
 !! test
@@ -20400,21 +20405,19 @@
 
 !!test
 Ref: 12. ref-tags act as trailing newline migration barrier
-!!options
-parsoid
 !! wikitext
-<!--the newline at the end of this line moves out of the p-tag-->a
+<!--the newline at the end of this line moves out of the p tag-->a
 
-b<!--the newline at the end of this line stays inside the p-tag--> <ref />
+b<!--the newline at the end of this line stays inside the p tag--> <ref />
 <ref />
 
 c
 <references />
-!! html
-<!--the newline at the end of this line moves out of the p-tag--><p>a</p>
+!! html/parsoid
+<!--the newline at the end of this line moves out of the p tag--><p>a</p>
 
 
-<p>b<!--the newline at the end of this line stays inside the p-tag--> <span 
about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" 
typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a 
href="#cite_note-1">[1]</a></span>
+<p>b<!--the newline at the end of this line stays inside the p tag--> <span 
about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" 
typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a 
href="#cite_note-1">[1]</a></span>
 <span about="#mwt4" class="reference" id="cite_ref-2" rel="dc:references" 
typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a 
href="#cite_note-2">[2]</a></span></p>
 
 <p>c</p>
diff --git a/tests/roundtrip-test.js b/tests/roundtrip-test.js
index 5e8ded9..32d00a8 100755
--- a/tests/roundtrip-test.js
+++ b/tests/roundtrip-test.js
@@ -208,7 +208,9 @@
                                                var diff = currentOffset - 
targetRange.start;
                                                while ( precedingNodes.length > 
0 && diff > 0 ) {
                                                        var n = 
precedingNodes.pop();
-                                                       var len = 
n.nodeValue.length + (n.nodeType === c.COMMENT_NODE ? 7 : 0);
+                                                       var len = 
DU.isComment(n) ?
+                                                               
DU.decodedCommentLength(n) :
+                                                               
n.nodeValue.length;
                                                        if ( len > diff ) {
                                                                break;
                                                        }
@@ -248,10 +250,10 @@
                                precedingNodes = [];
                        } else if ( c.nodeType === c.TEXT_NODE || c.nodeType 
=== c.COMMENT_NODE ) {
                                if ( currentOffset && ( currentOffset < 
targetRange.end ) ) {
-                                       currentOffset += c.nodeValue.length;
-                                       if ( c.nodeType === c.COMMENT_NODE ) {
-                                               // Add the length of the '<!--' 
and '--> bits
-                                               currentOffset += 7;
+                                       if (DU.isComment(c)) {
+                                               currentOffset += 
DU.decodedCommentLength(c);
+                                       } else {
+                                               currentOffset += 
c.nodeValue.length;
                                        }
                                        if ( currentOffset >= targetRange.end ) 
{
                                                waitingForEndMatch = false;

-- 
To view, visit https://gerrit.wikimedia.org/r/202058
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8
Gerrit-PatchSet: 13
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Cscott <canan...@wikimedia.org>
Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: Catrope <roan.katt...@gmail.com>
Gerrit-Reviewer: Cscott <canan...@wikimedia.org>
Gerrit-Reviewer: Esanders <esand...@wikimedia.org>
Gerrit-Reviewer: Nikerabbit <niklas.laxst...@gmail.com>
Gerrit-Reviewer: Santhosh <santhosh.thottin...@gmail.com>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to