Cscott has uploaded a new change for review. https://gerrit.wikimedia.org/r/202058
Change subject: T95039: encode/decode arbitrary data in comments. ...................................................................... T95039: encode/decode arbitrary data in comments. Use HTML5 entity encoding to ensure that arbitrary data can be represented in comments without triggering premature end of comment. Do a little bit entity encoding on the wikitext side as well to allow the user to author the string "-->" inside a comment. They probably won't want to (!) but we want to avoid exposing arbitrary content restrictions to DOM consumers (like Visual Editor). If you were wondering, "-->" is represented as "-->" in wikitext. Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8 --- M lib/dom.computeDSR.js M lib/mediawiki.tokenizer.utils.js M lib/pegTokenizer.pegjs.txt M lib/wts.utils.js M tests/parserTests-blacklist.js M tests/parserTests.txt 6 files changed, 113 insertions(+), 66 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/58/202058/1 diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js index 89110a1..763fea3 100644 --- a/lib/dom.computeDSR.js +++ b/lib/dom.computeDSR.js @@ -2,6 +2,7 @@ var Consts = require('./mediawiki.wikitext.constants.js').WikitextConstants, DU = require('./mediawiki.DOMUtils.js').DOMUtils, + TU = require('./mediawiki.tokenizer.utils.js'), Util = require('./mediawiki.Util.js').Util, dumpDOM = require('./dom.dumper.js').dumpDOM; @@ -249,7 +250,10 @@ } } else if (cType === node.COMMENT_NODE) { if (ce !== null) { - cs = ce - child.data.length - 7; // 7 chars for "<!--" and "-->" + // decode html entities & reencode as wikitext to get the + // length of the data. The subtract 7 chars for the + // "<!--" and "-->" + cs = ce - TU.decodeComment(child.data).length - 7; } } else if (cType === node.ELEMENT_NODE) { var cTypeOf = child.getAttribute("typeof"), diff --git a/lib/mediawiki.tokenizer.utils.js b/lib/mediawiki.tokenizer.utils.js index 0f2aec3..4c8b6a7 100644 --- a/lib/mediawiki.tokenizer.utils.js +++ b/lib/mediawiki.tokenizer.utils.js @@ -12,10 +12,7 @@ EndTagTk = defines.EndTagTk, CommentTk = defines.CommentTk; - -var replaceMatchWithSpaces = function( match ) { - return " ".repeat( match.length ); -}; +var Util = require('./mediawiki.Util.js').Util; var tu = { @@ -298,35 +295,80 @@ } }, - // Comment normalizing + // Comment encoding/decoding. // - // * Some relevant phab tickets: T94055, T70146, T60184. + // * Some relevant phab tickets: T94055, T70146, T60184, T95039 + // + // The wikitext comment production is very simple: <!-- starts a comment, + // and --> ends a comment. This means we can have almost anything as the + // contents of a comment (except the string "-->", but see below), including + // several things that are not valid in HTML5 comments: // // * For one, the html5 comment parsing algorithm [0] leniently accepts // --!> as a closing comment tag, which differs from the php+tidy combo. - // We suppress that behaviour at the cost of introducing a dirty diff, - // but selser should take care of it. // - // * If the comment's data matches /^-?>/, html5 will end the comment, - // introducing roundtrip diffs. For example, <!-->stuff<--> breaks up as + // * If the comment's data matches /^-?>/, html5 will end the comment. + // For example, <!-->stuff<--> breaks up as // <!--> (the comment) followed by, stuff<--> (as text). // // * Finally, comment data shouldn't contain two consecutive hyphen-minus // characters (--), nor end in a hyphen-minus character (/-$/) as defined - // in the spec [1]. Normalize to this syntax in order to emit valid XML. + // in the spec [1]. + // + // We work around all these problems by using HTML entity encoding inside + // the comment body. The characters -, >, and & must be encoded in order + // to prevent premature termination of the comment by one of the cases + // above. Encoding other characters is optional; all entities will be + // decoded during wikitext serialization. + // + // In order to allow *arbitrary* content inside a wikitext comment, + // including the forbidden string "-->" we also do some minimal entity + // decoding on the wikitext. We are also limited by our inability + // to encode DSR attributes on the comment node, so our wikitext entity + // decoding must be 1-to-1: that is, there must be a unique "decoded" + // string for every wikitext sequence, and for every decoded string there + // must be a unique wikitext which creates it. + // + // The basic idea here is to replace every string ab*c with the string with + // one more b in it. This creates a string with no instance of "ac", + // so you can use 'ac' to encode one more code point. In this case + // a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to + // encode "-->" (which is otherwise unspeakable in wikitext). + // Note that wikitext does not encode any string that does not match + // /--&(amp;)*>/ or /-->/. + // + // So, here are some examples: + // User-authored comment text Wikitext HTML5 DOM + // & - > & - > & + > + // --> --> ++> + // --> --&gt; ++&gt; + // --&gt; --&amp;gt; ++&amp;gt; // // [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state // [1] http://www.w3.org/TR/html5/syntax.html#comments - normalizeComment: function( comment ) { - return comment - // suppress html5 comment start (dash)? ending - .replace(/^(-?>)/, replaceMatchWithSpaces) - // normalize double hyphens, with the bonus of - // suppressing the html5 end bang state (--!>) - .replace(/(-{2,})/g, replaceMatchWithSpaces) - // normalize away extra ending hyphens - .replace(/(-+)$/, replaceMatchWithSpaces); + // Map a wikitext-escaped comment to an HTML DOM-escaped comment. + encodeComment: function( comment ) { + // Undo wikitext escaping to obtain "true value" of comment. + var trueValue = comment + .replace(/--&(amp;)*gt;/g, Util.decodeEntities); + // Now encode '-', '>' and '&' in the "true value" as HTML entities, + // so that they can be safely embedded in an HTML comment. + // This part doesn't have to map strings 1-to-1. + return trueValue + .replace(/[->&]/g, Util.entityEncodeAll); + }, + + // Map an HTML DOM-escaped comment to a wikitext-escaped comment. + decodeComment: function( comment ) { + // Undo HTML entity escaping to obtain "true value" of comment. + var trueValue = Util.decodeEntities(comment); + // ok, now encode this "true value" of the comment in such a way + // that the string "-->" never shows up. (See above.) + return trueValue + .replace(/--(&(amp;)*gt;|>)/g, function(s) { + return s==='-->' ? '-->' : '--&'+s.slice(3); + }); } }; diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt index 18ffb98..1e6cac2 100644 --- a/lib/pegTokenizer.pegjs.txt +++ b/lib/pegTokenizer.pegjs.txt @@ -379,13 +379,13 @@ // but, as always, things around here are a little more complicated. // // We accept the same comments, but because we emit them as HTML comments -// instead of deleting them, we have to sanitize the contents of the -// comments. We currently do this in a way which loses information. See -// the normalizeComment helper for further details. +// instead of deleting them, we have to encode the data to ensure that +// we always emit a valid HTML5 comment. See the encodeComment helper +// for further details. comment = '<!--' c:( !"-->" c:. { return c; } )* ('-->' / eof) { - var data = tu.normalizeComment( c.join('') ); + var data = tu.encodeComment( c.join('') ); return [new CommentTk( data, { tsr: [peg$reportedPos, peg$currPos] } )]; } diff --git a/lib/wts.utils.js b/lib/wts.utils.js index 843593a..a1f4bd3 100644 --- a/lib/wts.utils.js +++ b/lib/wts.utils.js @@ -1,6 +1,7 @@ "use strict"; var DU = require('./mediawiki.DOMUtils.js').DOMUtils; +var TU = require('./mediawiki.tokenizer.utils.js'); var WTSUtils = { isValidSep: function(sep) { @@ -14,7 +15,7 @@ }, commentWT: function (comment) { - return '<!--' + comment.replace(/-->/, '-->') + '-->'; + return '<!--' + TU.decodeComment(comment) + '-->'; }, /** diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js index 0fdbdb5..ef7b22a 100644 --- a/tests/parserTests-blacklist.js +++ b/tests/parserTests-blacklist.js @@ -219,7 +219,7 @@ add("wt2html", "Parser hook: empty input using terminated empty elements (bug 2374)", "<p data-parsoid='{\"dsr\":[0,18,0,0]}'><tag foo=bar/>text</p>"); add("wt2html", "Parser hook: basic arguments using terminated empty elements (bug 2374)", "<p data-parsoid='{\"dsr\":[0,70,0,0]}'><tag width=200 height = \"100\" depth = '50' square/>\nother stuff\n</tag></p>"); add("wt2html", "Parser hook: static parser hook not inside a comment", "<p data-parsoid='{\"dsr\":[0,61,0,0]}'><statictag>hello, world</statictag>\n<statictag action=flush/></p>"); -add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- <statictag>hello, world</statictag> -->\n<p data-parsoid='{\"dsr\":[45,70,0,0]}'><statictag action=flush/></p>"); +add("wt2html", "Parser hook: static parser hook inside a comment", "<!-- <statictag>hello, world</statictag> -->\n<p data-parsoid='{\"dsr\":[45,70,0,0]}'><statictag action=flush/></p>"); add("wt2html", "Sanitizer: Closing of closed but not open tags", ""); add("wt2html", "Sanitizer: Closing of closed but not open table tags", "Table not started"); add("wt2html", "Sanitizer: Escaping of spaces, multibyte characters, colons & other stuff in id=\"\"", "<p data-parsoid='{\"dsr\":[0,45,0,0]}'><span id=\"æ: v\" data-parsoid='{\"stx\":\"html\",\"dsr\":[0,27,16,7]}'>byte</span><a rel=\"mw:WikiLink\" href=\"./Main%20Page#æ:_v\" data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Main%20Page#æ:_v\"},\"sa\":{\"href\":\"#æ: v\"},\"dsr\":[27,45,8,2]}'>backlink</a></p>"); @@ -420,7 +420,6 @@ add("wt2wt", "Non-word characters don't terminate tag names (bug 17663, 40670, 52022)", "<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't terminate </bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate </s.foo>\n\n<sub-ID#1>\n"); add("wt2wt", "Non-word characters don't terminate tag names + tidy", "<blockquote|>a\n\n<b→> doesn't terminate </b→>\n\n<bä> doesn't terminate </bä>\n\n<boo> doesn't terminate </boo>\n\n<s.foo> doesn't terminate </s.foo>\n\n<sub-ID#1>\n"); add("wt2wt", "Isolated close tags should be treated as literal text (bug 52760)", "\n<s.foo>s\n"); -add("wt2wt", "Comment semantics: unclosed comment at end", "<!--This comment will run out to the end of the document-->"); add("wt2wt", "<nowiki> inside <pre> (bug 13238)", "<pre>\n<nowiki>\n</pre>\n<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki><nowiki></nowiki>Foo<nowiki></nowiki></nowiki></pre>"); add("wt2wt", "<nowiki> and <pre> preference (first one wins)", "<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n\n<nowiki>\n<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n"); add("wt2wt", "Templates: Indent-Pre: 1f: Wrapping should be based on expanded content", "{{echo| }}a\n\n{{echo|\n }}a\n\n{{echo|\n b}}\n\n{{echo|a\n }}b\n\n{{echo|a\n}}\n b\n"); @@ -921,7 +920,6 @@ add("html2wt", "Comment test 4", "asdfjkl\n"); add("html2wt", "Comment spacing", "a\n\n b \n\nc\n"); add("html2wt", "Comment whitespace", ""); -add("html2wt", "Comment semantics: unclosed comment at end", ""); add("html2wt", "Comment in template title", "FOO\n"); add("html2wt", "Comment on its own line post-expand", "a\n\nb\n"); add("html2wt", "Comment on its own line post-expand with non-significant whitespace", "a\n\nb\n"); @@ -1773,7 +1771,6 @@ add("selser", "Isolated close tags should be treated as literal text (bug 52760) [3,2]", "i8usmsqui9kke29\n\n<s.foo>s</s>"); add("selser", "Isolated close tags should be treated as literal text (bug 52760) [2,0]", "zskmdh55uiuxflxr\n\n<s.foo>s</s>"); add("selser", "Isolated close tags should be treated as literal text (bug 52760) [3,0]", "<s.foo>s</s>"); -add("selser", "Comment semantics: unclosed comment at end 5", "<!--This comment will run out to the end of the document-->"); add("selser", "Empty lines between lines with block tags [0,0,0,0,4,[4],0,4,2,4,2,0,0,3,3,0,0,[4]]", "<div></div>\n\n\n0jr2knw69qode7b9\n\nak4f0xoun9bv5cdi\n\n4us29365j9f9lik9\n\n3sh6psvknz3hm2t9\n\n8z2ywhrg9wl8fr\n\nz7s9at3qyq2xogvi\n\nb\n\n<div>b</div>\n<div>h2r4miiukcoez5mi</div>"); add("selser", "Empty lines between lines with block tags [0,4,2,2,0,4,0,[2],0,[4],3,0,0,0,0,2,3,0]", "<div></div>g72sdt9b9996bt9\n\nau765cdgwcsg7gb9\n\n\n\nqhrfek8hsgkz9f6r\n<div></div>rw83ofcbk55vzpvi\n\nvky2au8c7nmnp14ib\n<div>eu25mfmjxi0sh5mi</div>\n\n<div>b</div>d\n\nvxxu49npeng66r\n\n<div>e</div>"); add("selser", "<nowiki> inside <pre> (bug 13238) [4,2,2,0,0]", "ptjux9vq9zarlik9\n\ny7rebotz1jo1dcxr\nnb7mtl9jj0jm7vi<pre>\n<nowiki></nowiki>\n</pre>\n<pre><nowiki><nowiki></nowiki>Foo<nowiki></nowiki></nowiki></pre>"); @@ -2598,7 +2595,7 @@ add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [[2,0,0],2,0]", "3i2udpyjqgehr529A <ref> <b> foo </ref> B C\n\n04ufs7jtgivsra4i\n\n<references />"); add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [0,4,0]", "A <ref> <b> foo </ref> B C\n\nexksatqzwpgsc3di<references />"); add("selser", "Ref: 10. Unclosed HTML tags should not leak out of ref-body [[0,0,4],3,0]", "A <ref> <b> foo </ref>3ds74snl9wly2e29<references />"); -add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier [4,[2],3,2,4,[4],3,0]", "m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline at the end of this line stays inside the p-tag--> <ref />\n<ref />\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />"); +add("selser", "Ref: 12. ref-tags act as trailing newline migration barrier [4,[2],3,2,4,[4],3,0]", "m5deo7xhmiaoflxr\n\nqhnq99nstl8oajora\n\nf16ctvb87p919k9\n\nb<!--the newline at the end of this line stays inside the p tag--> <ref />\n<ref />\n\ni5sgdy5g0bb7qfr\n\nlppjadx2v8q41jor\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [[2,0,4,0],0,0]", "0dpatm9gsbqk6gvi1 <ref name=\"a & b\">foo</ref>k0yidhh5u2b21emi<ref name=\"a & b\" />\n\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [2,0,0]", "zmmmjlha7xim5cdi\n\n1 <ref name=\"a & b\">foo</ref> 2 <ref name=\"a & b\" />\n\n<references />"); add("selser", "Ref: 19. ref-tags with identical name encodings should get identical indexes [1,0,0]", "1 <ref name=\"a & b\">foo</ref> 2 <ref name=\"a & b\" />\n\n<references />"); diff --git a/tests/parserTests.txt b/tests/parserTests.txt index 15a8c08..4295bb3 100644 --- a/tests/parserTests.txt +++ b/tests/parserTests.txt @@ -1585,34 +1585,28 @@ !! test Comment semantics and delimiters -!! options -parsoid=wt2html,html2html !! wikitext <!-- --><!----><!-----><!------> !! html/php !! html/parsoid -<!-- --><!----><!-- --><!-- --> +<!-- --><!----><!-----><!------> !! end !! test Comment semantics and delimiters, redux -!! options -parsoid=wt2html,html2html !! wikitext <!-- In SGML every "foo" here would actually show up in the text -- foo -- bar -- foo -- funky huh? ... --> !! html/php !! html/parsoid -<!-- In SGML every "foo" here would actually show up in the text foo bar - foo funky huh? ... --> +<!-- In SGML every "foo" here would actually show up in the text -- foo -- bar +-- foo -- funky huh? ... --> !! end !! test Comment semantics and delimiters: directors cut -!! options -parsoid=wt2html,html2html !! wikitext <!-- ... However we like to keep things simple and somewhat XML-ish so we eat everything starting with < followed by !-- until the first -- and > we see, @@ -1622,53 +1616,64 @@ <p>--> </p> !! html/parsoid -<!-- ... However we like to keep things simple and somewhat XML-ish so we eat -everything starting with < followed by ! until the first and > we see, -that wouldn't be valid XML however, since in XML has to terminate a comment +<!-- ... However we like to keep things simple and somewhat XML-ish so we eat +everything starting with < followed by !-- until the first -- and > we see, +that wouldn't be valid XML however, since in XML -- has to terminate a comment --><p>--></p> !! end !! test Comment semantics: nesting -!! options -parsoid=wt2html,html2html !! wikitext <!--<!-- no, we're not going to do anything fancy here -->--> !! html/php <p>--> </p> !! html/parsoid -<!--<! no, we're not going to do anything fancy here --><p>--></p> +<!--<!-- no, we're not going to do anything fancy here --><p>--></p> !! end +# Parsoid closes the unclosed comment, even if it means a slight +# round-trip diff. !! test Comment semantics: unclosed comment at end +!! options +parsoid=wt2html,html2html !! wikitext <!--This comment will run out to the end of the document -!! html +!! html/php +!! html/parsoid +<!--This comment will run out to the end of the document--> !! end !! test -Normalize comments to play nice with XML and browsers -!! options -parsoid=wt2html,html2html +Comment semantics: normalize comments to play nice with XML and browsers !! wikitext <!-- Browsers --!> think this is closed --> <!--> This would normally be text --> <!---> As would this --> <!-- XML doesn't like trailing dashes --------> <!-- Nor doubled hyphens -- anywhere in the data --> +But this is not a comment. !! html/php -<p><br /> +<p>But this is not a comment. </p> !! html/parsoid -<!-- Browsers !> think this is closed --> -<!-- This would normally be text --> -<!-- As would this --> -<!-- XML doesn't like trailing dashes --> -<!-- Nor doubled hyphens anywhere in the data --> +<!-- Browsers --!> think this is closed --> +<!--> This would normally be text --> +<!---> As would this --> +<!-- XML doesn't like trailing dashes --------> +<!-- Nor doubled hyphens -- anywhere in the data --> +<p>But this is not a comment.</p> +!! end +!! test +Comment semantics: round-trip even text which contains encoded --> +!! wikitext +<!-- hello & goodbye - > --> --&gt; --&xx --> +!! html/parsoid +<!-- hello & goodbye - > --> --&gt; --&xx --> !! end !! test @@ -3166,9 +3171,9 @@ !! wikitext [[Category:foo]] <!-- No pre-wrapping --> {{echo| [[Category:foo]]}} <!-- No pre-wrapping --> -!! html - <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No pre-wrapping --> -<span about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":" [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" href="./Category:Foo" about="#mwt1"> <!-- No pre-wrapping --> +!! html/parsoid + <link rel="mw:PageProp/Category" href="./Category:Foo"> <!-- No pre-wrapping --> +<span about="#mwt1" typeof="mw:Transclusion" data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":" [[Category:foo]]"}},"i":0}}]}'> </span><link rel="mw:PageProp/Category" href="./Category:Foo" about="#mwt1"> <!-- No pre-wrapping --> !! end !! test @@ -20394,21 +20399,19 @@ !!test Ref: 12. ref-tags act as trailing newline migration barrier -!!options -parsoid !! wikitext -<!--the newline at the end of this line moves out of the p-tag-->a +<!--the newline at the end of this line moves out of the p tag-->a -b<!--the newline at the end of this line stays inside the p-tag--> <ref /> +b<!--the newline at the end of this line stays inside the p tag--> <ref /> <ref /> c <references /> -!! html -<!--the newline at the end of this line moves out of the p-tag--><p>a</p> +!! html/parsoid +<!--the newline at the end of this line moves out of the p tag--><p>a</p> -<p>b<!--the newline at the end of this line stays inside the p-tag--> <span about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-1">[1]</a></span> +<p>b<!--the newline at the end of this line stays inside the p tag--> <span about="#mwt2" class="reference" id="cite_ref-1" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-1">[1]</a></span> <span about="#mwt4" class="reference" id="cite_ref-2" rel="dc:references" typeof="mw:Extension/ref" data-mw='{"name":"ref","attrs":{}}'><a href="#cite_note-2">[2]</a></span></p> <p>c</p> -- To view, visit https://gerrit.wikimedia.org/r/202058 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Cscott <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits