[MediaWiki-commits] [Gerrit] T95039: encode/decode arbitrary data in comments. - change (mediawiki...parsoid)

2015-04-07 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: T95039: encode/decode arbitrary data in comments.
..


T95039: encode/decode arbitrary data in comments.

Use HTML5 entity encoding to ensure that arbitrary data can be represented
in comments without triggering premature end of comment.

Do a little bit of entity encoding on the wikitext side as well to allow the
user to author the string -- inside a comment.  They probably won't want
to (!) but we want to avoid exposing arbitrary content restrictions to
DOM consumers (like Visual Editor).

If you were wondering, -- is represented as --gt; in wikitext.

We also normalize HTML entities during DOM diff, which ensures that
we don't dirty diff even if VE does the entity encoding inside comments
differently than we do.

Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8
---
M lib/XMLSerializer.js
M lib/dom.computeDSR.js
M lib/dom.migrateTrailingNLs.js
M lib/dom.wrapTemplates.js
M lib/ext.core.PreHandler.js
M lib/mediawiki.DOMDiff.js
M lib/mediawiki.DOMUtils.js
M lib/mediawiki.tokenizer.peg.js
M lib/mediawiki.tokenizer.utils.js
M lib/pegTokenizer.pegjs.txt
M lib/wts.separators.js
M lib/wts.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
M tests/roundtrip-test.js
15 files changed, 166 insertions(+), 97 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/XMLSerializer.js b/lib/XMLSerializer.js
index f8ef3b9..8bcd85e 100644
--- a/lib/XMLSerializer.js
+++ b/lib/XMLSerializer.js
@@ -150,7 +150,13 @@
case TEXT_NODE:
return cb(node.data.replace(/[]/g, _xmlEncoder));
case COMMENT_NODE:
-   return cb( !-- + node.data.replace(/--/g, '--gt;') + 
--);
+   // According to
+   // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml
+   // we could throw an exception here if node.data would not 
create
+   // a well-formed XML comment.  But we use entity encoding when
+   // we create the comment node to ensure that node.data will 
always
+   // be okay; see DOMUtils.encodeComment().
+   return cb( !-- + node.data + --);
default:
cb('??' + node.nodeName);
}
diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js
index e638a16..2ad6b84 100644
--- a/lib/dom.computeDSR.js
+++ b/lib/dom.computeDSR.js
@@ -246,7 +246,8 @@
}
} else if (cType === node.COMMENT_NODE) {
if (ce !== null) {
-   cs = ce - child.data.length - 7; // 7 chars for 
!-- and --
+   // decode html entities  reencode as wikitext 
to find length
+   cs = ce - DU.decodedCommentLength(child);
}
} else if (cType === node.ELEMENT_NODE) {
var cTypeOf = child.getAttribute(typeof),
@@ -482,7 +483,7 @@
if (nType === node.TEXT_NODE) {
newCE = newCE + 
sibling.data.length + DU.indentPreDSRCorrection(sibling);
} else if (nType === node.COMMENT_NODE) 
{
-   newCE = newCE + 
sibling.data.length + 7;
+   newCE = newCE + 
DU.decodedCommentLength(sibling);
} else if (nType === node.ELEMENT_NODE) 
{
var siblingDP = 
DU.getDataParsoid( sibling );
if (siblingDP.dsr  
siblingDP.tsr  siblingDP.dsr[0] = newCE  e !== null) {
diff --git a/lib/dom.migrateTrailingNLs.js b/lib/dom.migrateTrailingNLs.js
index dde4394..caed4ea 100644
--- a/lib/dom.migrateTrailingNLs.js
+++ b/lib/dom.migrateTrailingNLs.js
@@ -80,7 +80,7 @@
if (DU.isComment(n)) {
firstEltToMigrate = n;
// !--comment--
-   tsrCorrection += (7 + n.data.length);
+   tsrCorrection += DU.decodedCommentLength(n);
} else {
if (n.data.match(/^\s*\n\s*$/)) {
foundNL = true;
diff --git a/lib/dom.wrapTemplates.js b/lib/dom.wrapTemplates.js
index 86c31c3..ba71475 100644
--- a/lib/dom.wrapTemplates.js
+++ b/lib/dom.wrapTemplates.js
@@ -100,7 +100,8 @@
if (DU.isText(n)) {
offset += n.data.length;
} else {
-   offset += n.data.length + 7;
+   // A comment
+   offset += DU.decodedCommentLength(n);
 

[MediaWiki-commits] [Gerrit] T95039: encode/decode arbitrary data in comments. - change (mediawiki...parsoid)

2015-04-06 Thread Cscott (Code Review)
Cscott has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/202058

Change subject: T95039: encode/decode arbitrary data in comments.
..

T95039: encode/decode arbitrary data in comments.

Use HTML5 entity encoding to ensure that arbitrary data can be represented
in comments without triggering premature end of comment.

Do a little bit entity encoding on the wikitext side as well to allow the
user to author the string -- inside a comment.  They probably won't want
to (!) but we want to avoid exposing arbitrary content restrictions to
DOM consumers (like Visual Editor).

If you were wondering, -- is represented as --gt; in wikitext.

Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8
---
M lib/dom.computeDSR.js
M lib/mediawiki.tokenizer.utils.js
M lib/pegTokenizer.pegjs.txt
M lib/wts.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
6 files changed, 113 insertions(+), 66 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/58/202058/1

diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js
index 89110a1..763fea3 100644
--- a/lib/dom.computeDSR.js
+++ b/lib/dom.computeDSR.js
@@ -2,6 +2,7 @@
 
 var Consts = require('./mediawiki.wikitext.constants.js').WikitextConstants,
DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+   TU = require('./mediawiki.tokenizer.utils.js'),
Util = require('./mediawiki.Util.js').Util,
dumpDOM = require('./dom.dumper.js').dumpDOM;
 
@@ -249,7 +250,10 @@
}
} else if (cType === node.COMMENT_NODE) {
if (ce !== null) {
-   cs = ce - child.data.length - 7; // 7 chars for 
!-- and --
+   // decode html entities  reencode as wikitext 
to get the
+   // length of the data.  The subtract 7 chars 
for the
+   // !-- and --
+   cs = ce - TU.decodeComment(child.data).length - 
7;
}
} else if (cType === node.ELEMENT_NODE) {
var cTypeOf = child.getAttribute(typeof),
diff --git a/lib/mediawiki.tokenizer.utils.js b/lib/mediawiki.tokenizer.utils.js
index 0f2aec3..4c8b6a7 100644
--- a/lib/mediawiki.tokenizer.utils.js
+++ b/lib/mediawiki.tokenizer.utils.js
@@ -12,10 +12,7 @@
EndTagTk = defines.EndTagTk,
CommentTk = defines.CommentTk;
 
-
-var replaceMatchWithSpaces = function( match ) {
-   return  .repeat( match.length );
-};
+var Util = require('./mediawiki.Util.js').Util;
 
 var tu = {
 
@@ -298,35 +295,80 @@
}
},
 
-   // Comment normalizing
+   // Comment encoding/decoding.
//
-   //  * Some relevant phab tickets: T94055, T70146, T60184.
+   //  * Some relevant phab tickets: T94055, T70146, T60184, T95039
+   //
+   // The wikitext comment production is very simple: !-- starts a 
comment,
+   // and -- ends a comment.  This means we can have almost anything as 
the
+   // contents of a comment (except the string --, but see below), 
including
+   // several things that are not valid in HTML5 comments:
//
//  * For one, the html5 comment parsing algorithm [0] leniently accepts
//--! as a closing comment tag, which differs from the php+tidy 
combo.
-   //We suppress that behaviour at the cost of introducing a dirty 
diff,
-   //but selser should take care of it.
//
-   //  * If the comment's data matches /^-?/, html5 will end the comment,
-   //introducing roundtrip diffs. For example, !--stuff-- breaks 
up as
+   //  * If the comment's data matches /^-?/, html5 will end the comment.
+   //For example, !--stuff-- breaks up as
//!-- (the comment) followed by, stuff-- (as text).
//
//  * Finally, comment data shouldn't contain two consecutive 
hyphen-minus
//characters (--), nor end in a hyphen-minus character (/-$/) as 
defined
-   //in the spec [1]. Normalize to this syntax in order to emit valid 
XML.
+   //in the spec [1].
+   //
+   // We work around all these problems by using HTML entity encoding 
inside
+   // the comment body.  The characters -, , and  must be encoded in 
order
+   // to prevent premature termination of the comment by one of the cases
+   // above.  Encoding other characters is optional; all entities will be
+   // decoded during wikitext serialization.
+   //
+   // In order to allow *arbitrary* content inside a wikitext comment,
+   // including the forbidden string -- we also do some minimal entity
+   // decoding on the wikitext.  We are also limited by our inability
+   // to encode DSR attributes on the comment node, so our wikitext entity
+   // decoding must