[MediaWiki-commits] [Gerrit] Provide section-offsets for immediate children of - change (mediawiki...parsoid)

jenkins-bot (Code Review) Mon, 18 May 2015 10:36:38 -0700

jenkins-bot has submitted this change and it was merged.

Change subject: Provide section-offsets for immediate children of <body>
......................................................................



Provide section-offsets for immediate children of <body>

 * Changes xmlserializer.serializeToString and DU.serializeNode to return
   an object with the str and, if options.captureOffsets, the html offsets
   of top-level nodes.

 * Added mocha tests to capture expectations about offsets computation.

Bug: T96279
Change-Id: Id74988d3ef39078fbfea72359884b75290da041b
---
M api/routes.js
M lib/XMLSerializer.js
M lib/ext.Cite.js
M lib/mediawiki.DOMUtils.js
M tests/mocha/api.js
A tests/mocha/xmlserializer.js
M tests/parse.js
M tests/parserTests.js
M tests/roundtrip-test.js
9 files changed, 171 insertions(+), 60 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  GWicke: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/api/routes.js b/api/routes.js
index 9e07508..7722383 100644
--- a/api/routes.js
+++ b/api/routes.js
@@ -116,7 +116,7 @@
                        var headNodes = "";
                        for (var i = 0; i < hNodes.length; i++) {
                                if (hNodes[i].nodeName.toLowerCase() === 
'base') {
-                                       headNodes += 
DU.serializeNode(hNodes[i]);
+                                       headNodes += 
DU.serializeNode(hNodes[i]).str;
                                        break;
                                }
                        }
@@ -124,7 +124,7 @@
                        var bNodes = doc.body.childNodes;
                        var bodyNodes = "";
                        for (i = 0; i < bNodes.length; i++) {
-                               bodyNodes += DU.serializeNode(bNodes[i]);
+                               bodyNodes += DU.serializeNode(bNodes[i]).str;
                        }
 
                        var htmlSpeChars = apiUtils.htmlSpecialChars(out);
@@ -331,10 +331,9 @@
                function sendRes(doc) {
                        var contentType = 
'text/html;profile=mediawiki.org/specs/html/1.0.0;charset=utf-8';
                        var output;
-                       if ( v2 && v2.format === "pagebundle" ) {
-                               var dpScriptElt = 
doc.getElementById('mw-data-parsoid');
-                               dpScriptElt.parentNode.removeChild(dpScriptElt);
-                               output = DU.serializeNode( res.local('body') ? 
doc.body : doc );
+                       if (v2 && v2.format === 'pagebundle') {
+                               var out = DU.extractDpAndSerialize(doc, 
res.local('body'));
+                               output = out.str;
                                apiUtils.jsonResponse(res, env, {
                                        // revid: 12345 (maybe?),
                                        html: {
@@ -342,12 +341,12 @@
                                                body: output
                                        },
                                        "data-parsoid": {
-                                               headers: { 'content-type': 
dpScriptElt.getAttribute('type') },
-                                               body: JSON.parse( 
dpScriptElt.text )
+                                               headers: { 'content-type': 
out.type },
+                                               body: out.dp
                                        }
                                });
                        } else {
-                               output = DU.serializeNode( res.local('body') ? 
doc.body : doc );
+                               output = DU.serializeNode(res.local('body') ? 
doc.body : doc).str;
                                apiUtils.setHeader(res, env, 'content-type', 
contentType);
                                apiUtils.endResponse(res, env, output);
                        }
@@ -693,7 +692,7 @@
                var p = TemplateRequest.setPageSrcInfo(env, target, oldid).then(
                        parse.bind( null, env, req, res )
                ).then(function( doc ) {
-                       doc = DU.parseHTML( DU.serializeNode(doc) );
+                       doc = DU.parseHTML(DU.serializeNode(doc).str);
                        var comment = 
doc.createComment('rtSelserEditTestComment');
                        doc.body.appendChild(comment);
                        return roundTripDiff( env, req, res, true, doc );
diff --git a/lib/XMLSerializer.js b/lib/XMLSerializer.js
index d068353..da33174 100644
--- a/lib/XMLSerializer.js
+++ b/lib/XMLSerializer.js
@@ -3,8 +3,6 @@
  */
 'use strict';
 
-var htmlns = 'http://www.w3.org/1999/xhtml';
-
 // nodeType constants
 var ELEMENT_NODE = 1;
 var ATTRIBUTE_NODE = 2;
@@ -41,7 +39,7 @@
        param: true,
        source: true,
        track: true,
-       wbr: true
+       wbr: true,
 };
 
 /**
@@ -55,14 +53,14 @@
        noembed: true,
        noframes: true,
        plaintext: true,
-       noscript: true
+       noscript: true,
 };
 // Elements that strip leading newlines
 // 
http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm
 var newlineStrippingElements = {
        pre: true,
        textarea: true,
-       listing: true
+       listing: true,
 };
 
 
@@ -81,15 +79,15 @@
 
 function serializeToString(node, options, accum) {
        var child;
-       switch (node.nodeType){
+       switch (node.nodeType) {
        case ELEMENT_NODE:
                child = node.firstChild;
                var attrs = node.attributes;
                var len = attrs.length;
                var nodeName = node.tagName.toLowerCase();
                var localName = node.localName;
-               accum('<' + localName);
-               for (var i = 0;i < len;i++) {
+               accum('<' + localName, node);
+               for (var i = 0; i < len; i++) {
                        var attr = attrs.item(i);
                        var singleQuotes, doubleQuotes;
                        var useSingleQuotes = false;
@@ -99,15 +97,17 @@
                                        (attr.value.match(/'/g) || []).length) {
                                // use single quotes
                                accum(' ' + attr.name + "='"
-                                               + attr.value.replace(/[<&']/g, 
_xmlEncoder) + "'");
+                                               + attr.value.replace(/[<&']/g, 
_xmlEncoder) + "'",
+                                               node);
                        } else {
                                // use double quotes
                                accum(' ' + attr.name + '="'
-                                               + attr.value.replace(/[<&"]/g, 
_xmlEncoder) + '"');
+                                               + attr.value.replace(/[<&"]/g, 
_xmlEncoder) + '"',
+                                               node);
                        }
                }
                if (child || !emptyElements[nodeName]) {
-                       accum('>');
+                       accum('>', node, 'start');
                        // if is cdata child node
                        if (hasRawContent[nodeName]) {
                                // TODO: perform context-sensitive escaping?
@@ -118,7 +118,7 @@
                                // * < to \3c in <style>
                                // ...
                                if (child) {
-                                       accum(child.data);
+                                       accum(child.data, node);
                                }
                        } else {
                                if (child && newlineStrippingElements[localName]
@@ -128,16 +128,16 @@
                                         * Text node whose character data has 
as its first
                                         * character a U+000A LINE FEED (LF) 
character, then
                                         * append a U+000A LINE FEED (LF) 
character. */
-                                       accum('\n');
+                                       accum('\n', node);
                                }
                                while (child) {
                                        serializeToString(child, options, 
accum);
                                        child = child.nextSibling;
                                }
                        }
-                       accum('</' + localName + '>');
-               }else {
-                       accum('/>');
+                       accum('</' + localName + '>', node, 'end');
+               } else {
+                       accum('/>', node);
                }
                return;
        case DOCUMENT_NODE:
@@ -149,7 +149,7 @@
                }
                return;
        case TEXT_NODE:
-               return accum(node.data.replace(/[<&]/g, _xmlEncoder));
+               return accum(node.data.replace(/[<&]/g, _xmlEncoder), node);
        case COMMENT_NODE:
                // According to
                // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml
@@ -157,18 +157,57 @@
                // a "well-formed" XML comment.  But we use entity encoding when
                // we create the comment node to ensure that node.data will 
always
                // be okay; see DOMUtils.encodeComment().
-               return accum('<!--' + node.data + '-->');
+               return accum('<!--' + node.data + '-->', node);
        default:
-               accum('??' + node.nodeName);
+               accum('??' + node.nodeName, node);
        }
 }
+
+// These two are from DU but there's a cyclic dependency.
+function isElt(node) {
+       return node.nodeType === ELEMENT_NODE;
+}
+function isBody(node) {
+       return isElt(node) && node.nodeName === "BODY";
+}
+
+var accumOffsets = function(out, bit, node, flag) {
+       if (isBody(node)) {
+               out.str += bit;
+               if (flag === 'start') {
+                       out.start = out.str.length;
+               } else if (flag === 'end') {
+                       out.start = null;
+                       out.uid = null;
+               }
+       } else if (!isElt(node) || out.start === null || 
!isBody(node.parentNode)) {
+               // In case you're wondering, out.start may never be set if body
+               // isn't a child of the node passed to serializeToString, or if 
it
+               // is the node itself but options.innerXML is true.
+               out.str += bit;
+               if (out.uid !== null) {
+                       out.offsets[out.uid].html[1] += bit.length;
+               }
+       } else {
+               // Template siblings don't have ids,
+               // so associate them with preceding content.
+               out.uid = node.getAttribute('id') || out.uid;
+               console.assert(out.uid !== null);
+               if (!out.offsets.hasOwnProperty(out.uid)) {
+                       var dt = out.str.length - out.start;
+                       out.offsets[out.uid] = { html: [dt, dt] };
+               }
+               out.str += bit;
+               out.offsets[out.uid].html[1] += bit.length;
+       }
+};
 
 function XMLSerializer() {}
 
 XMLSerializer.prototype.serializeToString = function(node, options) {
-       var buf = '';
-       var accum = function(bit) { buf += bit; };
-
+       var out = { str: '', start: null, offsets: {}, uid: null };
+       var accum = options.captureOffsets ?
+               accumOffsets.bind(null, out) : function(bit) { out.str += bit; 
};
        if (options.innerXML) {
                var children = node.childNodes;
                for (var i = 0; i < children.length; i++) {
@@ -177,7 +216,7 @@
        } else {
                serializeToString(node, options, accum);
        }
-       return buf;
+       return out;
 };
 
 
diff --git a/lib/ext.Cite.js b/lib/ext.Cite.js
index 6ac2632..d0a627c 100644
--- a/lib/ext.Cite.js
+++ b/lib/ext.Cite.js
@@ -5,8 +5,8 @@
 'use strict';
 require('./core-upgrade.js');
 
-var Util = require( './mediawiki.Util.js' ).Util;
-var DU = require( './mediawiki.DOMUtils.js').DOMUtils;
+var Util = require('./mediawiki.Util.js').Util;
+var DU = require('./mediawiki.DOMUtils.js').DOMUtils;
 var coreutil = require('util');
 var defines = require('./mediawiki.parser.defines.js');
 var entities = require('entities');
@@ -432,7 +432,7 @@
        } else {
                DU.storeDataParsoid( span, dataParsoid );
                DU.storeDataMw( span, dataMW );
-               refsInReferencesHTML.push( DU.serializeNode(span), "\n" );
+               refsInReferencesHTML.push(DU.serializeNode(span).str, '\n');
        }
 
        // Keep the first content to compare multiple <ref>s with the same name.
diff --git a/lib/mediawiki.DOMUtils.js b/lib/mediawiki.DOMUtils.js
index ce0ab1f..dd5f03e 100644
--- a/lib/mediawiki.DOMUtils.js
+++ b/lib/mediawiki.DOMUtils.js
@@ -2178,28 +2178,23 @@
  * result in more compact output than the standard double-quoted serialization.
  *
  * @param {Node} doc
- * @param {Object} options: flags smartQuote, innerHTML
- * @returns {string}
+ * @param {Object} options: flags smartQuote, innerXML, captureOffsets
+ * @returns {Object}
  */
 DOMUtils.serializeNode = function(doc, options) {
-       var html;
-       if (!options) {
-               options = {
-                       smartQuote: true,
-                       innerXML: false
-               };
+       if (!options) { options = {}; }
+       if (!options.hasOwnProperty('smartQuote')) {
+               options.smartQuote = true;
        }
        if (doc.nodeName === '#document') {
-               html = XMLSerializer.serializeToString(doc.documentElement, 
options);
-       } else {
-               html = XMLSerializer.serializeToString(doc, options);
+               doc = doc.documentElement;
        }
-       // ensure there's a doctype for documents
-       if (!options.innerXML && (doc.nodeName === '#document' || 
/^html$/i.test(doc.nodeName))) {
-               html = '<!DOCTYPE html>\n' + html;
+       var res = XMLSerializer.serializeToString(doc, options);
+       // Ensure there's a doctype for documents.
+       if (!options.innerXML && /^html$/i.test(doc.nodeName)) {
+               res.str = '<!DOCTYPE html>\n' + res.str;
        }
-
-       return html;
+       return res;
 };
 
 /**
@@ -2224,7 +2219,7 @@
                };
        }
        options.innerXML = true;
-       return this.serializeNode(node, options);
+       return this.serializeNode(node, options).str;
 };
 
 /**
@@ -2586,7 +2581,7 @@
                                return env.pipelineFactory.parse(
                                        env, env.page.src
                                ).then(function(doc) {
-                                       env.page.dom = 
DU.parseHTML(DU.serializeNode(doc)).body;
+                                       env.page.dom = 
DU.parseHTML(DU.serializeNode(doc).str).body;
                                }, function(err) {
                                        env.log('error', 'Error while parsing 
original DOM.');
                                });
@@ -2611,6 +2606,25 @@
        }).nodify(cb);
 };
 
+// Pull the data-parsoid script element out of the doc before serializing.
+DOMUtils.extractDpAndSerialize = function(doc, justBody) {
+       var dpScriptElt = doc.getElementById('mw-data-parsoid');
+       dpScriptElt.parentNode.removeChild(dpScriptElt);
+       var options = { captureOffsets: true };
+       var out = DU.serializeNode(justBody ? doc.body : doc, options);
+       out.dp = JSON.parse(dpScriptElt.text);
+       out.type = dpScriptElt.getAttribute('type');
+       // Add the wt offsets.
+       Object.keys(out.offsets).forEach(function(key) {
+               var dsr = out.dp.ids[key].dsr;
+               if (Util.isValidDSR(dsr)) {
+                       out.offsets[key].wt = dsr.slice(0, 2);
+               }
+       });
+       out.dp.sectionOffsets = out.offsets;
+       return out;
+};
+
 
 if (typeof module === "object") {
        module.exports.DOMUtils = DOMUtils;
diff --git a/tests/mocha/api.js b/tests/mocha/api.js
index e8de620..bd8c7d3 100644
--- a/tests/mocha/api.js
+++ b/tests/mocha/api.js
@@ -277,6 +277,18 @@
                                .end(done);
                        });
 
+                       it('should include captured offsets', function(done) {
+                               request(api)
+                               .get('v2/' + mockHost + 
'/pagebundle/Main_Page/1')
+                               .expect(200)
+                               .expect(function(res) {
+                                       res.body.should.have.property('html');
+                                       
res.body.should.have.property('data-parsoid');
+                                       
res.body['data-parsoid'].body.should.have.property('sectionOffsets');
+                               })
+                               .end(done);
+                       });
+
                }); // end wt2html
 
                describe("html2wt", function() {
diff --git a/tests/mocha/xmlserializer.js b/tests/mocha/xmlserializer.js
new file mode 100644
index 0000000..22186f6
--- /dev/null
+++ b/tests/mocha/xmlserializer.js
@@ -0,0 +1,47 @@
+/*global describe, it*/
+'use strict';
+
+var domino = require('domino');
+var XMLSerializer = require('../../lib/XMLSerializer.js');
+
+var xmlserializer = new XMLSerializer();
+
+describe('XML Serializer', function() {
+       it('should capture html offsets while serializing', function() {
+               var html = '<html><head><title>hi</title><body>' +
+                               '<div id="123">ok<div 
id="234">nope</div></div>' +
+                               '\n\n<!--comment--><div 
id="345">end</div></body></html>';
+               var doc = domino.createDocument(html);
+               var options = {
+                       smartQuote: true,
+                       innerXML: false,
+                       captureOffsets: true,
+               };
+               var ret = xmlserializer.serializeToString(doc, options);
+               ret.should.have.property('offsets');
+               ret.offsets.should.have.property('123');
+               ret.offsets['123'].html.should.eql([0, 62]);
+               ret.offsets.should.not.have.property('234');
+               ret.offsets.should.have.property('345');
+               ret.offsets['345'].html.should.eql([62, 85]);
+       });
+       it('should handle templates properly while capturing offsets', 
function() {
+               var html = '<html><head><title>hi</title><body>' +
+                       '<p about="#mwt1" typeof="mw:Transclusion" 
id="mwAQ">a</p>' +
+                       '<p about="#mwt1">b</p>' +
+                       '<p id="mwAg">c</p>' +
+                       '</body></html>';
+               var doc = domino.createDocument(html);
+               var options = {
+                       smartQuote: true,
+                       innerXML: false,
+                       captureOffsets: true,
+               };
+               var ret = xmlserializer.serializeToString(doc, options);
+               ret.should.have.property('offsets');
+               ret.offsets.should.have.property('mwAQ');
+               ret.offsets.should.have.property('mwAg');
+               ret.offsets.mwAQ.html.should.eql([0, 79]);
+               ret.offsets.mwAg.html.should.eql([79, 97]);
+       });
+});
diff --git a/tests/parse.js b/tests/parse.js
index 3f99cb1..dd146c1 100755
--- a/tests/parse.js
+++ b/tests/parse.js
@@ -165,11 +165,11 @@
                                // used in Parsoid JS API, return document
                                out = doc;
                        } else {
-                               out = DU.serializeNode(doc);
+                               out = DU.serializeNode(doc).str;
                        }
                        return { trailingNL: true, out: out };
                } else {
-                       return startsAtHTML(argv, env, DU.serializeNode(doc));
+                       return startsAtHTML(argv, env, 
DU.serializeNode(doc).str);
                }
        });
 };
diff --git a/tests/parserTests.js b/tests/parserTests.js
index 4125bdd..da8e2fe 100755
--- a/tests/parserTests.js
+++ b/tests/parserTests.js
@@ -979,7 +979,7 @@
                        // so we can maybe skip them later
                        testTasks.push( function( body, cb ) {
                                // Cache parsed HTML
-                               item.cachedBODY = DU.parseHTML( 
DU.serializeNode( body ) ).body;
+                               item.cachedBODY = 
DU.parseHTML(DU.serializeNode(body).str).body;
 
                                // - In wt2html mode, pass through original DOM
                                //   so that it is serialized just once.
@@ -1023,7 +1023,7 @@
                // Save the modified DOM so we can re-test it later
                // Always serialize to string and reparse before passing to 
selser/wt2wt
                testTasks.push( function( body, cb ) {
-                       item.changedHTMLStr = DU.serializeNode( body );
+                       item.changedHTMLStr = DU.serializeNode(body).str;
                        cb( null, DU.parseHTML( item.changedHTMLStr ).body );
                } );
        } else if ( mode === 'wt2wt' ) {
diff --git a/tests/roundtrip-test.js b/tests/roundtrip-test.js
index ba99e12..50e7182 100755
--- a/tests/roundtrip-test.js
+++ b/tests/roundtrip-test.js
@@ -456,7 +456,7 @@
                origOut = res ? res.nodes : [];
                for (k = 0; k < origOut.length; k++) {
                        // node need not be an element always!
-                       origOrigHTML += DU.serializeNode(origOut[k], { 
smartQuote: false });
+                       origOrigHTML += DU.serializeNode(origOut[k], { 
smartQuote: false }).str;
                }
                // Normalize away <br/>'s added by Parsoid because of newlines 
in wikitext
                origHTML = 
normalizeWS(DU.formatHTML(DU.normalizeOut(origOrigHTML)));
@@ -465,7 +465,7 @@
                newOut = res ? res.nodes : [];
                for (k = 0; k < newOut.length; k++) {
                        // node need not be an element always!
-                       origNewHTML += DU.serializeNode(newOut[k], { 
smartQuote: false });
+                       origNewHTML += DU.serializeNode(newOut[k], { 
smartQuote: false }).str;
                }
                // Normalize away <br/>'s added by Parsoid because of newlines 
in wikitext
                newHTML = 
normalizeWS(DU.formatHTML(DU.normalizeOut(origNewHTML)));

-- 
To view, visit https://gerrit.wikimedia.org/r/209253
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id74988d3ef39078fbfea72359884b75290da041b
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: Cscott <canan...@wikimedia.org>
Gerrit-Reviewer: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Provide section-offsets for immediate children of - change (mediawiki...parsoid)

Reply via email to