jenkins-bot has submitted this change and it was merged. Change subject: Provide section-offsets for immediate children of <body> ......................................................................
Provide section-offsets for immediate children of <body> * Changes xmlserializer.serializeToString and DU.serializeNode to return an object with the str and, if options.captureOffsets, the html offsets of top-level nodes. * Added mocha tests to capture expectations about offsets computation. Bug: T96279 Change-Id: Id74988d3ef39078fbfea72359884b75290da041b --- M api/routes.js M lib/XMLSerializer.js M lib/ext.Cite.js M lib/mediawiki.DOMUtils.js M tests/mocha/api.js A tests/mocha/xmlserializer.js M tests/parse.js M tests/parserTests.js M tests/roundtrip-test.js 9 files changed, 171 insertions(+), 60 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved GWicke: Looks good to me, but someone else must approve jenkins-bot: Verified diff --git a/api/routes.js b/api/routes.js index 9e07508..7722383 100644 --- a/api/routes.js +++ b/api/routes.js @@ -116,7 +116,7 @@ var headNodes = ""; for (var i = 0; i < hNodes.length; i++) { if (hNodes[i].nodeName.toLowerCase() === 'base') { - headNodes += DU.serializeNode(hNodes[i]); + headNodes += DU.serializeNode(hNodes[i]).str; break; } } @@ -124,7 +124,7 @@ var bNodes = doc.body.childNodes; var bodyNodes = ""; for (i = 0; i < bNodes.length; i++) { - bodyNodes += DU.serializeNode(bNodes[i]); + bodyNodes += DU.serializeNode(bNodes[i]).str; } var htmlSpeChars = apiUtils.htmlSpecialChars(out); @@ -331,10 +331,9 @@ function sendRes(doc) { var contentType = 'text/html;profile=mediawiki.org/specs/html/1.0.0;charset=utf-8'; var output; - if ( v2 && v2.format === "pagebundle" ) { - var dpScriptElt = doc.getElementById('mw-data-parsoid'); - dpScriptElt.parentNode.removeChild(dpScriptElt); - output = DU.serializeNode( res.local('body') ? doc.body : doc ); + if (v2 && v2.format === 'pagebundle') { + var out = DU.extractDpAndSerialize(doc, res.local('body')); + output = out.str; apiUtils.jsonResponse(res, env, { // revid: 12345 (maybe?), html: { @@ -342,12 +341,12 @@ body: output }, "data-parsoid": { - headers: { 'content-type': dpScriptElt.getAttribute('type') }, - body: JSON.parse( dpScriptElt.text ) + headers: { 'content-type': out.type }, + body: out.dp } }); } else { - output = DU.serializeNode( res.local('body') ? doc.body : doc ); + output = DU.serializeNode(res.local('body') ? doc.body : doc).str; apiUtils.setHeader(res, env, 'content-type', contentType); apiUtils.endResponse(res, env, output); } @@ -693,7 +692,7 @@ var p = TemplateRequest.setPageSrcInfo(env, target, oldid).then( parse.bind( null, env, req, res ) ).then(function( doc ) { - doc = DU.parseHTML( DU.serializeNode(doc) ); + doc = DU.parseHTML(DU.serializeNode(doc).str); var comment = doc.createComment('rtSelserEditTestComment'); doc.body.appendChild(comment); return roundTripDiff( env, req, res, true, doc ); diff --git a/lib/XMLSerializer.js b/lib/XMLSerializer.js index d068353..da33174 100644 --- a/lib/XMLSerializer.js +++ b/lib/XMLSerializer.js @@ -3,8 +3,6 @@ */ 'use strict'; -var htmlns = 'http://www.w3.org/1999/xhtml'; - // nodeType constants var ELEMENT_NODE = 1; var ATTRIBUTE_NODE = 2; @@ -41,7 +39,7 @@ param: true, source: true, track: true, - wbr: true + wbr: true, }; /** @@ -55,14 +53,14 @@ noembed: true, noframes: true, plaintext: true, - noscript: true + noscript: true, }; // Elements that strip leading newlines // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#html-fragment-serialization-algorithm var newlineStrippingElements = { pre: true, textarea: true, - listing: true + listing: true, }; @@ -81,15 +79,15 @@ function serializeToString(node, options, accum) { var child; - switch (node.nodeType){ + switch (node.nodeType) { case ELEMENT_NODE: child = node.firstChild; var attrs = node.attributes; var len = attrs.length; var nodeName = node.tagName.toLowerCase(); var localName = node.localName; - accum('<' + localName); - for (var i = 0;i < len;i++) { + accum('<' + localName, node); + for (var i = 0; i < len; i++) { var attr = attrs.item(i); var singleQuotes, doubleQuotes; var useSingleQuotes = false; @@ -99,15 +97,17 @@ (attr.value.match(/'/g) || []).length) { // use single quotes accum(' ' + attr.name + "='" - + attr.value.replace(/[<&']/g, _xmlEncoder) + "'"); + + attr.value.replace(/[<&']/g, _xmlEncoder) + "'", + node); } else { // use double quotes accum(' ' + attr.name + '="' - + attr.value.replace(/[<&"]/g, _xmlEncoder) + '"'); + + attr.value.replace(/[<&"]/g, _xmlEncoder) + '"', + node); } } if (child || !emptyElements[nodeName]) { - accum('>'); + accum('>', node, 'start'); // if is cdata child node if (hasRawContent[nodeName]) { // TODO: perform context-sensitive escaping? @@ -118,7 +118,7 @@ // * < to \3c in <style> // ... if (child) { - accum(child.data); + accum(child.data, node); } } else { if (child && newlineStrippingElements[localName] @@ -128,16 +128,16 @@ * Text node whose character data has as its first * character a U+000A LINE FEED (LF) character, then * append a U+000A LINE FEED (LF) character. */ - accum('\n'); + accum('\n', node); } while (child) { serializeToString(child, options, accum); child = child.nextSibling; } } - accum('</' + localName + '>'); - }else { - accum('/>'); + accum('</' + localName + '>', node, 'end'); + } else { + accum('/>', node); } return; case DOCUMENT_NODE: @@ -149,7 +149,7 @@ } return; case TEXT_NODE: - return accum(node.data.replace(/[<&]/g, _xmlEncoder)); + return accum(node.data.replace(/[<&]/g, _xmlEncoder), node); case COMMENT_NODE: // According to // http://www.w3.org/TR/DOM-Parsing/#dfn-concept-serialize-xml @@ -157,18 +157,57 @@ // a "well-formed" XML comment. But we use entity encoding when // we create the comment node to ensure that node.data will always // be okay; see DOMUtils.encodeComment(). - return accum('<!--' + node.data + '-->'); + return accum('<!--' + node.data + '-->', node); default: - accum('??' + node.nodeName); + accum('??' + node.nodeName, node); } } + +// These two are from DU but there's a cyclic dependency. +function isElt(node) { + return node.nodeType === ELEMENT_NODE; +} +function isBody(node) { + return isElt(node) && node.nodeName === "BODY"; +} + +var accumOffsets = function(out, bit, node, flag) { + if (isBody(node)) { + out.str += bit; + if (flag === 'start') { + out.start = out.str.length; + } else if (flag === 'end') { + out.start = null; + out.uid = null; + } + } else if (!isElt(node) || out.start === null || !isBody(node.parentNode)) { + // In case you're wondering, out.start may never be set if body + // isn't a child of the node passed to serializeToString, or if it + // is the node itself but options.innerXML is true. + out.str += bit; + if (out.uid !== null) { + out.offsets[out.uid].html[1] += bit.length; + } + } else { + // Template siblings don't have ids, + // so associate them with preceding content. + out.uid = node.getAttribute('id') || out.uid; + console.assert(out.uid !== null); + if (!out.offsets.hasOwnProperty(out.uid)) { + var dt = out.str.length - out.start; + out.offsets[out.uid] = { html: [dt, dt] }; + } + out.str += bit; + out.offsets[out.uid].html[1] += bit.length; + } +}; function XMLSerializer() {} XMLSerializer.prototype.serializeToString = function(node, options) { - var buf = ''; - var accum = function(bit) { buf += bit; }; - + var out = { str: '', start: null, offsets: {}, uid: null }; + var accum = options.captureOffsets ? + accumOffsets.bind(null, out) : function(bit) { out.str += bit; }; if (options.innerXML) { var children = node.childNodes; for (var i = 0; i < children.length; i++) { @@ -177,7 +216,7 @@ } else { serializeToString(node, options, accum); } - return buf; + return out; }; diff --git a/lib/ext.Cite.js b/lib/ext.Cite.js index 6ac2632..d0a627c 100644 --- a/lib/ext.Cite.js +++ b/lib/ext.Cite.js @@ -5,8 +5,8 @@ 'use strict'; require('./core-upgrade.js'); -var Util = require( './mediawiki.Util.js' ).Util; -var DU = require( './mediawiki.DOMUtils.js').DOMUtils; +var Util = require('./mediawiki.Util.js').Util; +var DU = require('./mediawiki.DOMUtils.js').DOMUtils; var coreutil = require('util'); var defines = require('./mediawiki.parser.defines.js'); var entities = require('entities'); @@ -432,7 +432,7 @@ } else { DU.storeDataParsoid( span, dataParsoid ); DU.storeDataMw( span, dataMW ); - refsInReferencesHTML.push( DU.serializeNode(span), "\n" ); + refsInReferencesHTML.push(DU.serializeNode(span).str, '\n'); } // Keep the first content to compare multiple <ref>s with the same name. diff --git a/lib/mediawiki.DOMUtils.js b/lib/mediawiki.DOMUtils.js index ce0ab1f..dd5f03e 100644 --- a/lib/mediawiki.DOMUtils.js +++ b/lib/mediawiki.DOMUtils.js @@ -2178,28 +2178,23 @@ * result in more compact output than the standard double-quoted serialization. * * @param {Node} doc - * @param {Object} options: flags smartQuote, innerHTML - * @returns {string} + * @param {Object} options: flags smartQuote, innerXML, captureOffsets + * @returns {Object} */ DOMUtils.serializeNode = function(doc, options) { - var html; - if (!options) { - options = { - smartQuote: true, - innerXML: false - }; + if (!options) { options = {}; } + if (!options.hasOwnProperty('smartQuote')) { + options.smartQuote = true; } if (doc.nodeName === '#document') { - html = XMLSerializer.serializeToString(doc.documentElement, options); - } else { - html = XMLSerializer.serializeToString(doc, options); + doc = doc.documentElement; } - // ensure there's a doctype for documents - if (!options.innerXML && (doc.nodeName === '#document' || /^html$/i.test(doc.nodeName))) { - html = '<!DOCTYPE html>\n' + html; + var res = XMLSerializer.serializeToString(doc, options); + // Ensure there's a doctype for documents. + if (!options.innerXML && /^html$/i.test(doc.nodeName)) { + res.str = '<!DOCTYPE html>\n' + res.str; } - - return html; + return res; }; /** @@ -2224,7 +2219,7 @@ }; } options.innerXML = true; - return this.serializeNode(node, options); + return this.serializeNode(node, options).str; }; /** @@ -2586,7 +2581,7 @@ return env.pipelineFactory.parse( env, env.page.src ).then(function(doc) { - env.page.dom = DU.parseHTML(DU.serializeNode(doc)).body; + env.page.dom = DU.parseHTML(DU.serializeNode(doc).str).body; }, function(err) { env.log('error', 'Error while parsing original DOM.'); }); @@ -2611,6 +2606,25 @@ }).nodify(cb); }; +// Pull the data-parsoid script element out of the doc before serializing. +DOMUtils.extractDpAndSerialize = function(doc, justBody) { + var dpScriptElt = doc.getElementById('mw-data-parsoid'); + dpScriptElt.parentNode.removeChild(dpScriptElt); + var options = { captureOffsets: true }; + var out = DU.serializeNode(justBody ? doc.body : doc, options); + out.dp = JSON.parse(dpScriptElt.text); + out.type = dpScriptElt.getAttribute('type'); + // Add the wt offsets. + Object.keys(out.offsets).forEach(function(key) { + var dsr = out.dp.ids[key].dsr; + if (Util.isValidDSR(dsr)) { + out.offsets[key].wt = dsr.slice(0, 2); + } + }); + out.dp.sectionOffsets = out.offsets; + return out; +}; + if (typeof module === "object") { module.exports.DOMUtils = DOMUtils; diff --git a/tests/mocha/api.js b/tests/mocha/api.js index e8de620..bd8c7d3 100644 --- a/tests/mocha/api.js +++ b/tests/mocha/api.js @@ -277,6 +277,18 @@ .end(done); }); + it('should include captured offsets', function(done) { + request(api) + .get('v2/' + mockHost + '/pagebundle/Main_Page/1') + .expect(200) + .expect(function(res) { + res.body.should.have.property('html'); + res.body.should.have.property('data-parsoid'); + res.body['data-parsoid'].body.should.have.property('sectionOffsets'); + }) + .end(done); + }); + }); // end wt2html describe("html2wt", function() { diff --git a/tests/mocha/xmlserializer.js b/tests/mocha/xmlserializer.js new file mode 100644 index 0000000..22186f6 --- /dev/null +++ b/tests/mocha/xmlserializer.js @@ -0,0 +1,47 @@ +/*global describe, it*/ +'use strict'; + +var domino = require('domino'); +var XMLSerializer = require('../../lib/XMLSerializer.js'); + +var xmlserializer = new XMLSerializer(); + +describe('XML Serializer', function() { + it('should capture html offsets while serializing', function() { + var html = '<html><head><title>hi</title><body>' + + '<div id="123">ok<div id="234">nope</div></div>' + + '\n\n<!--comment--><div id="345">end</div></body></html>'; + var doc = domino.createDocument(html); + var options = { + smartQuote: true, + innerXML: false, + captureOffsets: true, + }; + var ret = xmlserializer.serializeToString(doc, options); + ret.should.have.property('offsets'); + ret.offsets.should.have.property('123'); + ret.offsets['123'].html.should.eql([0, 62]); + ret.offsets.should.not.have.property('234'); + ret.offsets.should.have.property('345'); + ret.offsets['345'].html.should.eql([62, 85]); + }); + it('should handle templates properly while capturing offsets', function() { + var html = '<html><head><title>hi</title><body>' + + '<p about="#mwt1" typeof="mw:Transclusion" id="mwAQ">a</p>' + + '<p about="#mwt1">b</p>' + + '<p id="mwAg">c</p>' + + '</body></html>'; + var doc = domino.createDocument(html); + var options = { + smartQuote: true, + innerXML: false, + captureOffsets: true, + }; + var ret = xmlserializer.serializeToString(doc, options); + ret.should.have.property('offsets'); + ret.offsets.should.have.property('mwAQ'); + ret.offsets.should.have.property('mwAg'); + ret.offsets.mwAQ.html.should.eql([0, 79]); + ret.offsets.mwAg.html.should.eql([79, 97]); + }); +}); diff --git a/tests/parse.js b/tests/parse.js index 3f99cb1..dd146c1 100755 --- a/tests/parse.js +++ b/tests/parse.js @@ -165,11 +165,11 @@ // used in Parsoid JS API, return document out = doc; } else { - out = DU.serializeNode(doc); + out = DU.serializeNode(doc).str; } return { trailingNL: true, out: out }; } else { - return startsAtHTML(argv, env, DU.serializeNode(doc)); + return startsAtHTML(argv, env, DU.serializeNode(doc).str); } }); }; diff --git a/tests/parserTests.js b/tests/parserTests.js index 4125bdd..da8e2fe 100755 --- a/tests/parserTests.js +++ b/tests/parserTests.js @@ -979,7 +979,7 @@ // so we can maybe skip them later testTasks.push( function( body, cb ) { // Cache parsed HTML - item.cachedBODY = DU.parseHTML( DU.serializeNode( body ) ).body; + item.cachedBODY = DU.parseHTML(DU.serializeNode(body).str).body; // - In wt2html mode, pass through original DOM // so that it is serialized just once. @@ -1023,7 +1023,7 @@ // Save the modified DOM so we can re-test it later // Always serialize to string and reparse before passing to selser/wt2wt testTasks.push( function( body, cb ) { - item.changedHTMLStr = DU.serializeNode( body ); + item.changedHTMLStr = DU.serializeNode(body).str; cb( null, DU.parseHTML( item.changedHTMLStr ).body ); } ); } else if ( mode === 'wt2wt' ) { diff --git a/tests/roundtrip-test.js b/tests/roundtrip-test.js index ba99e12..50e7182 100755 --- a/tests/roundtrip-test.js +++ b/tests/roundtrip-test.js @@ -456,7 +456,7 @@ origOut = res ? res.nodes : []; for (k = 0; k < origOut.length; k++) { // node need not be an element always! - origOrigHTML += DU.serializeNode(origOut[k], { smartQuote: false }); + origOrigHTML += DU.serializeNode(origOut[k], { smartQuote: false }).str; } // Normalize away <br/>'s added by Parsoid because of newlines in wikitext origHTML = normalizeWS(DU.formatHTML(DU.normalizeOut(origOrigHTML))); @@ -465,7 +465,7 @@ newOut = res ? res.nodes : []; for (k = 0; k < newOut.length; k++) { // node need not be an element always! - origNewHTML += DU.serializeNode(newOut[k], { smartQuote: false }); + origNewHTML += DU.serializeNode(newOut[k], { smartQuote: false }).str; } // Normalize away <br/>'s added by Parsoid because of newlines in wikitext newHTML = normalizeWS(DU.formatHTML(DU.normalizeOut(origNewHTML))); -- To view, visit https://gerrit.wikimedia.org/r/209253 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id74988d3ef39078fbfea72359884b75290da041b Gerrit-PatchSet: 8 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits