Marcoil has uploaded a new change for review. https://gerrit.wikimedia.org/r/184359
Change subject: WIP: Parallelize all template expansions to other Parsoid instances. ...................................................................... WIP: Parallelize all template expansions to other Parsoid instances. TODO: Fix a problem that appears when, in a large page, there are multiple instances of the same template and params very near each other. A synthetic test would be Foo {{echo|a}} Bar {{echo|a}} Baz {{echo|a}} This example sometimes presents minor DSR warnings, but when parsing [[en:Euro]], there are 3 {{Citation needed}} with the same params that are output with the same about id and the same data-parsoid, creating much bigger problems. Change-Id: I983d02e32f8c93ac43ad2f0d498c7a720bcbeb23 --- M api/routes.js M lib/ext.Cite.js M lib/ext.core.TemplateHandler.js M lib/ext.core.TokenStreamPatcher.js M lib/mediawiki.ApiRequest.js M lib/mediawiki.parser.js 6 files changed, 99 insertions(+), 24 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/59/184359/1 diff --git a/api/routes.js b/api/routes.js index c228ee3..8206f0c 100644 --- a/api/routes.js +++ b/api/routes.js @@ -365,9 +365,13 @@ env.page.name = ''; } return new Promise(function( resolve, reject ) { - var pipelineType = (res.local("format") === "tokens")? - 'text/x-mediawiki/tokens' : 'text/x-mediawiki/full', - parser = env.pipelineFactory.getPipeline(pipelineType); + var parser; + if (res.local('format') === 'tokens') { + parser = env.pipelineFactory.getPipeline('text/x-mediawiki/tokens', + {parallelizeTemplates: false}); + } else { + parser = env.pipelineFactory.getPipeline('text/x-mediawiki/full'); + } parser.once('document', function( doc ) { // Don't cache requests when wt is set in case somebody uses // GET for wikitext parsing diff --git a/lib/ext.Cite.js b/lib/ext.Cite.js index 7d1c704..a8e01da 100644 --- a/lib/ext.Cite.js +++ b/lib/ext.Cite.js @@ -45,7 +45,8 @@ opts.parentCB({tokens: opts.res, async: true}); // Wrap templates always - opts.pipelineOpts = Util.extendProps({}, opts.pipelineOpts, { wrapTemplates: true }); + opts.pipelineOpts = Util.extendProps({}, opts.pipelineOpts, + { wrapTemplates: true, parallelizeTemplates: false }); var tsr = extToken.dataAttribs.tsr; opts.srcOffsets = [ tsr[0]+tagWidths[0]+leadingWS.length, tsr[1]-tagWidths[1] ]; diff --git a/lib/ext.core.TemplateHandler.js b/lib/ext.core.TemplateHandler.js index fc194b8..c366edf 100644 --- a/lib/ext.core.TemplateHandler.js +++ b/lib/ext.core.TemplateHandler.js @@ -18,6 +18,7 @@ TemplateRequest = require('./mediawiki.ApiRequest.js').TemplateRequest, api = require('./mediawiki.ApiRequest.js'), PreprocessorRequest = api.PreprocessorRequest, + ParsoidTokensRequest = api.ParsoidTokensRequest, Util = require('./mediawiki.Util.js').Util, DU = require('./mediawiki.DOMUtils.js').DOMUtils, async = require('async'); @@ -135,25 +136,34 @@ cb({ tokens: toks }); } else { - // Use a TokenAccumulator to divide the template processing - // in two parts: The child part will take care of the main - // template element (including parameters) and the sibling - // will process the returned template expansion - state.accum = new TokenAccumulator(this.manager, cb); - accumReceiveToksFromSibling = state.accum.receiveToksFromSibling.bind(state.accum); - accumReceiveToksFromChild = state.accum.receiveToksFromChild.bind(state.accum); - var srcHandler = state.srcCB.bind( - this, state, frame, - accumReceiveToksFromSibling, - { name: templateName, attribs: [], cacheKey: text }); + // console.warn(process.pid, "target:", tgt.target, "parallelize:", this.options.parallelizeTemplates); + if (this.options.parallelizeTemplates) { + this.fetchExpandedTpl(env.page.name || 'Main_Page', + text, ParsoidTokensRequest, + cb, this.mergeExternalExpandedTpl.bind( + this, text, state.wrappedObjectId, + Util.clone(state.token.dataAttribs.tsr), cb)); + } else { + // Use a TokenAccumulator to divide the template processing + // in two parts: The child part will take care of the main + // template element (including parameters) and the sibling + // will process the returned template expansion + state.accum = new TokenAccumulator(this.manager, cb); + accumReceiveToksFromSibling = state.accum.receiveToksFromSibling.bind(state.accum); + accumReceiveToksFromChild = state.accum.receiveToksFromChild.bind(state.accum); + var srcHandler = state.srcCB.bind( + this, state, frame, + accumReceiveToksFromSibling, + { name: templateName, attribs: [], cacheKey: text }); + // Process the main template element + this._encapsulateTemplate(state, + accumReceiveToksFromChild); - // Process the main template element - this._encapsulateTemplate(state, - accumReceiveToksFromChild); - // Fetch and process the template expansion - this.fetchExpandedTpl( env.page.name || '', + // Fetch and process the template expansion + this.fetchExpandedTpl( env.page.name || '', text, PreprocessorRequest, accumReceiveToksFromSibling, srcHandler); + } } } else { // We don't perform recursive template expansion- something @@ -768,7 +778,8 @@ // but it's not be the correct solution // for cases with significant start-of-line // chars inserted after "\n". - noPre: true + noPre: true, + parallelizeTemplates: false }, srcOffsets: [ srcStart, srcEnd ], documentCB: function (html) { @@ -1150,6 +1161,62 @@ } }; +/** + * After a template expansion has returned from a different Parsoid instance, + * treat it so it can be incorporated into this pipeline correctly. + * + */ +TemplateHandler.prototype.mergeExternalExpandedTpl = function (text, wrappedObjectId, tsr, cb, err, data) { + console.warn("id", wrappedObjectId, "returned for tsr:", tsr); + var tokens = Util.JSONToTokens(data), + token, chunk; + + // console.warn("\ntokens start:", JSON.stringify(tokens)); + // Remove the ending EOFTk + tokens = Util.stripEOFTkfromTokens(tokens); + // Remove the surrounding <p> if it's not the template expansion itself + token = tokens[0]; + if (token.constructor === defines.TagTk && token.name === "p") { + // console.warn("\tremoving p:", token); + tokens.splice(0, 1); + } + token = tokens.last(); + if (token.constructor === defines.EndTagTk && token.name === "p") { + // console.warn("\tremoving p:", token); + tokens.pop(); + } + + // Set the correct tsr + token = tokens[0]; + if (typeof token === "object" && token.dataAttribs) { + token.dataAttribs.tsr = tsr; + } + token = tokens.last(); + if (typeof token === "object" && token.dataAttribs) { + token.dataAttribs.tsr = [null, tsr ? tsr[1] : null]; + } + + // Set the correct about id + for (var i in tokens) { + token = tokens[i]; + // console.warn("\ttoken:", JSON.stringify(token)); + if (typeof token === "object" && token.attribs) { + for (var a in token.attribs) { + if (token.attribs[a].k === 'about') { + token.attribs[a].v = '#' + wrappedObjectId; + } + } + } + } + chunk = { + tokens: tokens, + async: false + }; + chunk.rank = this.rank; + console.warn("tokens end:", JSON.stringify(tokens)); + cb(chunk); +}; + /* ********************** Template argument expansion ****************** */ /** diff --git a/lib/ext.core.TokenStreamPatcher.js b/lib/ext.core.TokenStreamPatcher.js index 4b91bbd..3bdcf35 100644 --- a/lib/ext.core.TokenStreamPatcher.js +++ b/lib/ext.core.TokenStreamPatcher.js @@ -57,8 +57,8 @@ this.sol = true; }; -TokenStreamPatcher.prototype.onNewline = function(token) { - this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; +TokenStreamPatcher.prototype.onNewline = function(token, mgr, prevToken) { + this.srcOffset = ((token.dataAttribs && token.dataAttribs.tsr) || [null,null])[1]; this.sol = true; return {tokens: [token]}; }; diff --git a/lib/mediawiki.ApiRequest.js b/lib/mediawiki.ApiRequest.js index 05513621..d99cbea 100644 --- a/lib/mediawiki.ApiRequest.js +++ b/lib/mediawiki.ApiRequest.js @@ -932,7 +932,7 @@ this.env.tp('Parsoid tokens ', this.text, JSON.stringify(data)); this._processListeners(err, data); -} +}; if (typeof module === "object") { module.exports.ConfigRequest = ConfigRequest; diff --git a/lib/mediawiki.parser.js b/lib/mediawiki.parser.js index dae65fe..bdfc32a 100644 --- a/lib/mediawiki.parser.js +++ b/lib/mediawiki.parser.js @@ -206,6 +206,9 @@ if (options.wrapTemplates === undefined) { options.wrapTemplates = true; } + if (options.parallelizeTemplates === undefined) { + options.parallelizeTemplates = true; + } if (options.cacheKey === undefined) { options.cacheKey = null; -- To view, visit https://gerrit.wikimedia.org/r/184359 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I983d02e32f8c93ac43ad2f0d498c7a720bcbeb23 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: parallel_tpl Gerrit-Owner: Marcoil <marc...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits