Marcoil has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/184359

Change subject: WIP: Parallelize all template expansions to other Parsoid 
instances.
......................................................................

WIP: Parallelize all template expansions to other Parsoid instances.

TODO: Fix a problem that appears when, in a large page, there are
multiple instances of the same template and params very near each
other. A synthetic test would be

Foo {{echo|a}}
Bar {{echo|a}}
Baz {{echo|a}}

This example sometimes presents minor DSR warnings, but when
parsing [[en:Euro]], there are 3 {{Citation needed}} with the same
params that are output with the same about id and the same
data-parsoid, creating much bigger problems.

Change-Id: I983d02e32f8c93ac43ad2f0d498c7a720bcbeb23
---
M api/routes.js
M lib/ext.Cite.js
M lib/ext.core.TemplateHandler.js
M lib/ext.core.TokenStreamPatcher.js
M lib/mediawiki.ApiRequest.js
M lib/mediawiki.parser.js
6 files changed, 99 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/59/184359/1

diff --git a/api/routes.js b/api/routes.js
index c228ee3..8206f0c 100644
--- a/api/routes.js
+++ b/api/routes.js
@@ -365,9 +365,13 @@
                        env.page.name = '';
                }
                return new Promise(function( resolve, reject ) {
-                       var pipelineType = (res.local("format") === "tokens")?
-                                       'text/x-mediawiki/tokens' : 
'text/x-mediawiki/full',
-                               parser = 
env.pipelineFactory.getPipeline(pipelineType);
+                       var parser;
+                       if (res.local('format') === 'tokens') {
+                               parser = 
env.pipelineFactory.getPipeline('text/x-mediawiki/tokens',
+                                       {parallelizeTemplates: false});
+                       } else {
+                               parser = 
env.pipelineFactory.getPipeline('text/x-mediawiki/full');
+                       }
                        parser.once('document', function( doc ) {
                                // Don't cache requests when wt is set in case 
somebody uses
                                // GET for wikitext parsing
diff --git a/lib/ext.Cite.js b/lib/ext.Cite.js
index 7d1c704..a8e01da 100644
--- a/lib/ext.Cite.js
+++ b/lib/ext.Cite.js
@@ -45,7 +45,8 @@
                opts.parentCB({tokens: opts.res, async: true});
 
                // Wrap templates always
-               opts.pipelineOpts = Util.extendProps({}, opts.pipelineOpts, { 
wrapTemplates: true });
+               opts.pipelineOpts = Util.extendProps({}, opts.pipelineOpts,
+                       { wrapTemplates: true, parallelizeTemplates: false });
 
                var tsr = extToken.dataAttribs.tsr;
                opts.srcOffsets = [ tsr[0]+tagWidths[0]+leadingWS.length, 
tsr[1]-tagWidths[1] ];
diff --git a/lib/ext.core.TemplateHandler.js b/lib/ext.core.TemplateHandler.js
index fc194b8..c366edf 100644
--- a/lib/ext.core.TemplateHandler.js
+++ b/lib/ext.core.TemplateHandler.js
@@ -18,6 +18,7 @@
        TemplateRequest = require('./mediawiki.ApiRequest.js').TemplateRequest,
        api = require('./mediawiki.ApiRequest.js'),
        PreprocessorRequest = api.PreprocessorRequest,
+       ParsoidTokensRequest = api.ParsoidTokensRequest,
        Util = require('./mediawiki.Util.js').Util,
        DU = require('./mediawiki.DOMUtils.js').DOMUtils,
        async = require('async');
@@ -135,25 +136,34 @@
 
                                cb({ tokens: toks });
                        } else {
-                               // Use a TokenAccumulator to divide the 
template processing
-                               // in two parts: The child part will take care 
of the main
-                               // template element (including parameters) and 
the sibling
-                               // will process the returned template expansion
-                               state.accum = new 
TokenAccumulator(this.manager, cb);
-                               accumReceiveToksFromSibling = 
state.accum.receiveToksFromSibling.bind(state.accum);
-                               accumReceiveToksFromChild = 
state.accum.receiveToksFromChild.bind(state.accum);
-                               var srcHandler = state.srcCB.bind(
-                                               this, state, frame,
-                                               accumReceiveToksFromSibling,
-                                               { name: templateName, attribs: 
[], cacheKey: text });
+                               // console.warn(process.pid, "target:", 
tgt.target, "parallelize:", this.options.parallelizeTemplates);
+                               if (this.options.parallelizeTemplates) {
+                                       this.fetchExpandedTpl(env.page.name || 
'Main_Page',
+                                               text, ParsoidTokensRequest,
+                                               cb, 
this.mergeExternalExpandedTpl.bind(
+                                                       this, text, 
state.wrappedObjectId,
+                                                       
Util.clone(state.token.dataAttribs.tsr), cb));
+                               } else {
+                                       // Use a TokenAccumulator to divide the 
template processing
+                                       // in two parts: The child part will 
take care of the main
+                                       // template element (including 
parameters) and the sibling
+                                       // will process the returned template 
expansion
+                                       state.accum = new 
TokenAccumulator(this.manager, cb);
+                                       accumReceiveToksFromSibling = 
state.accum.receiveToksFromSibling.bind(state.accum);
+                                       accumReceiveToksFromChild = 
state.accum.receiveToksFromChild.bind(state.accum);
+                                       var srcHandler = state.srcCB.bind(
+                                                       this, state, frame,
+                                                       
accumReceiveToksFromSibling,
+                                                       { name: templateName, 
attribs: [], cacheKey: text });
+                                       // Process the main template element
+                                       this._encapsulateTemplate(state,
+                                               accumReceiveToksFromChild);
 
-                               // Process the main template element
-                               this._encapsulateTemplate(state,
-                                       accumReceiveToksFromChild);
-                               // Fetch and process the template expansion
-                               this.fetchExpandedTpl( env.page.name || '',
+                                       // Fetch and process the template 
expansion
+                                       this.fetchExpandedTpl( env.page.name || 
'',
                                                text, PreprocessorRequest,
                                                accumReceiveToksFromSibling, 
srcHandler);
+                               }
                        }
                } else {
                        // We don't perform recursive template expansion- 
something
@@ -768,7 +778,8 @@
                                // but it's not be the correct solution
                                // for cases with significant start-of-line
                                // chars inserted after "\n".
-                               noPre: true
+                               noPre: true,
+                               parallelizeTemplates: false
                        },
                        srcOffsets: [ srcStart, srcEnd ],
                        documentCB: function (html) {
@@ -1150,6 +1161,62 @@
        }
 };
 
+/**
+ * After a template expansion has returned from a different Parsoid instance,
+ * treat it so it can be incorporated into this pipeline correctly.
+ *
+ */
+TemplateHandler.prototype.mergeExternalExpandedTpl = function (text, 
wrappedObjectId, tsr, cb, err, data) {
+       console.warn("id", wrappedObjectId, "returned for tsr:", tsr);
+       var tokens = Util.JSONToTokens(data),
+               token, chunk;
+
+       // console.warn("\ntokens start:", JSON.stringify(tokens));
+       // Remove the ending EOFTk
+       tokens = Util.stripEOFTkfromTokens(tokens);
+       // Remove the surrounding <p> if it's not the template expansion itself
+       token = tokens[0];
+       if (token.constructor === defines.TagTk && token.name === "p") {
+       //      console.warn("\tremoving p:", token);
+               tokens.splice(0, 1);
+       }
+       token = tokens.last();
+       if (token.constructor === defines.EndTagTk && token.name === "p") {
+       //      console.warn("\tremoving p:", token);
+               tokens.pop();
+       }
+
+       // Set the correct tsr
+       token = tokens[0];
+       if (typeof token === "object" && token.dataAttribs) {
+               token.dataAttribs.tsr = tsr;
+       }
+       token = tokens.last();
+       if (typeof token === "object" && token.dataAttribs) {
+               token.dataAttribs.tsr = [null, tsr ? tsr[1] : null];
+       }
+
+       // Set the correct about id
+       for (var i in tokens) {
+               token = tokens[i];
+       //      console.warn("\ttoken:", JSON.stringify(token));
+               if (typeof token === "object" && token.attribs) {
+                       for (var a in token.attribs) {
+                               if (token.attribs[a].k === 'about') {
+                                       token.attribs[a].v = '#' + 
wrappedObjectId;
+                               }
+                       }
+               }
+       }
+       chunk = {
+               tokens: tokens,
+               async: false
+       };
+       chunk.rank = this.rank;
+       console.warn("tokens end:", JSON.stringify(tokens));
+       cb(chunk);
+};
+
 /* ********************** Template argument expansion ****************** */
 
 /**
diff --git a/lib/ext.core.TokenStreamPatcher.js 
b/lib/ext.core.TokenStreamPatcher.js
index 4b91bbd..3bdcf35 100644
--- a/lib/ext.core.TokenStreamPatcher.js
+++ b/lib/ext.core.TokenStreamPatcher.js
@@ -57,8 +57,8 @@
        this.sol = true;
 };
 
-TokenStreamPatcher.prototype.onNewline = function(token) {
-       this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
+TokenStreamPatcher.prototype.onNewline = function(token, mgr, prevToken) {
+       this.srcOffset = ((token.dataAttribs && token.dataAttribs.tsr) || 
[null,null])[1];
        this.sol = true;
        return {tokens: [token]};
 };
diff --git a/lib/mediawiki.ApiRequest.js b/lib/mediawiki.ApiRequest.js
index 05513621..d99cbea 100644
--- a/lib/mediawiki.ApiRequest.js
+++ b/lib/mediawiki.ApiRequest.js
@@ -932,7 +932,7 @@
        this.env.tp('Parsoid tokens ', this.text, JSON.stringify(data));
 
        this._processListeners(err, data);
-}
+};
 
 if (typeof module === "object") {
        module.exports.ConfigRequest = ConfigRequest;
diff --git a/lib/mediawiki.parser.js b/lib/mediawiki.parser.js
index dae65fe..bdfc32a 100644
--- a/lib/mediawiki.parser.js
+++ b/lib/mediawiki.parser.js
@@ -206,6 +206,9 @@
        if (options.wrapTemplates === undefined) {
                options.wrapTemplates = true;
        }
+       if (options.parallelizeTemplates === undefined) {
+               options.parallelizeTemplates = true;
+       }
 
        if (options.cacheKey === undefined) {
                options.cacheKey = null;

-- 
To view, visit https://gerrit.wikimedia.org/r/184359
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I983d02e32f8c93ac43ad2f0d498c7a720bcbeb23
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: parallel_tpl
Gerrit-Owner: Marcoil <marc...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to