jenkins-bot has submitted this change and it was merged. Change subject: Some more tweaks to the parsing limits patch ......................................................................
Some more tweaks to the parsing limits patch * Enforce resource limits in emitChunk because the backtracking cache is enabled selectively in the tokenizer. * Reduce list item default limit to 30K * Increase transclusion limit to 10K. Change-Id: Ib21b152b239b1262778aa9dcb0081ceaccf35867 --- M lib/config/ParsoidConfig.js M lib/wt2html/pegTokenizer.pegjs.txt M lib/wt2html/tokenizer.utils.js 3 files changed, 33 insertions(+), 14 deletions(-) Approvals: Arlolra: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/config/ParsoidConfig.js b/lib/config/ParsoidConfig.js index 976042a..4a5182a 100644 --- a/lib/config/ParsoidConfig.js +++ b/lib/config/ParsoidConfig.js @@ -77,22 +77,23 @@ parsoidCacheReq: 0, }, - // SSS FIXME: Somewhat arbitrary numbers for starters. - // If these limits are breached, we return a http 500. + // Somewhat arbitrary numbers for starters. + // If these limits are breached, we return a http 413 (Payload too large) limits: { wt2html: { // We won't handle pages beyond this size maxWikitextSize: 1000000, // 1M // Max list items per page - maxListItems: 40000, + maxListItems: 30000, // Max table cells per page maxTableCells: 30000, // Max transclusions per page - maxTransclusions: 5000, + maxTransclusions: 10000, + // DISABLED for now // Max images per page maxImages: 1000, diff --git a/lib/wt2html/pegTokenizer.pegjs.txt b/lib/wt2html/pegTokenizer.pegjs.txt index 9524262..07e5c4a 100644 --- a/lib/wt2html/pegTokenizer.pegjs.txt +++ b/lib/wt2html/pegTokenizer.pegjs.txt @@ -50,12 +50,20 @@ // Shift tsr of all tokens by the pipeline offset Util.shiftTokenTSR(tokens, options.pipelineOffset); env.log("trace/peg", options.pegTokenizer.pipelineId, "----> ", tokens); + + var i; + var n = tokens.length; + + // Enforce parsing resource limits + for (i = 0; i < n; i++) { + tu.enforceParserResourceLimits(env, tokens[i]); + } + // limit the size of individual chunks var chunkLimit = 100000; - if (tokens.length > chunkLimit) { - var i = 0; - var l = tokens.length; - while (i < l) { + if (n > chunkLimit) { + i = 0; + while (i < n) { options.cb(tokens.slice(i, i + chunkLimit)); i += chunkLimit; } @@ -658,7 +666,6 @@ // again. params.unshift(new KV(tu.flattenIfArray(target.tokens), '', target.srcOffsets)); var obj = new SelfclosingTagTk('template', params, { tsr: tsrOffsets(), src: text() }); - env.bumpParserResourceUse('transclusion'); return obj; } @@ -1394,7 +1401,6 @@ var tsr = tsrOffsets('start'); tsr[1] += bullets.length; var li = new TagTk('listItem', [], { tsr: tsr }); - env.bumpParserResourceUse('listItem'); li.bullets = bullets; return [ li, c ]; } @@ -1416,7 +1422,6 @@ var tsr = tsrOffsets('start'); tsr[1] += bullets.length; var li = new TagTk('listItem', [], { tsr: tsr }); - env.bumpParserResourceUse('listItem'); li.bullets = bullets; return tu.flattenIfArray([li, tbl || [], s || []]); } @@ -1442,7 +1447,6 @@ li1.bullets.push(";"); // TSR: -1 for the intermediate ":" var li2 = new TagTk('listItem', [], { tsr: [cpos - 1, cpos], stx: 'row' }); - env.bumpParserResourceUse('listItem', 2); li2.bullets = bullets.slice(); li2.bullets.push(":"); @@ -1672,7 +1676,6 @@ tagEndPos:("" { return endOffset(); }) td:nested_block_in_table* { - env.bumpParserResourceUse('tableCell'); return tu.buildTableTokens("td", "|", arg, [startOffset(), tagEndPos], endOffset(), td); } @@ -1711,7 +1714,6 @@ } return true; } d:nested_block_in_table { return d; } )* { - env.bumpParserResourceUse('tableCell'); return tu.buildTableTokens("th", "!", arg, [startOffset(), tagEndPos], endOffset(), c); } diff --git a/lib/wt2html/tokenizer.utils.js b/lib/wt2html/tokenizer.utils.js index 986454c..e746129 100644 --- a/lib/wt2html/tokenizer.utils.js +++ b/lib/wt2html/tokenizer.utils.js @@ -306,6 +306,22 @@ } }, + enforceParserResourceLimits: function(env, token) { + if (token && (token.constructor === TagTk || token.constructor === SelfclosingTagTk)) { + switch (token.name) { + case 'listItem': + env.bumpParserResourceUse('listItem'); + break; + case 'template': + env.bumpParserResourceUse('transclusion'); + break; + case 'td': + case 'th': + env.bumpParserResourceUse('tableCell'); + break; + } + } + }, }; -- To view, visit https://gerrit.wikimedia.org/r/258469 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib21b152b239b1262778aa9dcb0081ceaccf35867 Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits