Subramanya Sastry has uploaded a new change for review. https://gerrit.wikimedia.org/r/51802
Change subject: WIP: Incomplete; Higher priority ext content parsing. ...................................................................... WIP: Incomplete; Higher priority ext content parsing. * Committing expt. work in progress. Will continue Monday. Change-Id: I82e29701875dde7702c504416f4a9487387dbe19 --- M js/lib/mediawiki.tokenizer.peg.js M js/lib/pegTokenizer.pegjs.txt 2 files changed, 69 insertions(+), 6 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid refs/changes/02/51802/1 diff --git a/js/lib/mediawiki.tokenizer.peg.js b/js/lib/mediawiki.tokenizer.peg.js index a9eb410..e05a6a2 100644 --- a/js/lib/mediawiki.tokenizer.peg.js +++ b/js/lib/mediawiki.tokenizer.peg.js @@ -200,9 +200,14 @@ */ PegTokenizer.prototype.tokenize = function( text, production ) { try { - var args = { cb: null, pegTokenizer: this, srcOffset: 0, env: this.env }; - return this.tokenizer.tokenize(text, production || "start", args); + var toks = []; + var args = { cb: function(r) { toks = toks.concat(r); }, pegTokenizer: this, srcOffset: 0, env: this.env }; + // console.warn("text:" + JSON.stringify(text)); + this.tokenizer.tokenize(text, production || "start", args); + return toks; } catch ( e ) { + // console.warn("exc: " + e); + // console.warn("trace: " + e.stack); return false; } }; diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt index 2bbadc5..f127166 100644 --- a/js/lib/pegTokenizer.pegjs.txt +++ b/js/lib/pegTokenizer.pegjs.txt @@ -164,6 +164,18 @@ } } + function charSequence(prefix, c, numChars) { + if (numChars && numChars > 0) { + var buf = [prefix]; + for (var i = 0; i < numChars; i++) { + buf.push(c); + } + return buf.join(''); + } else { + return prefix; + } + } + /* End static utilities */ /* @@ -298,6 +310,8 @@ // text start position var textStart = 0; + + var currExtensionTag = null; // Define block-level tags in JS, so we can use toLowerCase to match tags // case-independently. This would be quite ugly (and possibly slower) if @@ -631,7 +645,8 @@ inline_element = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; } - & '<' ( comment / xmlish_tag ) + & '<' xmlish_tag + / & '<' comment /// & '{' ( & '{{{{{' template / tplarg / template ) / & '{' tplarg_or_template /// & '{' ( tplarg / template ) @@ -1299,19 +1314,62 @@ * and limit the content parsing to this section. */ xmlish_tag = nowiki - / t:generic_tag { + / t2:(t:generic_tag { var tagName = t.name; + // TagTk and SelfclosingTagTk if (t.constructor !== EndTagTk && !html5_tag_names[tagName] && !html_old_names[tagName]) { if (t.constructor === TagTk) { - var restOfInput = input.substring(t.dataAttribs.tsr[0]); - var tagContent = restOfInput.match(new RegExp("^(.|\n)*?</\s*" + tagName + ">", "m")); + var tsr0 = t.dataAttribs.tsr[0], + restOfInput = input.substring(tsr0), + tagContent = restOfInput.match(new RegExp("^(.|\n)*?(</\s*" + tagName + ">)", "m")); t.dataAttribs.src = tagContent ? tagContent[0] : restOfInput; + + // Replace extension content with dummy content so it matches the + // rule following this match and can be tokenized independently (if required). + // This is just a trick to tokenize ref content with higher priority. + var startTagLen = pos-tsr0, + extContentLen = t.dataAttribs.src.length - startTagLen - tagContent[2].length, + extContent = t.dataAttribs.src.substring(startTagLen, startTagLen+extContentLen); + + t.dataAttribs.extContentOffset = pos; + t.dataAttribs.extContent = extContent; + input = input.slice(0,pos) + charSequence('', '#', extContentLen) + input.slice(pos+extContentLen); } else { t.dataAttribs.src = input.substring(t.dataAttribs.tsr[0], t.dataAttribs.tsr[1]); } + t.dataAttribs.isExt = true; } + + currExtensionTag = t; return t; + }) ( + '#'+ { + // Should only match if we tricked the tokenizer + return currExtensionTag.dataAttribs.isExt && currExtensionTag.constructor !== SelfclosingTagTk; + } + / & { + // Should not match if we tricked the tokenizer + return !currExtensionTag.dataAttribs.isExt || currExtensionTag.constructor === SelfclosingTagTk; + } + ) { + var ret = t2; + if (t2.dataAttribs.isExt && t2.name === 'ref') { + // Parse ref-content, strip eof, and shift tsr + var extContentToks = (new PegTokenizer(pegArgs.env)).tokenize(t2.dataAttribs.extContent); + extContentToks = Util.stripEOFTkfromTokens(extContentToks); + Util.shiftTokenTSR(extContentToks, t2.dataAttribs.extContentOffset); + + // Clear temporary state + t2.dataAttribs.isExt = undefined; + t2.dataAttribs.extContent = undefined; + + ret = [t2].concat(extContentToks); + } + + currExtensionTag = null; + + return ret; } /* -- To view, visit https://gerrit.wikimedia.org/r/51802 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I82e29701875dde7702c504416f4a9487387dbe19 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits