[MediaWiki-commits] [Gerrit] WIP: Incomplete; Higher priority ext content parsing. - change (mediawiki...Parsoid)

Subramanya Sastry (Code Review) Fri, 01 Mar 2013 15:22:07 -0800

Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/51802



Change subject: WIP: Incomplete; Higher priority ext content parsing.
......................................................................

WIP: Incomplete; Higher priority ext content parsing.

* Committing expt. work in progress. Will continue Monday.

Change-Id: I82e29701875dde7702c504416f4a9487387dbe19
---
M js/lib/mediawiki.tokenizer.peg.js
M js/lib/pegTokenizer.pegjs.txt
2 files changed, 69 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/02/51802/1

diff --git a/js/lib/mediawiki.tokenizer.peg.js 
b/js/lib/mediawiki.tokenizer.peg.js
index a9eb410..e05a6a2 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -200,9 +200,14 @@
  */
 PegTokenizer.prototype.tokenize = function( text, production ) {
        try {
-               var args = { cb: null, pegTokenizer: this, srcOffset: 0, env: 
this.env };
-               return this.tokenizer.tokenize(text, production || "start", 
args);
+               var toks = [];
+               var args = { cb: function(r) { toks = toks.concat(r); }, 
pegTokenizer: this, srcOffset: 0, env: this.env };
+               // console.warn("text:" + JSON.stringify(text));
+               this.tokenizer.tokenize(text, production || "start", args);
+               return toks;
        } catch ( e ) {
+               // console.warn("exc: " + e);
+               // console.warn("trace: " + e.stack);
                return false;
        }
 };
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index 2bbadc5..f127166 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -164,6 +164,18 @@
         }
     }
 
+    function charSequence(prefix, c, numChars) {
+        if (numChars && numChars > 0) {
+            var buf = [prefix];
+            for (var i = 0; i < numChars; i++) {
+                buf.push(c);
+            }
+            return buf.join('');
+        } else {
+            return prefix;
+        }
+    }
+
     /* End static utilities */
 
     /*
@@ -298,6 +310,8 @@
 
     // text start position
     var textStart = 0;
+
+    var currExtensionTag = null;
 
     // Define block-level tags in JS, so we can use toLowerCase to match tags
     // case-independently. This would be quite ugly (and possibly slower) if
@@ -631,7 +645,8 @@
 
 inline_element
   = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
-      & '<' ( comment / xmlish_tag )
+      & '<' xmlish_tag
+    / & '<' comment
     /// & '{' ( & '{{{{{' template / tplarg / template )
     / & '{' tplarg_or_template
     /// & '{' ( tplarg / template )
@@ -1299,19 +1314,62 @@
  * and limit the content parsing to this section. */
 
 xmlish_tag = nowiki
-    / t:generic_tag {
+    / t2:(t:generic_tag {
         var tagName = t.name;
+
         // TagTk and SelfclosingTagTk
         if (t.constructor !== EndTagTk && !html5_tag_names[tagName] && 
!html_old_names[tagName]) {
             if (t.constructor === TagTk) {
-                var restOfInput = input.substring(t.dataAttribs.tsr[0]);
-                var tagContent = restOfInput.match(new RegExp("^(.|\n)*?</\s*" 
+ tagName + ">", "m"));
+                var tsr0 = t.dataAttribs.tsr[0],
+                    restOfInput = input.substring(tsr0),
+                    tagContent = restOfInput.match(new 
RegExp("^(.|\n)*?(</\s*" + tagName + ">)", "m"));
                 t.dataAttribs.src = tagContent ? tagContent[0] : restOfInput;
+
+                // Replace extension content with dummy content so it matches 
the
+                // rule following this match and can be tokenized 
independently (if required).
+                // This is just a trick to tokenize ref content with higher 
priority.
+                var startTagLen = pos-tsr0,
+                    extContentLen = t.dataAttribs.src.length - startTagLen - 
tagContent[2].length,
+                    extContent = t.dataAttribs.src.substring(startTagLen, 
startTagLen+extContentLen);
+
+                t.dataAttribs.extContentOffset = pos;
+                t.dataAttribs.extContent = extContent;
+                input = input.slice(0,pos) + charSequence('', '#', 
extContentLen) + input.slice(pos+extContentLen);
             } else {
                 t.dataAttribs.src = input.substring(t.dataAttribs.tsr[0], 
t.dataAttribs.tsr[1]);
             }
+            t.dataAttribs.isExt = true;
         }
+
+        currExtensionTag = t;
         return t;
+    }) (
+      '#'+ {
+        // Should only match if we tricked the tokenizer
+        return currExtensionTag.dataAttribs.isExt && 
currExtensionTag.constructor !== SelfclosingTagTk;
+      }
+      / &  {
+        // Should not match if we tricked the tokenizer
+        return !currExtensionTag.dataAttribs.isExt || 
currExtensionTag.constructor === SelfclosingTagTk;
+      }
+    ) {
+        var ret = t2;
+        if (t2.dataAttribs.isExt && t2.name === 'ref') {
+            // Parse ref-content, strip eof, and shift tsr
+            var extContentToks = (new 
PegTokenizer(pegArgs.env)).tokenize(t2.dataAttribs.extContent);
+            extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+            Util.shiftTokenTSR(extContentToks, 
t2.dataAttribs.extContentOffset);
+
+            // Clear temporary state
+            t2.dataAttribs.isExt = undefined;
+            t2.dataAttribs.extContent = undefined;
+
+            ret = [t2].concat(extContentToks);
+        }
+
+        currExtensionTag = null;
+
+        return ret;
     }
 
 /*

-- 
To view, visit https://gerrit.wikimedia.org/r/51802
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I82e29701875dde7702c504416f4a9487387dbe19
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] WIP: Incomplete; Higher priority ext content parsing. - change (mediawiki...Parsoid)

Reply via email to