[MediaWiki-commits] [Gerrit] More performance tweaks / tokenizer cleanup - change (mediawiki...parsoid)

GWicke (Code Review) Tue, 29 Apr 2014 13:58:34 -0700

GWicke has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/130483


Change subject: More performance tweaks / tokenizer cleanup
......................................................................

More performance tweaks / tokenizer cleanup

* Remove old and unused 'new String' logic
* simplify, optimize and partially inline inline_breaks
* Remove some dead code from the tokenizer

Change-Id: Idccadc5ef891d3b622b316060ed537a18fcac772
---
M lib/mediawiki.HTML5TreeBuilder.node.js
M lib/mediawiki.tokenizer.peg.js
M lib/pegTokenizer.pegjs.txt
3 files changed, 80 insertions(+), 182 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/83/130483/1

diff --git a/lib/mediawiki.HTML5TreeBuilder.node.js 
b/lib/mediawiki.HTML5TreeBuilder.node.js
index c52d4fe..19d665c 100644
--- a/lib/mediawiki.HTML5TreeBuilder.node.js
+++ b/lib/mediawiki.HTML5TreeBuilder.node.js
@@ -141,12 +141,6 @@
                                                || self.lastToken.name !== 
'pre';
        switch( token.constructor ) {
                case String:
-                       // note that we sometimes add 'dataAttrib' and 'get' 
fields to
-                       // string objects, making them non-primitive.
-                       // ("git grep 'new String'" for more details)
-                       // we strip that information from the tokens here so we 
don't
-                       // end up with non-primitive strings in the DOM.
-                       token = token.valueOf(); // convert token to primitive 
string.
                        if ( token.match(/^[ \t\r\n\f]+$/) && 
isNotPrecededByPre ) {
                                // Treat space characters specially so that the 
tree builder
                                // doesn't apply the foster parenting algorithm
diff --git a/lib/mediawiki.tokenizer.peg.js b/lib/mediawiki.tokenizer.peg.js
index 361556b..a8d88b6 100644
--- a/lib/mediawiki.tokenizer.peg.js
+++ b/lib/mediawiki.tokenizer.peg.js
@@ -253,27 +253,32 @@
  * handle the end marker.
  */
 PegTokenizer.prototype.inline_breaks = function (input, pos, stops ) {
+       var c = input[pos];
+       if (!/[=|!\}\{:\r\n\]<]/.test(c)) {
+               return false;
+       }
+
        var counters = stops.counters;
-       switch( input[pos] ) {
+       switch( c ) {
                case '=':
                        return stops.onStack( 'equal' ) ||
                                ( counters.h &&
-                                       ( pos === input.length - 1 ||
-                                         input.substr( pos + 1 )
-                                               // possibly more equals 
followed by spaces or comments
-                                               .match(/^=*(?:[ 
\t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)/) !== null )
-                               ) || null;
+                                       ( pos === input.length - 1
+                                         // possibly more equals followed by 
spaces or comments
+                                         || /^=*(?:[ 
\t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)/
+                                               .test(input.substr( pos + 1 )))
+                               );
                case '|':
                        return stops.onStack('pipe') ||
                                //counters.template ||
                                counters.linkdesc || (
                                        stops.onStack('table') && (
                                                counters.tableCellArg || (
-                                                       pos < input.length - 1 
&& input[pos+1].match(/[}|]/) !== null
+                                                       pos < input.length - 1
+                                                       && 
/[}|]/.test(input[pos+1])
                                                )
                                        )
-                               ) ||
-                               null;
+                               );
                case '{':
                        // {{!}} pipe templates..
                        return (
@@ -286,41 +291,36 @@
                                                                
counters.tableCellArg
                                                        )
                                                )
-                                       ) && input.substr( pos, 5 ) === '{{!}}' 
|| null;
+                                       ) && input.substr( pos, 5 ) === '{{!}}';
                case "!":
-                       return stops.onStack( 'table' ) && input[pos + 1] === 
"!" ||
-                               null;
+                       return stops.onStack( 'table' ) && input[pos + 1] === 
"!";
                case "}":
-                       return counters.template && input[pos + 1] === "}" || 
null;
+                       return counters.template && input[pos + 1] === "}";
                case ":":
                        return counters.colon &&
                                ! stops.onStack( 'extlink' ) &&
-                               ! counters.linkdesc || null;
+                               ! counters.linkdesc;
                case "\r":
                        return stops.onStack( 'table' ) &&
-                               input.substr(pos).match(/\r\n?\s*[!|]/) !== 
null ||
-                               null;
+                               /\r\n?\s*[!|]/.test(input.substr(pos));
                case "\n":
                        //console.warn(JSON.stringify(input.substr(pos, 5)), 
stops);
-                       return ( stops.onStack( 'table' ) &&
+                       return stops.onStack( 'table' ) &&
                                // allow leading whitespace in tables
-                               input.substr(pos, 200).match( /^\n\s*[!|]/ ) ) 
||
+                               /^\n\s*[!|]/.test(input.substr(pos, 200));
                                // break on table-like syntax when the table 
stop is not
                                // enabled. XXX: see if this can be improved
                                //input.substr(pos, 200).match( /^\n[!|]/ ) ||
-                               null;
                case "]":
                        return stops.onStack( 'extlink' ) ||
-                               ( counters.linkdesc && input[pos + 1] === ']' ) 
||
-                               null;
+                               ( counters.linkdesc && input[pos + 1] === ']' );
                case "<":
                        return ( counters.pre &&  input.substr( pos, 6 ) === 
'<pre>' ) ||
                                ( counters.noinclude && input.substr(pos, 12) 
=== '</noinclude>' ) ||
                                ( counters.includeonly && input.substr(pos, 14) 
=== '</includeonly>' ) ||
-                               ( counters.onlyinclude && input.substr(pos, 14) 
=== '</onlyinclude>' ) ||
-                               null;
+                               ( counters.onlyinclude && input.substr(pos, 14) 
=== '</onlyinclude>' );
                default:
-                       return null;
+                       return false;
        }
 };
 
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index a6be990..9e08e79 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -47,6 +47,8 @@
         CommentTk = defines.CommentTk,
         EOFTk = defines.EOFTk;
 
+    var inline_breaks = pegArgs.pegTokenizer.inline_breaks;
+
 
     var flattenIfArray = function(e) {
         function internal_flatten(e, res) {
@@ -81,12 +83,6 @@
         }
     };
 
-    var protocol_regexp = 
/^(?:\/\/|(?:ftp|git|gopher|https?|ircs?|mms|nntp|svn|telnet|worldwind)\:\/\/|(?:mailto|news)\:)/i;
 // mailto and news are special
-
-    var match_protocol = function ( string ) {
-        return string.match( protocol_regexp );
-    };
-
     var flatten_stringlist = function ( c ) {
         var out = [],
             text = '';
@@ -113,13 +109,13 @@
     };
 
     // Debug print with global switch
-    var dp = function ( msg ) {
-        if ( false ) {
-            console.warn(msg);
-        }
-    };
+    //var dp = function ( msg ) {
+    //    if ( false ) {
+    //        console.warn(msg);
+    //    }
+    //};
 
-    var pp = function ( s ) { return JSON.stringify(s, null, 2); };
+    //var pp = function ( s ) { return JSON.stringify(s, null, 2); };
 
     // Simple string formatting using '%s'
     var sprintf = function ( format ) {
@@ -127,41 +123,6 @@
         return format.replace(/%s/g, function () {
             return args.length ? args.shift() : '';
         });
-    };
-
-
-    /**
-     * Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
-     *
-     * @static
-     * @method
-     * @param {string} isbn: The string to check
-     * @returns {Boolean}: True if valid ISBN, false otherwise.
-     */
-    var isValidISBN = function ( isbn ) {
-        var i = 0, checksum = 0;
-
-        isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
-        return [10, 13].indexOf(isbn.length) !== -1;
-
-        // XXX: The following code is short-circuited because it is stricter
-        // than the standard parser:
-
-        switch (isbn.length) {
-        case 10:
-            for (i = 0; i < 9; i++) {
-                checksum += parseInt(isbn[i], 10) * (10 - i);
-            }
-            checksum += '0123456789X'.indexOf(isbn[9]);
-            return (checksum % 11 === 0);
-        case 13:
-            for (i = 0; i < 13; i++) {
-                /* jshint bitwise:false */
-                checksum += parseInt(isbn[i], 10) * ((i & 1) ? 3 : 1);
-            }
-            return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
-        }
-        return false;
     };
 
 
@@ -251,58 +212,27 @@
         this._updateStackKey();
     };
     SyntaxStops.prototype._updateCounterKey = function ( ) {
-        var counters = [];
+        var counters = '';
         for ( var k in this.counters ) {
             if ( this.counters[k] > 0 ) {
-                counters.push(k);
+                counters += 'c' + k;
             }
         }
-        this._counterKey = JSON.stringify(counters);
+        this._counterKey = counters;
         this.key = this._counterKey + this._stackKey;
     };
     SyntaxStops.prototype._updateStackKey = function ( ) {
-        var stackStops = [];
+        var stackStops = '';
         for ( var k in this.stacks ) {
             if ( this.onStack( k )  ) {
-                stackStops.push(k);
+                stackStops += 's' + k;
             }
         }
-        this._stackKey = JSON.stringify(stackStops);
+        this._stackKey = stackStops;
         this.key = this._counterKey + this._stackKey;
     };
 
     var stops = new SyntaxStops();
-
-    // Start position of top-level block
-    // Could also provide positions for lower-level blocks using a stack.
-    var blockStart = 0;
-
-    // Start position of generic tag production
-    var tagStartPos = 0;
-
-    // Stack of source positions
-    var posStack = {
-        positions: {},
-        push: function( key, pos ) {
-            if ( this.positions[key] === undefined ) {
-                this.positions[key] = [pos];
-            } else {
-                this.positions[key].push( pos );
-            }
-            return true;
-        },
-        pop: function( key, pos ) {
-            var pk = this.positions[key];
-            if ( pk === undefined || ! pk.length ) {
-                throw "Tried to pop unknown position for " + key;
-            } else {
-                return [ pk.pop(), pos ];
-            }
-        }
-    };
-
-    // cache the input length
-    var inputLength = input.length;
 
     // Current extension/include tag being parsed.
     var currExtTag = null;
@@ -461,7 +391,6 @@
 
         // Trick the tokenizer into ending parsing
         input = parsedInput;
-        inputLength = pos;
 
         // console.warn("Yield @pos: " + newOffset + "; input len: " + 
newInput.length);
 
@@ -499,22 +428,11 @@
       if ( Array.isArray(b) && b.length ) {
         b = flattenIfArray(b);
         var bs = b[0];
-        if ( bs.constructor === String && bs.attribs === undefined ) {
-            /*jshint -W053 */
-            // we need to make a non-primitive string in order to add 
properties
-            b[0] = new String( bs );
-            /*jshint +W053 */
-            bs = b[0];
-        }
-        if (bs.dataAttribs === undefined) {
+        if ( bs.constructor !== String && bs.dataAttribs === undefined) {
             bs.dataAttribs = {};
         }
         tokens = b;
-    } else if (b.constructor === String && b.attribs === undefined) {
-        b = new String( b );
-        if (b.dataAttribs === undefined) {
-            b.dataAttribs = {};
-        }
+    } else if (b.constructor === String) {
         tokens = [b];
     }
 
@@ -643,15 +561,14 @@
  * in nested inline productions.
  */
 inline_breaks
-  = & [=|!}{:\r\n\]<]
-    & {
+  = & {
         //console.warn('ilbf: ' + input.substr(pos, 5) );
         //if ( null !== pegArgs.parser.inline_breaks( input, pos, stops ) ) {
         //    console.warn('ilb break: ' + pp(input.substr(pos, 5)) );
         //} else {
         //    console.warn('ilb no break: ' + pp(input.substr(pos, 5)) );
         //}
-        return null !== pegArgs.pegTokenizer.inline_breaks( input, pos, stops 
);
+        return inline_breaks( input, pos, stops );
       }
 
 pre_start = "<" pre_tag_name (' '+ [^>]*)? ">"
@@ -663,16 +580,19 @@
   }
 
 inlineline
-  = c:(urltext / !inline_breaks !pre_start (inline_element / [^\r\n]))+ {
+  = c:(urltext 
+          / !{ return inline_breaks( input, pos, stops ); } // inline_breaks 
+            !pre_start (inline_element / [^\r\n]))+ {
       //console.warn('inlineline out:' + pp(c) + input.substr(pos0, pos));
       return flatten_stringlist( c );
   }
 
 inline_element
   = //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
-      & '<' nowiki
-    / & '<' xmlish_tag
-    / & '<' comment
+    & '<' ( nowiki
+          / xmlish_tag
+          / comment
+          )
     /// & '{' ( & '{{{{{' template / tplarg / template )
     / & '{' tplarg_or_template_or_broken
     / & '}' broken_template
@@ -727,7 +647,7 @@
                         spc
                         ]);
       }
-    / & { /* dp('nomatch exit h'); */ stops.dec('h'); return false; } { return 
null; }
+    / & { stops.dec('h'); return false; } { return null; }
     ) { return r; }
 
 comment
@@ -768,37 +688,37 @@
 
 autolink
   = ! { return stops.onStack('extlink'); }
-    (urllink / autoref / isbn)
-
-urllink
-  = target:autourl {
-      // Special case handling for trailing parentheses: remove from link if
-      // there is no opening parenthesis in the link
-      if ( Array.isArray(target) ) {
-          var end = target[target.length - 1];
-          if ( !/[(]/.test( target[0] ) &&
-               end.constructor === String &&
-               /[)]$/.test( end )
-          ) {
-              target.pop();
-              pos--;
-          }
-      } else {
-          if (!/[(]/.test(target) && /[)]$/.test(target)) {
-              target = target.substr(0, target.length - 1);
-              pos--;
-          }
-      }
-      var res = [ new SelfclosingTagTk( 'urllink', [new KV('href', target)], { 
tsr: [pos0, pos] } ) ];
-      return res;
-  }
+    (
+      // urllink, inlined
+      target:autourl {
+        // Special case handling for trailing parentheses: remove from link if
+        // there is no opening parenthesis in the link
+        if ( Array.isArray(target) ) {
+            var end = target[target.length - 1];
+            if ( !/[(]/.test( target[0] ) &&
+                 end.constructor === String &&
+                 /[)]$/.test( end )
+            ) {
+                target.pop();
+                pos--;
+            }
+        } else {
+            if (!/[(]/.test(target) && /[)]$/.test(target)) {
+                target = target.substr(0, target.length - 1);
+                pos--;
+            }
+        }
+        var res = [ new SelfclosingTagTk( 'urllink', [new KV('href', target)], 
{ tsr: [pos0, pos] } ) ];
+          return res;
+      } 
+    / autoref 
+    / isbn)
 
 extlink
   = ! { return stops.onStack('extlink'); } // extlink cannot be nested
     (
         "["
         & { return stops.push('extlink', true); }
-        //target:urllink
         target:extlink_preprocessor_text
         & { return Util.isProtocolValid( target, pegArgs.env ); }
         sp:( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
@@ -871,11 +791,6 @@
         return [ input.substring( pos0, pos ) ];
     }
 
-    // Don't validate in the tokenizer, since the PHP parser does not either.
-    //if (!isValidISBN(isbn)) {
-    //    return null;
-    //}
-
     return [
         new SelfclosingTagTk( 'extlink', [
            new KV('href', 'Special:BookSources/' + isbncode),
@@ -924,7 +839,7 @@
 url
   = proto:url_protocol
     addr:( ipv6_address / ipv4_address )?
-    path:(  ( !inline_breaks
+    path:(  ( !{ return inline_breaks( input, pos, stops ); } // inline_breaks
               c:no_punctuation_char
               { return c; }
             )
@@ -945,7 +860,7 @@
 autourl
   = proto:url_protocol
     addr:( ipv6_address / ipv4_address )?
-    path:(  ( !inline_breaks
+    path:(  ( !{ return inline_breaks( input, pos, stops ); } // inline_breaks
               c:no_punctuation_char
               { return c; }
             )
@@ -1903,7 +1818,7 @@
         return [ li1 ].concat( c, [ li2 ], d || '' );
     }
   // Fall-back case to clear the colon flag
-  / & { return true; } { stops.counters.colon = 0; return null; }
+  / & { stops.counters.colon = 0; return false; }
 
 
 list_char = [*#:;]
@@ -2211,17 +2126,6 @@
 // Old version
 //text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
 
-// Experimental tweaked version: avoid expensive single-char substrings
-// This did not bring the expected performance boost, however.
-//text = [A-Za-z0-9,._ -] {
-//            textStart = pos;
-//
-//            var res = input.substr(textStart - 1, inputLength)
-//                        .match(/[A-Za-z0-9,._ -]+/)[0];
-//            pos = pos + (res.length - 1);
-//            return res;
-//       }
-
 htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
     //return "&" + c.join('') + ";";
     var m = "&" + c.join('') + ";",
@@ -2444,10 +2348,10 @@
   }
 
 // Start of file
-sof = & { return pos === 0; } { return true; }
+sof = & { return pos === 0; }
 
 // End of file
-eof = & { return pos === input.length; } { return true; }
+eof = & { return pos === input.length; }
 
 newline = '\n' / '\r\n'
 

-- 
To view, visit https://gerrit.wikimedia.org/r/130483
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idccadc5ef891d3b622b316060ed537a18fcac772
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <gwi...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] More performance tweaks / tokenizer cleanup - change (mediawiki...parsoid)

Reply via email to