GWicke has uploaded a new change for review.
https://gerrit.wikimedia.org/r/130483
Change subject: More performance tweaks / tokenizer cleanup
......................................................................
More performance tweaks / tokenizer cleanup
* Remove old and unused 'new String' logic
* simplify, optimize and partially inline inline_breaks
* Remove some dead code from the tokenizer
Change-Id: Idccadc5ef891d3b622b316060ed537a18fcac772
---
M lib/mediawiki.HTML5TreeBuilder.node.js
M lib/mediawiki.tokenizer.peg.js
M lib/pegTokenizer.pegjs.txt
3 files changed, 80 insertions(+), 182 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/83/130483/1
diff --git a/lib/mediawiki.HTML5TreeBuilder.node.js
b/lib/mediawiki.HTML5TreeBuilder.node.js
index c52d4fe..19d665c 100644
--- a/lib/mediawiki.HTML5TreeBuilder.node.js
+++ b/lib/mediawiki.HTML5TreeBuilder.node.js
@@ -141,12 +141,6 @@
|| self.lastToken.name !==
'pre';
switch( token.constructor ) {
case String:
- // note that we sometimes add 'dataAttrib' and 'get'
fields to
- // string objects, making them non-primitive.
- // ("git grep 'new String'" for more details)
- // we strip that information from the tokens here so we
don't
- // end up with non-primitive strings in the DOM.
- token = token.valueOf(); // convert token to primitive
string.
if ( token.match(/^[ \t\r\n\f]+$/) &&
isNotPrecededByPre ) {
// Treat space characters specially so that the
tree builder
// doesn't apply the foster parenting algorithm
diff --git a/lib/mediawiki.tokenizer.peg.js b/lib/mediawiki.tokenizer.peg.js
index 361556b..a8d88b6 100644
--- a/lib/mediawiki.tokenizer.peg.js
+++ b/lib/mediawiki.tokenizer.peg.js
@@ -253,27 +253,32 @@
* handle the end marker.
*/
PegTokenizer.prototype.inline_breaks = function (input, pos, stops ) {
+ var c = input[pos];
+ if (!/[=|!\}\{:\r\n\]<]/.test(c)) {
+ return false;
+ }
+
var counters = stops.counters;
- switch( input[pos] ) {
+ switch( c ) {
case '=':
return stops.onStack( 'equal' ) ||
( counters.h &&
- ( pos === input.length - 1 ||
- input.substr( pos + 1 )
- // possibly more equals
followed by spaces or comments
- .match(/^=*(?:[
\t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)/) !== null )
- ) || null;
+ ( pos === input.length - 1
+ // possibly more equals followed by
spaces or comments
+ || /^=*(?:[
\t]|<\!--(?:(?!-->)[^])*-->)*(?:[\r\n]|$)/
+ .test(input.substr( pos + 1 )))
+ );
case '|':
return stops.onStack('pipe') ||
//counters.template ||
counters.linkdesc || (
stops.onStack('table') && (
counters.tableCellArg || (
- pos < input.length - 1
&& input[pos+1].match(/[}|]/) !== null
+ pos < input.length - 1
+ &&
/[}|]/.test(input[pos+1])
)
)
- ) ||
- null;
+ );
case '{':
// {{!}} pipe templates..
return (
@@ -286,41 +291,36 @@
counters.tableCellArg
)
)
- ) && input.substr( pos, 5 ) === '{{!}}'
|| null;
+ ) && input.substr( pos, 5 ) === '{{!}}';
case "!":
- return stops.onStack( 'table' ) && input[pos + 1] ===
"!" ||
- null;
+ return stops.onStack( 'table' ) && input[pos + 1] ===
"!";
case "}":
- return counters.template && input[pos + 1] === "}" ||
null;
+ return counters.template && input[pos + 1] === "}";
case ":":
return counters.colon &&
! stops.onStack( 'extlink' ) &&
- ! counters.linkdesc || null;
+ ! counters.linkdesc;
case "\r":
return stops.onStack( 'table' ) &&
- input.substr(pos).match(/\r\n?\s*[!|]/) !==
null ||
- null;
+ /\r\n?\s*[!|]/.test(input.substr(pos));
case "\n":
//console.warn(JSON.stringify(input.substr(pos, 5)),
stops);
- return ( stops.onStack( 'table' ) &&
+ return stops.onStack( 'table' ) &&
// allow leading whitespace in tables
- input.substr(pos, 200).match( /^\n\s*[!|]/ ) )
||
+ /^\n\s*[!|]/.test(input.substr(pos, 200));
// break on table-like syntax when the table
stop is not
// enabled. XXX: see if this can be improved
//input.substr(pos, 200).match( /^\n[!|]/ ) ||
- null;
case "]":
return stops.onStack( 'extlink' ) ||
- ( counters.linkdesc && input[pos + 1] === ']' )
||
- null;
+ ( counters.linkdesc && input[pos + 1] === ']' );
case "<":
return ( counters.pre && input.substr( pos, 6 ) ===
'<pre>' ) ||
( counters.noinclude && input.substr(pos, 12)
=== '</noinclude>' ) ||
( counters.includeonly && input.substr(pos, 14)
=== '</includeonly>' ) ||
- ( counters.onlyinclude && input.substr(pos, 14)
=== '</onlyinclude>' ) ||
- null;
+ ( counters.onlyinclude && input.substr(pos, 14)
=== '</onlyinclude>' );
default:
- return null;
+ return false;
}
};
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index a6be990..9e08e79 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -47,6 +47,8 @@
CommentTk = defines.CommentTk,
EOFTk = defines.EOFTk;
+ var inline_breaks = pegArgs.pegTokenizer.inline_breaks;
+
var flattenIfArray = function(e) {
function internal_flatten(e, res) {
@@ -81,12 +83,6 @@
}
};
- var protocol_regexp =
/^(?:\/\/|(?:ftp|git|gopher|https?|ircs?|mms|nntp|svn|telnet|worldwind)\:\/\/|(?:mailto|news)\:)/i;
// mailto and news are special
-
- var match_protocol = function ( string ) {
- return string.match( protocol_regexp );
- };
-
var flatten_stringlist = function ( c ) {
var out = [],
text = '';
@@ -113,13 +109,13 @@
};
// Debug print with global switch
- var dp = function ( msg ) {
- if ( false ) {
- console.warn(msg);
- }
- };
+ //var dp = function ( msg ) {
+ // if ( false ) {
+ // console.warn(msg);
+ // }
+ //};
- var pp = function ( s ) { return JSON.stringify(s, null, 2); };
+ //var pp = function ( s ) { return JSON.stringify(s, null, 2); };
// Simple string formatting using '%s'
var sprintf = function ( format ) {
@@ -127,41 +123,6 @@
return format.replace(/%s/g, function () {
return args.length ? args.shift() : '';
});
- };
-
-
- /**
- * Determine if a string represents a valid ISBN-10 or ISBN-13 identifier
- *
- * @static
- * @method
- * @param {string} isbn: The string to check
- * @returns {Boolean}: True if valid ISBN, false otherwise.
- */
- var isValidISBN = function ( isbn ) {
- var i = 0, checksum = 0;
-
- isbn = isbn.toUpperCase().replace(/[^\dX]/g, '');
- return [10, 13].indexOf(isbn.length) !== -1;
-
- // XXX: The following code is short-circuited because it is stricter
- // than the standard parser:
-
- switch (isbn.length) {
- case 10:
- for (i = 0; i < 9; i++) {
- checksum += parseInt(isbn[i], 10) * (10 - i);
- }
- checksum += '0123456789X'.indexOf(isbn[9]);
- return (checksum % 11 === 0);
- case 13:
- for (i = 0; i < 13; i++) {
- /* jshint bitwise:false */
- checksum += parseInt(isbn[i], 10) * ((i & 1) ? 3 : 1);
- }
- return (checksum % 10 === 0) && (/^97[89]/.test(isbn));
- }
- return false;
};
@@ -251,58 +212,27 @@
this._updateStackKey();
};
SyntaxStops.prototype._updateCounterKey = function ( ) {
- var counters = [];
+ var counters = '';
for ( var k in this.counters ) {
if ( this.counters[k] > 0 ) {
- counters.push(k);
+ counters += 'c' + k;
}
}
- this._counterKey = JSON.stringify(counters);
+ this._counterKey = counters;
this.key = this._counterKey + this._stackKey;
};
SyntaxStops.prototype._updateStackKey = function ( ) {
- var stackStops = [];
+ var stackStops = '';
for ( var k in this.stacks ) {
if ( this.onStack( k ) ) {
- stackStops.push(k);
+ stackStops += 's' + k;
}
}
- this._stackKey = JSON.stringify(stackStops);
+ this._stackKey = stackStops;
this.key = this._counterKey + this._stackKey;
};
var stops = new SyntaxStops();
-
- // Start position of top-level block
- // Could also provide positions for lower-level blocks using a stack.
- var blockStart = 0;
-
- // Start position of generic tag production
- var tagStartPos = 0;
-
- // Stack of source positions
- var posStack = {
- positions: {},
- push: function( key, pos ) {
- if ( this.positions[key] === undefined ) {
- this.positions[key] = [pos];
- } else {
- this.positions[key].push( pos );
- }
- return true;
- },
- pop: function( key, pos ) {
- var pk = this.positions[key];
- if ( pk === undefined || ! pk.length ) {
- throw "Tried to pop unknown position for " + key;
- } else {
- return [ pk.pop(), pos ];
- }
- }
- };
-
- // cache the input length
- var inputLength = input.length;
// Current extension/include tag being parsed.
var currExtTag = null;
@@ -461,7 +391,6 @@
// Trick the tokenizer into ending parsing
input = parsedInput;
- inputLength = pos;
// console.warn("Yield @pos: " + newOffset + "; input len: " +
newInput.length);
@@ -499,22 +428,11 @@
if ( Array.isArray(b) && b.length ) {
b = flattenIfArray(b);
var bs = b[0];
- if ( bs.constructor === String && bs.attribs === undefined ) {
- /*jshint -W053 */
- // we need to make a non-primitive string in order to add
properties
- b[0] = new String( bs );
- /*jshint +W053 */
- bs = b[0];
- }
- if (bs.dataAttribs === undefined) {
+ if ( bs.constructor !== String && bs.dataAttribs === undefined) {
bs.dataAttribs = {};
}
tokens = b;
- } else if (b.constructor === String && b.attribs === undefined) {
- b = new String( b );
- if (b.dataAttribs === undefined) {
- b.dataAttribs = {};
- }
+ } else if (b.constructor === String) {
tokens = [b];
}
@@ -643,15 +561,14 @@
* in nested inline productions.
*/
inline_breaks
- = & [=|!}{:\r\n\]<]
- & {
+ = & {
//console.warn('ilbf: ' + input.substr(pos, 5) );
//if ( null !== pegArgs.parser.inline_breaks( input, pos, stops ) ) {
// console.warn('ilb break: ' + pp(input.substr(pos, 5)) );
//} else {
// console.warn('ilb no break: ' + pp(input.substr(pos, 5)) );
//}
- return null !== pegArgs.pegTokenizer.inline_breaks( input, pos, stops
);
+ return inline_breaks( input, pos, stops );
}
pre_start = "<" pre_tag_name (' '+ [^>]*)? ">"
@@ -663,16 +580,19 @@
}
inlineline
- = c:(urltext / !inline_breaks !pre_start (inline_element / [^\r\n]))+ {
+ = c:(urltext
+ / !{ return inline_breaks( input, pos, stops ); } // inline_breaks
+ !pre_start (inline_element / [^\r\n]))+ {
//console.warn('inlineline out:' + pp(c) + input.substr(pos0, pos));
return flatten_stringlist( c );
}
inline_element
= //& { dp('inline_element enter' + input.substr(pos, 10)); return true; }
- & '<' nowiki
- / & '<' xmlish_tag
- / & '<' comment
+ & '<' ( nowiki
+ / xmlish_tag
+ / comment
+ )
/// & '{' ( & '{{{{{' template / tplarg / template )
/ & '{' tplarg_or_template_or_broken
/ & '}' broken_template
@@ -727,7 +647,7 @@
spc
]);
}
- / & { /* dp('nomatch exit h'); */ stops.dec('h'); return false; } { return
null; }
+ / & { stops.dec('h'); return false; } { return null; }
) { return r; }
comment
@@ -768,37 +688,37 @@
autolink
= ! { return stops.onStack('extlink'); }
- (urllink / autoref / isbn)
-
-urllink
- = target:autourl {
- // Special case handling for trailing parentheses: remove from link if
- // there is no opening parenthesis in the link
- if ( Array.isArray(target) ) {
- var end = target[target.length - 1];
- if ( !/[(]/.test( target[0] ) &&
- end.constructor === String &&
- /[)]$/.test( end )
- ) {
- target.pop();
- pos--;
- }
- } else {
- if (!/[(]/.test(target) && /[)]$/.test(target)) {
- target = target.substr(0, target.length - 1);
- pos--;
- }
- }
- var res = [ new SelfclosingTagTk( 'urllink', [new KV('href', target)], {
tsr: [pos0, pos] } ) ];
- return res;
- }
+ (
+ // urllink, inlined
+ target:autourl {
+ // Special case handling for trailing parentheses: remove from link if
+ // there is no opening parenthesis in the link
+ if ( Array.isArray(target) ) {
+ var end = target[target.length - 1];
+ if ( !/[(]/.test( target[0] ) &&
+ end.constructor === String &&
+ /[)]$/.test( end )
+ ) {
+ target.pop();
+ pos--;
+ }
+ } else {
+ if (!/[(]/.test(target) && /[)]$/.test(target)) {
+ target = target.substr(0, target.length - 1);
+ pos--;
+ }
+ }
+ var res = [ new SelfclosingTagTk( 'urllink', [new KV('href', target)],
{ tsr: [pos0, pos] } ) ];
+ return res;
+ }
+ / autoref
+ / isbn)
extlink
= ! { return stops.onStack('extlink'); } // extlink cannot be nested
(
"["
& { return stops.push('extlink', true); }
- //target:urllink
target:extlink_preprocessor_text
& { return Util.isProtocolValid( target, pegArgs.env ); }
sp:( space / [\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000] )*
@@ -871,11 +791,6 @@
return [ input.substring( pos0, pos ) ];
}
- // Don't validate in the tokenizer, since the PHP parser does not either.
- //if (!isValidISBN(isbn)) {
- // return null;
- //}
-
return [
new SelfclosingTagTk( 'extlink', [
new KV('href', 'Special:BookSources/' + isbncode),
@@ -924,7 +839,7 @@
url
= proto:url_protocol
addr:( ipv6_address / ipv4_address )?
- path:( ( !inline_breaks
+ path:( ( !{ return inline_breaks( input, pos, stops ); } // inline_breaks
c:no_punctuation_char
{ return c; }
)
@@ -945,7 +860,7 @@
autourl
= proto:url_protocol
addr:( ipv6_address / ipv4_address )?
- path:( ( !inline_breaks
+ path:( ( !{ return inline_breaks( input, pos, stops ); } // inline_breaks
c:no_punctuation_char
{ return c; }
)
@@ -1903,7 +1818,7 @@
return [ li1 ].concat( c, [ li2 ], d || '' );
}
// Fall-back case to clear the colon flag
- / & { return true; } { stops.counters.colon = 0; return null; }
+ / & { stops.counters.colon = 0; return false; }
list_char = [*#:;]
@@ -2211,17 +2126,6 @@
// Old version
//text = t:[A-Za-z0-9,._ "?!\t-]+ { return t.join('') }
-// Experimental tweaked version: avoid expensive single-char substrings
-// This did not bring the expected performance boost, however.
-//text = [A-Za-z0-9,._ -] {
-// textStart = pos;
-//
-// var res = input.substr(textStart - 1, inputLength)
-// .match(/[A-Za-z0-9,._ -]+/)[0];
-// pos = pos + (res.length - 1);
-// return res;
-// }
-
htmlentity = "&" c:[#0-9a-zA-Z]+ ";" {
//return "&" + c.join('') + ";";
var m = "&" + c.join('') + ";",
@@ -2444,10 +2348,10 @@
}
// Start of file
-sof = & { return pos === 0; } { return true; }
+sof = & { return pos === 0; }
// End of file
-eof = & { return pos === input.length; } { return true; }
+eof = & { return pos === input.length; }
newline = '\n' / '\r\n'
--
To view, visit https://gerrit.wikimedia.org/r/130483
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Idccadc5ef891d3b622b316060ed537a18fcac772
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits