Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/161108
Change subject: WIP: Make table_lines return tokens after each row.
......................................................................
WIP: Make table_lines return tokens after each row.
* Still some failing tests ... productions need addtl. tweaking.
* This doesn't seem to affect parsability of the large table.
I can see the HTML tree builder getting the stream of processed
tokens (try parsing with --trace html). So, I suspect we have to
look elsewhere to fix memory issues. I was able to parse about
1800+ lines earlier in 50+ secs and the same is true even now.
Nothing has changed. When I go up to 2200+ lines, things get
realllly slow. Let me run it with --trace html next to see
what comes up.
Change-Id: I84aab52a04269ce58c274ad3f279600b358e8e14
---
M lib/pegTokenizer.pegjs.txt
1 file changed, 43 insertions(+), 29 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/08/161108/1
diff --git a/lib/pegTokenizer.pegjs.txt b/lib/pegTokenizer.pegjs.txt
index 4b63049..e558e88 100644
--- a/lib/pegTokenizer.pegjs.txt
+++ b/lib/pegTokenizer.pegjs.txt
@@ -849,14 +849,14 @@
// This production is identical to the 'inline' fragment except
// that tables are allowed inside image captions.
link_text_fragment
- = c:((sol table_lines) / urltext
- / (!inline_breaks
- !pre_start
- r:( inline_element / '[' text_char+ ']' / . ) {
- return r;
- })
+ = c:((sol full_table)
+ / urltext
+ / (!inline_breaks
+ !pre_start
+ r:( inline_element / '[' text_char+ ']' / . ) { return r; }
+ )
)+ {
- //console.warn('inline out:' + pp(c));
+ // console.warn('inline out:' +
JSON.stringify(tu.flatten_stringlist(c)));
return tu.flatten_stringlist( c );
}
@@ -1511,14 +1511,14 @@
* Fürstenberg.
*/
hacky_dl_uses = bullets:":"+
- c:table_lines
+ tbl:full_table
s:space* // Do we really need to RT this?
&comment_space_eolf
{
// Leave bullets as an array -- list handler expects this
var li = new TagTk( 'listItem', [], { tsr: [peg$reportedPos,
peg$reportedPos + bullets.length] } );
li.bullets = bullets;
- return tu.flattenIfArray([li, c || [], s || []]);
+ return tu.flattenIfArray([li, tbl || [], s || []]);
}
dtdd
@@ -1566,37 +1566,52 @@
* directly to block_lines.
*********************************************************************/
+full_table
+ = // & { console.warn('enter table_lines: ' + input.substr(peg$currPos,
20)); return true; }
+ (! inline_breaks / & '{{!}}' )
+ r:(
+ & { return stops.push('table', true); }
+ tbl:(
+ table_start_tag optionalNewlines
+ (sol table_content_line optionalNewlines)*
+ sol table_end_tag)
+ {
+ stops.pop('table');
+ return tbl;
+ }
+ / & { return stops.pop('table'); }
+ ) { return r; }
+
table_lines
- = //& { console.warn('enter table_lines: ' + input.substr(peg$currPos, 20));
return true; }
+ = // & { console.warn('enter table_lines: ' + input.substr(peg$currPos,
20)); return true; }
(! inline_breaks / & '{{!}}' )
r:(
& { return stops.push('table', true); }
tl:table_line
- tls:(
- nls:optionalNewlines
- s:sol
- tl2:table_line { return nls.concat(s, tl2); }
- )*
+ nls:optionalNewlines
{
stops.pop('table');
//console.warn('table_lines: ' + pp(tl.concat(tls)));
- return tl.concat( tls );
+ return tl.concat(nls);
}
/ & { return stops.pop('table'); }
) { return r; }
// This production assumes start-of-line position!
table_line
- = (space / comment)* (table_start_tag
- / table_heading_tags
- / table_row_tag
- / table_data_tags
- / table_caption_tag
- / table_end_tag)
+ = table_start_tag
+ / table_content_line
+ / table_end_tag
+table_content_line = (space / comment)* (
+ table_heading_tags
+ / table_row_tag
+ / table_data_tags
+ / table_caption_tag
+ )
table_start_tag
- = b:"{" p:pipe
+ = sc:(space / comment)* b:"{" p:pipe
// ok to normalize away stray |} on rt (see bug 57360)
& { return stops.push('table', false); }
ta:(generic_attribute / broken_table_attribute_name_char)*
@@ -1613,7 +1628,7 @@
tblStart.attribs = ta;
}
- return [tblStart];
+ return sc.concat([tblStart]);
}
table_caption_tag
@@ -1627,9 +1642,8 @@
.concat([new EndTagTk('caption')]);
}
-
table_row_tag
- = //& { console.warn("table row enter @" + input.substr(peg$currPos, 30));
return true; }
+ = // & { console.warn("table row enter @" + input.substr(peg$currPos, 30));
return true; }
// avoid recursion via nested_block_in_table
! { return stops.onStack('tableDataBlock'); }
p:pipe dashes:"-"+
@@ -1696,7 +1710,7 @@
}
table_data_tag
- = //& { dp("table_data enter, pos=" + peg$currPos +
input.substr(peg$currPos,10)); return true; }
+ = // & { console.warn("table_data_tag enter @" + input.substr(peg$currPos,
30)); return true; }
! "}"
arg:row_syntax_table_args?
//& { console.warn("past attrib, pos=" + peg$currPos +
input.substr(peg$currPos,10)); return true; }
@@ -1740,14 +1754,14 @@
/ & { return stops.pop('th'); }
table_end_tag
- = space* p:pipe b:"}" {
+ = sc:(space / comment)* p:pipe b:"}" {
var tblEnd = new EndTagTk( 'table', [], { tsr: [peg$reportedPos,
peg$currPos] } );
if (p !== "|") {
// p+"<brace-char>" is triggering some bug in pegJS
// I cannot even use that expression in the comment!
tblEnd.dataAttribs.endTagSrc = p+b;
}
- return [ tblEnd ];
+ return sc.concat([tblEnd]);
}
/**
--
To view, visit https://gerrit.wikimedia.org/r/161108
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I84aab52a04269ce58c274ad3f279600b358e8e14
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits