jenkins-bot has submitted this change and it was merged. Change subject: Preprocessor: Don't allow unclosed extension tags (matching until end of input) ......................................................................
Preprocessor: Don't allow unclosed extension tags (matching until end of input) I think it's saner to treat this as invalid syntax, and output the mismatched tag code verbatim. The current behavior is particularly annoying for <ref> tags, which often swallow everything afterwards. This does not affect HTML tags, though. Assuming Tidy is enabled, they are still auto-closed at the end of the page content. Related to T17712 and T58306. I think this brings the PHP parser closer to Parsoid's interpretation. It reduces performance somewhat in the worst case, though. Testing with https://phabricator.wikimedia.org/F3245989 (a 1 MB page starting with 3000 opening tags of 15 different types), parsing time rises from ~0.2 seconds to ~1.1 seconds on my setup. We go from O(N) to O(kN), where N is bytes of input and k is the number of types of tags present on the page. Maximum k shouldn't exceed 30 or so in reasonable setups (depends on installed extensions, it's 20 on English Wikipedia). To consider: * Should we keep previous behavior for unclosed <includeonly> / <noinclude>? This would be particularly disruptive for these if someone relied on the old behavior, and they're already special-cased in places. * Unclosed <pre> tags are now treated as HTML tags, and are still displayed as preformatted text, but without suppressing wikitext formatting. Change-Id: Ia2f24dbfb3567c4b0778761585e6c0303d11ddd0 --- M includes/parser/Preprocessor_DOM.php M includes/parser/Preprocessor_Hash.php M tests/parser/parserTests.txt M tests/phpunit/includes/parser/PreprocessorTest.php 4 files changed, 31 insertions(+), 13 deletions(-) Approvals: Tim Starling: Looks good to me, approved jenkins-bot: Verified diff --git a/includes/parser/Preprocessor_DOM.php b/includes/parser/Preprocessor_DOM.php index 4ca3a87..817f153 100644 --- a/includes/parser/Preprocessor_DOM.php +++ b/includes/parser/Preprocessor_DOM.php @@ -237,6 +237,8 @@ $inHeading = false; // True if there are no more greater-than (>) signs right of $i $noMoreGT = false; + // Map of tag name => true if there are no more closing tags of given type right of $i + $noMoreClosingTag = array(); // True to ignore all input up to the next <onlyinclude> $findOnlyinclude = $enableOnlyinclude; // Do a line-start run without outputting an LF character @@ -457,17 +459,21 @@ } else { $attrEnd = $tagEndPos; // Find closing tag - if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + if ( + !isset( $noMoreClosingTag[$name] ) && + preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) { $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); $i = $matches[0][1] + strlen( $matches[0][0] ); $close = '<close>' . htmlspecialchars( $matches[0][0] ) . '</close>'; } else { - // No end tag -- let it run out to the end of the text. - $inner = substr( $text, $tagEndPos + 1 ); - $i = $lengthText; - $close = ''; + // No end tag -- don't match the tag, treat opening tag as literal and resume parsing. + $i = $tagEndPos + 1; + $accum .= htmlspecialchars( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) ); + // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>... + $noMoreClosingTag[$name] = true; + continue; } } // <includeonly> and <noinclude> just become <ignore> tags diff --git a/includes/parser/Preprocessor_Hash.php b/includes/parser/Preprocessor_Hash.php index 50eaefb..28c49fd 100644 --- a/includes/parser/Preprocessor_Hash.php +++ b/includes/parser/Preprocessor_Hash.php @@ -160,6 +160,8 @@ $inHeading = false; // True if there are no more greater-than (>) signs right of $i $noMoreGT = false; + // Map of tag name => true if there are no more closing tags of given type right of $i + $noMoreClosingTag = array(); // True to ignore all input up to the next <onlyinclude> $findOnlyinclude = $enableOnlyinclude; // Do a line-start run without outputting an LF character @@ -380,17 +382,21 @@ } else { $attrEnd = $tagEndPos; // Find closing tag - if ( preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", + if ( + !isset( $noMoreClosingTag[$name] ) && + preg_match( "/<\/" . preg_quote( $name, '/' ) . "\s*>/i", $text, $matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 ) ) { $inner = substr( $text, $tagEndPos + 1, $matches[0][1] - $tagEndPos - 1 ); $i = $matches[0][1] + strlen( $matches[0][0] ); $close = $matches[0][0]; } else { - // No end tag -- let it run out to the end of the text. - $inner = substr( $text, $tagEndPos + 1 ); - $i = $lengthText; - $close = null; + // No end tag -- don't match the tag, treat opening tag as literal and resume parsing. + $i = $tagEndPos + 1; + $accum->addLiteral( substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) ); + // Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>... + $noMoreClosingTag[$name] = true; + continue; } } // <includeonly> and <noinclude> just become <ignore> tags diff --git a/tests/parser/parserTests.txt b/tests/parser/parserTests.txt index cd2b769..438fe31 100644 --- a/tests/parser/parserTests.txt +++ b/tests/parser/parserTests.txt @@ -2520,7 +2520,6 @@ </p> !! end -## PHP parser discards the "<pre " string !! test Handle broken pre-like tags (bug 64025) !! options @@ -2531,8 +2530,13 @@ <table><pre </table> !! html/php <pre>x</pre> -<table><pre></pre></table> +<table><pre </table> +!! html/php+tidy +<pre> +x +</pre> +<p><pre</p> !! html/parsoid <pre about="#mwt1" typeof="mw:Transclusion" data-parsoid='{"a":{"<pre":null},"sa":{"<pre":""},"stx":"html","pi":[[{"k":"1","spc":["","","",""]}]]}' data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"<pre <pre>x</pre>"}},"i":0}}]}'>x</pre> @@ -10876,6 +10880,8 @@ !! wikitext <includeonly> !! html +<p><includeonly> +</p> !! end ## We used to, but no longer wt2wt this test since the default serializer diff --git a/tests/phpunit/includes/parser/PreprocessorTest.php b/tests/phpunit/includes/parser/PreprocessorTest.php index 1ebba1a..b940230 100644 --- a/tests/phpunit/includes/parser/PreprocessorTest.php +++ b/tests/phpunit/includes/parser/PreprocessorTest.php @@ -48,7 +48,7 @@ array( "<noinclude> Foo bar </noinclude>", "<root><ignore><noinclude></ignore> Foo bar <ignore></noinclude></ignore></root>" ), array( "<noinclude>\n{{Foo}}\n</noinclude>", "<root><ignore><noinclude></ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore></noinclude></ignore></root>" ), array( "<noinclude>\n{{Foo}}\n</noinclude>\n", "<root><ignore><noinclude></ignore>\n<template lineStart=\"1\"><title>Foo</title></template>\n<ignore></noinclude></ignore>\n</root>" ), - array( "<gallery>foo bar", "<root><ext><name>gallery</name><attr></attr><inner>foo bar</inner></ext></root>" ), + array( "<gallery>foo bar", "<root><gallery>foo bar</root>" ), array( "<{{foo}}>", "<root><<template><title>foo</title></template>></root>" ), array( "<{{{foo}}}>", "<root><<tplarg><title>foo</title></tplarg>></root>" ), array( "<gallery></gallery</gallery>", "<root><ext><name>gallery</name><attr></attr><inner></gallery</inner><close></gallery></close></ext></root>" ), -- To view, visit https://gerrit.wikimedia.org/r/255258 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ia2f24dbfb3567c4b0778761585e6c0303d11ddd0 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Bartosz Dziewoński <matma....@gmail.com> Gerrit-Reviewer: Bartosz Dziewoński <matma....@gmail.com> Gerrit-Reviewer: Brion VIBBER <br...@wikimedia.org> Gerrit-Reviewer: Cscott <canan...@wikimedia.org> Gerrit-Reviewer: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: Jackmcbarn <jackmcb...@gmail.com> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits