jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/391348 )
Change subject: Use Remex in Sanitizer::stripAllTags() ...................................................................... Use Remex in Sanitizer::stripAllTags() Using a real HTML tokenizer fixes bugs when < or > appear in attribute values. The old implementation used delimiterReplace(), which didn't handle this case: > print Sanitizer::stripAllTags( '<p data-foo="a<b>c">Hello</p>' ); c">Hello We also can't use PHP's built-in strip_tags() because it doesn't handle <?php and <? correctly: > print strip_tags('1<span class="<?php">2</span>3'); 1 > print strip_tags('1<span class="<?">2</span>3'); 1 Bug: T179978 Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e --- M autoload.php A includes/parser/RemexStripTagHandler.php M includes/parser/Sanitizer.php M tests/phpunit/includes/parser/SanitizerTest.php 4 files changed, 57 insertions(+), 12 deletions(-) Approvals: jenkins-bot: Verified Jforrester: Looks good to me, approved diff --git a/autoload.php b/autoload.php index 2f6fbda..3f5a3c1 100644 --- a/autoload.php +++ b/autoload.php @@ -1218,6 +1218,7 @@ 'RefreshLinks' => __DIR__ . '/maintenance/refreshLinks.php', 'RefreshLinksJob' => __DIR__ . '/includes/jobqueue/jobs/RefreshLinksJob.php', 'RegexlikeReplacer' => __DIR__ . '/includes/libs/replacers/RegexlikeReplacer.php', + 'RemexStripTagHandler' => __DIR__ . '/includes/parser/RemexStripTagHandler.php', 'RemoveInvalidEmails' => __DIR__ . '/maintenance/removeInvalidEmails.php', 'RemoveUnusedAccounts' => __DIR__ . '/maintenance/removeUnusedAccounts.php', 'RenameDbPrefix' => __DIR__ . '/maintenance/renameDbPrefix.php', diff --git a/includes/parser/RemexStripTagHandler.php b/includes/parser/RemexStripTagHandler.php new file mode 100644 index 0000000..2839147 --- /dev/null +++ b/includes/parser/RemexStripTagHandler.php @@ -0,0 +1,40 @@ +<?php + +use RemexHtml\Tokenizer\Attributes; +use RemexHtml\Tokenizer\TokenHandler; +use RemexHtml\Tokenizer\Tokenizer; + +/** + * @internal + */ +class RemexStripTagHandler implements TokenHandler { + private $text = ''; + public function getResult() { + return $this->text; + } + + function startDocument( Tokenizer $t, $fns, $fn ) { + // Do nothing. + } + function endDocument( $pos ) { + // Do nothing. + } + function error( $text, $pos ) { + // Do nothing. + } + function characters( $text, $start, $length, $sourceStart, $sourceLength ) { + $this->text .= substr( $text, $start, $length ); + } + function startTag( $name, Attributes $attrs, $selfClose, $sourceStart, $sourceLength ) { + // Do nothing. + } + function endTag( $name, $sourceStart, $sourceLength ) { + // Do nothing. + } + function doctype( $name, $public, $system, $quirks, $sourceStart, $sourceLength ) { + // Do nothing. + } + function comment( $text, $sourceStart, $sourceLength ) { + // Do nothing. + } +} diff --git a/includes/parser/Sanitizer.php b/includes/parser/Sanitizer.php index 4c99677..7c9f563 100644 --- a/includes/parser/Sanitizer.php +++ b/includes/parser/Sanitizer.php @@ -1967,17 +1967,22 @@ * Warning: this return value must be further escaped for literal * inclusion in HTML output as of 1.10! * - * @param string $text HTML fragment + * @param string $html HTML fragment * @return string */ - static function stripAllTags( $text ) { - # Actual <tags> - $text = StringUtils::delimiterReplace( '<', '>', '', $text ); + static function stripAllTags( $html ) { + // Use RemexHtml to tokenize $html and extract the text + $handler = new RemexStripTagHandler; + $tokenizer = new RemexHtml\Tokenizer\Tokenizer( $handler, $html, [ + 'ignoreErrors' => true, + // don't ignore char refs, we want them to be decoded + 'ignoreNulls' => true, + 'skipPreprocess' => true, + ] ); + $tokenizer->execute(); + $text = $handler->getResult(); - # Normalize &entities and whitespace - $text = self::decodeCharReferences( $text ); $text = self::normalizeWhitespace( $text ); - return $text; } diff --git a/tests/phpunit/includes/parser/SanitizerTest.php b/tests/phpunit/includes/parser/SanitizerTest.php index 269575b..d7e72e1 100644 --- a/tests/phpunit/includes/parser/SanitizerTest.php +++ b/tests/phpunit/includes/parser/SanitizerTest.php @@ -530,11 +530,10 @@ [ '<p id="one">Foo</p><p id="two">Bar</p>', 'FooBar' ], [ "<p>Foo</p>\n<p>Bar</p>", 'Foo Bar' ], [ '<p>Hello <strong> world café</p>', 'Hello <strong> world cafĂ©' ], - // This one is broken, see T179978 - //[ - // '<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>', - // 'Bar Whee!' - //], + [ + '<p><small data-foo=\'bar"<baz>quux\'><a href="./Foo">Bar</a></small> Whee!</p>', + 'Bar Whee!' + ], [ '1<span class="<?php">2</span>3', '123' ], [ '1<span class="<?">2</span>3', '123' ], ]; -- To view, visit https://gerrit.wikimedia.org/r/391348 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I53b98e6c877c00c03ff110914168b398559c9c3e Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Catrope <r...@wikimedia.org> Gerrit-Reviewer: Catrope <r...@wikimedia.org> Gerrit-Reviewer: Jackmcbarn <jackmcb...@gmail.com> Gerrit-Reviewer: Jforrester <jforres...@wikimedia.org> Gerrit-Reviewer: TTO <at.li...@live.com.au> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits