C. Scott Ananian has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/340225 )
Change subject: T159174: Strip U+0000 in wikitext ...................................................................... T159174: Strip U+0000 in wikitext U+0000 is not allowed in HTML5, there's no reason to allow it in wikitext. It simplifies our code if we can just strip them at the start. Strip in PST as well so they don't sneak into our database either. Tweaked the EXT_LINK URLs to account for the fact that invalid characters get transformed into U+FFFD when using Preprocessor_DOM. See https://github.com/wikimedia/mediawiki/commit/73649741ed1e2f557aec22a485598b199fdd2d09 for context on that change. Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb --- M includes/parser/Parser.php M languages/LanguageConverter.php M tests/parser/extraParserTests.txt 3 files changed, 12 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/25/340225/1 diff --git a/includes/parser/Parser.php b/includes/parser/Parser.php index 86aa06a..5f64c92 100644 --- a/includes/parser/Parser.php +++ b/includes/parser/Parser.php @@ -89,13 +89,15 @@ # Everything except bracket, space, or control characters # \p{Zs} is unicode 'separator, space' category. It covers the space 0x20 # as well as U+3000 is IDEOGRAPHIC SPACE for T21052 - const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}]'; + # \x{FFFD} is the Unicode replacement character, which Preprocessor_DOM + # uses to replace invalid HTML characters. + const EXT_LINK_URL_CLASS = '[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]'; # Simplified expression to match an IPv4 or IPv6 address, or # at least one character of a host name (embeds EXT_LINK_URL_CLASS) - const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}])'; + const EXT_LINK_ADDR = '(?:[0-9.]+|\\[(?i:[0-9a-f:.]+)\\]|[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}])'; # RegExp to make image URLs (embeds IPv6 part of EXT_LINK_ADDR) // @codingStandardsIgnoreStart Generic.Files.LineLength - const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}]+) + const EXT_IMAGE_REGEX = '/^(http:\/\/|https:\/\/)((?:\\[(?i:[0-9a-f:.]+)\\])?[^][<>"\\x00-\\x20\\x7F\p{Zs}\x{FFFD}]+) \\/([A-Za-z0-9_.,~%\\-+&;#*?!=()@\\x80-\\xFF]+)\\.((?i)gif|png|jpg|jpeg)$/Sxu'; // @codingStandardsIgnoreEnd @@ -264,7 +266,7 @@ $this->mUrlProtocols = wfUrlProtocols(); $this->mExtLinkBracketedRegex = '/\[(((?i)' . $this->mUrlProtocols . ')' . self::EXT_LINK_ADDR . - self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F]*?)\]/Su'; + self::EXT_LINK_URL_CLASS . '*)\p{Zs}*([^\]\\x00-\\x08\\x0a-\\x1F\\x{FFFD}]*?)\]/Su'; if ( isset( $conf['preprocessorClass'] ) ) { $this->mPreprocessorClass = $conf['preprocessorClass']; } elseif ( defined( 'HPHP_VERSION' ) ) { @@ -417,6 +419,8 @@ $text = strtr( $text, "\x7f", "?" ); $magicScopeVariable = $this->lock(); } + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); $this->startParse( $title, $options, self::OT_HTML, $clearState ); @@ -4462,6 +4466,9 @@ $this->startParse( $title, $options, self::OT_WIKI, $clearState ); $this->setUser( $user ); + // Strip U+0000 NULL (T159174) + $text = str_replace( "\000", '', $text ); + // We still normalize line endings for backwards-compatibility // with other code that just calls PST, but this should already // be handled in TextContent subclasses diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index 06fec44..8607d59 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -380,6 +380,7 @@ $literalBlob = ''; // Guard against delimiter nulls in the input + // (should never happen: see T159174) $text = str_replace( "\000", '', $text ); $markupMatches = null; diff --git a/tests/parser/extraParserTests.txt b/tests/parser/extraParserTests.txt index a48087e..8d042d7 100644 --- a/tests/parser/extraParserTests.txt +++ b/tests/parser/extraParserTests.txt Binary files differ -- To view, visit https://gerrit.wikimedia.org/r/340225 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I3f67e92b61aacc87a40c3662085c84d1dac08bfb Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits