Thiemo Mättig (WMDE) has uploaded a new change for review. https://gerrit.wikimedia.org/r/203112
Change subject: Simplify StringNormalizer regex ...................................................................... Simplify StringNormalizer regex This does not change any semantics. All it does is switching the two regex. Before: 1. Remove spaces and controls from the start and end. 2. Replace all remaining controls with spaces. After: 1. Replace all controls with spaces. 2. Trim spaces from the start and end. Change-Id: I5be82205d0c6a819b0a0e9d04416c9b6c589f1ab --- M lib/includes/StringNormalizer.php M lib/includes/store/sql/TermSqlIndex.php 2 files changed, 8 insertions(+), 9 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/12/203112/1 diff --git a/lib/includes/StringNormalizer.php b/lib/includes/StringNormalizer.php index 5bead8c..3439506 100644 --- a/lib/includes/StringNormalizer.php +++ b/lib/includes/StringNormalizer.php @@ -87,11 +87,11 @@ public function trimWhitespace( $inputString ) { $inputString = $this->trimBadChars( $inputString ); - // \p{Z} - whitespace - // \p{Cc} - control chars // WARNING: *any* invalid UTF8 sequence causes preg_replace to return an empty string. - $trimmed = preg_replace( '/^(\p{Z}|\p{Cc})+|(\p{Z}|\p{Cc})+$/u', '', $inputString ); - $trimmed = preg_replace( '/[\p{Cc}]+/u', ' ', $trimmed ); + // \p{Cc} only includes general control characters. + $trimmed = preg_replace( '/\p{Cc}+/u', ' ', $inputString ); + // \p{Z} includes all whitespace characters and invisible separators. + $trimmed = preg_replace( '/^\p{Z}+|\p{Z}+$/u', '', $trimmed ); return $trimmed; } diff --git a/lib/includes/store/sql/TermSqlIndex.php b/lib/includes/store/sql/TermSqlIndex.php index c26f724..345742a 100644 --- a/lib/includes/store/sql/TermSqlIndex.php +++ b/lib/includes/store/sql/TermSqlIndex.php @@ -968,12 +968,11 @@ wfWarn( "Unicode normalization failed for `$text`" ); } - // \p{Z} - whitespace - // \p{C} - control chars // WARNING: *any* invalid UTF8 sequence causes preg_replace to return an empty string. - $strippedText = $nfcText; - $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', ' ', $strippedText ); - $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '', $strippedText ); + // Control character classes excluding private use areas. + $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', ' ', $nfcText ); + // \p{Z} includes all whitespace characters and invisible separators. + $strippedText = preg_replace( '/^\p{Z}+|\p{Z}+$/u', '', $strippedText ); if ( $strippedText === '' ) { // NOTE: This happens when there is only whitespace in the string. -- To view, visit https://gerrit.wikimedia.org/r/203112 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5be82205d0c6a819b0a0e9d04416c9b6c589f1ab Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits