Thiemo Mättig (WMDE) has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/203112

Change subject: Simplify StringNormalizer regex
......................................................................

Simplify StringNormalizer regex

This does not change any semantics. All it does is switching the two
regex.

Before:
1. Remove spaces and controls from the start and end.
2. Replace all remaining controls with spaces.

After:
1. Replace all controls with spaces.
2. Trim spaces from the start and end.

Change-Id: I5be82205d0c6a819b0a0e9d04416c9b6c589f1ab
---
M lib/includes/StringNormalizer.php
M lib/includes/store/sql/TermSqlIndex.php
2 files changed, 8 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/12/203112/1

diff --git a/lib/includes/StringNormalizer.php 
b/lib/includes/StringNormalizer.php
index 5bead8c..3439506 100644
--- a/lib/includes/StringNormalizer.php
+++ b/lib/includes/StringNormalizer.php
@@ -87,11 +87,11 @@
        public function trimWhitespace( $inputString ) {
                $inputString = $this->trimBadChars( $inputString );
 
-               // \p{Z} - whitespace
-               // \p{Cc} - control chars
                // WARNING: *any* invalid UTF8 sequence causes preg_replace to 
return an empty string.
-               $trimmed = preg_replace( 
'/^(\p{Z}|\p{Cc})+|(\p{Z}|\p{Cc})+$/u', '', $inputString );
-               $trimmed = preg_replace( '/[\p{Cc}]+/u', ' ', $trimmed );
+               // \p{Cc} only includes general control characters.
+               $trimmed = preg_replace( '/\p{Cc}+/u', ' ', $inputString );
+               // \p{Z} includes all whitespace characters and invisible 
separators.
+               $trimmed = preg_replace( '/^\p{Z}+|\p{Z}+$/u', '', $trimmed );
                return $trimmed;
        }
 
diff --git a/lib/includes/store/sql/TermSqlIndex.php 
b/lib/includes/store/sql/TermSqlIndex.php
index c26f724..345742a 100644
--- a/lib/includes/store/sql/TermSqlIndex.php
+++ b/lib/includes/store/sql/TermSqlIndex.php
@@ -968,12 +968,11 @@
                        wfWarn( "Unicode normalization failed for `$text`" );
                }
 
-               // \p{Z} - whitespace
-               // \p{C} - control chars
                // WARNING: *any* invalid UTF8 sequence causes preg_replace to 
return an empty string.
-               $strippedText = $nfcText;
-               $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', 
' ', $strippedText );
-               $strippedText = preg_replace( '/^[\p{Z}]+|[\p{Z}]+$/u', '', 
$strippedText );
+               // Control character classes excluding private use areas.
+               $strippedText = preg_replace( '/[\p{Cc}\p{Cf}\p{Cn}\p{Cs}]+/u', 
' ', $nfcText );
+               // \p{Z} includes all whitespace characters and invisible 
separators.
+               $strippedText = preg_replace( '/^\p{Z}+|\p{Z}+$/u', '', 
$strippedText );
 
                if ( $strippedText === '' ) {
                        // NOTE: This happens when there is only whitespace in 
the string.

-- 
To view, visit https://gerrit.wikimedia.org/r/203112
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5be82205d0c6a819b0a0e9d04416c9b6c589f1ab
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to