[MediaWiki-commits] [Gerrit] Move more escaping logic to Escaper - change (mediawiki...CirrusSearch)
jenkins-bot has submitted this change and it was merged. Change subject: Move more escaping logic to Escaper .. Move more escaping logic to Escaper Change-Id: I626e9dc5edd63ff4666dbc38d220d6fffcfa2055 --- M includes/Search/Escaper.php M includes/Searcher.php 2 files changed, 126 insertions(+), 125 deletions(-) Approvals: Chad: Looks good to me, approved jenkins-bot: Verified diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php index 466d09b..f7ec1e5 100644 --- a/includes/Search/Escaper.php +++ b/includes/Search/Escaper.php @@ -42,6 +42,116 @@ return $text; } + /** +* Make sure the the query string part is well formed by escaping some syntax that we don't +* want users to get direct access to and making sure quotes are balanced. +* These special characters _aren't_ escaped: +* * and ?: Do a wildcard search against the stemmed text which isn't strictly a good +* idea but this is so rarely used that adding extra code to flip prefix searches into +* real prefix searches isn't really worth it. +* ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it +* gets the job done and fuzzy matches are a really rarely used feature to be creating an +* extra index for. +* : Perform a phrase search for the quoted term. If the s aren't balanced we insert one +* at the end of the term to make sure elasticsearch doesn't barf at us. +*/ + public function fixupQueryStringPart( $string ) { + $profiler = new ProfileSection( __METHOD__ ); + + // Escape characters that can be escaped with \\ + $string = preg_replace( '/( + \(| (?# no user supplied groupings) + \)| + \{| (?# no exclusive range queries) + }| + \[| (?# no inclusive range queries either) + ]| + \^| (?# no user supplied boosts at this point, though I cant think why) + :| (?# no specifying your own fields) + \\\(?!) (?# the only acceptable escaping is for quotes) + )/x', '\\\$1', $string ); + // Forward slash escaping doesn't work properly in all environments so we just eat them. Nom. + $string = str_replace( '/', ' ', $string ); + + // Elasticsearch's query strings can't abide unbalanced quotes + return $this-balanceQuotes( $string ); + } + + /** +* Make sure that all operators and lucene syntax is used correctly in the query string +* and store if this is a fuzzy query. +* If it isn't then the syntax escaped so it becomes part of the query text. +* @return array(string, boolean) (fixedup query string, is this a fuzzy query?) +*/ + public function fixupWholeQueryString( $string ) { + $profiler = new ProfileSection( __METHOD__ ); + + // Be careful when editing this method because the ordering of the replacements matters. + + // Escape ~ that don't follow a term or a quote + $string = preg_replace_callback( '/(?![\w])~/', + 'CirrusSearch\Search\Escaper::escapeBadSyntax', $string ); + + // Remove ? and * that don't follow a term. These are slow so we turned them off and escaping isn't working + $string = preg_replace( '/(?![\w])([?*])/', '', $string ); + + // Reduce token ranges to bare tokens without the or + $string = preg_replace( '/(?:|)([^\s])/', '$1', $string ); + + // Turn bad fuzzy searches into searches that contain a ~ and set $this-fuzzyQuery for good ones. + $fuzzyQuery = false; + $string = preg_replace_callback( '/(?leading\w)~(?trailing\S*)/', + function ( $matches ) use ( $fuzzyQuery ) { + if ( preg_match( '/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) { + $fuzzyQuery = true; + return $matches[ 0 ]; + } else { + return $matches[ 'leading' ] . '\\~' . + preg_replace( '/(?!)~/', '\~', $matches[ 'trailing' ] ); + } + }, $string ); + + // Turn bad proximity searches into searches that contain a ~ + $string = preg_replace_callback( '/~(?trailing\S*)/', function ( $matches ) { +
[MediaWiki-commits] [Gerrit] Move more escaping logic to Escaper - change (mediawiki...CirrusSearch)
Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/148990 Change subject: Move more escaping logic to Escaper .. Move more escaping logic to Escaper Change-Id: I626e9dc5edd63ff4666dbc38d220d6fffcfa2055 --- M includes/Search/Escaper.php M includes/Searcher.php 2 files changed, 126 insertions(+), 125 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/90/148990/1 diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php index 466d09b..f7ec1e5 100644 --- a/includes/Search/Escaper.php +++ b/includes/Search/Escaper.php @@ -42,6 +42,116 @@ return $text; } + /** +* Make sure the the query string part is well formed by escaping some syntax that we don't +* want users to get direct access to and making sure quotes are balanced. +* These special characters _aren't_ escaped: +* * and ?: Do a wildcard search against the stemmed text which isn't strictly a good +* idea but this is so rarely used that adding extra code to flip prefix searches into +* real prefix searches isn't really worth it. +* ~: Do a fuzzy match against the stemmed text which isn't strictly a good idea but it +* gets the job done and fuzzy matches are a really rarely used feature to be creating an +* extra index for. +* : Perform a phrase search for the quoted term. If the s aren't balanced we insert one +* at the end of the term to make sure elasticsearch doesn't barf at us. +*/ + public function fixupQueryStringPart( $string ) { + $profiler = new ProfileSection( __METHOD__ ); + + // Escape characters that can be escaped with \\ + $string = preg_replace( '/( + \(| (?# no user supplied groupings) + \)| + \{| (?# no exclusive range queries) + }| + \[| (?# no inclusive range queries either) + ]| + \^| (?# no user supplied boosts at this point, though I cant think why) + :| (?# no specifying your own fields) + \\\(?!) (?# the only acceptable escaping is for quotes) + )/x', '\\\$1', $string ); + // Forward slash escaping doesn't work properly in all environments so we just eat them. Nom. + $string = str_replace( '/', ' ', $string ); + + // Elasticsearch's query strings can't abide unbalanced quotes + return $this-balanceQuotes( $string ); + } + + /** +* Make sure that all operators and lucene syntax is used correctly in the query string +* and store if this is a fuzzy query. +* If it isn't then the syntax escaped so it becomes part of the query text. +* @return array(string, boolean) (fixedup query string, is this a fuzzy query?) +*/ + public function fixupWholeQueryString( $string ) { + $profiler = new ProfileSection( __METHOD__ ); + + // Be careful when editing this method because the ordering of the replacements matters. + + // Escape ~ that don't follow a term or a quote + $string = preg_replace_callback( '/(?![\w])~/', + 'CirrusSearch\Search\Escaper::escapeBadSyntax', $string ); + + // Remove ? and * that don't follow a term. These are slow so we turned them off and escaping isn't working + $string = preg_replace( '/(?![\w])([?*])/', '', $string ); + + // Reduce token ranges to bare tokens without the or + $string = preg_replace( '/(?:|)([^\s])/', '$1', $string ); + + // Turn bad fuzzy searches into searches that contain a ~ and set $this-fuzzyQuery for good ones. + $fuzzyQuery = false; + $string = preg_replace_callback( '/(?leading\w)~(?trailing\S*)/', + function ( $matches ) use ( $fuzzyQuery ) { + if ( preg_match( '/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) { + $fuzzyQuery = true; + return $matches[ 0 ]; + } else { + return $matches[ 'leading' ] . '\\~' . + preg_replace( '/(?!)~/', '\~', $matches[ 'trailing' ] ); + } + }, $string ); + + // Turn bad proximity searches into searches that contain a ~ + $string =