[MediaWiki-commits] [Gerrit] Move more escaping logic to Escaper - change (mediawiki...CirrusSearch)

2014-07-25 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Move more escaping logic to Escaper
..


Move more escaping logic to Escaper

Change-Id: I626e9dc5edd63ff4666dbc38d220d6fffcfa2055
---
M includes/Search/Escaper.php
M includes/Searcher.php
2 files changed, 126 insertions(+), 125 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php
index 466d09b..f7ec1e5 100644
--- a/includes/Search/Escaper.php
+++ b/includes/Search/Escaper.php
@@ -42,6 +42,116 @@
return $text;
}
 
+   /**
+* Make sure the the query string part is well formed by escaping some 
syntax that we don't
+* want users to get direct access to and making sure quotes are 
balanced.
+* These special characters _aren't_ escaped:
+* * and ?: Do a wildcard search against the stemmed text which isn't 
strictly a good
+* idea but this is so rarely used that adding extra code to flip 
prefix searches into
+* real prefix searches isn't really worth it.
+* ~: Do a fuzzy match against the stemmed text which isn't strictly a 
good idea but it
+* gets the job done and fuzzy matches are a really rarely used feature 
to be creating an
+* extra index for.
+* : Perform a phrase search for the quoted term.  If the s aren't 
balanced we insert one
+* at the end of the term to make sure elasticsearch doesn't barf at us.
+*/
+   public function fixupQueryStringPart( $string ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   // Escape characters that can be escaped with \\
+   $string = preg_replace( '/(
+   \(| (?# no user supplied groupings)
+   \)|
+   \{| (?# no exclusive range queries)
+   }|
+   \[| (?# no inclusive range queries either)
+   ]|
+   \^| (?# no user supplied boosts at this 
point, though I cant think why)
+   :|  (?# no specifying your own 
fields)
+   \\\(?!) (?# the only acceptable escaping is 
for quotes)
+   )/x', '\\\$1', $string );
+   // Forward slash escaping doesn't work properly in all 
environments so we just eat them.   Nom.
+   $string = str_replace( '/', ' ', $string );
+
+   // Elasticsearch's query strings can't abide unbalanced quotes
+   return $this-balanceQuotes( $string );
+   }
+
+   /**
+* Make sure that all operators and lucene syntax is used correctly in 
the query string
+* and store if this is a fuzzy query.
+* If it isn't then the syntax escaped so it becomes part of the query 
text.
+* @return array(string, boolean) (fixedup query string, is this a 
fuzzy query?)
+*/
+   public function fixupWholeQueryString( $string ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   // Be careful when editing this method because the ordering of 
the replacements matters.
+
+   // Escape ~ that don't follow a term or a quote
+   $string = preg_replace_callback( '/(?![\w])~/',
+   'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
+
+   // Remove ? and * that don't follow a term.  These are slow so 
we turned them off and escaping isn't working
+   $string = preg_replace( '/(?![\w])([?*])/', '', $string );
+
+   // Reduce token ranges to bare tokens without the  or 
+   $string = preg_replace( '/(?:|)([^\s])/', '$1', $string );
+
+   // Turn bad fuzzy searches into searches that contain a ~ and 
set $this-fuzzyQuery for good ones.
+   $fuzzyQuery = false;
+   $string = preg_replace_callback( 
'/(?leading\w)~(?trailing\S*)/',
+   function ( $matches ) use ( $fuzzyQuery ) {
+   if ( preg_match( 
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) {
+   $fuzzyQuery = true;
+   return $matches[ 0 ];
+   } else {
+   return $matches[ 'leading' ] . '\\~' .
+   preg_replace( '/(?!)~/', 
'\~', $matches[ 'trailing' ] );
+   }
+   }, $string );
+
+   // Turn bad proximity searches into searches that contain a ~
+   $string = preg_replace_callback( '/~(?trailing\S*)/', 
function ( $matches ) {
+   

[MediaWiki-commits] [Gerrit] Move more escaping logic to Escaper - change (mediawiki...CirrusSearch)

2014-07-24 Thread Manybubbles (Code Review)
Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/148990

Change subject: Move more escaping logic to Escaper
..

Move more escaping logic to Escaper

Change-Id: I626e9dc5edd63ff4666dbc38d220d6fffcfa2055
---
M includes/Search/Escaper.php
M includes/Searcher.php
2 files changed, 126 insertions(+), 125 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/90/148990/1

diff --git a/includes/Search/Escaper.php b/includes/Search/Escaper.php
index 466d09b..f7ec1e5 100644
--- a/includes/Search/Escaper.php
+++ b/includes/Search/Escaper.php
@@ -42,6 +42,116 @@
return $text;
}
 
+   /**
+* Make sure the the query string part is well formed by escaping some 
syntax that we don't
+* want users to get direct access to and making sure quotes are 
balanced.
+* These special characters _aren't_ escaped:
+* * and ?: Do a wildcard search against the stemmed text which isn't 
strictly a good
+* idea but this is so rarely used that adding extra code to flip 
prefix searches into
+* real prefix searches isn't really worth it.
+* ~: Do a fuzzy match against the stemmed text which isn't strictly a 
good idea but it
+* gets the job done and fuzzy matches are a really rarely used feature 
to be creating an
+* extra index for.
+* : Perform a phrase search for the quoted term.  If the s aren't 
balanced we insert one
+* at the end of the term to make sure elasticsearch doesn't barf at us.
+*/
+   public function fixupQueryStringPart( $string ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   // Escape characters that can be escaped with \\
+   $string = preg_replace( '/(
+   \(| (?# no user supplied groupings)
+   \)|
+   \{| (?# no exclusive range queries)
+   }|
+   \[| (?# no inclusive range queries either)
+   ]|
+   \^| (?# no user supplied boosts at this 
point, though I cant think why)
+   :|  (?# no specifying your own 
fields)
+   \\\(?!) (?# the only acceptable escaping is 
for quotes)
+   )/x', '\\\$1', $string );
+   // Forward slash escaping doesn't work properly in all 
environments so we just eat them.   Nom.
+   $string = str_replace( '/', ' ', $string );
+
+   // Elasticsearch's query strings can't abide unbalanced quotes
+   return $this-balanceQuotes( $string );
+   }
+
+   /**
+* Make sure that all operators and lucene syntax is used correctly in 
the query string
+* and store if this is a fuzzy query.
+* If it isn't then the syntax escaped so it becomes part of the query 
text.
+* @return array(string, boolean) (fixedup query string, is this a 
fuzzy query?)
+*/
+   public function fixupWholeQueryString( $string ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   // Be careful when editing this method because the ordering of 
the replacements matters.
+
+   // Escape ~ that don't follow a term or a quote
+   $string = preg_replace_callback( '/(?![\w])~/',
+   'CirrusSearch\Search\Escaper::escapeBadSyntax', $string 
);
+
+   // Remove ? and * that don't follow a term.  These are slow so 
we turned them off and escaping isn't working
+   $string = preg_replace( '/(?![\w])([?*])/', '', $string );
+
+   // Reduce token ranges to bare tokens without the  or 
+   $string = preg_replace( '/(?:|)([^\s])/', '$1', $string );
+
+   // Turn bad fuzzy searches into searches that contain a ~ and 
set $this-fuzzyQuery for good ones.
+   $fuzzyQuery = false;
+   $string = preg_replace_callback( 
'/(?leading\w)~(?trailing\S*)/',
+   function ( $matches ) use ( $fuzzyQuery ) {
+   if ( preg_match( 
'/^(?:|0|(?:0?\.[0-9]+)|(?:1(?:\.0)?))$/', $matches[ 'trailing' ] ) ) {
+   $fuzzyQuery = true;
+   return $matches[ 0 ];
+   } else {
+   return $matches[ 'leading' ] . '\\~' .
+   preg_replace( '/(?!)~/', 
'\~', $matches[ 'trailing' ] );
+   }
+   }, $string );
+
+   // Turn bad proximity searches into searches that contain a ~
+   $string =