Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/94373


Change subject: Term containing * match against unstemmed text
......................................................................

Term containing * match against unstemmed text

Matching terms containing * against stemmed text didn't work too well
because Elasticsearch didn't stem the terms so things like pi*les would
match nothing while pi*le would match "pickles".  This is truely
backwards.

Bug: 56163
Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5
---
M includes/CirrusSearchSearcher.php
1 file changed, 70 insertions(+), 23 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/73/94373/1

diff --git a/includes/CirrusSearchSearcher.php 
b/includes/CirrusSearchSearcher.php
index ce72ae6..0461c67 100644
--- a/includes/CirrusSearchSearcher.php
+++ b/includes/CirrusSearchSearcher.php
@@ -170,36 +170,43 @@
                );
                $this->filters = $filters;
                wfProfileOut( __METHOD__ . '-other-filters' );
-               wfProfileIn( __METHOD__ . '-find-phrase-queries-and-escape' );
-               $query = array();
-               $matches = array();
-               $offset = 0;
-               while ( preg_match( 
'/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
-                               $term, $matches, PREG_OFFSET_CAPTURE, $offset ) 
) {
-                       $startOffset = $matches[ 0 ][ 1 ];
-                       if ( $startOffset > $offset ) {
-                               $query[] = self::fixupQueryStringPart( substr( 
$term, $offset, $startOffset - $offset ) );
-                       }
+               wfProfileIn( __METHOD__ . '-switch-phrase-queries-to-plain' );
+               $query = self::replacePartsOfQuery( $term, 
'/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
+                       function ( $matches ) use ( $showRedirects ) {
+                               $main = 
CirrusSearchSearcher::fixupQueryStringPart( $matches[ 'main' ][ 0 ] );
+                               if ( !isset( $matches[ 'fuzzy' ] ) ) {
+                                       $main = 
CirrusSearchSearcher::switchSearchToExact( $main, $showRedirects );
+                               }
+                               return array( 'escaped' => $main );
+                       } );
+               wfProfileOut( __METHOD__ . '-find-phrase-queries' );
+               wfProfileIn( __METHOD__ . '-switch-prefix-to-plain' );
+               $query = self::replaceAllPartsOfQuery( $query, 
'/\w*\*(?:\w*\*?)*/',
+                       function ( $matches ) use ( $showRedirects ) {
+                               $term = 
CirrusSearchSearcher::fixupQueryStringPart( $matches[ 0 ][ 0 ] );
+                               return array( 'escaped' => 
CirrusSearchSearcher::switchSearchToExact( $term, $showRedirects ) );
+                       } );
+               wfProfileOut( __METHOD__ . '-switch-phrase-queries-to-plain' );
 
-                       $main = self::fixupQueryStringPart( $matches[ 'main' ][ 
0 ] );
-                       if ( isset( $matches[ 'fuzzy' ] ) ) {
-                               $query[] = $main;
-                       } else {
-                               $main = $main;
-                               $exact = join( ' OR ', 
self::buildFullTextSearchFields( $showRedirects, ".plain:$main" ) );
-                               $query[] = "($exact)";
+               wfProfileIn( __METHOD__ . '-escape' );
+               $escapedQuery = array();
+               foreach ( $query as $queryPart ) {
+                       if ( isset( $queryPart[ 'escaped' ] ) ) {
+                               $escapedQuery[] = $queryPart[ 'escaped' ];
+                               continue;
                        }
-                       $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] );
+                       if ( isset( $queryPart[ 'raw' ] ) ) {
+                               $escapedQuery[] = self::fixupQueryStringPart( 
$queryPart[ 'raw' ] );
+                               continue;
+                       }
+                       wfLogWarning( 'Unknown query part:  ' . serialize( 
$queryPart ) );
                }
-               if ( $offset < strlen( $term ) ) {
-                       $query[] = self::fixupQueryStringPart( substr( $term, 
$offset ) );
-               }
-               wfProfileOut( __METHOD__ . '-find-phrase-queries-and-escape' );
+               wfProfileOut( __METHOD__ . '-escape' );
 
                // Actual text query
                if ( count( $query ) > 0 ) {
                        wfProfileIn( __METHOD__ . '-build-query' );
-                       $queryStringQueryString = self::fixupWholeQueryString( 
implode( ' ', $query ) );
+                       $queryStringQueryString = self::fixupWholeQueryString( 
implode( ' ', $escapedQuery ) );
                        $fields = self::buildFullTextSearchFields( 
$showRedirects );
                        $this->query = $this->buildSearchTextQuery( $fields, 
$queryStringQueryString );
 
@@ -302,6 +309,41 @@
                $result = $getWork->execute();
                wfProfileOut( __METHOD__ );
                return $result;
+       }
+
+       private static function replaceAllPartsOfQuery( $query, $regex, 
$callable ) {
+               $result = array();
+               foreach ( $query as $queryPart ) {
+                       if ( isset( $queryPart[ 'raw' ] ) ) {
+                               $result = array_merge( $result, 
self::replacePartsOfQuery( $queryPart[ 'raw' ], $regex, $callable ) );
+                               continue;
+                       }
+                       $result[] = $queryPart;
+               }
+               return $result;
+       }
+
+       private static function replacePartsOfQuery( $queryPart, $regex, 
$callable ) {
+               $destination = array();
+               $matches = array();
+               $offset = 0;
+               while ( preg_match( $regex, $queryPart, $matches, 
PREG_OFFSET_CAPTURE, $offset ) ) {
+                       $startOffset = $matches[ 0 ][ 1 ];
+                       if ( $startOffset > $offset ) {
+                               $destination[] = array( 'raw' => substr( 
$queryPart, $offset, $startOffset - $offset ) );
+                       }
+
+                       $callableResult = call_user_func( $callable, $matches );
+                       if ( $callableResult ) {
+                               $destination[] = $callableResult;
+                       }
+
+                       $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] );
+               }
+               if ( $offset < strlen( $queryPart ) ) {
+                       $destination[] = array( 'raw' => substr( $queryPart, 
$offset ) );
+               }
+               return $destination;
        }
 
        /**
@@ -429,6 +471,11 @@
                );
        }
 
+       public static function switchSearchToExact( $term, $showRedirects ) {
+               $exact = join( ' OR ', 
CirrusSearchSearcher::buildFullTextSearchFields( $showRedirects, ".plain:$term" 
) );
+               return "($exact)";
+       }
+
        /**
         * Build fields searched by full text search.
         * @param $includeRedirects bool show redirects be included

-- 
To view, visit https://gerrit.wikimedia.org/r/94373
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to