Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/94373
Change subject: Term containing * match against unstemmed text ...................................................................... Term containing * match against unstemmed text Matching terms containing * against stemmed text didn't work too well because Elasticsearch didn't stem the terms so things like pi*les would match nothing while pi*le would match "pickles". This is truely backwards. Bug: 56163 Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5 --- M includes/CirrusSearchSearcher.php 1 file changed, 70 insertions(+), 23 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/73/94373/1 diff --git a/includes/CirrusSearchSearcher.php b/includes/CirrusSearchSearcher.php index ce72ae6..0461c67 100644 --- a/includes/CirrusSearchSearcher.php +++ b/includes/CirrusSearchSearcher.php @@ -170,36 +170,43 @@ ); $this->filters = $filters; wfProfileOut( __METHOD__ . '-other-filters' ); - wfProfileIn( __METHOD__ . '-find-phrase-queries-and-escape' ); - $query = array(); - $matches = array(); - $offset = 0; - while ( preg_match( '/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/', - $term, $matches, PREG_OFFSET_CAPTURE, $offset ) ) { - $startOffset = $matches[ 0 ][ 1 ]; - if ( $startOffset > $offset ) { - $query[] = self::fixupQueryStringPart( substr( $term, $offset, $startOffset - $offset ) ); - } + wfProfileIn( __METHOD__ . '-switch-phrase-queries-to-plain' ); + $query = self::replacePartsOfQuery( $term, '/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/', + function ( $matches ) use ( $showRedirects ) { + $main = CirrusSearchSearcher::fixupQueryStringPart( $matches[ 'main' ][ 0 ] ); + if ( !isset( $matches[ 'fuzzy' ] ) ) { + $main = CirrusSearchSearcher::switchSearchToExact( $main, $showRedirects ); + } + return array( 'escaped' => $main ); + } ); + wfProfileOut( __METHOD__ . '-find-phrase-queries' ); + wfProfileIn( __METHOD__ . '-switch-prefix-to-plain' ); + $query = self::replaceAllPartsOfQuery( $query, '/\w*\*(?:\w*\*?)*/', + function ( $matches ) use ( $showRedirects ) { + $term = CirrusSearchSearcher::fixupQueryStringPart( $matches[ 0 ][ 0 ] ); + return array( 'escaped' => CirrusSearchSearcher::switchSearchToExact( $term, $showRedirects ) ); + } ); + wfProfileOut( __METHOD__ . '-switch-phrase-queries-to-plain' ); - $main = self::fixupQueryStringPart( $matches[ 'main' ][ 0 ] ); - if ( isset( $matches[ 'fuzzy' ] ) ) { - $query[] = $main; - } else { - $main = $main; - $exact = join( ' OR ', self::buildFullTextSearchFields( $showRedirects, ".plain:$main" ) ); - $query[] = "($exact)"; + wfProfileIn( __METHOD__ . '-escape' ); + $escapedQuery = array(); + foreach ( $query as $queryPart ) { + if ( isset( $queryPart[ 'escaped' ] ) ) { + $escapedQuery[] = $queryPart[ 'escaped' ]; + continue; } - $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] ); + if ( isset( $queryPart[ 'raw' ] ) ) { + $escapedQuery[] = self::fixupQueryStringPart( $queryPart[ 'raw' ] ); + continue; + } + wfLogWarning( 'Unknown query part: ' . serialize( $queryPart ) ); } - if ( $offset < strlen( $term ) ) { - $query[] = self::fixupQueryStringPart( substr( $term, $offset ) ); - } - wfProfileOut( __METHOD__ . '-find-phrase-queries-and-escape' ); + wfProfileOut( __METHOD__ . '-escape' ); // Actual text query if ( count( $query ) > 0 ) { wfProfileIn( __METHOD__ . '-build-query' ); - $queryStringQueryString = self::fixupWholeQueryString( implode( ' ', $query ) ); + $queryStringQueryString = self::fixupWholeQueryString( implode( ' ', $escapedQuery ) ); $fields = self::buildFullTextSearchFields( $showRedirects ); $this->query = $this->buildSearchTextQuery( $fields, $queryStringQueryString ); @@ -302,6 +309,41 @@ $result = $getWork->execute(); wfProfileOut( __METHOD__ ); return $result; + } + + private static function replaceAllPartsOfQuery( $query, $regex, $callable ) { + $result = array(); + foreach ( $query as $queryPart ) { + if ( isset( $queryPart[ 'raw' ] ) ) { + $result = array_merge( $result, self::replacePartsOfQuery( $queryPart[ 'raw' ], $regex, $callable ) ); + continue; + } + $result[] = $queryPart; + } + return $result; + } + + private static function replacePartsOfQuery( $queryPart, $regex, $callable ) { + $destination = array(); + $matches = array(); + $offset = 0; + while ( preg_match( $regex, $queryPart, $matches, PREG_OFFSET_CAPTURE, $offset ) ) { + $startOffset = $matches[ 0 ][ 1 ]; + if ( $startOffset > $offset ) { + $destination[] = array( 'raw' => substr( $queryPart, $offset, $startOffset - $offset ) ); + } + + $callableResult = call_user_func( $callable, $matches ); + if ( $callableResult ) { + $destination[] = $callableResult; + } + + $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] ); + } + if ( $offset < strlen( $queryPart ) ) { + $destination[] = array( 'raw' => substr( $queryPart, $offset ) ); + } + return $destination; } /** @@ -429,6 +471,11 @@ ); } + public static function switchSearchToExact( $term, $showRedirects ) { + $exact = join( ' OR ', CirrusSearchSearcher::buildFullTextSearchFields( $showRedirects, ".plain:$term" ) ); + return "($exact)"; + } + /** * Build fields searched by full text search. * @param $includeRedirects bool show redirects be included -- To view, visit https://gerrit.wikimedia.org/r/94373 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits