Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/85383


Change subject: Infrastructure for quotes turning off stemmer.
......................................................................

Infrastructure for quotes turning off stemmer.

Looks like I fixed 54278 in the process!

Bug: 54020
Bug: 54278

Change-Id: Iddd03d0c75fd9474892e9bc32e1f12fc909ed29a
---
M CirrusSearchAnalysisConfigBuilder.php
M CirrusSearchMappingConfigBuilder.php
M CirrusSearchSearcher.php
3 files changed, 60 insertions(+), 43 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/83/85383/1

diff --git a/CirrusSearchAnalysisConfigBuilder.php 
b/CirrusSearchAnalysisConfigBuilder.php
index 8e2e077..0da92a6 100644
--- a/CirrusSearchAnalysisConfigBuilder.php
+++ b/CirrusSearchAnalysisConfigBuilder.php
@@ -50,6 +50,15 @@
                                'text' => array(
                                        'type' => 
$this->getDefaultTextAnalyzerType(),
                                ),
+                               'plain' => array(
+                                       // Surprisingly, the Lucene docs claim 
this works for
+                                       // Chinese, Japanese, and Thai as well.
+                                       // The difference between this and the 
'standard'
+                                       // analzyer is the lack of english stop 
words.
+                                       'type' => 'custom',
+                                       'tokenizer' => 'standard',
+                                       'filter' => array( 'standard', 
'lowercase' )
+                               ),
                                'suggest' => array(
                                        'type' => 'custom',
                                        'tokenizer' => 'standard',
@@ -99,16 +108,18 @@
                        $config[ 'filter' ][ 'lowercase' ][ 'language' ] = 
'greek';
                        break;
                case 'en':
+                       $config[ 'filter' ][ 'possessive_english' ] = array(
+                               'type' => 'stemmer',
+                               'language' => 'possessive_english',
+                       );
                        // Replace the default english analyzer with a rebuilt 
copy with asciifolding tacked on the end
                        $config[ 'analyzer' ][ 'text' ] = array(
                                'type' => 'custom',
                                'tokenizer' => 'standard',
                                'filter' => array( 'standard', 
'possessive_english', 'lowercase', 'stop', 'porter_stem', 'asciifolding' )
                        );
-                       $config[ 'filter' ][ 'possessive_english' ] = array(
-                               'type' => 'stemmer',
-                               'language' => 'possessive_english',
-                       );
+                       // Add asciifolding to the the text_plain analyzer as 
well
+                       $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 
'asciifolding';
                        // Add asciifolding to the prefix queries and 
incategory filters
                        $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] = 
'asciifolding';
                        $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding';
@@ -127,8 +138,8 @@
         * @return string the analyzer type
         */
        private function getDefaultTextAnalyzerType() {
-               if ( array_key_exists( $this->language, 
$this->elasticsearchLanguages ) ) {
-                       return $this->elasticsearchLanguages[ $this->language ];
+               if ( array_key_exists( $this->language, 
$this->elasticsearchLanguageAnalyzers ) ) {
+                       return $this->elasticsearchLanguageAnalyzers[ 
$this->language ];
                } else {
                        return 'default';
                }
@@ -139,7 +150,7 @@
         * that this array is sorted alphabetically by value and sourced from
         * 
http://www.elasticsearch.org/guide/reference/index-modules/analysis/lang-analyzer/
         */
-       private $elasticsearchLanguages = array(
+       private $elasticsearchLanguageAnalyzers = array(
                'ar' => 'arabic',
                'hy' => 'armenian',
                'eu' => 'basque',
@@ -147,7 +158,6 @@
                'bg' => 'bulgarian',
                'ca' => 'catalan',
                'zh' => 'chinese',
-               // 'cjk', - we don't use this because we don't have a wiki with 
all three
                'cs' => 'czech',
                'da' => 'danish',
                'nl' => 'dutch',
@@ -170,6 +180,6 @@
                'es' => 'spanish',
                'sv' => 'swedish',
                'tr' => 'turkish',
-               'th' => 'thai'
+               'th' => 'thai',
        );
 }
diff --git a/CirrusSearchMappingConfigBuilder.php 
b/CirrusSearchMappingConfigBuilder.php
index bd9b2dd..b31186a 100644
--- a/CirrusSearchMappingConfigBuilder.php
+++ b/CirrusSearchMappingConfigBuilder.php
@@ -18,7 +18,6 @@
  * http://www.gnu.org/copyleft/gpl.html
  */
 class CirrusSearchMappingConfigBuilder {
-
        /**
         * @return array
         */
@@ -36,13 +35,13 @@
                // and is infered anyway.
                return array(
                        'properties' => array(
-                               'title' => $this->buildStringField( 'title', 
array( 'suggest', 'prefix' ), true ),
-                               'text' => $this->buildStringField( 'text', 
array( 'suggest' ), true ),
+                               'title' => $this->buildStringField( 'title', 
array( 'suggest', 'prefix' ) ),
+                               'text' => $this->buildStringField( 'text', 
array( 'suggest' ) ),
                                'category' => 
$this->buildLowercaseKeywordField(),
-                               'heading' => $this->buildStringField(),
+                               'heading' => $this->buildStringField( 'heading' 
),
                                'redirect' => array(
                                        'properties' => array(
-                                               'title' => 
$this->buildStringField( 'title', array( 'suggest' ), true )
+                                               'title' => 
$this->buildStringField( 'title', array( 'suggest' ) ),
                                        )
                                ),
                                'links' => array(
@@ -59,29 +58,29 @@
 
        /**
         * Build a string field that does standard analysis for the language.
-        * @param $name string|null Name of the field.  Required if extra is 
not false.
-        * @param $extra array|null Extra analyzers for this field beyond the 
basic string type.  If not falsy the
-        *              field will be a multi_field.
-        * @param $willHighlight bool Will this field be highlighted?  Defaults 
to false.
+        * @param $name string|null Name of the field.
+        * @param $extra array|null Extra analyzers for this field beyond the 
basic text and plain.
         * @return array definition of the field
         */
-       private function buildStringField( $name = null, $extra = null, 
$willHighlight = false ) {
-               $field = array( 'type' => 'string', 'analyzer' => 'text' );
-               if ( $willHighlight ) {
-                       $field[ 'store' ] = true;
-                       $field[ 'term_vector' ] = 'with_positions_offsets';
-               }
-               if ( !$extra ) {
-                       return $field;
-               }
+       private function buildStringField( $name, $extra = array() ) {
                $field = array(
                        'type' => 'multi_field',
                        'fields' => array(
-                               $name => $field
+                               $name => array(
+                                       'type' => 'string',
+                                       'analyzer' => 'text',
+                                       'store' => 'yes',
+                                       'term_vector' => 
'with_positions_offsets',
+                               ),
+                               'plain' => array(
+                                       'type' => 'string',
+                                       'analyzer' => 'plain',
+                                       'term_vector' => 
'with_positions_offsets',
+                               ),
                        )
                );
                foreach ( $extra as $extraname ) {
-                       $field['fields'][$extraname] = array( 'type' => 
'string', 'analyzer' => $extraname );
+                       $field[ 'fields' ][ $extraname ] = array( 'type' => 
'string', 'analyzer' => $extraname );
                }
                return $field;
        }
diff --git a/CirrusSearchSearcher.php b/CirrusSearchSearcher.php
index fd2abe8..0ffc0eb 100644
--- a/CirrusSearchSearcher.php
+++ b/CirrusSearchSearcher.php
@@ -108,7 +108,6 @@
         * @return CirrusSearchResultSet|null|SearchResultSet|Status
         */
        public function searchText( $term, $showRedirects ) {
-               global $wgCirrusSearchWeights;
                global $wgCirrusSearchPhraseRescoreBoost;
                global $wgCirrusSearchPhraseRescoreWindowSize;
                global $wgCirrusSearchPhraseSuggestMaxErrors;
@@ -148,14 +147,7 @@
                if ( trim( $term ) !== '' || $extraQueryStrings ) {
                        $fixedTerm = self::fixupQueryString( $term );
                        $queryStringQueryString = trim( implode( ' ', 
$extraQueryStrings ) . ' ' . $fixedTerm );
-                       $fields = array(
-                               'title^' . $wgCirrusSearchWeights[ 'title' ],
-                               'heading^' . $wgCirrusSearchWeights[ 'heading' 
],
-                               'text',
-                       );
-                       if ( $showRedirects ) {
-                               $fields[] = 'redirect.title^' . 
$wgCirrusSearchWeights[ 'redirect' ];
-                       }
+                       $fields = 
CirrusSearchSearcher::buildFullTextSearchFields( $showRedirects );
                        $this->query = $this->buildSearchTextQuery( $fields, 
$queryStringQueryString );
 
                        // Only do a phrase match rescore if the query doesn't 
include any phrases
@@ -351,6 +343,25 @@
        }
 
        /**
+        * Build fields searched by full text search.
+        * @param $includeRedirects bool show redirects be included
+        * @param $fieldSuffix string suffux to add to field names.  Defaults 
to ''.
+        * @return array(string) of fields to query
+        */
+       public static function buildFullTextSearchFields( $includeRedirects, 
$fieldSuffix = '' ) {
+               global $wgCirrusSearchWeights;
+               $fields = array(
+                       'title' . $fieldSuffix . '^' . $wgCirrusSearchWeights[ 
'title' ],
+                       'heading' . $fieldSuffix . '^' . 
$wgCirrusSearchWeights[ 'heading' ],
+                       'text' . $fieldSuffix,
+               );
+               if ( $includeRedirects ) {
+                       $fields[] = 'redirect.title' . $fieldSuffix . '^' . 
$wgCirrusSearchWeights[ 'redirect' ];
+               }
+               return $fields;
+       }
+
+       /**
         * Pick the index type to search bases on the list of namespaces to 
search.
         * @return mixed index type in which to search
         */
@@ -417,7 +428,6 @@
                }
                // Turn bad fuzzy searches into searches that contain a ~
                $string = preg_replace_callback( 
'/(?<leading>[^\s"])~(?<trailing>\S+)/', function ( $matches ) {
-                       wfDebugLog( 'CirrusSearch', 'checking fuzzy:' . 
$matches[0] );
                        if ( preg_match( '/0|(?:0?\.[0-9]+)|(?:1(?:\.0)?)/', 
$matches[ 'trailing' ] ) ) {
                                return $matches[ 0 ];
                        } else {
@@ -426,14 +436,12 @@
                }, $string );
                // Turn bad proximity searches into seraches that contain a ~
                $string = preg_replace_callback( '/"~(?<trailing>\S*)/', 
function ( $matches ) {
-                       wfDebugLog( 'CirrusSearch', 'checking proximity:' . 
$matches[0] );
                        if ( preg_match( '/[0-9]+/', $matches[ 'trailing' ] ) ) 
{
                                return $matches[ 0 ];
                        } else {
                                return '"\\~' . $matches[ 'trailing' ];
                        }
                }, $string );
-               wfDebugLog( 'CirrusSearch', 'Got  ' . $string );
                return $string;
        }
 
@@ -489,8 +497,8 @@
                        'fields' => array(
                                'title' => array( 'number_of_fragments' => 0 ),
                                'text' => array( 'number_of_fragments' => 1 ),
-                               'redirect.title' => array( 
'number_of_fragments' => 1 ),
-                               'heading' => array( 'number_of_fragments' => 1),
+                               'redirect.title' => array( 
'number_of_fragments' => 1, 'type' => 'plain' ),
+                               'heading' => array( 'number_of_fragments' => 1, 
'type' => 'plain' ),
                        ),
                );
        }

-- 
To view, visit https://gerrit.wikimedia.org/r/85383
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iddd03d0c75fd9474892e9bc32e1f12fc909ed29a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to