Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/85383
Change subject: Infrastructure for quotes turning off stemmer. ...................................................................... Infrastructure for quotes turning off stemmer. Looks like I fixed 54278 in the process! Bug: 54020 Bug: 54278 Change-Id: Iddd03d0c75fd9474892e9bc32e1f12fc909ed29a --- M CirrusSearchAnalysisConfigBuilder.php M CirrusSearchMappingConfigBuilder.php M CirrusSearchSearcher.php 3 files changed, 60 insertions(+), 43 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/83/85383/1 diff --git a/CirrusSearchAnalysisConfigBuilder.php b/CirrusSearchAnalysisConfigBuilder.php index 8e2e077..0da92a6 100644 --- a/CirrusSearchAnalysisConfigBuilder.php +++ b/CirrusSearchAnalysisConfigBuilder.php @@ -50,6 +50,15 @@ 'text' => array( 'type' => $this->getDefaultTextAnalyzerType(), ), + 'plain' => array( + // Surprisingly, the Lucene docs claim this works for + // Chinese, Japanese, and Thai as well. + // The difference between this and the 'standard' + // analzyer is the lack of english stop words. + 'type' => 'custom', + 'tokenizer' => 'standard', + 'filter' => array( 'standard', 'lowercase' ) + ), 'suggest' => array( 'type' => 'custom', 'tokenizer' => 'standard', @@ -99,16 +108,18 @@ $config[ 'filter' ][ 'lowercase' ][ 'language' ] = 'greek'; break; case 'en': + $config[ 'filter' ][ 'possessive_english' ] = array( + 'type' => 'stemmer', + 'language' => 'possessive_english', + ); // Replace the default english analyzer with a rebuilt copy with asciifolding tacked on the end $config[ 'analyzer' ][ 'text' ] = array( 'type' => 'custom', 'tokenizer' => 'standard', 'filter' => array( 'standard', 'possessive_english', 'lowercase', 'stop', 'porter_stem', 'asciifolding' ) ); - $config[ 'filter' ][ 'possessive_english' ] = array( - 'type' => 'stemmer', - 'language' => 'possessive_english', - ); + // Add asciifolding to the the text_plain analyzer as well + $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 'asciifolding'; // Add asciifolding to the prefix queries and incategory filters $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] = 'asciifolding'; $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding'; @@ -127,8 +138,8 @@ * @return string the analyzer type */ private function getDefaultTextAnalyzerType() { - if ( array_key_exists( $this->language, $this->elasticsearchLanguages ) ) { - return $this->elasticsearchLanguages[ $this->language ]; + if ( array_key_exists( $this->language, $this->elasticsearchLanguageAnalyzers ) ) { + return $this->elasticsearchLanguageAnalyzers[ $this->language ]; } else { return 'default'; } @@ -139,7 +150,7 @@ * that this array is sorted alphabetically by value and sourced from * http://www.elasticsearch.org/guide/reference/index-modules/analysis/lang-analyzer/ */ - private $elasticsearchLanguages = array( + private $elasticsearchLanguageAnalyzers = array( 'ar' => 'arabic', 'hy' => 'armenian', 'eu' => 'basque', @@ -147,7 +158,6 @@ 'bg' => 'bulgarian', 'ca' => 'catalan', 'zh' => 'chinese', - // 'cjk', - we don't use this because we don't have a wiki with all three 'cs' => 'czech', 'da' => 'danish', 'nl' => 'dutch', @@ -170,6 +180,6 @@ 'es' => 'spanish', 'sv' => 'swedish', 'tr' => 'turkish', - 'th' => 'thai' + 'th' => 'thai', ); } diff --git a/CirrusSearchMappingConfigBuilder.php b/CirrusSearchMappingConfigBuilder.php index bd9b2dd..b31186a 100644 --- a/CirrusSearchMappingConfigBuilder.php +++ b/CirrusSearchMappingConfigBuilder.php @@ -18,7 +18,6 @@ * http://www.gnu.org/copyleft/gpl.html */ class CirrusSearchMappingConfigBuilder { - /** * @return array */ @@ -36,13 +35,13 @@ // and is infered anyway. return array( 'properties' => array( - 'title' => $this->buildStringField( 'title', array( 'suggest', 'prefix' ), true ), - 'text' => $this->buildStringField( 'text', array( 'suggest' ), true ), + 'title' => $this->buildStringField( 'title', array( 'suggest', 'prefix' ) ), + 'text' => $this->buildStringField( 'text', array( 'suggest' ) ), 'category' => $this->buildLowercaseKeywordField(), - 'heading' => $this->buildStringField(), + 'heading' => $this->buildStringField( 'heading' ), 'redirect' => array( 'properties' => array( - 'title' => $this->buildStringField( 'title', array( 'suggest' ), true ) + 'title' => $this->buildStringField( 'title', array( 'suggest' ) ), ) ), 'links' => array( @@ -59,29 +58,29 @@ /** * Build a string field that does standard analysis for the language. - * @param $name string|null Name of the field. Required if extra is not false. - * @param $extra array|null Extra analyzers for this field beyond the basic string type. If not falsy the - * field will be a multi_field. - * @param $willHighlight bool Will this field be highlighted? Defaults to false. + * @param $name string|null Name of the field. + * @param $extra array|null Extra analyzers for this field beyond the basic text and plain. * @return array definition of the field */ - private function buildStringField( $name = null, $extra = null, $willHighlight = false ) { - $field = array( 'type' => 'string', 'analyzer' => 'text' ); - if ( $willHighlight ) { - $field[ 'store' ] = true; - $field[ 'term_vector' ] = 'with_positions_offsets'; - } - if ( !$extra ) { - return $field; - } + private function buildStringField( $name, $extra = array() ) { $field = array( 'type' => 'multi_field', 'fields' => array( - $name => $field + $name => array( + 'type' => 'string', + 'analyzer' => 'text', + 'store' => 'yes', + 'term_vector' => 'with_positions_offsets', + ), + 'plain' => array( + 'type' => 'string', + 'analyzer' => 'plain', + 'term_vector' => 'with_positions_offsets', + ), ) ); foreach ( $extra as $extraname ) { - $field['fields'][$extraname] = array( 'type' => 'string', 'analyzer' => $extraname ); + $field[ 'fields' ][ $extraname ] = array( 'type' => 'string', 'analyzer' => $extraname ); } return $field; } diff --git a/CirrusSearchSearcher.php b/CirrusSearchSearcher.php index fd2abe8..0ffc0eb 100644 --- a/CirrusSearchSearcher.php +++ b/CirrusSearchSearcher.php @@ -108,7 +108,6 @@ * @return CirrusSearchResultSet|null|SearchResultSet|Status */ public function searchText( $term, $showRedirects ) { - global $wgCirrusSearchWeights; global $wgCirrusSearchPhraseRescoreBoost; global $wgCirrusSearchPhraseRescoreWindowSize; global $wgCirrusSearchPhraseSuggestMaxErrors; @@ -148,14 +147,7 @@ if ( trim( $term ) !== '' || $extraQueryStrings ) { $fixedTerm = self::fixupQueryString( $term ); $queryStringQueryString = trim( implode( ' ', $extraQueryStrings ) . ' ' . $fixedTerm ); - $fields = array( - 'title^' . $wgCirrusSearchWeights[ 'title' ], - 'heading^' . $wgCirrusSearchWeights[ 'heading' ], - 'text', - ); - if ( $showRedirects ) { - $fields[] = 'redirect.title^' . $wgCirrusSearchWeights[ 'redirect' ]; - } + $fields = CirrusSearchSearcher::buildFullTextSearchFields( $showRedirects ); $this->query = $this->buildSearchTextQuery( $fields, $queryStringQueryString ); // Only do a phrase match rescore if the query doesn't include any phrases @@ -351,6 +343,25 @@ } /** + * Build fields searched by full text search. + * @param $includeRedirects bool show redirects be included + * @param $fieldSuffix string suffux to add to field names. Defaults to ''. + * @return array(string) of fields to query + */ + public static function buildFullTextSearchFields( $includeRedirects, $fieldSuffix = '' ) { + global $wgCirrusSearchWeights; + $fields = array( + 'title' . $fieldSuffix . '^' . $wgCirrusSearchWeights[ 'title' ], + 'heading' . $fieldSuffix . '^' . $wgCirrusSearchWeights[ 'heading' ], + 'text' . $fieldSuffix, + ); + if ( $includeRedirects ) { + $fields[] = 'redirect.title' . $fieldSuffix . '^' . $wgCirrusSearchWeights[ 'redirect' ]; + } + return $fields; + } + + /** * Pick the index type to search bases on the list of namespaces to search. * @return mixed index type in which to search */ @@ -417,7 +428,6 @@ } // Turn bad fuzzy searches into searches that contain a ~ $string = preg_replace_callback( '/(?<leading>[^\s"])~(?<trailing>\S+)/', function ( $matches ) { - wfDebugLog( 'CirrusSearch', 'checking fuzzy:' . $matches[0] ); if ( preg_match( '/0|(?:0?\.[0-9]+)|(?:1(?:\.0)?)/', $matches[ 'trailing' ] ) ) { return $matches[ 0 ]; } else { @@ -426,14 +436,12 @@ }, $string ); // Turn bad proximity searches into seraches that contain a ~ $string = preg_replace_callback( '/"~(?<trailing>\S*)/', function ( $matches ) { - wfDebugLog( 'CirrusSearch', 'checking proximity:' . $matches[0] ); if ( preg_match( '/[0-9]+/', $matches[ 'trailing' ] ) ) { return $matches[ 0 ]; } else { return '"\\~' . $matches[ 'trailing' ]; } }, $string ); - wfDebugLog( 'CirrusSearch', 'Got ' . $string ); return $string; } @@ -489,8 +497,8 @@ 'fields' => array( 'title' => array( 'number_of_fragments' => 0 ), 'text' => array( 'number_of_fragments' => 1 ), - 'redirect.title' => array( 'number_of_fragments' => 1 ), - 'heading' => array( 'number_of_fragments' => 1), + 'redirect.title' => array( 'number_of_fragments' => 1, 'type' => 'plain' ), + 'heading' => array( 'number_of_fragments' => 1, 'type' => 'plain' ), ), ); } -- To view, visit https://gerrit.wikimedia.org/r/85383 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iddd03d0c75fd9474892e9bc32e1f12fc909ed29a Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits