jenkins-bot has submitted this change and it was merged. Change subject: Use structured fields API to build mapping ......................................................................
Use structured fields API to build mapping Change-Id: Ibb4fc637637a8305b966a2f9702f8dcfac9dc94b Depends-On: Iad6876aae109ad84c5534619f47c72edc900d704 Bug: T89733 --- M CirrusSearch.php M autoload.php M includes/CirrusSearch.php M includes/Maintenance/MappingConfigBuilder.php A includes/Search/BooleanIndexField.php A includes/Search/CirrusIndexField.php A includes/Search/DatetimeIndexField.php A includes/Search/IntegerIndexField.php A includes/Search/KeywordIndexField.php A includes/Search/NestedIndexField.php A includes/Search/NumberIndexField.php A includes/Search/TextIndexField.php A tests/unit/IndexFieldsTest.php A tests/unit/Search/SearchFieldsTest.php 14 files changed, 648 insertions(+), 206 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve DCausse: Looks good to me, approved jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index 5a041b1..2c1f573 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -997,6 +997,19 @@ ); /** + * Mapping of result types to CirrusSearch classes. + */ +$wgCirrusSearchFieldTypes = array( + SearchIndexField::INDEX_TYPE_TEXT => \CirrusSearch\Search\TextIndexField::class, + SearchIndexField::INDEX_TYPE_KEYWORD => \CirrusSearch\Search\KeywordIndexField::class, + SearchIndexField::INDEX_TYPE_INTEGER => \CirrusSearch\Search\IntegerIndexField::class, + SearchIndexField::INDEX_TYPE_NUMBER => \CirrusSearch\Search\NumberIndexField::class, + SearchIndexField::INDEX_TYPE_DATETIME => \CirrusSearch\Search\DatetimeIndexField::class, + SearchIndexField::INDEX_TYPE_BOOL => \CirrusSearch\Search\BooleanIndexField::class, + SearchIndexField::INDEX_TYPE_NESTED => \CirrusSearch\Search\NestedIndexField::class, +); + +/** * Jenkins configuration required to get all the browser tests passing cleanly. * * @todo re-enable the code below if/when browser tests are enabled again diff --git a/autoload.php b/autoload.php index 751f078..55fa2b6 100644 --- a/autoload.php +++ b/autoload.php @@ -94,8 +94,11 @@ 'CirrusSearch\\Sanity\\QueueingRemediator' => __DIR__ . '/includes/Sanity/QueueingRemediator.php', 'CirrusSearch\\Sanity\\Remediator' => __DIR__ . '/includes/Sanity/Remediator.php', 'CirrusSearch\\SearchConfig' => __DIR__ . '/includes/SearchConfig.php', + 'CirrusSearch\\Search\\BooleanIndexField' => __DIR__ . '/includes/Search/BooleanIndexField.php', 'CirrusSearch\\Search\\BoostTemplatesFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', + 'CirrusSearch\\Search\\CirrusIndexField' => __DIR__ . '/includes/Search/CirrusIndexField.php', 'CirrusSearch\\Search\\CustomFieldFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', + 'CirrusSearch\\Search\\DatetimeIndexField' => __DIR__ . '/includes/Search/DatetimeIndexField.php', 'CirrusSearch\\Search\\EmptyResultSet' => __DIR__ . '/includes/Search/EmptyResultSet.php', 'CirrusSearch\\Search\\Escaper' => __DIR__ . '/includes/Search/Escaper.php', 'CirrusSearch\\Search\\FancyTitleResultsType' => __DIR__ . '/includes/Search/ResultsType.php', @@ -108,12 +111,16 @@ 'CirrusSearch\\Search\\GeoRadiusFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\IdResultsType' => __DIR__ . '/includes/Search/ResultsType.php', 'CirrusSearch\\Search\\IncomingLinksFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', + 'CirrusSearch\\Search\\IntegerIndexField' => __DIR__ . '/includes/Search/IntegerIndexField.php', 'CirrusSearch\\Search\\InterwikiResultsType' => __DIR__ . '/includes/Search/ResultsType.php', 'CirrusSearch\\Search\\InvalidRescoreProfileException' => __DIR__ . '/includes/Search/RescoreBuilders.php', + 'CirrusSearch\\Search\\KeywordIndexField' => __DIR__ . '/includes/Search/KeywordIndexField.php', 'CirrusSearch\\Search\\LangWeightFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\LogMultFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\LogScaleBoostFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\NamespacesFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', + 'CirrusSearch\\Search\\NestedIndexField' => __DIR__ . '/includes/Search/NestedIndexField.php', + 'CirrusSearch\\Search\\NumberIndexField' => __DIR__ . '/includes/Search/NumberIndexField.php', 'CirrusSearch\\Search\\PreferRecentFunctionScoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\RescoreBuilder' => __DIR__ . '/includes/Search/RescoreBuilders.php', 'CirrusSearch\\Search\\Result' => __DIR__ . '/includes/Search/Result.php', @@ -126,6 +133,7 @@ 'CirrusSearch\\Search\\SearchTextQueryBuilder' => __DIR__ . '/includes/Search/SearchTextQueryBuilders.php', 'CirrusSearch\\Search\\SearchTextQueryBuilderFactory' => __DIR__ . '/includes/Search/SearchTextQueryBuilders.php', 'CirrusSearch\\Search\\SearchTextQueryStringBuilder' => __DIR__ . '/includes/Search/SearchTextQueryBuilders.php', + 'CirrusSearch\\Search\\TextIndexField' => __DIR__ . '/includes/Search/TextIndexField.php', 'CirrusSearch\\Search\\TitleResultsType' => __DIR__ . '/includes/Search/ResultsType.php', 'CirrusSearch\\Searcher' => __DIR__ . '/includes/Searcher.php', 'CirrusSearch\\Test\\DummyConnection' => __DIR__ . '/tests/unit/TestUtils.php', diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php index c5cc7b1..7646cf7 100644 --- a/includes/CirrusSearch.php +++ b/includes/CirrusSearch.php @@ -751,4 +751,19 @@ } return null; } + + /** + * Create a search field definition + * @param string $name + * @param int $type + * @return SearchIndexField + */ + public function makeSearchFieldMapping( $name, $type ) { + $mappings = $this->config->get( 'CirrusSearchFieldTypes' ); + if ( !isset( $mappings[$type] ) ) { + return new NullIndexField(); + } + $klass = $mappings[$type]; + return new $klass( $name, $type, $this->config ); + } } diff --git a/includes/Maintenance/MappingConfigBuilder.php b/includes/Maintenance/MappingConfigBuilder.php index 4e7f645..ec7b9b6 100644 --- a/includes/Maintenance/MappingConfigBuilder.php +++ b/includes/Maintenance/MappingConfigBuilder.php @@ -2,9 +2,14 @@ namespace CirrusSearch\Maintenance; +use CirrusSearch\Search\CirrusIndexField; +use CirrusSearch\Search\IntegerIndexField; +use CirrusSearch\Search\KeywordIndexField; use CirrusSearch\SearchConfig; +use CirrusSearch\Search\TextIndexField; use Hooks; use MediaWiki\MediaWikiServices; +use SearchIndexField; /** * Builds elasticsearch mapping configuration arrays. @@ -25,26 +30,10 @@ * http://www.gnu.org/copyleft/gpl.html */ class MappingConfigBuilder { - // Bit field parameters for buildStringField. - const MINIMAL = 0; - const ENABLE_NORMS = 1; - const COPY_TO_SUGGEST = 2; - const SPEED_UP_HIGHLIGHTING = 4; - // Bit field parameters for buildConfig const PREFIX_START_WITH_ANY = 1; const PHRASE_SUGGEST_USE_TEXT = 2; - - /** - * Maximum number of characters allowed in keyword terms. - */ - const KEYWORD_IGNORE_ABOVE = 5000; - - /** - * Distance that lucene places between multiple values of the same field. - * Set pretty high to prevent accidental phrase queries between those values. - */ - const POSITION_INCREMENT_GAP = 10; + const OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER = 4; /** * Version number for the core analysis. Increment the major @@ -59,7 +48,15 @@ */ private $optimizeForExperimentalHighlighter; - private $similarity; + /** + * @var SearchConfig + */ + private $config; + + /** + * @var \CirrusSearch + */ + private $engine; /** * Constructor @@ -68,29 +65,28 @@ */ public function __construct( $optimizeForExperimentalHighlighter, SearchConfig $config = null ) { $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; - if ( is_null ( $config ) ) { - $config = MediaWikiServices::getInstance() - ->getConfigFactory() - ->makeConfig( 'CirrusSearch' ); + if ( is_null( $config ) ) { + $config = + MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' ); } - $this->similarity = $config->get( 'CirrusSearchSimilarityProfile' ); + $this->config = $config; + $this->engine = new \CirrusSearch(); + $this->engine->setConfig( $config ); } /** - * Build the mapping config. - * @param int $flags Flags for building the configuration - * @return array the mapping config + * Get definitions for default index fields. + * These fields are always present in the index. + * @param int $flags + * @return array */ - public function buildConfig( $flags = 0 ) { - global $wgCirrusSearchAllFields, - $wgCirrusSearchWeights, - $wgCirrusSearchWikimediaExtraPlugin; + private function getDefaultFields( $flags ) { + global $wgCirrusSearchWikimediaExtraPlugin; - $suggestExtra = array( 'analyzer' => 'suggest' ); // Note never to set something as type='object' here because that isn't returned by elasticsearch // and is inferred anyway. $titleExtraAnalyzers = array( - $suggestExtra, + array( 'analyzer' => 'suggest' ), array( 'analyzer' => 'prefix', 'search_analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => array( 'enabled' => false ) ), array( 'analyzer' => 'prefix_asciifolding', 'search_analyzer' => 'near_match_asciifolding', 'index_options' => 'docs', 'norms' => array( 'enabled' => false ) ), array( 'analyzer' => 'near_match', 'index_options' => 'docs', 'norms' => array( 'enabled' => false ) ), @@ -104,23 +100,17 @@ 'index_options' => 'docs' ); } + $sourceExtraAnalyzers = array(); if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) && - in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) { + in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) { $sourceExtraAnalyzers[] = array( 'analyzer' => 'trigram', 'index_options' => 'docs', ); } - $textExtraAnalyzers = array(); - $textOptions = MappingConfigBuilder::ENABLE_NORMS | MappingConfigBuilder::SPEED_UP_HIGHLIGHTING; - if ( $flags & self::PHRASE_SUGGEST_USE_TEXT ) { - $textExtraAnalyzers[] = $suggestExtra; - $textOptions |= MappingConfigBuilder::COPY_TO_SUGGEST; - } - - $page = array( + $page = [ 'dynamic' => false, '_all' => array( 'enabled' => false ), 'properties' => array( @@ -128,58 +118,85 @@ 'type' => 'date', 'format' => 'dateOptionalTime', ), - 'wiki' => $this->buildKeywordField(), - 'namespace' => $this->buildLongField(), - 'namespace_text' => $this->buildKeywordField(), + 'wiki' => $this->buildKeywordField( 'wiki' )->getMapping( $this->engine ), + 'namespace' => $this->buildLongField( 'namespace' )->getMapping( $this->engine ), + 'namespace_text' => $this->buildKeywordField( 'namespace_text' ) + ->getMapping( $this->engine ), 'title' => $this->buildStringField( 'title', - MappingConfigBuilder::ENABLE_NORMS | MappingConfigBuilder::COPY_TO_SUGGEST, - $titleExtraAnalyzers ), - 'text' => array_merge_recursive( - $this->buildStringField( 'text', $textOptions, $textExtraAnalyzers ), - array( 'fields' => array( 'word_count' => array( - 'type' => 'token_count', - 'store' => true, - 'analyzer' => 'plain', - ) ) ) - ), - 'opening_text' => $this->buildStringField( 'opening_text', MappingConfigBuilder::ENABLE_NORMS ), - 'auxiliary_text' => $this->buildStringField( 'auxiliary_text', $textOptions ), - 'file_text' => $this->buildStringField( 'file_text', $textOptions ), - 'source_text' => $this->buildStringField( 'source_text', MappingConfigBuilder::MINIMAL, + TextIndexField::ENABLE_NORMS | TextIndexField::COPY_TO_SUGGEST, + $titleExtraAnalyzers )->setMappingFlags( $flags )->getMapping( $this->engine ), + 'text' => array_merge_recursive( $this->buildStringField( 'text', null, + ( $flags & self::PHRASE_SUGGEST_USE_TEXT ) ? [ 'analyzer' => 'suggest' ] : [ ] ) + ->setMappingFlags( $flags )->getMapping( $this->engine ), array( + 'fields' => array( + 'word_count' => array( + 'type' => 'token_count', + 'store' => true, + 'analyzer' => 'plain', + ) + ) + ) ), + 'text_bytes' => $this->buildLongField( 'text_bytes' ) + ->setFlag( SearchIndexField::FLAG_NO_INDEX ) + ->getMapping( $this->engine ), + 'source_text' => $this->buildStringField( 'source_text', 0, $sourceExtraAnalyzers - ), - 'category' => $this->buildStringField( 'category', $textOptions, array( - array( - 'analyzer' => 'lowercase_keyword', - 'norms' => array( 'enabled' => false ), - 'index_options' => 'docs', - 'ignore_above' => self::KEYWORD_IGNORE_ABOVE, - ) ) - ), - 'template' => $this->buildLowercaseKeywordField(), - 'outgoing_link' => $this->buildKeywordField(), - 'external_link' => $this->buildKeywordField(), - 'heading' => $this->buildStringField( 'heading', MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ), - 'text_bytes' => $this->buildLongField( false ), + )->setMappingFlags( $flags )->getMapping( $this->engine ), 'redirect' => array( 'dynamic' => false, 'properties' => array( - 'namespace' => $this->buildLongField(), + 'namespace' => $this->buildLongField( 'namespace' ) + ->getMapping( $this->engine ), 'title' => $this->buildStringField( 'redirect.title', - $textOptions | MappingConfigBuilder::COPY_TO_SUGGEST, - $titleExtraAnalyzers ), + TextIndexField::ENABLE_NORMS | TextIndexField::SPEED_UP_HIGHLIGHTING | + TextIndexField::COPY_TO_SUGGEST, $titleExtraAnalyzers ) + ->setMappingFlags( $flags ) + ->getMapping( $this->engine ), ) ), - 'incoming_links' => $this->buildLongField(), - 'local_sites_with_dupe' => $this->buildLowercaseKeywordField(), + 'incoming_links' => $this->buildLongField( 'incoming_links' ) + ->getMapping( $this->engine ), + 'local_sites_with_dupe' => $this->buildKeywordField( 'local_sites_with_dupe' ) + ->setFlag( SearchIndexField::FLAG_CASEFOLD ) + ->getMapping( $this->engine ), 'suggest' => array( 'type' => 'string', 'analyzer' => 'suggest', ), - 'language' => $this->buildKeywordField(), - 'wikibase_item' => $this->buildKeywordField(), - ), - ); + // FIXME: this should be moved to Wikibase Client + 'wikibase_item' => $this->buildKeywordField( 'wikibase_item' ) + ->getMapping( $this->engine ), + ) + ]; + + return $page; + } + + /** + * Build the mapping config. + * @param int $flags Flags for building the configuration + * @return array the mapping config + */ + public function buildConfig( $flags = 0 ) { + global $wgCirrusSearchAllFields, + $wgCirrusSearchWeights; + + if ( $this->optimizeForExperimentalHighlighter ) { + $flags |= self::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER; + } + $page = $this->getDefaultFields( $flags ); + + $fields = $this->engine->getSearchIndexFields(); + + foreach ( $fields as $fieldName => $field ) { + if ( $field instanceof CirrusIndexField ) { + $field->setMappingFlags( $flags ); + } + $config = $field->getMapping( $this->engine ); + if ( $config ) { + $page['properties'][$fieldName] = $config; + } + } if ( $wgCirrusSearchAllFields[ 'build' ] ) { // Now layer all the fields into the all field once per weight. Querying it isn't strictly the @@ -190,7 +207,9 @@ // This field can't be used for the fvh/experimental highlighter for several reasons: // 1. It is built with copy_to and not stored. // 2. The term frequency information is all whoppy compared to the "real" source text. - $page[ 'properties' ][ 'all' ] = $this->buildStringField( 'all', MappingConfigBuilder::ENABLE_NORMS ); + $allField = $this->buildStringField( 'all', TextIndexField::ENABLE_NORMS ); + $page['properties']['all'] = + $allField->setMappingFlags( $flags )->getMapping( $this->engine ); $page = $this->setupCopyTo( $page, $wgCirrusSearchWeights, 'all' ); // Now repeat for near_match fields. The same considerations above apply except near_match @@ -199,17 +218,17 @@ 'type' => 'string', 'analyzer' => 'near_match', 'index_options' => 'freqs', - 'position_increment_gap' => self::POSITION_INCREMENT_GAP, + 'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP, 'norms' => array( 'enabled' => false ), - 'similarity' => $this->getSimilarity( 'all_near_match' ), + 'similarity' => $allField->getSimilarity( 'all_near_match' ), 'fields' => array( 'asciifolding' => array( 'type' => 'string', 'analyzer' => 'near_match_asciifolding', 'index_options' => 'freqs', - 'position_increment_gap' => self::POSITION_INCREMENT_GAP, + 'position_increment_gap' => TextIndexField::POSITION_INCREMENT_GAP, 'norms' => array( 'enabled' => false ), - 'similarity' => $this->getSimilarity( 'all_near_match', 'asciifolding' ), + 'similarity' => $allField->getSimilarity( 'all_near_match', 'asciifolding' ), ), ), ); @@ -219,6 +238,7 @@ ); $page = $this->setupCopyTo( $page, $nearMatchFields, 'all_near_match' ); } + $config[ 'page' ] = $page; $config[ 'namespace' ] = array( @@ -230,36 +250,14 @@ 'analyzer' => 'near_match_asciifolding', 'norms' => array( 'enabled' => false ), 'index_options' => 'docs', - 'ignore_above' => self::KEYWORD_IGNORE_ABOVE, + 'ignore_above' => KeywordIndexField::KEYWORD_IGNORE_ABOVE, ), ), ); Hooks::run( 'CirrusSearchMappingConfig', array( &$config, $this ) ); + return $config; - } - - - /** - * Get the field similarity - * @param string $field - * @param string $analyzer - * @return string - */ - private function getSimilarity( $field, $analyzer = null ) { - $fieldSimilarity = 'default'; - if ( isset( $this->similarity['fields'] ) ) { - if( isset( $this->similarity['fields'][$field] ) ) { - $fieldSimilarity = $this->similarity['fields'][$field]; - } else if ( $this->similarity['fields']['__default__'] ) { - $fieldSimilarity = $this->similarity['fields']['__default__']; - } - - if ( $analyzer != null && isset( $this->similarity['fields']["$field.$analyzer"] ) ) { - $fieldSimilarity = $this->similarity['fields']["$field.$analyzer"]; - } - } - return $fieldSimilarity; } /** @@ -296,112 +294,32 @@ * SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting. This is important for long * strings or fields with many values. * @param array $extra Extra analyzers for this field beyond the basic text and plain. - * @return array definition of the field + * @return TextIndexField definition of the field */ - public function buildStringField( $fieldName, $options, $extra = array() ) { - // multi_field is dead in 1.0 so we do this which actually looks less gnarly. - $field = array( - 'type' => 'string', - 'analyzer' => 'text', - 'search_analyzer' => 'text_search', - 'position_increment_gap' => self::POSITION_INCREMENT_GAP, - 'similarity' => $this->getSimilarity( $fieldName ), - 'fields' => array( - 'plain' => array( - 'type' => 'string', - 'analyzer' => 'plain', - 'search_analyzer' => 'plain_search', - 'position_increment_gap' => self::POSITION_INCREMENT_GAP, - 'similarity' => $this->getSimilarity( $fieldName, 'plain' ), - ), - ) - ); - $disableNorms = ( $options & MappingConfigBuilder::ENABLE_NORMS ) === 0; - if ( $disableNorms ) { - $disableNorms = array( 'norms' => array( 'enabled' => false ) ); - $field = array_merge( $field, $disableNorms ); - $field[ 'fields' ][ 'plain' ] = array_merge( $field[ 'fields' ][ 'plain' ], $disableNorms ); - } - if ( $options & MappingConfigBuilder::COPY_TO_SUGGEST ) { - $field[ 'copy_to' ] = array( 'suggest' ); - } - foreach ( $extra as $extraField ) { - $extraName = $extraField[ 'analyzer' ]; - $field[ 'fields' ][ $extraName ] = array_merge( array( - 'similarity' => $this->getSimilarity( $fieldName, $extraName ), - 'type' => 'string', - 'position_increment_gap' => self::POSITION_INCREMENT_GAP, - ), $extraField ); - if ( $disableNorms ) { - $field[ 'fields' ][ $extraName ] = array_merge( - $field[ 'fields' ][ $extraName ], - $disableNorms - ); - } - } - if ( $this->optimizeForExperimentalHighlighter ) { - if ( $options & MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ) { - $field[ 'index_options' ] = 'offsets'; - $fieldNames = array( 'plain', 'prefix', 'prefix_asciifolding', 'near_match', 'near_match_asciifolding' ); - foreach ( $fieldNames as $fieldName ) { - if ( isset( $field[ 'fields' ][ $fieldName ] ) ) { - $field[ 'fields' ][ $fieldName ][ 'index_options' ] = 'offsets'; - } - } - } - } else { - // We use the FVH on all fields so turn on term vectors - $field[ 'term_vector' ] = 'with_positions_offsets'; - $fieldNames = array( 'plain', 'prefix', 'prefix_asciifolding', 'near_match', 'near_match_asciifolding' ); - foreach ( $fieldNames as $fieldName ) { - if ( isset( $field[ 'fields' ][ $fieldName ] ) ) { - $field[ 'fields' ][ $fieldName ][ 'term_vector' ] = 'with_positions_offsets'; - } - } - } + public function buildStringField( $fieldName, $options = null, $extra = [] ) { + $field = + new TextIndexField( $fieldName, SearchIndexField::INDEX_TYPE_TEXT, $this->config, + $extra ); + $field->setTextOptions( $options ); return $field; } /** - * Create a string field that only lower cases and does ascii folding (if enabled) for the language. - * @return array definition of the field + * Create a long field. + * @param string $name Field name + * @return IntegerIndexField */ - public function buildLowercaseKeywordField() { - return array( - 'type' => 'string', - 'analyzer' => 'lowercase_keyword', - 'norms' => array( 'enabled' => false ), // Omit the length norm because there is only even one token - 'index_options' => 'docs', // Omit the frequency and position information because neither are useful - 'ignore_above' => self::KEYWORD_IGNORE_ABOVE, - ); - } - - /** - * Create a string field that does no analyzing whatsoever. - * @return array definition of the field - */ - public function buildKeywordField() { - return array( - 'type' => 'string', - 'analyzer' => 'keyword', - 'norms' => array( 'enabled' => false ), // Omit the length norm because there is only even one token - 'index_options' => 'docs', // Omit the frequency and position information because neither are useful - 'ignore_above' => self::KEYWORD_IGNORE_ABOVE, - ); + public function buildLongField( $name ) { + return new IntegerIndexField( $name, SearchIndexField::INDEX_TYPE_INTEGER, $this->config ); } /** * Create a long field. - * @param boolean $index should this be indexed - * @return array definition of the field + * @param string $name Field name + * @return KeywordIndexField */ - public function buildLongField( $index = true ) { - $config = array( - 'type' => 'long', - ); - if ( !$index ) { - $config[ 'index' ] = 'no'; - } - return $config; + public function buildKeywordField( $name ) { + return new KeywordIndexField( $name, SearchIndexField::INDEX_TYPE_KEYWORD, $this->config ); } } + diff --git a/includes/Search/BooleanIndexField.php b/includes/Search/BooleanIndexField.php new file mode 100644 index 0000000..1a01edf --- /dev/null +++ b/includes/Search/BooleanIndexField.php @@ -0,0 +1,10 @@ +<?php +namespace CirrusSearch\Search; + +/** + * Index field representing boolean value. + * @package CirrusSearch + */ +class BooleanIndexField extends CirrusIndexField { + protected $typeName = 'boolean'; +} \ No newline at end of file diff --git a/includes/Search/CirrusIndexField.php b/includes/Search/CirrusIndexField.php new file mode 100644 index 0000000..240167d --- /dev/null +++ b/includes/Search/CirrusIndexField.php @@ -0,0 +1,71 @@ +<?php +namespace CirrusSearch\Search; + +use SearchEngine; +use SearchIndexFieldDefinition; +use SearchIndexField; +use CirrusSearch\SearchConfig; + +/** + * Basic ElasticSearch index field + * @since 1.28 + */ +abstract class CirrusIndexField extends SearchIndexFieldDefinition { + + /** + * Name of the type in Elastic + * @var string + */ + protected $typeName = 'unknown'; + + /** + * @var SearchConfig + */ + protected $config; + + /** + * Specific mapping flags + * @var int + */ + protected $mappingFlags; + + /** + * CirrusIndexField constructor. + * @param string $name + * @param int $type + * @param SearchConfig $config + */ + public function __construct( $name, $type, SearchConfig $config ) { + parent::__construct( $name, $type ); + $this->config = $config; + } + + /** + * Set flags for specific mapping + * @param $flags + * @return $this + */ + public function setMappingFlags( $flags ) { + $this->mappingFlags = $flags; + return $this; + } + + /** + * Get mapping for specific search engine + * @param SearchEngine $engine + * @return array + */ + public function getMapping( SearchEngine $engine ) { + if ( !( $engine instanceof \CirrusSearch ) ) { + throw new \LogicException( "Cannot map CirrusSearch fields for another engine." ); + } + + $config = [ + 'type' => $this->typeName, + ]; + if ( $this->checkFlag( SearchIndexField::FLAG_NO_INDEX ) ) { + $config['index'] = 'no'; + } + return $config; + } +} \ No newline at end of file diff --git a/includes/Search/DatetimeIndexField.php b/includes/Search/DatetimeIndexField.php new file mode 100644 index 0000000..ba2a496 --- /dev/null +++ b/includes/Search/DatetimeIndexField.php @@ -0,0 +1,18 @@ +<?php +namespace CirrusSearch\Search; + +use SearchEngine; +/** + * Index field representing datetime field. + * @package CirrusSearch + */ +class DatetimeIndexField extends CirrusIndexField { + + protected $typeName = 'date'; + + public function getMapping( SearchEngine $engine ) { + $config = parent::getMapping( $engine ); + $config['format'] = 'dateOptionalTime'; + return $config; + } +} \ No newline at end of file diff --git a/includes/Search/IntegerIndexField.php b/includes/Search/IntegerIndexField.php new file mode 100644 index 0000000..748a94f --- /dev/null +++ b/includes/Search/IntegerIndexField.php @@ -0,0 +1,10 @@ +<?php +namespace CirrusSearch\Search; + +/** + * Index field representing integer. + * @package CirrusSearch + */ +class IntegerIndexField extends CirrusIndexField { + protected $typeName = 'long'; +} \ No newline at end of file diff --git a/includes/Search/KeywordIndexField.php b/includes/Search/KeywordIndexField.php new file mode 100644 index 0000000..dd672fa --- /dev/null +++ b/includes/Search/KeywordIndexField.php @@ -0,0 +1,30 @@ +<?php +namespace CirrusSearch\Search; + +/** + * Index field representing keyword. + * Keywords use special analyzer. + * @package CirrusSearch + */ +class KeywordIndexField extends CirrusIndexField { + protected $typeName = 'string'; + + /** + * Maximum number of characters allowed in keyword terms. + */ + const KEYWORD_IGNORE_ABOVE = 5000; + + public function getMapping( \SearchEngine $engine ) { + $config = parent::getMapping( $engine ); + $config['analyzer'] = + $this->checkFlag( self::FLAG_CASEFOLD ) ? 'lowercase_keyword' : 'keyword'; + $config += [ + 'norms' => [ 'enabled' => false ], + // Omit the length norm because there is only even one token + 'index_options' => 'docs', + // Omit the frequency and position information because neither are useful + 'ignore_above' => self::KEYWORD_IGNORE_ABOVE, + ]; + return $config; + } +} \ No newline at end of file diff --git a/includes/Search/NestedIndexField.php b/includes/Search/NestedIndexField.php new file mode 100644 index 0000000..21bbcdc --- /dev/null +++ b/includes/Search/NestedIndexField.php @@ -0,0 +1,26 @@ +<?php +namespace CirrusSearch\Search; + +use SearchIndexField; +use SearchEngine; + +class NestedIndexField extends CirrusIndexField { + protected $typeName = "nested"; + + /** + * Add sub-field for nested field + * @param string $name Field name + * @param SearchIndexField $subfield Field object + */ + public function addSubfield($name, SearchIndexField $subfield) { + $this->subfields[$name] = $subfield; + } + + public function getMapping( SearchEngine $engine ) { + $fields = parent::getMapping( $engine ); + foreach ( $this->subfields as $name => $sub ) { + $fields['properties'][$name] = $sub->getMapping( $engine ); + } + return $fields; + } +} \ No newline at end of file diff --git a/includes/Search/NumberIndexField.php b/includes/Search/NumberIndexField.php new file mode 100644 index 0000000..f391aae --- /dev/null +++ b/includes/Search/NumberIndexField.php @@ -0,0 +1,10 @@ +<?php +namespace CirrusSearch\Search; + +/** + * Index field representing double. + * @package CirrusSearch + */ +class NumberIndexField extends CirrusIndexField { + protected $typeName = 'double'; +} \ No newline at end of file diff --git a/includes/Search/TextIndexField.php b/includes/Search/TextIndexField.php new file mode 100644 index 0000000..b84de69 --- /dev/null +++ b/includes/Search/TextIndexField.php @@ -0,0 +1,217 @@ +<?php +namespace CirrusSearch\Search; + +use CirrusSearch\Maintenance\MappingConfigBuilder; +use SearchIndexField; +use CirrusSearch\SearchConfig; +use SearchEngine; + +/** + * Index field representing keyword. + * Keywords use special analyzer. + * @package CirrusSearch + */ +class TextIndexField extends CirrusIndexField { + /** + * Distance that lucene places between multiple values of the same field. + * Set pretty high to prevent accidental phrase queries between those values. + */ + const POSITION_INCREMENT_GAP = 10; + + /* Bit field parameters for string fields. + * ENABLE_NORMS: Enable norms on the field. Good for text you search against but useless + * for fields that don't get involved in the score. + * COPY_TO_SUGGEST: Copy the contents of this field to the suggest field for "Did you mean". + * SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting. This is important for long + * strings or fields with many values. + */ + const ENABLE_NORMS = 0x1000000; + // FIXME: when exactly we want to disable norms for text fields? + const COPY_TO_SUGGEST = 0x2000000; + const SPEED_UP_HIGHLIGHTING = 0x4000000; + const STRING_FIELD_MASK = 0xFFFFFF; + + /** + * Extra definitions. + * @var array + */ + protected $extra; + /** + * Similarity config + * @var array + */ + private $similarity; + /** + * Text options for this field + * @var int + */ + private $textOptions; + + /** + * Name of the type in Elastic + * @var string + */ + protected $typeName = 'string'; + + public function __construct( $name, $type, SearchConfig $config, $extra = [] ) { + parent::__construct($name, $type, $config ); + + $this->similarity = $config->get( 'CirrusSearchSimilarityProfile' ); + $this->extra = $extra; + } + + /** + * Set text options for this field if non-default + * @param $options + * @return $this + */ + public function setTextOptions( $options ) { + $this->textOptions = $options; + return $this; + } + + /** + * Get text options for this field + * @param $mappingFlags + * @return int + */ + protected function getTextOptions( $mappingFlags ) { + if ( !is_null( $this->textOptions ) ) { + return $this->textOptions; + } + $options = self::ENABLE_NORMS | self::SPEED_UP_HIGHLIGHTING; + if ( $mappingFlags & MappingConfigBuilder::PHRASE_SUGGEST_USE_TEXT && + !$this->checkFlag( SearchIndexField::FLAG_SCORING ) + ) { + // SCORING fields are not copied since this info is already in other fields + $options |= self::COPY_TO_SUGGEST; + } + if ( $this->checkFlag( SearchIndexField::FLAG_NO_HIGHLIGHT ) ) { + // Disable highlighting is asked to + $options &= ~self::SPEED_UP_HIGHLIGHTING; + } + return $options; + } + + /** + * @param SearchEngine $engine + * @return array|void + */ + public function getMapping( SearchEngine $engine ) { + if (!($engine instanceof \CirrusSearch)) { + throw new \LogicException("Cannot map CirrusSearch fields for another engine."); + } + /** + * @var \CirrusSearch $engine + */ + $this->flags = + ( $this->flags & self::STRING_FIELD_MASK ) | $this->getTextOptions( $this->mappingFlags ); + + $field = parent::getMapping( $engine ); + + if ( $this->checkFlag( self::COPY_TO_SUGGEST ) ) { + $field[ 'copy_to' ] = array( 'suggest' ); + } + + if ( $this->checkFlag( self::FLAG_NO_INDEX ) ) { + // no need to configure further a not-indexed field + return $field; + } + + $extra = $this->extra; + if ( $this->mappingFlags & MappingConfigBuilder::PREFIX_START_WITH_ANY ) { + $extra[] = [ + 'analyzer' => 'word_prefix', + 'search_analyzer' => 'plain_search', + 'index_options' => 'docs' + ]; + } + if ( $this->checkFlag( SearchIndexField::FLAG_CASEFOLD ) ) { + $extra[] = [ + 'analyzer' => 'lowercase_keyword', + 'norms' => [ 'enabled' => false ], + 'index_options' => 'docs', + 'ignore_above' => KeywordIndexField::KEYWORD_IGNORE_ABOVE, + ]; + } + + // multi_field is dead in 1.0 so we do this which actually looks less gnarly. + $field += array( + 'analyzer' => 'text', + 'search_analyzer' => 'text_search', + 'position_increment_gap' => self::POSITION_INCREMENT_GAP, + 'similarity' => $this->getSimilarity( $this->name ), + 'fields' => array( + 'plain' => array( + 'type' => 'string', + 'analyzer' => 'plain', + 'search_analyzer' => 'plain_search', + 'position_increment_gap' => self::POSITION_INCREMENT_GAP, + 'similarity' => $this->getSimilarity( $this->name, 'plain' ), + ), + ) + ); + $disableNorms = !$this->checkFlag( self::ENABLE_NORMS ); + if ( $disableNorms ) { + $disableNorms = array( 'norms' => array( 'enabled' => false ) ); + $field = array_merge( $field, $disableNorms ); + $field[ 'fields' ][ 'plain' ] = array_merge( $field[ 'fields' ][ 'plain' ], $disableNorms ); + } + foreach ( $extra as $extraField ) { + $extraName = $extraField[ 'analyzer' ]; + + $field[ 'fields' ][ $extraName ] = array_merge( array( + 'similarity' => $this->getSimilarity( $this->name, $extraName ), + 'type' => 'string', + 'position_increment_gap' => self::POSITION_INCREMENT_GAP, + ), $extraField ); + if ( $disableNorms ) { + $field[ 'fields' ][ $extraName ] = array_merge( + $field[ 'fields' ][ $extraName ], $disableNorms ); + } + } + if ( $this->mappingFlags & MappingConfigBuilder::OPTIMIZE_FOR_EXPERIMENTAL_HIGHLIGHTER ) { + if ( $this->checkFlag( self::SPEED_UP_HIGHLIGHTING ) ) { + $field[ 'index_options' ] = 'offsets'; + $fieldNames = array( 'plain', 'prefix', 'prefix_asciifolding', 'near_match', 'near_match_asciifolding' ); + foreach ( $fieldNames as $fieldName ) { + if ( isset( $field[ 'fields' ][ $fieldName ] ) ) { + $field[ 'fields' ][ $fieldName ][ 'index_options' ] = 'offsets'; + } + } + } + } else { + // We use the FVH on all fields so turn on term vectors + $field[ 'term_vector' ] = 'with_positions_offsets'; + $fieldNames = array( 'plain', 'prefix', 'prefix_asciifolding', 'near_match', 'near_match_asciifolding' ); + foreach ( $fieldNames as $fieldName ) { + if ( isset( $field[ 'fields' ][ $fieldName ] ) ) { + $field[ 'fields' ][ $fieldName ][ 'term_vector' ] = 'with_positions_offsets'; + } + } + } + return $field; + } + + /** + * Get the field similarity + * @param string $field + * @param string $analyzer + * @return string + */ + public function getSimilarity( $field, $analyzer = null ) { + $fieldSimilarity = 'default'; + if ( isset( $this->similarity['fields'] ) ) { + if( isset( $this->similarity['fields'][$field] ) ) { + $fieldSimilarity = $this->similarity['fields'][$field]; + } else if ( $this->similarity['fields']['__default__'] ) { + $fieldSimilarity = $this->similarity['fields']['__default__']; + } + + if ( $analyzer != null && isset( $this->similarity['fields']["$field.$analyzer"] ) ) { + $fieldSimilarity = $this->similarity['fields']["$field.$analyzer"]; + } + } + return $fieldSimilarity; + } +} diff --git a/tests/unit/IndexFieldsTest.php b/tests/unit/IndexFieldsTest.php new file mode 100644 index 0000000..c7de4a8 --- /dev/null +++ b/tests/unit/IndexFieldsTest.php @@ -0,0 +1,52 @@ +<?php + +use MediaWiki\MediaWikiServices; + +class IndexFieldsTest extends MediaWikiTestCase { + + public function getTypes() { + return [ + [ SearchIndexField::INDEX_TYPE_TEXT, 'string', 'CirrusSearch\\Search\\TextIndexField' ], + [ SearchIndexField::INDEX_TYPE_KEYWORD, 'string', 'CirrusSearch\\Search\\KeywordIndexField' ], + [ SearchIndexField::INDEX_TYPE_INTEGER, 'long', 'CirrusSearch\\Search\\IntegerIndexField' ], + [ SearchIndexField::INDEX_TYPE_NUMBER, 'double', 'CirrusSearch\\Search\\NumberIndexField' ], + [ SearchIndexField::INDEX_TYPE_DATETIME, 'date', 'CirrusSearch\\Search\\DatetimeIndexField' ], + [ SearchIndexField::INDEX_TYPE_NESTED, 'nested', 'CirrusSearch\\Search\\NestedIndexField' ], + [ SearchIndexField::INDEX_TYPE_BOOL, 'boolean', 'CirrusSearch\\Search\\BooleanIndexField' ], + ]; + } + + /** + * @dataProvider getTypes + * @param int $type Field type + * @param string $typeName Internal type name + * @param string $klass Class name + */ + public function testFieldTypes( $type, $typeName, $klass ) { + $config = + MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' ); + $engine = new CirrusSearch(); + /** + * @var \CirrusSearch\Search\CirrusIndexField $idxField + */ + $idxField = new $klass( "test$typeName", $type, $config ); + $map = $idxField->getMapping( $engine ); + $this->assertEquals( $typeName, $map['type'] ); + $this->assertEquals( $type, $idxField->getIndexType() ); + $this->assertEquals( "test$typeName", $idxField->getName() ); + } + + /** + * @dataProvider getTypes + * @param int $type Field type + * @param string $typeName Internal type name + * @param string $klass Class name + */ + public function testFieldEngine( $type, $typeName, $klass ) { + $engine = new CirrusSearch(); + $field = $engine->makeSearchFieldMapping( "test$typeName", $type ); + $this->assertInstanceOf( $klass, $field ); + $this->assertEquals( $type, $field->getIndexType() ); + $this->assertEquals( "test$typeName", $field->getName() ); + } +} \ No newline at end of file diff --git a/tests/unit/Search/SearchFieldsTest.php b/tests/unit/Search/SearchFieldsTest.php new file mode 100644 index 0000000..37fd7af --- /dev/null +++ b/tests/unit/Search/SearchFieldsTest.php @@ -0,0 +1,44 @@ +<?php + +namespace CirrusSearch\Search; + +use SearchIndexField; + +class SearchFieldsTest extends \PHPUnit_Framework_TestCase { + + public function getFields() { + return [ + [ SearchIndexField::INDEX_TYPE_TEXT, 'string' ], + [ SearchIndexField::INDEX_TYPE_KEYWORD, 'string' ], + [ SearchIndexField::INDEX_TYPE_INTEGER, 'long' ], + [ SearchIndexField::INDEX_TYPE_NUMBER, 'double' ], + [ SearchIndexField::INDEX_TYPE_DATETIME, 'date' ], + [ SearchIndexField::INDEX_TYPE_BOOL, 'boolean' ], + [ SearchIndexField::INDEX_TYPE_NESTED, 'nested' ], + ]; + } + + /** + * @dataProvider getFields + * @param int $type Generic type + * @param string $elasticType Elasticsearch type + */ + public function testFields( $type, $elasticType ) { + $engine = new \CirrusSearch(); + $field = $engine->makeSearchFieldMapping( 'testField-' . $type, $type ); + $this->assertInstanceOf( CirrusIndexField::class, $field ); + $mapping = $field->getMapping( $engine ); + $this->assertEquals( $elasticType, $mapping['type'] ); + + $field->setFlag( SearchIndexField::FLAG_NO_INDEX ); + $mapping = $field->getMapping( $engine ); + $this->assertEquals( 'no', $mapping['index'] ); + } + + public function testBadField() { + $engine = new \CirrusSearch(); + $field = $engine->makeSearchFieldMapping( 'testBadField', 42 ); + $this->assertInstanceOf( \NullIndexField::class, $field ); + $this->assertEquals( null, $field->getMapping( $engine ) ); + } +} \ No newline at end of file -- To view, visit https://gerrit.wikimedia.org/r/288567 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ibb4fc637637a8305b966a2f9702f8dcfac9dc94b Gerrit-PatchSet: 29 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits