jenkins-bot has submitted this change and it was merged. Change subject: Add support for ICU tokenization ......................................................................
Add support for ICU tokenization The icu tokenizer uses an approach based on dictionnaries to break words. For chinese: 灯笼 is properly tokenized as a single token while the standard tokenizer would emit two separate tokens. Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152 --- M CirrusSearch.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Maintenance/SuggesterAnalysisConfigBuilder.php M tests/unit/Maintenance/AnalysisConfigBuilderTest.php 4 files changed, 135 insertions(+), 7 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve Tjones: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index 258c3ca..5e60b7a 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -795,6 +795,21 @@ $wgCirrusSearchICUFoldingUnicodeSetFilter = null; /** + * Enable the ICU Tokenizer instead of the standard filter + * for plain fields. + * It may be more suited for languages that do not use spaces + * to break words. + * Requires the ICU plugin installed + * Set to: + * - default: let cirrus decides if the ICU tokenizer can be enabled according to wiki language + * - yes: force the use of ICU tokenizer + * - no: disable the ICU tokenizer even if cirrus thinks it can be enabled + * NOTE: Experimental + */ +$wgCirrusSearchUseIcuTokenizer = 'default'; + + +/** * Set the default scoring function to be used by maintenance/updateSuggesterIndex.php * @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions * NOTE: if you change the scoring method you'll have to rebuild the suggester index. diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index f489f05..831d34a 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -48,7 +48,12 @@ /** * @var boolean true if icu folding is requested and available */ - private $icuFolding; + protected $icuFolding; + + /** + * @var boolean true if the icu tokenizer is requested and available + */ + protected $icuTokenizer; /** * @var array Similarity algo (tf/idf, bm25, etc) configuration @@ -85,6 +90,7 @@ $this->config = $config; $this->icuFolding = $this->shouldActivateIcuFolding( $plugins ); + $this->icuTokenizer = $this->shouldActivateIcuTokenization(); } /** @@ -117,6 +123,27 @@ } /** + * Determine if the icu tokenizer can be enabled + * @return bool + */ + private function shouldActivateIcuTokenization() { + if ( !$this->icu ) { + // requires the icu plugin + return false; + } + $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' ); + switch( $in_config ) { + case 'yes': return true; + case 'no': return false; + case 'default': + if ( isset( $this->languagesWithIcuTokenization[$this->language] ) ) { + return $this->languagesWithIcuTokenization[$this->language]; + } + default: return false; + } + } + + /** * Build the analysis config. * * @return array the analysis config @@ -124,6 +151,9 @@ public function buildConfig() { $config = $this->customize( $this->defaults() ); Hooks::run( 'CirrusSearchAnalysisConfig', [ &$config ] ); + if ( $this->icuTokenizer ) { + $config = $this->enableICUTokenizer( $config ); + } if ( $this->icuFolding ) { $config = $this->enableICUFolding( $config ); } @@ -141,6 +171,22 @@ return $this->similarity['similarity']; } return null; + } + /** + * replace the standard tokenizer with icu_tokenizer + * @param mixed[] $config + * @return mixed[] update config + */ + public function enableICUTokenizer( array $config ) { + foreach( $config['analyzer'] as $name => &$value ) { + if ( isset( $value['type'] ) && $value['type'] != 'custom' ) { + continue; + } + if ( isset( $value['tokenizer'] ) && 'standard' === $value['tokenizer'] ) { + $value['tokenizer'] = 'icu_tokenizer'; + } + } + return $config; } /** @@ -866,6 +912,12 @@ private $languagesWithIcuFolding = []; /** + * @var bool[] indexed by language code, languages where ICU tokenization + * can be enabled by default + */ + private $languagesWithIcuTokenization = []; + + /** * @var array[] */ private $elasticsearchLanguageAnalyzersFromPlugins = [ diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php index 621a6bc..8d1d0a0 100644 --- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php +++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php @@ -60,6 +60,15 @@ $folding_type['unicodeSetFilter'] = $unicodeSetFilter; } } + $textTokenizer = 'standard'; + $plainTokenizer = 'whitespace'; + if ( $this->icuTokenizer ) { + $textTokenizer = 'icu_tokenizer'; + // We cannot use the icu_tokenizer for plain here + // even if icu tokenization is mostly needed for languages + // where space is not used to break words. We don't want + // to break some punctuation chars like ':' + } $defaults = [ 'char_filter' => [ 'word_break_helper' => [ @@ -115,7 +124,7 @@ "accentfolding", "token_limit" ], - "tokenizer" => "standard" + "tokenizer" => $textTokenizer, ], // We do not remove stop words when searching, // this leads to extremely weird behaviors while @@ -127,7 +136,7 @@ "accentfolding", "token_limit" ], - "tokenizer" => "standard" + "tokenizer" => $textTokenizer, ], "plain" => [ "type" => "custom", @@ -136,7 +145,7 @@ "token_limit", "lowercase" ], - "tokenizer" => "whitespace" + "tokenizer" => $plainTokenizer, ], "plain_search" => [ "type" => "custom", @@ -145,7 +154,7 @@ "token_limit", "lowercase" ], - "tokenizer" => "whitespace" + "tokenizer" => $plainTokenizer, ], ], ]; @@ -157,7 +166,7 @@ "accentfolding", "token_limit" ], - "tokenizer" => "standard" + "tokenizer" => $textTokenizer, ]; $defaults['analyzer']['subphrases_search'] = [ "type" => "custom", @@ -166,7 +175,7 @@ "accentfolding", "token_limit" ], - "tokenizer" => "standard" + "tokenizer" => $textTokenizer, ]; } return $defaults; diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php index 6a5d46c..f464a45 100644 --- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php +++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php @@ -54,6 +54,15 @@ $this->assertFalse( $builder->isIcuFolding() ); } + /** @dataProvider provideICUTokenizer */ + public function testICUTokinizer( array $input, array $expected ) { + $config = new HashSearchConfig( ['CirrusSearchUseIcuTokenizer' => 'yes'] ); + $plugins = ['extra', 'analysis-icu']; + $builder = new AnalysisConfigBuilder( 'en', $plugins, $config ); + $result = $builder->enableICUTokenizer( $input ); + $this->assertEquals( $expected['analyzer'], $result['analyzer'] ); + } + public static function provideASCIIFoldingFilters() { return [ 'only custom is updated' => [ @@ -295,4 +304,47 @@ ], ]; } + + public static function provideICUTokenizer() { + return [ + 'only custom is updated' => [ + [ + 'analyzer' => [ + 'french' => [ + 'type' => 'french', + 'filter' => ['random'] + ] + ], + ], + [ + 'analyzer' => [ + 'french' => [ + 'type' => 'french', + 'filter' => ['random'] + ] + ], + ], + ], + 'only custom is updated' => [ + [ + 'analyzer' => [ + 'chinese' => [ + 'type' => 'custom', + 'tokenizer' => 'standard', + 'filter' => ['random'] + ] + ], + ], + [ + 'analyzer' => [ + 'chinese' => [ + 'type' => 'custom', + 'tokenizer' => 'icu_tokenizer', + 'filter' => ['random'] + ] + ], + ], + ], + ]; + } } -- To view, visit https://gerrit.wikimedia.org/r/313577 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152 Gerrit-PatchSet: 15 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits