[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add support for ICU tokenization

jenkins-bot (Code Review) Tue, 06 Dec 2016 16:29:41 -0800

jenkins-bot has submitted this change and it was merged.

Change subject: Add support for ICU tokenization
......................................................................



Add support for ICU tokenization

The icu tokenizer uses an approach based on dictionnaries to break
words.
For chinese: 灯笼 is properly tokenized as a single token while the
standard tokenizer would emit two separate tokens.

Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152
---
M CirrusSearch.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M tests/unit/Maintenance/AnalysisConfigBuilderTest.php
4 files changed, 135 insertions(+), 7 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  Tjones: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 258c3ca..5e60b7a 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -795,6 +795,21 @@
 $wgCirrusSearchICUFoldingUnicodeSetFilter = null;
 
 /**
+ * Enable the ICU Tokenizer instead of the standard filter
+ * for plain fields.
+ * It may be more suited for languages that do not use spaces
+ * to break words.
+ * Requires the ICU plugin installed
+ * Set to:
+ * - default: let cirrus decides if the ICU tokenizer can be enabled according 
to wiki language
+ * - yes: force the use of ICU tokenizer
+ * - no: disable the ICU tokenizer even if cirrus thinks it can be enabled
+ * NOTE: Experimental
+ */
+$wgCirrusSearchUseIcuTokenizer = 'default';
+
+
+/**
  * Set the default scoring function to be used by 
maintenance/updateSuggesterIndex.php
  * @see includes/BuildDocument/SuggestScoring.php for more details about 
scoring functions
  * NOTE: if you change the scoring method you'll have to rebuild the suggester 
index.
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index f489f05..831d34a 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -48,7 +48,12 @@
        /**
         * @var boolean true if icu folding is requested and available
         */
-       private $icuFolding;
+       protected $icuFolding;
+
+       /**
+        * @var boolean true if the icu tokenizer is requested and available
+        */
+       protected $icuTokenizer;
 
        /**
         * @var array Similarity algo (tf/idf, bm25, etc) configuration
@@ -85,6 +90,7 @@
 
                $this->config = $config;
                $this->icuFolding = $this->shouldActivateIcuFolding( $plugins );
+               $this->icuTokenizer = $this->shouldActivateIcuTokenization();
        }
 
        /**
@@ -117,6 +123,27 @@
        }
 
        /**
+        * Determine if the icu tokenizer can be enabled
+        * @return bool
+        */
+       private function shouldActivateIcuTokenization() {
+               if ( !$this->icu ) {
+                       // requires the icu plugin
+                       return false;
+               }
+               $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer' 
);
+               switch( $in_config ) {
+               case 'yes': return true;
+               case 'no': return false;
+               case 'default':
+                       if ( isset( 
$this->languagesWithIcuTokenization[$this->language] ) ) {
+                               return 
$this->languagesWithIcuTokenization[$this->language];
+                       }
+               default: return false;
+               }
+       }
+
+       /**
         * Build the analysis config.
         *
         * @return array the analysis config
@@ -124,6 +151,9 @@
        public function buildConfig() {
                $config = $this->customize( $this->defaults() );
                Hooks::run( 'CirrusSearchAnalysisConfig', [ &$config ] );
+               if ( $this->icuTokenizer ) {
+                       $config = $this->enableICUTokenizer( $config );
+               }
                if ( $this->icuFolding ) {
                        $config = $this->enableICUFolding( $config );
                }
@@ -141,6 +171,22 @@
                        return $this->similarity['similarity'];
                }
                return null;
+       }
+       /**
+        * replace the standard tokenizer with icu_tokenizer
+        * @param mixed[] $config
+        * @return mixed[] update config
+        */
+       public function enableICUTokenizer( array $config ) {
+               foreach( $config['analyzer'] as $name => &$value ) {
+                       if ( isset( $value['type'] ) && $value['type'] != 
'custom' ) {
+                               continue;
+                       }
+                       if ( isset( $value['tokenizer'] ) && 'standard' === 
$value['tokenizer'] ) {
+                               $value['tokenizer'] = 'icu_tokenizer';
+                       }
+               }
+               return $config;
        }
 
        /**
@@ -866,6 +912,12 @@
        private $languagesWithIcuFolding = [];
 
        /**
+        * @var bool[] indexed by language code, languages where ICU 
tokenization
+        * can be enabled by default
+        */
+       private $languagesWithIcuTokenization = [];
+
+       /**
         * @var array[]
         */
        private $elasticsearchLanguageAnalyzersFromPlugins = [
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 621a6bc..8d1d0a0 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -60,6 +60,15 @@
                                $folding_type['unicodeSetFilter'] = 
$unicodeSetFilter;
                        }
                }
+               $textTokenizer = 'standard';
+               $plainTokenizer = 'whitespace';
+               if ( $this->icuTokenizer ) {
+                       $textTokenizer = 'icu_tokenizer';
+                       // We cannot use the icu_tokenizer for plain here
+                       // even if icu tokenization is mostly needed for 
languages
+                       // where space is not used to break words. We don't want
+                       // to break some punctuation chars like ':'
+               }
                $defaults = [
                        'char_filter' => [
                                'word_break_helper' => [
@@ -115,7 +124,7 @@
                                                "accentfolding",
                                                "token_limit"
                                        ],
-                                       "tokenizer" => "standard"
+                                       "tokenizer" => $textTokenizer,
                                ],
                                // We do not remove stop words when searching,
                                // this leads to extremely weird behaviors while
@@ -127,7 +136,7 @@
                                                "accentfolding",
                                                "token_limit"
                                        ],
-                                       "tokenizer" => "standard"
+                                       "tokenizer" => $textTokenizer,
                                ],
                                "plain" => [
                                        "type" => "custom",
@@ -136,7 +145,7 @@
                                                "token_limit",
                                                "lowercase"
                                        ],
-                                       "tokenizer" => "whitespace"
+                                       "tokenizer" => $plainTokenizer,
                                ],
                                "plain_search" => [
                                        "type" => "custom",
@@ -145,7 +154,7 @@
                                                "token_limit",
                                                "lowercase"
                                        ],
-                                       "tokenizer" => "whitespace"
+                                       "tokenizer" => $plainTokenizer,
                                ],
                        ],
                ];
@@ -157,7 +166,7 @@
                                        "accentfolding",
                                        "token_limit"
                                ],
-                               "tokenizer" => "standard"
+                               "tokenizer" => $textTokenizer,
                        ];
                        $defaults['analyzer']['subphrases_search'] = [
                                "type" => "custom",
@@ -166,7 +175,7 @@
                                        "accentfolding",
                                        "token_limit"
                                ],
-                               "tokenizer" => "standard"
+                               "tokenizer" => $textTokenizer,
                        ];
                }
                return $defaults;
diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php 
b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
index 6a5d46c..f464a45 100644
--- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
+++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
@@ -54,6 +54,15 @@
                $this->assertFalse( $builder->isIcuFolding() );
        }
 
+       /** @dataProvider provideICUTokenizer */
+       public function testICUTokinizer( array $input, array $expected ) {
+               $config = new HashSearchConfig( ['CirrusSearchUseIcuTokenizer' 
=> 'yes'] );
+               $plugins = ['extra', 'analysis-icu'];
+               $builder = new AnalysisConfigBuilder( 'en', $plugins, $config );
+               $result = $builder->enableICUTokenizer( $input );
+               $this->assertEquals( $expected['analyzer'], $result['analyzer'] 
);
+       }
+
        public static function provideASCIIFoldingFilters() {
                return [
                        'only custom is updated' => [
@@ -295,4 +304,47 @@
                        ],
                ];
        }
+
+       public static function provideICUTokenizer() {
+               return [
+                       'only custom is updated' => [
+                               [
+                                       'analyzer' => [
+                                               'french' => [
+                                                       'type' => 'french',
+                                                       'filter' => ['random']
+                                               ]
+                                       ],
+                               ],
+                               [
+                                       'analyzer' => [
+                                               'french' => [
+                                                       'type' => 'french',
+                                                       'filter' => ['random']
+                                               ]
+                                       ],
+                               ],
+                       ],
+                       'only custom is updated' => [
+                               [
+                                       'analyzer' => [
+                                               'chinese' => [
+                                                       'type' => 'custom',
+                                                       'tokenizer' => 
'standard',
+                                                       'filter' => ['random']
+                                               ]
+                                       ],
+                               ],
+                               [
+                                       'analyzer' => [
+                                               'chinese' => [
+                                                       'type' => 'custom',
+                                                       'tokenizer' => 
'icu_tokenizer',
+                                                       'filter' => ['random']
+                                               ]
+                                       ],
+                               ],
+                       ],
+               ];
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/313577
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152
Gerrit-PatchSet: 15
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add support for ICU tokenization

Reply via email to