Tjones has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/334728 )

Change subject: Deploy TextCat Improvements
......................................................................

Deploy TextCat Improvements

Update to TextCat 1.2.0.
Use multiple language model directories.
Add config for TextCat parameters and set at runtime.
Add TextCat tests that use parameters.
Fix misc typos, syntax, and EOL whitespace.

Bug: T149324
Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101
---
M CirrusSearch.php
M composer.json
M docs/settings.txt
M includes/LanguageDetector/TextCat.php
M tests/unit/LanguageDetectTest.php
5 files changed, 119 insertions(+), 28 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/28/334728/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 856db50..8ca2da4 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -957,14 +957,23 @@
 $wgCirrusSearchLanguageDetectors = [];
 
 /**
- * Directory where TextCat detector should look for language model
+ * List of directories where TextCat detector should look for language models
  */
-$wgCirrusSearchTextcatModel = false;
+$wgCirrusSearchTextcatModel = [];
+
+/**
+ * Configuration for specifying TextCat parameters.
+ * Keys are maxNgrams, maxReturnedLanguages, resultsRatio,
+ * minInputLength, maxProportion, langBoostScore, and numBoostedLangs.
+ * See vendor/wikimedia/textcat/TextCat.php
+ */
+
+$wgCirrusSearchTextcatConfig = [];
 
 /**
  * Limit the set of languages detected by Textcat.
- * Useful when some languages in the model have very bad precision, e.g.:
- * $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' );
+ * Useful when some languages in the model have too many false positives, e.g.:
+ * $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ];
  */
 
 /**
diff --git a/composer.json b/composer.json
index a285c6b..ecbb259 100644
--- a/composer.json
+++ b/composer.json
@@ -5,6 +5,6 @@
        "license"    : "GPL-2.0+",
        "minimum-stability": "dev",
        "require"    : {
-               "wikimedia/textcat": "1.1.3"
+               "wikimedia/textcat": "1.2.0"
        }
 }
diff --git a/docs/settings.txt b/docs/settings.txt
index bfc0160..ad9b374 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -131,7 +131,7 @@
 Elasticsearch plugin that should produce better snippets for search results.
 Installation instructions are here: 
https://github.com/wikimedia/search-highlighter
 If you have the highlighter installed you can switch this on and off so long
-as you don't rebuild the index while 
$wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true.  
+as you don't rebuild the index while 
$wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true.
 Setting it to true without the highlighter installed will break search.
 
 ; $wgCirrusSearchOptimizeIndexForExperimentalHighlighter
@@ -1269,9 +1269,19 @@
 ; $wgCirrusSearchTextcatModel
 
 Default:
-    $wgCirrusSearchTextcatModel = false;
+    $wgCirrusSearchTextcatModel = [];
 
-Directory where TextCat detector should look for language model.
+List of directories where TextCat detector should look for language models
+
+; $wgCirrusSearchTextcatConfig
+
+Default:
+    $wgCirrusSearchTextcatConfig = null;
+
+Configuration for specifying TextCat parameters.
+Keys are maxNgrams, maxReturnedLanguages, resultsRatio,
+minInputLength, maxProportion, langBoostScore, and numBoostedLangs.
+See vendor/wikimedia/textcat/TextCat.php
 
 ; $wgCirrusSearchTextcatLanguages
 
@@ -1281,7 +1291,7 @@
 Limit the set of languages detected by Textcat.
 Useful when some languages in the model have very bad precision, e.g.:
 
-    $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' );
+    $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ];
 
 ; $wgCirrusSearchMasterTimeout
 
diff --git a/includes/LanguageDetector/TextCat.php 
b/includes/LanguageDetector/TextCat.php
index 94ef1c5..e36cd76 100644
--- a/includes/LanguageDetector/TextCat.php
+++ b/includes/LanguageDetector/TextCat.php
@@ -22,22 +22,54 @@
                        // Should not happen
                        return null;
                }
-               $dir = $config->getElement('CirrusSearchTextcatModel');
-               if( !$dir ) {
+               $dirs = $config->getElement('CirrusSearchTextcatModel');
+               if( !$dirs ) {
                        return null;
                }
-               if( !is_dir( $dir ) ) {
-                       LoggerFactory::getInstance( 'CirrusSearch' )->warning(
-                               "Bad directory for TextCat model: {dir}",
-                               [ "dir" => $dir ]
-                       );
+               if ( !is_array( $dirs ) ) { // backward compatibility
+                       $dirs = [ $dirs ];
+               }
+               foreach ($dirs as $dir) {
+                       if( !is_dir( $dir ) ) {
+                               LoggerFactory::getInstance( 'CirrusSearch' 
)->warning(
+                                       "Bad directory for TextCat model: 
{dir}",
+                                       [ "dir" => $dir ]
+                               );
+                       }
                }
 
-               $textcat = new \TextCat( $dir );
+               $textcat = new \TextCat( $dirs );
+
+               $textcatConfig = 
$config->getElement('CirrusSearchTextcatConfig');
+               if ( $textcatConfig ) {
+                       if ( isset( $textcatConfig['maxNgrams'] ) ) {
+                               $textcat->setMaxNgrams( intval( 
$textcatConfig['maxNgrams'] ) );
+                               }
+                       if ( isset( $textcatConfig['maxReturnedLanguages'] ) ) {
+                               $textcat->setMaxReturnedLanguages( intval( 
$textcatConfig['maxReturnedLanguages'] ) );
+                               }
+                       if ( isset( $textcatConfig['resultsRatio'] ) ) {
+                               $textcat->setResultsRatio( floatval( 
$textcatConfig['resultsRatio'] ) );
+                               }
+                       if ( isset( $textcatConfig['minInputLength'] ) ) {
+                               $textcat->setMinInputLength( intval( 
$textcatConfig['minInputLength'] ) );
+                               }
+                       if ( isset( $textcatConfig['maxProportion'] ) ) {
+                               $textcat->setMaxProportion( floatval( 
$textcatConfig['maxProportion'] ) );
+                               }
+                       if ( isset( $textcatConfig['langBoostScore'] ) ) {
+                               $textcat->setLangBoostScore( floatval( 
$textcatConfig['langBoostScore'] ) );
+                               }
+
+                       if ( isset( $textcatConfig['numBoostedLangs'] ) && 
$config->getElement( 'CirrusSearchTextcatLanguages' ) ) {
+                               $textcat->setBoostedLangs( array_slice ( 
$config->getElement( 'CirrusSearchTextcatLanguages' ),
+                                                                               
                                 0, $textcatConfig['numBoostedLangs'] ) );
+                               }
+               }
                $languages = $textcat->classify( $text, $config->getElement( 
'CirrusSearchTextcatLanguages' ) );
                if( !empty( $languages ) ) {
                        // For now, just return the best option
-                       // TODO: thing what else we could do
+                       // TODO: think what else we could do
                        reset( $languages );
                        return key( $languages );
                }
diff --git a/tests/unit/LanguageDetectTest.php 
b/tests/unit/LanguageDetectTest.php
index 3daa235..b05975d 100644
--- a/tests/unit/LanguageDetectTest.php
+++ b/tests/unit/LanguageDetectTest.php
@@ -32,17 +32,26 @@
         */
        private $cirrus;
 
+       /**
+        * Data; query, lang1, lang2
+        * lang1 is result with defaults (testTextCatDetector)
+        * lang2 is result with non-defaults (testTextCatDetectorWithParams)
+        *              see notes inline
+        */
        public function getLanguageTexts() {
                return [
                        // simple cases
-                       ["Welcome to Wikipedia, the free encyclopedia that 
anyone can edit", "en"],
-                       ["Добро пожаловать в Википедию", "ru"],
+                       ["Welcome to Wikipedia, the free encyclopedia that 
anyone can edit", "en", "en"],
+                       ["Добро пожаловать в Википедию", "ru", "uk"],   // ru 
missing, uk present
+
                        // more query-like cases
-                       ["Breaking Bad", "en"],
-                       ["Jesenwang flugplatz", "de"],
-                       ["volviendose malo", "es"],
-                       ["противоточный теплообменник", "ru"],
-                       ["שובר שורות", "he"],
+                       ["who stars in Breaking Bad?", "en", "en"],
+                       ["Jesenwang flugplatz", "de", "de"],
+                       ["volviendose malo", "es", null], // en boosted -> too 
ambiguous
+                       ["противоточный теплообменник", "ru", "uk"], // ru 
missing, uk present
+                       ["שובר שורות", "he", "he"],
+                       ["୨୪ ଅକ୍ଟୋବର", "or", null],     // or missing, no 
alternative
+                       ["th", "en", null],     // too short
                ];
        }
 
@@ -52,7 +61,8 @@
                global $wgCirrusSearchTextcatModel;
                if (empty( $wgCirrusSearchTextcatModel ) ) {
                        $tc = new \ReflectionClass('TextCat');
-                       $wgCirrusSearchTextcatModel = 
dirname($tc->getFileName())."/LM-query/";
+                       $wgCirrusSearchTextcatModel = [ dirname( 
$tc->getFileName() )."/LM-query/", 
+                                                                               
        dirname( $tc->getFileName() )."/LM/" ];
                }
        }
 
@@ -60,9 +70,39 @@
         * @dataProvider getLanguageTexts
         * @param string $text
         * @param string $language
+        * @param string $ignore
         */
-       public function testTextCatDetector($text, $language) {
-               // not really used for anything, but we need to pass it as a 
parameter
+       public function testTextCatDetector($text, $language, $ignore) {
+               $detector = new TextCat();
+               $detect = $detector->detect($this->cirrus, $text);
+               $this->assertEquals($language, $detect);
+       }
+
+       /**
+        * @dataProvider getLanguageTexts
+        * @param string $text
+        * @param string $ignore
+        * @param string $language
+        */
+       public function testTextCatDetectorWithParams($text, $ignore, 
$language) {
+               // only use one language model directory in old non-array format
+               global $wgCirrusSearchTextcatModel;
+               $tc = new \ReflectionClass('TextCat');
+               $wgCirrusSearchTextcatModel = dirname( $tc->getFileName() 
)."/LM-query/";
+               // limit languages, excluding needed ones
+               global $wgCirrusSearchTextcatLanguages;
+               $wgCirrusSearchTextcatLanguages = [ 'en', 'es', 'de', 'he', 
'uk' ];
+               // reconfigure everything
+               global $wgCirrusSearchTextcatConfig;
+               $wgCirrusSearchTextcatConfig = [
+                       'maxNgrams' => 9000,
+                       'maxReturnedLanguages' => 1,
+                       'resultsRatio' => 1.06,
+                       'minInputLength' => 3,
+                       'maxProportion' => 0.8,
+                       'langBoostScore' => 0.15,
+                       'numBoostedLangs' => 1,
+               ];
                $detector = new TextCat();
                $detect = $detector->detect($this->cirrus, $text);
                $this->assertEquals($language, $detect);

-- 
To view, visit https://gerrit.wikimedia.org/r/334728
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <tjo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to