jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/334728 )
Change subject: Deploy TextCat Improvements ...................................................................... Deploy TextCat Improvements Update to TextCat 1.2.0. Use multiple language model directories. Add config for TextCat parameters and set at runtime. Add TextCat tests that use parameters. Fix misc typos, syntax, and EOL whitespace. Bug: T149324 Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101 --- M CirrusSearch.php M composer.json M docs/settings.txt M includes/LanguageDetector/TextCat.php M tests/unit/LanguageDetectTest.php 5 files changed, 138 insertions(+), 38 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve jenkins-bot: Verified DCausse: Looks good to me, approved diff --git a/CirrusSearch.php b/CirrusSearch.php index 856db50..8ca2da4 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -957,14 +957,23 @@ $wgCirrusSearchLanguageDetectors = []; /** - * Directory where TextCat detector should look for language model + * List of directories where TextCat detector should look for language models */ -$wgCirrusSearchTextcatModel = false; +$wgCirrusSearchTextcatModel = []; + +/** + * Configuration for specifying TextCat parameters. + * Keys are maxNgrams, maxReturnedLanguages, resultsRatio, + * minInputLength, maxProportion, langBoostScore, and numBoostedLangs. + * See vendor/wikimedia/textcat/TextCat.php + */ + +$wgCirrusSearchTextcatConfig = []; /** * Limit the set of languages detected by Textcat. - * Useful when some languages in the model have very bad precision, e.g.: - * $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' ); + * Useful when some languages in the model have too many false positives, e.g.: + * $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ]; */ /** diff --git a/composer.json b/composer.json index a285c6b..ecbb259 100644 --- a/composer.json +++ b/composer.json @@ -5,6 +5,6 @@ "license" : "GPL-2.0+", "minimum-stability": "dev", "require" : { - "wikimedia/textcat": "1.1.3" + "wikimedia/textcat": "1.2.0" } } diff --git a/docs/settings.txt b/docs/settings.txt index bfc0160..ad9b374 100644 --- a/docs/settings.txt +++ b/docs/settings.txt @@ -131,7 +131,7 @@ Elasticsearch plugin that should produce better snippets for search results. Installation instructions are here: https://github.com/wikimedia/search-highlighter If you have the highlighter installed you can switch this on and off so long -as you don't rebuild the index while $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. +as you don't rebuild the index while $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. Setting it to true without the highlighter installed will break search. ; $wgCirrusSearchOptimizeIndexForExperimentalHighlighter @@ -1269,9 +1269,19 @@ ; $wgCirrusSearchTextcatModel Default: - $wgCirrusSearchTextcatModel = false; + $wgCirrusSearchTextcatModel = []; -Directory where TextCat detector should look for language model. +List of directories where TextCat detector should look for language models + +; $wgCirrusSearchTextcatConfig + +Default: + $wgCirrusSearchTextcatConfig = null; + +Configuration for specifying TextCat parameters. +Keys are maxNgrams, maxReturnedLanguages, resultsRatio, +minInputLength, maxProportion, langBoostScore, and numBoostedLangs. +See vendor/wikimedia/textcat/TextCat.php ; $wgCirrusSearchTextcatLanguages @@ -1281,7 +1291,7 @@ Limit the set of languages detected by Textcat. Useful when some languages in the model have very bad precision, e.g.: - $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' ); + $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ]; ; $wgCirrusSearchMasterTimeout diff --git a/includes/LanguageDetector/TextCat.php b/includes/LanguageDetector/TextCat.php index 94ef1c5..72c264d 100644 --- a/includes/LanguageDetector/TextCat.php +++ b/includes/LanguageDetector/TextCat.php @@ -22,22 +22,57 @@ // Should not happen return null; } - $dir = $config->getElement('CirrusSearchTextcatModel'); - if( !$dir ) { + $dirs = $config->getElement('CirrusSearchTextcatModel'); + if( !$dirs ) { return null; } - if( !is_dir( $dir ) ) { - LoggerFactory::getInstance( 'CirrusSearch' )->warning( - "Bad directory for TextCat model: {dir}", - [ "dir" => $dir ] - ); + if ( !is_array( $dirs ) ) { // backward compatibility + $dirs = [ $dirs ]; + } + foreach ($dirs as $dir) { + if( !is_dir( $dir ) ) { + LoggerFactory::getInstance( 'CirrusSearch' )->warning( + "Bad directory for TextCat model: {dir}", + [ "dir" => $dir ] + ); + } } - $textcat = new \TextCat( $dir ); + $textcat = new \TextCat( $dirs ); + + $textcatConfig = $config->getElement('CirrusSearchTextcatConfig'); + if ( $textcatConfig ) { + if ( isset( $textcatConfig['maxNgrams'] ) ) { + $textcat->setMaxNgrams( intval( $textcatConfig['maxNgrams'] ) ); + } + if ( isset( $textcatConfig['maxReturnedLanguages'] ) ) { + $textcat->setMaxReturnedLanguages( intval( $textcatConfig['maxReturnedLanguages'] ) ); + } + if ( isset( $textcatConfig['resultsRatio'] ) ) { + $textcat->setResultsRatio( floatval( $textcatConfig['resultsRatio'] ) ); + } + if ( isset( $textcatConfig['minInputLength'] ) ) { + $textcat->setMinInputLength( intval( $textcatConfig['minInputLength'] ) ); + } + if ( isset( $textcatConfig['maxProportion'] ) ) { + $textcat->setMaxProportion( floatval( $textcatConfig['maxProportion'] ) ); + } + if ( isset( $textcatConfig['langBoostScore'] ) ) { + $textcat->setLangBoostScore( floatval( $textcatConfig['langBoostScore'] ) ); + } + + if ( isset( $textcatConfig['numBoostedLangs'] ) && + $config->getElement( 'CirrusSearchTextcatLanguages' ) + ) { + $textcat->setBoostedLangs( array_slice ( + $config->getElement( 'CirrusSearchTextcatLanguages' ), + 0, $textcatConfig['numBoostedLangs'] ) ); + } + } $languages = $textcat->classify( $text, $config->getElement( 'CirrusSearchTextcatLanguages' ) ); if( !empty( $languages ) ) { // For now, just return the best option - // TODO: thing what else we could do + // TODO: think what else we could do reset( $languages ); return key( $languages ); } diff --git a/tests/unit/LanguageDetectTest.php b/tests/unit/LanguageDetectTest.php index 3daa235..279b51c 100644 --- a/tests/unit/LanguageDetectTest.php +++ b/tests/unit/LanguageDetectTest.php @@ -32,47 +32,93 @@ */ private $cirrus; + /** + * @var TextCat + */ + private $textcat; + + /** + * data provided is: text, lang1, lang2 + * lang1 is result with defaults (testTextCatDetector) + * lang2 is result with non-defaults (testTextCatDetectorWithParams) + * see notes inline + */ public function getLanguageTexts() { return [ // simple cases - ["Welcome to Wikipedia, the free encyclopedia that anyone can edit", "en"], - ["Добро пожаловать в Википедию", "ru"], + ["Welcome to Wikipedia, the free encyclopedia that anyone can edit", "en", "en"], + ["Добро пожаловать в Википедию", "ru", "uk"], // ru missing, uk present + // more query-like cases - ["Breaking Bad", "en"], - ["Jesenwang flugplatz", "de"], - ["volviendose malo", "es"], - ["противоточный теплообменник", "ru"], - ["שובר שורות", "he"], + ["who stars in Breaking Bad?", "en", "en"], + ["Jesenwang flugplatz", "de", "de"], + ["volviendose malo", "es", null], // en boosted -> too ambiguous + ["противоточный теплообменник", "ru", "uk"], // ru missing, uk present + ["שובר שורות", "he", "he"], + ["୨୪ ଅକ୍ଟୋବର", "or", null], // or missing, no alternative + ["th", "en", null], // too short ]; } public function setUp() { parent::setUp(); $this->cirrus = new \CirrusSearch(); - global $wgCirrusSearchTextcatModel; - if (empty( $wgCirrusSearchTextcatModel ) ) { - $tc = new \ReflectionClass('TextCat'); - $wgCirrusSearchTextcatModel = dirname($tc->getFileName())."/LM-query/"; - } + $this->textcat = new TextCat(); } /** * @dataProvider getLanguageTexts * @param string $text * @param string $language + * @param string $ignore */ - public function testTextCatDetector($text, $language) { - // not really used for anything, but we need to pass it as a parameter - $detector = new TextCat(); - $detect = $detector->detect($this->cirrus, $text); + public function testTextCatDetector($text, $language, $ignore) { + $tc = new \ReflectionClass('TextCat'); + $this->setMwGlobals( [ + 'wgCirrusSearchTextcatModel' => [ dirname( $tc->getFileName() )."/LM-query/", + dirname( $tc->getFileName() )."/LM/" ], + 'wgCirrusSearchTextcatLanguages' => null, + 'wgCirrusSearchTextcatConfig' => null, + ] ); + $detect = $this->textcat->detect($this->cirrus, $text); + $this->assertEquals($language, $detect); + } + + /** + * @dataProvider getLanguageTexts + * @param string $text + * @param string $ignore + * @param string $language + */ + public function testTextCatDetectorWithParams($text, $ignore, $language) { + $tc = new \ReflectionClass('TextCat'); + $this->setMwGlobals( [ + // only use one language model directory in old non-array format + 'wgCirrusSearchTextcatModel' => dirname( $tc->getFileName() )."/LM-query/", + 'wgCirrusSearchTextcatLanguages' => [ 'en', 'es', 'de', 'he', 'uk' ], + 'wgCirrusSearchTextcatConfig' => [ + 'maxNgrams' => 9000, + 'maxReturnedLanguages' => 1, + 'resultsRatio' => 1.06, + 'minInputLength' => 3, + 'maxProportion' => 0.8, + 'langBoostScore' => 0.15, + 'numBoostedLangs' => 1, + ], + ] ); + $detect = $this->textcat->detect($this->cirrus, $text); $this->assertEquals($language, $detect); } public function testTextCatDetectorLimited() { - global $wgCirrusSearchTextcatLanguages; - $wgCirrusSearchTextcatLanguages = ["en", "ru"]; - $detector = new TextCat(); - $detect = $detector->detect($this->cirrus, "volviendose malo"); + $tc = new \ReflectionClass('TextCat'); + $this->setMwGlobals( [ + 'wgCirrusSearchTextcatModel' => [ dirname( $tc->getFileName() )."/LM-query/", + dirname( $tc->getFileName() )."/LM/" ], + 'wgCirrusSearchTextcatLanguages' => ["en", "ru"], + 'wgCirrusSearchTextcatConfig' => null, + ] ); + $detect = $this->textcat->detect($this->cirrus, "volviendose malo"); $this->assertEquals("en", $detect); } -- To view, visit https://gerrit.wikimedia.org/r/334728 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101 Gerrit-PatchSet: 5 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Legoktm <lego...@member.fsf.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits