Tjones has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/334728 )
Change subject: Deploy TextCat Improvements ...................................................................... Deploy TextCat Improvements Update to TextCat 1.2.0. Use multiple language model directories. Add config for TextCat parameters and set at runtime. Add TextCat tests that use parameters. Fix misc typos, syntax, and EOL whitespace. Bug: T149324 Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101 --- M CirrusSearch.php M composer.json M docs/settings.txt M includes/LanguageDetector/TextCat.php M tests/unit/LanguageDetectTest.php 5 files changed, 119 insertions(+), 28 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/28/334728/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 856db50..8ca2da4 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -957,14 +957,23 @@ $wgCirrusSearchLanguageDetectors = []; /** - * Directory where TextCat detector should look for language model + * List of directories where TextCat detector should look for language models */ -$wgCirrusSearchTextcatModel = false; +$wgCirrusSearchTextcatModel = []; + +/** + * Configuration for specifying TextCat parameters. + * Keys are maxNgrams, maxReturnedLanguages, resultsRatio, + * minInputLength, maxProportion, langBoostScore, and numBoostedLangs. + * See vendor/wikimedia/textcat/TextCat.php + */ + +$wgCirrusSearchTextcatConfig = []; /** * Limit the set of languages detected by Textcat. - * Useful when some languages in the model have very bad precision, e.g.: - * $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' ); + * Useful when some languages in the model have too many false positives, e.g.: + * $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ]; */ /** diff --git a/composer.json b/composer.json index a285c6b..ecbb259 100644 --- a/composer.json +++ b/composer.json @@ -5,6 +5,6 @@ "license" : "GPL-2.0+", "minimum-stability": "dev", "require" : { - "wikimedia/textcat": "1.1.3" + "wikimedia/textcat": "1.2.0" } } diff --git a/docs/settings.txt b/docs/settings.txt index bfc0160..ad9b374 100644 --- a/docs/settings.txt +++ b/docs/settings.txt @@ -131,7 +131,7 @@ Elasticsearch plugin that should produce better snippets for search results. Installation instructions are here: https://github.com/wikimedia/search-highlighter If you have the highlighter installed you can switch this on and off so long -as you don't rebuild the index while $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. +as you don't rebuild the index while $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. Setting it to true without the highlighter installed will break search. ; $wgCirrusSearchOptimizeIndexForExperimentalHighlighter @@ -1269,9 +1269,19 @@ ; $wgCirrusSearchTextcatModel Default: - $wgCirrusSearchTextcatModel = false; + $wgCirrusSearchTextcatModel = []; -Directory where TextCat detector should look for language model. +List of directories where TextCat detector should look for language models + +; $wgCirrusSearchTextcatConfig + +Default: + $wgCirrusSearchTextcatConfig = null; + +Configuration for specifying TextCat parameters. +Keys are maxNgrams, maxReturnedLanguages, resultsRatio, +minInputLength, maxProportion, langBoostScore, and numBoostedLangs. +See vendor/wikimedia/textcat/TextCat.php ; $wgCirrusSearchTextcatLanguages @@ -1281,7 +1291,7 @@ Limit the set of languages detected by Textcat. Useful when some languages in the model have very bad precision, e.g.: - $wgCirrusSearchTextcatLanguages = array( 'ar', 'it', 'de' ); + $wgCirrusSearchTextcatLanguages = [ 'ar', 'it', 'de' ]; ; $wgCirrusSearchMasterTimeout diff --git a/includes/LanguageDetector/TextCat.php b/includes/LanguageDetector/TextCat.php index 94ef1c5..e36cd76 100644 --- a/includes/LanguageDetector/TextCat.php +++ b/includes/LanguageDetector/TextCat.php @@ -22,22 +22,54 @@ // Should not happen return null; } - $dir = $config->getElement('CirrusSearchTextcatModel'); - if( !$dir ) { + $dirs = $config->getElement('CirrusSearchTextcatModel'); + if( !$dirs ) { return null; } - if( !is_dir( $dir ) ) { - LoggerFactory::getInstance( 'CirrusSearch' )->warning( - "Bad directory for TextCat model: {dir}", - [ "dir" => $dir ] - ); + if ( !is_array( $dirs ) ) { // backward compatibility + $dirs = [ $dirs ]; + } + foreach ($dirs as $dir) { + if( !is_dir( $dir ) ) { + LoggerFactory::getInstance( 'CirrusSearch' )->warning( + "Bad directory for TextCat model: {dir}", + [ "dir" => $dir ] + ); + } } - $textcat = new \TextCat( $dir ); + $textcat = new \TextCat( $dirs ); + + $textcatConfig = $config->getElement('CirrusSearchTextcatConfig'); + if ( $textcatConfig ) { + if ( isset( $textcatConfig['maxNgrams'] ) ) { + $textcat->setMaxNgrams( intval( $textcatConfig['maxNgrams'] ) ); + } + if ( isset( $textcatConfig['maxReturnedLanguages'] ) ) { + $textcat->setMaxReturnedLanguages( intval( $textcatConfig['maxReturnedLanguages'] ) ); + } + if ( isset( $textcatConfig['resultsRatio'] ) ) { + $textcat->setResultsRatio( floatval( $textcatConfig['resultsRatio'] ) ); + } + if ( isset( $textcatConfig['minInputLength'] ) ) { + $textcat->setMinInputLength( intval( $textcatConfig['minInputLength'] ) ); + } + if ( isset( $textcatConfig['maxProportion'] ) ) { + $textcat->setMaxProportion( floatval( $textcatConfig['maxProportion'] ) ); + } + if ( isset( $textcatConfig['langBoostScore'] ) ) { + $textcat->setLangBoostScore( floatval( $textcatConfig['langBoostScore'] ) ); + } + + if ( isset( $textcatConfig['numBoostedLangs'] ) && $config->getElement( 'CirrusSearchTextcatLanguages' ) ) { + $textcat->setBoostedLangs( array_slice ( $config->getElement( 'CirrusSearchTextcatLanguages' ), + 0, $textcatConfig['numBoostedLangs'] ) ); + } + } $languages = $textcat->classify( $text, $config->getElement( 'CirrusSearchTextcatLanguages' ) ); if( !empty( $languages ) ) { // For now, just return the best option - // TODO: thing what else we could do + // TODO: think what else we could do reset( $languages ); return key( $languages ); } diff --git a/tests/unit/LanguageDetectTest.php b/tests/unit/LanguageDetectTest.php index 3daa235..b05975d 100644 --- a/tests/unit/LanguageDetectTest.php +++ b/tests/unit/LanguageDetectTest.php @@ -32,17 +32,26 @@ */ private $cirrus; + /** + * Data; query, lang1, lang2 + * lang1 is result with defaults (testTextCatDetector) + * lang2 is result with non-defaults (testTextCatDetectorWithParams) + * see notes inline + */ public function getLanguageTexts() { return [ // simple cases - ["Welcome to Wikipedia, the free encyclopedia that anyone can edit", "en"], - ["Добро пожаловать в Википедию", "ru"], + ["Welcome to Wikipedia, the free encyclopedia that anyone can edit", "en", "en"], + ["Добро пожаловать в Википедию", "ru", "uk"], // ru missing, uk present + // more query-like cases - ["Breaking Bad", "en"], - ["Jesenwang flugplatz", "de"], - ["volviendose malo", "es"], - ["противоточный теплообменник", "ru"], - ["שובר שורות", "he"], + ["who stars in Breaking Bad?", "en", "en"], + ["Jesenwang flugplatz", "de", "de"], + ["volviendose malo", "es", null], // en boosted -> too ambiguous + ["противоточный теплообменник", "ru", "uk"], // ru missing, uk present + ["שובר שורות", "he", "he"], + ["୨୪ ଅକ୍ଟୋବର", "or", null], // or missing, no alternative + ["th", "en", null], // too short ]; } @@ -52,7 +61,8 @@ global $wgCirrusSearchTextcatModel; if (empty( $wgCirrusSearchTextcatModel ) ) { $tc = new \ReflectionClass('TextCat'); - $wgCirrusSearchTextcatModel = dirname($tc->getFileName())."/LM-query/"; + $wgCirrusSearchTextcatModel = [ dirname( $tc->getFileName() )."/LM-query/", + dirname( $tc->getFileName() )."/LM/" ]; } } @@ -60,9 +70,39 @@ * @dataProvider getLanguageTexts * @param string $text * @param string $language + * @param string $ignore */ - public function testTextCatDetector($text, $language) { - // not really used for anything, but we need to pass it as a parameter + public function testTextCatDetector($text, $language, $ignore) { + $detector = new TextCat(); + $detect = $detector->detect($this->cirrus, $text); + $this->assertEquals($language, $detect); + } + + /** + * @dataProvider getLanguageTexts + * @param string $text + * @param string $ignore + * @param string $language + */ + public function testTextCatDetectorWithParams($text, $ignore, $language) { + // only use one language model directory in old non-array format + global $wgCirrusSearchTextcatModel; + $tc = new \ReflectionClass('TextCat'); + $wgCirrusSearchTextcatModel = dirname( $tc->getFileName() )."/LM-query/"; + // limit languages, excluding needed ones + global $wgCirrusSearchTextcatLanguages; + $wgCirrusSearchTextcatLanguages = [ 'en', 'es', 'de', 'he', 'uk' ]; + // reconfigure everything + global $wgCirrusSearchTextcatConfig; + $wgCirrusSearchTextcatConfig = [ + 'maxNgrams' => 9000, + 'maxReturnedLanguages' => 1, + 'resultsRatio' => 1.06, + 'minInputLength' => 3, + 'maxProportion' => 0.8, + 'langBoostScore' => 0.15, + 'numBoostedLangs' => 1, + ]; $detector = new TextCat(); $detect = $detector->detect($this->cirrus, $text); $this->assertEquals($language, $detect); -- To view, visit https://gerrit.wikimedia.org/r/334728 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I20a82978aa7a046f885dfbdcbee93d4a13f71101 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Tjones <tjo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits