jenkins-bot has submitted this change and it was merged. Change subject: Allow TextCat to use multiple language model directories ......................................................................
Allow TextCat to use multiple language model directories Allow multiple .lm directories, and add some test cases where it makes a difference in the order of results. Also some syntactic cleanup. Bug: T149316 Change-Id: I1cd40a58e335e1af1cc73effba45251f18ff8699 --- M TextCat.php M catus.php M tests/TextCatTest.php 3 files changed, 81 insertions(+), 38 deletions(-) Approvals: Smalyshev: Looks good to me, approved EBernhardson: Looks good to me, but someone else must approve jenkins-bot: Verified diff --git a/TextCat.php b/TextCat.php index c1218da..55e1ade 100644 --- a/TextCat.php +++ b/TextCat.php @@ -47,19 +47,24 @@ } /** - * @param string $dir + * @param string|array $dirs */ - public function __construct( $dir = null ) { - if ( empty( $dir ) ) { - $dir = __DIR__."/LM"; + public function __construct( $dirs = array() ) { + if ( empty( $dirs ) ) { + $dirs = array( __DIR__."/LM" ); } - $this->dir = $dir; - foreach ( new DirectoryIterator( $dir ) as $file ) { - if ( !$file->isFile() ) { - continue; - } - if ( $file->getExtension() == "lm" ) { - $this->langFiles[$file->getBasename( ".lm" )] = $file->getPathname(); + if ( !is_array( $dirs ) ) { + $dirs = array( $dirs ); + } + foreach ( $dirs as $dir ) { + foreach ( new DirectoryIterator( $dir ) as $file ) { + if ( !$file->isFile() ) { + continue; + } + if ( $file->getExtension() == "lm" && + !isset( $this->langFiles[$file->getBasename( ".lm" )] ) ) { + $this->langFiles[$file->getBasename( ".lm" )] = $file->getPathname(); + } } } } diff --git a/catus.php b/catus.php index b09c112..b42fa7c 100644 --- a/catus.php +++ b/catus.php @@ -8,7 +8,7 @@ if ( isset( $options['h'] ) ) { $help = <<<HELP -{$argv[0]} [-d Dir] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float] +{$argv[0]} [-d Dir] [-c Lang] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float] -a NUM the program returns the best-scoring language together with all languages which are <N times worse (set by option -u). @@ -19,9 +19,10 @@ -c LANG,LANG,... lists the candidate languages. Only languages listed will be considered for detection. - -d DIR indicates in which directory the language models are - located (files ending in .lm). Currently only a single - directory is supported. Default: ./LM . + -d DIR,DIR,... + indicates in which directory the language models are + located (files ending in .lm). Multiple directories can be + separated by a comma, and will be used in order. Default: ./LM . -f NUM Before sorting is performed the Ngrams which occur this number of times or less are removed. This can be used to speed up the program for longer inputs. For short inputs you should use @@ -41,12 +42,12 @@ } if ( !empty( $options['d'] ) ) { - $dir = $options['d']; + $dirs = explode( ",", $options['d'] ); } else { - $dir = __DIR__."/LM"; + $dirs = array( __DIR__."/LM" ); } -$cat = new TextCat( $dir ); +$cat = new TextCat( $dirs ); if ( !empty( $options['t'] ) ) { $cat->setMaxNgrams( intval( $options['t'] ) ); diff --git a/tests/TextCatTest.php b/tests/TextCatTest.php index ba16d06..348a673 100644 --- a/tests/TextCatTest.php +++ b/tests/TextCatTest.php @@ -11,15 +11,17 @@ public function setUp() { - $this->cat = new TextCat(__DIR__."/../LM"); - $this->testcat = new TextCat(__DIR__."/data/Models"); + // initialze testcat with a string, and multicats with arrays + $this->testcat = new TextCat( __DIR__."/data/Models" ); + $this->multicat1 = new TextCat( array(__DIR__."/../LM", __DIR__."/../LM-query" ) ); + $this->multicat2 = new TextCat( array(__DIR__."/../LM-query", __DIR__."/../LM" ) ); } public function testCreateLM() { - $lm = $this->cat->createLM("test", 1000); + $lm = $this->testcat->createLM( "test", 1000 ); $result = - array ( + array( '_' => 2, 't' => 2, '_t' => 1, @@ -39,20 +41,20 @@ 'test' => 1, 'test_' => 1, ); - $this->assertEquals($result, $lm); + $this->assertEquals( $result, $lm ); } public function testCreateLMLimit() { - $lm = $this->cat->createLM("test", 4); + $lm = $this->testcat->createLM( "test", 4 ); $result = - array ( + array( '_' => 2, 't' => 2, '_t' => 1, '_te' => 1, ); - $this->assertEquals($result, $lm); + $this->assertEquals( $result, $lm ); } public function getTexts() @@ -60,11 +62,11 @@ $indir = __DIR__."/data/ShortTexts"; $outdir = __DIR__."/data/Models"; $data = array(); - foreach(new DirectoryIterator($indir) as $file) { - if(!$file->isFile() || $file->getExtension() != "txt") { + foreach( new DirectoryIterator( $indir ) as $file ) { + if ( !$file->isFile() || $file->getExtension() != "txt" ) { continue; } - $data[] = array($file->getPathname(), $outdir . "/" . $file->getBasename(".txt") . ".lm"); + $data[] = array( $file->getPathname(), $outdir . "/" . $file->getBasename(".txt") . ".lm" ); } return $data; } @@ -74,12 +76,12 @@ * @param string $text * @param string $lm */ - public function testCreateFromTexts($textFile, $lmFile) + public function testCreateFromTexts( $textFile, $lmFile ) { include $lmFile; $this->assertEquals( $ngrams, - $this->cat->createLM(file_get_contents($textFile), 4000) + $this->testcat->createLM( file_get_contents( $textFile ), 4000) ); } @@ -88,16 +90,51 @@ * @param string $text * @param string $lm */ - public function testFileLines($textFile) + public function testFileLines( $textFile ) { - $lines = file($textFile); + $lines = file( $textFile ); $line = 5; do { - $testLine = trim($lines[$line]); + $testLine = trim( $lines[$line] ); $line++; - } while(empty($testLine)); - $detect = $this->testcat->classify($testLine); - reset($detect); - $this->assertEquals(basename($textFile, ".txt"), key($detect)); + } while( empty( $testLine ) ); + $detect = $this->testcat->classify( $testLine ); + reset( $detect ); + $this->assertEquals( basename( $textFile, ".txt" ), key( $detect ) ); } + + public function multiCatData() + { + return array( + array('this is english text français bisschen', + array('sco', 'en', 'fr', 'de' ), + array('fr', 'de', 'sco', 'en' ), ), + array('الاسم العلمي: Felis catu', + array('ar', 'la', 'fa', 'fr' ), + array('ar', 'fr', 'la', 'fa' ), ), + array('Кошка, или домашняя кошка A macska más néven házi macska', + array('ru', 'uk', 'hu', 'fi' ), + array('hu', 'ru', 'uk', 'fi' ), ), + array('Il gatto domestico Kucing disebut juga kucing domestik', + array('id', 'it', 'pt', 'es' ), + array('it', 'id', 'es', 'pt' ), ), + array('Domaća mačka Pisică de casă Hejma kato', + array('hr', 'ro', 'eo', 'cs' ), + array('hr', 'cs', 'ro', 'eo' ), ), + ); + } + + /** + * @dataProvider multiCatData + * @param string $testLine + * @param array $res1 + * @param array $res2 + */ + public function testMultiCat( $testLine, $res1, $res2 ) + { + $this->assertEquals( array_keys( $this->multicat1->classify( $testLine, $res1 ) ), + array_values( $res1 ) ); + $this->assertEquals( array_keys( $this->multicat2->classify( $testLine, $res2 ) ), + array_values( $res2 ) ); + } } -- To view, visit https://gerrit.wikimedia.org/r/320852 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I1cd40a58e335e1af1cc73effba45251f18ff8699 Gerrit-PatchSet: 9 Gerrit-Project: wikimedia/textcat Gerrit-Branch: master Gerrit-Owner: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits