jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/323998 )
Change subject: Allow specification of minimum length for classification ...................................................................... Allow specification of minimum length for classification Also, normalize capitalization in help text. Bug: T149318 Change-Id: Id19cedccf9a0025fac230e67997564ceae7a34c0 --- M TextCat.php M catus.php M tests/TextCatTest.php 3 files changed, 72 insertions(+), 9 deletions(-) Approvals: Smalyshev: Looks good to me, approved jenkins-bot: Verified diff --git a/TextCat.php b/TextCat.php index 55e1ade..1ede010 100644 --- a/TextCat.php +++ b/TextCat.php @@ -33,6 +33,13 @@ private $langFiles = array(); /** + * Minimum Input Length to be considered for + * classification + * @var int + */ + private $minInputLength = 0; + + /** * @param int $maxNgrams */ public function setMaxNgrams( $maxNgrams ) { @@ -44,6 +51,13 @@ */ public function setMinFreq( $minFreq ) { $this->minFreq = $minFreq; + } + + /** + * @param int $minInputLength + */ + public function setMinInputLength( $minInputLength ) { + $this->minInputLength = $minInputLength; } /** @@ -155,12 +169,19 @@ * Sorted by ascending score, with first result being the best. */ public function classify( $text, $candidates = null ) { + $results = array(); + + // strip non-word characters before checking for min length, don't assess empty strings + $wordLength = mb_strlen( preg_replace( "/[{$this->wordSeparator}]+/", "", $text ) ); + if ( $wordLength < $this->minInputLength || $wordLength == 0 ) { + return $results; + } + $inputgrams = array_keys( $this->createLM( $text, $this->maxNgrams ) ); if ( $candidates ) { // flip for more efficient lookups $candidates = array_flip( $candidates ); } - $results = array(); foreach ( $this->langFiles as $language => $langFile ) { if ( $candidates && !isset( $candidates[$language] ) ) { continue; diff --git a/catus.php b/catus.php index b42fa7c..7be34ef 100644 --- a/catus.php +++ b/catus.php @@ -4,35 +4,37 @@ */ require_once __DIR__.'/TextCat.php'; -$options = getopt( 'a:c:d:f:t:u:l:h' ); +$options = getopt( 'a:c:d:f:j:l:t:u:h' ); if ( isset( $options['h'] ) ) { $help = <<<HELP -{$argv[0]} [-d Dir] [-c Lang] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float] +{$argv[0]} [-d Dir] [-c Lang] [-a Int] [-f Int] [-j Int] [-l Text] [-t Int] [-u Float] - -a NUM the program returns the best-scoring language together + -a NUM The program returns the best-scoring language together with all languages which are <N times worse (set by option -u). If the number of languages to be printed is larger than the value of this option then no language is returned, but instead a message that the input is of an unknown language is printed. Default: 10. -c LANG,LANG,... - lists the candidate languages. Only languages listed will be + Lists the candidate languages. Only languages listed will be considered for detection. -d DIR,DIR,... - indicates in which directory the language models are + Indicates in which directory the language models are located (files ending in .lm). Multiple directories can be separated by a comma, and will be used in order. Default: ./LM . -f NUM Before sorting is performed the Ngrams which occur this number of times or less are removed. This can be used to speed up the program for longer inputs. For short inputs you should use the default or -f 0. Default: 0. - -l TEXT indicates that input is given as an argument on the command line, + -j NUM Only attempt classification if the input string is at least this + many characters. Default: 0. + -l TEXT Indicates that input is given as an argument on the command line, e.g. {$argv[0]} -l "this is english text" If this option is not given, the input is stdin. - -t NUM indicates the topmost number of ngrams that should be used. + -t NUM Indicates the topmost number of ngrams that should be used. Default: 3000 - -u NUM determines how much worse result must be in order not to be + -u NUM Determines how much worse result must be in order not to be mentioned as an alternative. Typical value: 1.05 or 1.1. Default: 1.05. @@ -55,6 +57,9 @@ if ( !empty( $options['f'] ) ) { $cat->setMinFreq( intval( $options['f'] ) ); } +if ( isset( $options['j'] ) ) { + $cat->setMinInputLength( intval( $options['j'] ) ); +} $input = isset( $options['l'] ) ? $options['l'] : file_get_contents( "php://stdin" ); if ( !empty( $options['c'] ) ) { diff --git a/tests/TextCatTest.php b/tests/TextCatTest.php index 348a673..c6ce0e7 100644 --- a/tests/TextCatTest.php +++ b/tests/TextCatTest.php @@ -137,4 +137,41 @@ $this->assertEquals( array_keys( $this->multicat2->classify( $testLine, $res2 ) ), array_values( $res2 ) ); } + + public function minInputLengthData() + { + return array( + array( 'eso es español', + array( 'spanish', 'catalan', 'portuguese' ), null, ), + array( 'this is english', + array( 'english', 'german' ), null, ), + array( 'c\'est français', + array( 'french', 'portuguese', 'romanian', 'catalan' ), null, ), + // numbers and spaces get stripped, so result should be an empty array + // regardless of min input length + array( '56 8 49564 83 9', + array( 'french', 'portuguese', 'romanian', 'catalan' ), array(), ), + ); + } + + /** + * @dataProvider minInputLengthData + * @param string $testLine + * @param array $lang + * @param array $res + */ + public function testMinInputLength( $testLine, $lang, $res ) + { + if ( !isset( $res ) ) { + $res = $lang; + } + # should get results when min input len is 0 + $minLength = $this->testcat->setMinInputLength(0); + $this->assertEquals( array_keys( $this->testcat->classify( $testLine, $res ) ), + array_values( $res ) ); + # should get no results when min input len is more than the length of the string + $minLength = $this->testcat->setMinInputLength(mb_strlen($testLine) + 1); + $this->assertEquals( array_keys( $this->testcat->classify( $testLine, $res ) ), + array() ); + } } -- To view, visit https://gerrit.wikimedia.org/r/323998 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id19cedccf9a0025fac230e67997564ceae7a34c0 Gerrit-PatchSet: 4 Gerrit-Project: wikimedia/textcat Gerrit-Branch: master Gerrit-Owner: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits