jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/323998 )

Change subject: Allow specification of minimum length for classification
......................................................................


Allow specification of minimum length for classification

Also, normalize capitalization in help text.

Bug: T149318
Change-Id: Id19cedccf9a0025fac230e67997564ceae7a34c0
---
M TextCat.php
M catus.php
M tests/TextCatTest.php
3 files changed, 72 insertions(+), 9 deletions(-)

Approvals:
  Smalyshev: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/TextCat.php b/TextCat.php
index 55e1ade..1ede010 100644
--- a/TextCat.php
+++ b/TextCat.php
@@ -33,6 +33,13 @@
        private $langFiles = array();
 
        /**
+        * Minimum Input Length to be considered for
+        * classification
+        * @var int
+        */
+       private $minInputLength = 0;
+
+       /**
         * @param int $maxNgrams
         */
        public function setMaxNgrams( $maxNgrams ) {
@@ -44,6 +51,13 @@
         */
        public function setMinFreq( $minFreq ) {
                $this->minFreq = $minFreq;
+       }
+
+       /**
+        * @param int $minInputLength
+        */
+       public function setMinInputLength( $minInputLength ) {
+               $this->minInputLength = $minInputLength;
        }
 
        /**
@@ -155,12 +169,19 @@
         *                               Sorted by ascending score, with first 
result being the best.
         */
        public function classify( $text, $candidates = null ) {
+               $results = array();
+
+               // strip non-word characters before checking for min length, 
don't assess empty strings
+               $wordLength = mb_strlen( preg_replace( 
"/[{$this->wordSeparator}]+/", "", $text ) );
+               if ( $wordLength < $this->minInputLength || $wordLength == 0 ) {
+                       return $results;
+               }
+
                $inputgrams = array_keys( $this->createLM( $text, 
$this->maxNgrams ) );
                if ( $candidates ) {
                        // flip for more efficient lookups
                        $candidates = array_flip( $candidates );
                }
-               $results = array();
                foreach ( $this->langFiles as $language => $langFile ) {
                        if ( $candidates && !isset( $candidates[$language] ) ) {
                                continue;
diff --git a/catus.php b/catus.php
index b42fa7c..7be34ef 100644
--- a/catus.php
+++ b/catus.php
@@ -4,35 +4,37 @@
  */
 require_once __DIR__.'/TextCat.php';
 
-$options = getopt( 'a:c:d:f:t:u:l:h' );
+$options = getopt( 'a:c:d:f:j:l:t:u:h' );
 
 if ( isset( $options['h'] ) ) {
        $help = <<<HELP
-{$argv[0]} [-d Dir] [-c Lang] [-a Int] [-f Int] [-l Text] [-t Int] [-u Float]
+{$argv[0]} [-d Dir] [-c Lang] [-a Int] [-f Int] [-j Int] [-l Text] [-t Int] 
[-u Float]
 
-    -a NUM  the program returns the best-scoring language together
+    -a NUM  The program returns the best-scoring language together
             with all languages which are <N times worse (set by option -u).
             If the number of languages to be printed is larger than the value
             of this option then no language is returned, but
             instead a message that the input is of an unknown language is
             printed. Default: 10.
     -c LANG,LANG,...
-            lists the candidate languages. Only languages listed will be
+            Lists the candidate languages. Only languages listed will be
             considered for detection.
     -d DIR,DIR,...
-            indicates in which directory the language models are
+            Indicates in which directory the language models are
             located (files ending in .lm). Multiple directories can be
             separated by a comma, and will be used in order.  Default: ./LM .
     -f NUM  Before sorting is performed the Ngrams which occur this number
             of times or less are removed. This can be used to speed up
             the program for longer inputs. For short inputs you should use
             the default or -f 0. Default: 0.
-    -l TEXT indicates that input is given as an argument on the command line,
+    -j NUM  Only attempt classification if the input string is at least this
+            many characters. Default: 0.
+    -l TEXT Indicates that input is given as an argument on the command line,
             e.g. {$argv[0]} -l "this is english text"
             If this option is not given, the input is stdin.
-    -t NUM  indicates the topmost number of ngrams that should be used.
+    -t NUM  Indicates the topmost number of ngrams that should be used.
             Default: 3000
-    -u NUM  determines how much worse result must be in order not to be
+    -u NUM  Determines how much worse result must be in order not to be
             mentioned as an alternative. Typical value: 1.05 or 1.1.
             Default: 1.05.
 
@@ -55,6 +57,9 @@
 if ( !empty( $options['f'] ) ) {
        $cat->setMinFreq( intval( $options['f'] ) );
 }
+if ( isset( $options['j'] ) ) {
+       $cat->setMinInputLength( intval( $options['j'] ) );
+}
 
 $input = isset( $options['l'] ) ? $options['l'] : file_get_contents( 
"php://stdin" );
 if ( !empty( $options['c'] ) ) {
diff --git a/tests/TextCatTest.php b/tests/TextCatTest.php
index 348a673..c6ce0e7 100644
--- a/tests/TextCatTest.php
+++ b/tests/TextCatTest.php
@@ -137,4 +137,41 @@
         $this->assertEquals( array_keys( $this->multicat2->classify( 
$testLine, $res2 ) ),
                                                         array_values( $res2 ) 
);
     }
+
+    public function minInputLengthData()
+    {
+        return array(
+          array( 'eso es español',
+                               array( 'spanish', 'catalan', 'portuguese' ), 
null, ),
+          array( 'this is english',
+                               array( 'english', 'german' ), null, ),
+          array( 'c\'est français',
+                               array( 'french', 'portuguese', 'romanian', 
'catalan' ), null, ),
+          // numbers and spaces get stripped, so result should be an empty 
array
+          // regardless of min input length
+          array( '56 8 49564     83 9',
+                               array( 'french', 'portuguese', 'romanian', 
'catalan' ), array(), ),
+        );
+    }
+
+    /**
+     * @dataProvider minInputLengthData
+        * @param string $testLine
+        * @param array $lang
+        * @param array $res
+     */
+    public function testMinInputLength( $testLine, $lang, $res )
+    {
+               if ( !isset( $res ) ) {
+                       $res = $lang;
+               }
+               # should get results when min input len is 0
+               $minLength = $this->testcat->setMinInputLength(0);
+               $this->assertEquals( array_keys( $this->testcat->classify( 
$testLine, $res ) ),
+                                                        array_values( $res ) );
+        # should get no results when min input len is more than the length of 
the string
+        $minLength = $this->testcat->setMinInputLength(mb_strlen($testLine) + 
1);
+        $this->assertEquals( array_keys( $this->testcat->classify( $testLine, 
$res ) ),
+                             array() );
+    }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/323998
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id19cedccf9a0025fac230e67997564ceae7a34c0
Gerrit-PatchSet: 4
Gerrit-Project: wikimedia/textcat
Gerrit-Branch: master
Gerrit-Owner: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to