EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/326139 )

Change subject: [WIP] Expose language classification of text via api
......................................................................

[WIP] Expose language classification of text via api

Change-Id: I0be27828bcdb49da913e55aea62779ec29eb1d18
---
M CirrusSearch.php
M autoload.php
A includes/Api/TextCat.php
M includes/LanguageDetector/TextCat.php
4 files changed, 99 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/39/326139/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 2782a77..30b8509 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -952,7 +952,13 @@
 $wgCirrusSearchLanguageDetectors = [];
 
 /**
- * Directory where TextCat detector should look for language model
+ * Map of <name, directory> where TextCat detector should look for language 
model
+ */
+$wgCirrusSearchTextcatModels = [];
+
+/**
+ * Default model name in $wgCirrusSearchTextcatModel to use. For BC purposes
+ * this can also be an exact directory.
  */
 $wgCirrusSearchTextcatModel = false;
 
@@ -1136,6 +1142,7 @@
 $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump';
 $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump';
 $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump';
+$wgAPIModules['cirrus-textcat'] = 'CirrusSearch\Api\TextCat';
 
 /**
  * Configs
diff --git a/autoload.php b/autoload.php
index 33c806c..c14c1c5 100644
--- a/autoload.php
+++ b/autoload.php
@@ -11,6 +11,7 @@
        'CirrusSearch\\Api\\MappingDump' => __DIR__ . 
'/includes/Api/MappingDump.php',
        'CirrusSearch\\Api\\SettingsDump' => __DIR__ . 
'/includes/Api/SettingsDump.php',
        'CirrusSearch\\Api\\SuggestIndex' => __DIR__ . 
'/includes/Api/SuggestIndex.php',
+       'CirrusSearch\\Api\\TextCat' => __DIR__ . '/includes/Api/TextCat.php',
        'CirrusSearch\\BaseInterwikiResolver' => __DIR__ . 
'/includes/BaseInterwikiResolver.php',
        'CirrusSearch\\BaseRequestLog' => __DIR__ . 
'/includes/BaseRequestLog.php',
        'CirrusSearch\\BuildDocument\\Builder' => __DIR__ . 
'/includes/BuildDocument/Builder.php',
diff --git a/includes/Api/TextCat.php b/includes/Api/TextCat.php
new file mode 100644
index 0000000..73c33f7
--- /dev/null
+++ b/includes/Api/TextCat.php
@@ -0,0 +1,89 @@
+<?php
+
+namespace CirrusSearch\Api;
+
+use ApiBase;
+use ApiMain;
+use MediaWiki\MediaWikiServices;
+
+/**
+ * Exposes textcat language classification via API.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class TextCat extends ApiBase {
+       public function __construct( ApiMain $mainModule, $moduleName, 
$modulePrefix = '' ) {
+               parent::__construct( $mainModule, $moduleName, $modulePrefix );
+               $this->config = \MediaWiki\MediaWikiServices::getInstance()
+                       ->getConfigFactory()
+                       ->makeConfig( 'CirrusSearch' );
+       }
+
+       public function execute() {
+               $dir = $this->config->getElement(
+                       'CirrusSearchTextcatModels',
+                       $this->getParameter( 'model' )
+               );
+
+               if ( !is_dir( $dir ) ) {
+                       throw new \ApiUsageException( '...' );
+               }
+
+               $textcat  = new \TextCat( $dir );
+               $languages = $textcat->classify(
+                       $this->getParameter( 'text' ),
+                       $this->getParameter( 'languages' )
+               );
+
+               // TODO: Report relative costs instead of absolute?
+               $this->getResult()->addValue( null, $this->getModuleName(), 
$languages );
+       }
+
+       public function getAllowedParams() {
+               return [
+                       'model' => [
+                               ApiBase::PARAM_TYPE => array_keys( 
$this->config->getElement(
+                                       'CirrusSearchTextcatModel'
+                               ) ),
+                               ApiBase::PARAM_DFLT => 'article',
+                       ],
+                       'text' => [
+                               ApiBase::PARAM_TYPE => 'text',
+                               ApiBase::PARAM_REQUIRED => true,
+                       ],
+                       'languages' => [
+                               ApiBase::PARAM_TYPE => 'string',
+                               ApiBase::PARAM_ISMULTI => true,
+                               ApiBase::PARAM_ALLOW_DUPLICATES => false,
+                       ]
+               ];
+       }
+
+       /**
+        * @deprecated since MediaWiki core 1.25
+        */
+       public function getDescription() {
+               return 'Classify text to a particular language';
+       }
+
+       /**
+        * @see ApiBase::getExamplesMessages
+        */
+       protected function getExamplesMessages() {
+               return [
+               ];
+       }
+}
diff --git a/includes/LanguageDetector/TextCat.php 
b/includes/LanguageDetector/TextCat.php
index 94ef1c5..006b669 100644
--- a/includes/LanguageDetector/TextCat.php
+++ b/includes/LanguageDetector/TextCat.php
@@ -22,7 +22,7 @@
                        // Should not happen
                        return null;
                }
-               $dir = $config->getElement('CirrusSearchTextcatModel');
+               $dir = $config->getElement('CirrusSearchTextcatModel', 'query');
                if( !$dir ) {
                        return null;
                }

-- 
To view, visit https://gerrit.wikimedia.org/r/326139
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0be27828bcdb49da913e55aea62779ec29eb1d18
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to