EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/326139 )
Change subject: [WIP] Expose language classification of text via api ...................................................................... [WIP] Expose language classification of text via api Change-Id: I0be27828bcdb49da913e55aea62779ec29eb1d18 --- M CirrusSearch.php M autoload.php A includes/Api/TextCat.php M includes/LanguageDetector/TextCat.php 4 files changed, 99 insertions(+), 2 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/39/326139/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 2782a77..30b8509 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -952,7 +952,13 @@ $wgCirrusSearchLanguageDetectors = []; /** - * Directory where TextCat detector should look for language model + * Map of <name, directory> where TextCat detector should look for language model + */ +$wgCirrusSearchTextcatModels = []; + +/** + * Default model name in $wgCirrusSearchTextcatModel to use. For BC purposes + * this can also be an exact directory. */ $wgCirrusSearchTextcatModel = false; @@ -1136,6 +1142,7 @@ $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump'; $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump'; $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump'; +$wgAPIModules['cirrus-textcat'] = 'CirrusSearch\Api\TextCat'; /** * Configs diff --git a/autoload.php b/autoload.php index 33c806c..c14c1c5 100644 --- a/autoload.php +++ b/autoload.php @@ -11,6 +11,7 @@ 'CirrusSearch\\Api\\MappingDump' => __DIR__ . '/includes/Api/MappingDump.php', 'CirrusSearch\\Api\\SettingsDump' => __DIR__ . '/includes/Api/SettingsDump.php', 'CirrusSearch\\Api\\SuggestIndex' => __DIR__ . '/includes/Api/SuggestIndex.php', + 'CirrusSearch\\Api\\TextCat' => __DIR__ . '/includes/Api/TextCat.php', 'CirrusSearch\\BaseInterwikiResolver' => __DIR__ . '/includes/BaseInterwikiResolver.php', 'CirrusSearch\\BaseRequestLog' => __DIR__ . '/includes/BaseRequestLog.php', 'CirrusSearch\\BuildDocument\\Builder' => __DIR__ . '/includes/BuildDocument/Builder.php', diff --git a/includes/Api/TextCat.php b/includes/Api/TextCat.php new file mode 100644 index 0000000..73c33f7 --- /dev/null +++ b/includes/Api/TextCat.php @@ -0,0 +1,89 @@ +<?php + +namespace CirrusSearch\Api; + +use ApiBase; +use ApiMain; +use MediaWiki\MediaWikiServices; + +/** + * Exposes textcat language classification via API. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class TextCat extends ApiBase { + public function __construct( ApiMain $mainModule, $moduleName, $modulePrefix = '' ) { + parent::__construct( $mainModule, $moduleName, $modulePrefix ); + $this->config = \MediaWiki\MediaWikiServices::getInstance() + ->getConfigFactory() + ->makeConfig( 'CirrusSearch' ); + } + + public function execute() { + $dir = $this->config->getElement( + 'CirrusSearchTextcatModels', + $this->getParameter( 'model' ) + ); + + if ( !is_dir( $dir ) ) { + throw new \ApiUsageException( '...' ); + } + + $textcat = new \TextCat( $dir ); + $languages = $textcat->classify( + $this->getParameter( 'text' ), + $this->getParameter( 'languages' ) + ); + + // TODO: Report relative costs instead of absolute? + $this->getResult()->addValue( null, $this->getModuleName(), $languages ); + } + + public function getAllowedParams() { + return [ + 'model' => [ + ApiBase::PARAM_TYPE => array_keys( $this->config->getElement( + 'CirrusSearchTextcatModel' + ) ), + ApiBase::PARAM_DFLT => 'article', + ], + 'text' => [ + ApiBase::PARAM_TYPE => 'text', + ApiBase::PARAM_REQUIRED => true, + ], + 'languages' => [ + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_ISMULTI => true, + ApiBase::PARAM_ALLOW_DUPLICATES => false, + ] + ]; + } + + /** + * @deprecated since MediaWiki core 1.25 + */ + public function getDescription() { + return 'Classify text to a particular language'; + } + + /** + * @see ApiBase::getExamplesMessages + */ + protected function getExamplesMessages() { + return [ + ]; + } +} diff --git a/includes/LanguageDetector/TextCat.php b/includes/LanguageDetector/TextCat.php index 94ef1c5..006b669 100644 --- a/includes/LanguageDetector/TextCat.php +++ b/includes/LanguageDetector/TextCat.php @@ -22,7 +22,7 @@ // Should not happen return null; } - $dir = $config->getElement('CirrusSearchTextcatModel'); + $dir = $config->getElement('CirrusSearchTextcatModel', 'query'); if( !$dir ) { return null; } -- To view, visit https://gerrit.wikimedia.org/r/326139 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I0be27828bcdb49da913e55aea62779ec29eb1d18 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits