Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/160811
Change subject: Adds a cirrus-analyze api ...................................................................... Adds a cirrus-analyze api The analyze api wraps http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/indices-analyze.html and lets users fiddle a bit with the chosen analyzer. It might be useful for explanation purposes but the analyze api isn't powerful enough for any real experimentation, unfortunately. Change-Id: I461c50581417bd4bb125dc3940d08d2c425f3151 --- M CirrusSearch.php A includes/Action/AbstractFormlessAction.php A includes/Action/Analyze.php R includes/Action/Dump.php A includes/Api/Analyze.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Searcher.php 7 files changed, 250 insertions(+), 27 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/11/160811/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 08a63fb..86c98e7 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -500,6 +500,7 @@ $includes = __DIR__ . "/includes/"; +$actionDir = $includes . 'Action/'; $apiDir = $includes . 'Api/'; $buildDocument = $includes . 'BuildDocument/'; $jobsDir = $includes . 'Job/'; @@ -511,6 +512,10 @@ * Classes */ $wgAutoloadClasses['CirrusSearch'] = $includes . 'CirrusSearch.php'; +$wgAutoloadClasses['CirrusSearch\Action\AbstractFormlessAction'] = $actionDir . 'AbstractFormlessAction.php'; +$wgAutoloadClasses['CirrusSearch\Action\Analyze'] = $actionDir . 'Analyze.php'; +$wgAutoloadClasses['CirrusSearch\Action\Dump'] = $actionDir . 'Dump.php'; +$wgAutoloadClasses['CirrusSearch\Api\Analyze'] = $apiDir . 'Analyze.php'; $wgAutoloadClasses['CirrusSearch\Api\ConfigDump'] = $apiDir . 'ConfigDump.php'; $wgAutoloadClasses['CirrusSearch\Api\MappingDump'] = $apiDir . 'MappingDump.php'; $wgAutoloadClasses['CirrusSearch\Api\SettingsDump'] = $apiDir . 'SettingsDump.php'; @@ -521,7 +526,6 @@ $wgAutoloadClasses['CirrusSearch\BuildDocument\ParseBuilder'] = $buildDocument . 'Builder.php'; $wgAutoloadClasses['CirrusSearch\BuildDocument\RedirectsAndIncomingLinks'] = $buildDocument . 'RedirectsAndIncomingLinks.php'; $wgAutoloadClasses['CirrusSearch\Connection'] = $includes . 'Connection.php'; -$wgAutoloadClasses['CirrusSearch\Dump'] = $includes . 'Dump.php'; $wgAutoloadClasses['CirrusSearch\ElasticsearchIntermediary'] = $includes . 'ElasticsearchIntermediary.php'; $wgAutoloadClasses['CirrusSearch\ForceSearchIndex'] = __DIR__ . '/maintenance/forceSearchIndex.php'; $wgAutoloadClasses['CirrusSearch\Hooks'] = $includes . 'Hooks.php'; @@ -601,11 +605,13 @@ /** * Actions */ -$wgActions[ 'cirrusdump' ] = 'CirrusSearch\Dump'; +$wgActions[ 'cirrusdump' ] = 'CirrusSearch\Action\Dump'; +$wgActions[ 'cirrusanalyze' ] = 'CirrusSearch\Action\Analyze'; /** * API */ +$wgAPIModules['cirrus-analyze'] = 'CirrusSearch\Api\Analyze'; $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump'; $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump'; $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump'; diff --git a/includes/Action/AbstractFormlessAction.php b/includes/Action/AbstractFormlessAction.php new file mode 100644 index 0000000..4926aa5 --- /dev/null +++ b/includes/Action/AbstractFormlessAction.php @@ -0,0 +1,48 @@ +<?php + +namespace CirrusSearch\Action; + +use \CirrusSearch\Searcher; +use \FormlessAction; + +/** + * Base class for Cirrus formless actions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +abstract class AbstractFormlessAction extends FormlessAction { + public abstract function result(); + + public function onView() { + // Disable regular results + $this->getOutput()->disable(); + + $response = $this->getRequest()->response(); + $response->header( 'Content-type: application/json; charset=UTF-8' ); + + echo json_encode( $this->result() ); + + return null; + } + + public function requiresWrite() { + return false; + } + + public function requiresUnblock() { + return false; + } +} diff --git a/includes/Action/Analyze.php b/includes/Action/Analyze.php new file mode 100644 index 0000000..5ed42a8 --- /dev/null +++ b/includes/Action/Analyze.php @@ -0,0 +1,55 @@ +<?php + +namespace CirrusSearch\Action; + +use \CirrusSearch\Searcher; +use \FormlessAction; + +/** + * action=cirrusanalyze handler. Analyzes the page text into tokens. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class Analyze extends AbstractFormlessAction { + public function result() { + $searcher = new Searcher( 0, 0, false, $this->getUser() ); + $id = $this->getTitle()->getArticleID(); + $esSources = $searcher->get( array( $id ), true ); + if ( !$esSources->isOk() ) { + // Exception has been logged + return array(); + } + $esSources = $esSources->getValue(); + if ( !isset( $esSources[ 0 ] ) ) { + return array(); + } + $source = $esSources[ 0 ]; + if ( !isset( $source->text ) ) { + return array(); + } + $text = $source->text; + $analyzed = $searcher->analyze( $text, array( 'analyzer' => 'text' ) ); + if ( !$analyzed->isOk() ) { + // Exception has been logged + return array(); + } + return $analyzed->getValue(); + } + + public function getName() { + return 'cirrusanalyze'; + } +} diff --git a/includes/Dump.php b/includes/Action/Dump.php similarity index 73% rename from includes/Dump.php rename to includes/Action/Dump.php index 58d5b65..16df6d2 100644 --- a/includes/Dump.php +++ b/includes/Action/Dump.php @@ -1,11 +1,12 @@ <?php -namespace CirrusSearch; +namespace CirrusSearch\Action; +use \CirrusSearch\Searcher; use \FormlessAction; /** - * action=cirrusDump handler. Dumps contents of Elasticsearch indexes for the + * action=cirrusdump handler. Dumps contents of Elasticsearch indexes for the * page. * * This program is free software; you can redistribute it and/or modify @@ -23,21 +24,14 @@ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * http://www.gnu.org/copyleft/gpl.html */ -class Dump extends FormlessAction { - public function onView() { - // Disable regular results - $this->getOutput()->disable(); - - $response = $this->getRequest()->response(); - $response->header( 'Content-type: application/json; charset=UTF-8' ); - +class Dump extends AbstractFormlessAction { + public function result() { $searcher = new Searcher( 0, 0, false, $this->getUser() ); $id = $this->getTitle()->getArticleID(); $esSources = $searcher->get( array( $id ), true ); if ( !$esSources->isOk() ) { // Exception has been logged - echo '{}'; - return null; + return array(); } $esSources = $esSources->getValue(); @@ -51,20 +45,10 @@ '_source' => $esSource->getData(), ); } - echo json_encode( $result ); - - return null; + return $result; } public function getName() { return 'cirrusdump'; - } - - public function requiresWrite() { - return false; - } - - public function requiresUnblock() { - return false; } } diff --git a/includes/Api/Analyze.php b/includes/Api/Analyze.php new file mode 100644 index 0000000..d806fb1 --- /dev/null +++ b/includes/Api/Analyze.php @@ -0,0 +1,97 @@ +<?php + + +namespace CirrusSearch\Api; +use \ApiBase; +use \CirrusSearch\Searcher; + +/** + * Analyzes a string using some parameters. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class Analyze extends ApiBase { + public function execute() { + $params = $this->extractRequestParams(); + $searcher = new Searcher( 0, 0, false, $this->getUser() ); + $args = array(); + if ( isset( $params[ 'analyzer' ] ) ) { + $args[ 'analyzer' ] = $params[ 'analyzer' ]; + } else { + $args[ 'tokenizer' ] = $params[ 'tokenizer' ]; + if ( isset( $params[ 'tokenfilters' ] ) ) { + $args[ 'token_filters' ] = implode( ',', $params[ 'tokenfilters' ] ); + } + if ( isset( $params[ 'charfilters' ] ) ) { + $args[ 'char_filters' ] = implode( ',', $params[ 'charfilters' ] ); + } + } + + $analyzed = $searcher->analyze( $params[ 'text' ], $args ); + if ( !$analyzed->isOk() ) { + // Exception has been logged + $this->dieStatus( $analyzed ); + } + $result = array(); + foreach ( $analyzed->getValue() as $value ) { + $result[] = $value; + } + $this->getResult()->setIndexedTagName( $result, 'tokens' ); + $this->getResult()->addValue( null, 'tokens', $result ); + } + + public function getAllowedParams() { + return array( + 'text' => array( + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_REQUIRED => true + ), + 'analyzer' => array( + ApiBase::PARAM_TYPE => array( + 'text', 'plain', 'near_match', + 'suggest', 'prefix', 'word_prefix', + 'lowercase_keyword' + ), + ), + 'tokenizer' => array( + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_DFLT => 'standard', + ), + 'tokenfilters' => array( + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_ISMULTI => true, + ), + 'charfilters' => array( + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_ISMULTI => true, + ), + ); + } + + public function getParamDescription() { + return array( + 'text' => 'Text to analyze', + 'analyzer' => 'Named analyzer (overrides tokenizer, tokenfilters, charfilters if set)', + 'tokenizer' => 'Tokenizer doing analysis', + 'tokenfilters' => 'Filters applied to tokens', + 'charfilters' => 'Filters applied to text before tokenizer', + ); + } + + public function getDescription() { + return 'Analyze a string using Elasticsearch.'; + } +} diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index 1d17c4f..a36fd37 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -162,7 +162,7 @@ ), ), 'char_filter' => array( - // Flattens things that are space like to spaces in the near_match style analyzersc + // Flattens things that are space like to spaces in the near_match style analyzers 'near_space_flattener' => array( 'type' => 'mapping', 'mappings' => array( diff --git a/includes/Searcher.php b/includes/Searcher.php index bd78012..fdc636b 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -781,7 +781,7 @@ 'doWork' => function() use ( $searcher, $pageIds, $sourceFiltering, $indexType, $indexBaseName ) { try { global $wgCirrusSearchClientSideSearchTimeout; - $searcher->start( "get of $indexType." . implode( ', ', $pageIds ) ); + $searcher->start( "get of " . implode( ', ', $pageIds ) ); // Shard timeout not supported on get requests so we just use the client side timeout Connection::setTimeout( $wgCirrusSearchClientSideSearchTimeout[ 'default' ] ); $pageType = Connection::getPageType( $indexBaseName, $indexType ); @@ -806,6 +806,39 @@ return $getWork->execute(); } + /** + * Send $text to elasticsearch for analysis with the analysis arguments from $args. + * @param string $text text to analyze + * @param array $args analysis arguments + * @return array result of the analysis + */ + public function analyze( $text, $args ) { + global $wgCirrusSearchPoolCounterKey; + + $searcher = $this; + $getWork = new PoolCounterWorkViaCallback( 'CirrusSearch-Analyze', $wgCirrusSearchPoolCounterKey, array( + 'doWork' => function() use ( $searcher, $text, $args ) { + global $wgCirrusSearchClientSideSearchTimeout; + + try { + $searcher->start( "analyzing" ); + // Shard timeout not supported on get requests so we just use the client side timeout + Connection::setTimeout( $wgCirrusSearchClientSideSearchTimeout[ 'default' ] ); + $index = Connection::getIndex( wfWikiId(), 'general' ); + return $searcher->success( $index->analyze( $text, $args ) ); + } catch ( \Elastica\Exception\ExceptionInterface $e ) { + return $searcher->failure( $e ); + } + }, + 'error' => function( $status ) { + $status = $status->getErrorsArray(); + wfLogWarning( 'Pool error performing an analyze against Elasticsearch: ' . $status[ 0 ][ 0 ] ); + return Status::newFatal( 'cirrussearch-backend-error' ); + } + ) ); + return $getWork->execute(); + } + private function extractSpecialSyntaxFromTerm( $regex, $callback ) { $suggestPrefixes = $this->suggestPrefixes; $this->term = preg_replace_callback( $regex, -- To view, visit https://gerrit.wikimedia.org/r/160811 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I461c50581417bd4bb125dc3940d08d2c425f3151 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
