DCausse has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/323550

Change subject: [WIP] Extract feature scores
......................................................................

[WIP] Extract feature scores

Still WIP, I can't get elastic to return the scores
for individual rescore QI factors.
Individual query features seem to be properly extracted.

Change-Id: I31dda1bb9b9c914352e7e015a4e7766bfee25bc8
---
M autoload.php
A includes/ML/FeatureCollector.php
A includes/ML/MLSearcher.php
M includes/Query/FullTextSimpleMatchQueryBuilder.php
M includes/Search/RescoreBuilders.php
M includes/Search/SearchContext.php
M maintenance/runSearch.php
7 files changed, 403 insertions(+), 21 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/50/323550/1

diff --git a/autoload.php b/autoload.php
index 4f493e9..bc79369 100644
--- a/autoload.php
+++ b/autoload.php
@@ -66,6 +66,8 @@
        'CirrusSearch\\LanguageDetector\\ElasticSearch' => __DIR__ . 
'/includes/LanguageDetector/ElasticSearch.php',
        'CirrusSearch\\LanguageDetector\\HttpAccept' => __DIR__ . 
'/includes/LanguageDetector/HttpAccept.php',
        'CirrusSearch\\LanguageDetector\\TextCat' => __DIR__ . 
'/includes/LanguageDetector/TextCat.php',
+       'CirrusSearch\\ML\\FeatureCollector' => __DIR__ . 
'/includes/ML/FeatureCollector.php',
+       'CirrusSearch\\ML\\MLSearcher' => __DIR__ . 
'/includes/ML/MLSearcher.php',
        'CirrusSearch\\Maintenance\\AnalysisConfigBuilder' => __DIR__ . 
'/includes/Maintenance/AnalysisConfigBuilder.php',
        'CirrusSearch\\Maintenance\\ChunkBuilder' => __DIR__ . 
'/includes/Maintenance/ChunkBuilder.php',
        'CirrusSearch\\Maintenance\\ConfigUtils' => __DIR__ . 
'/includes/Maintenance/ConfigUtils.php',
diff --git a/includes/ML/FeatureCollector.php b/includes/ML/FeatureCollector.php
new file mode 100644
index 0000000..58a0ad1
--- /dev/null
+++ b/includes/ML/FeatureCollector.php
@@ -0,0 +1,86 @@
+<?php
+
+namespace CirrusSearch\ML;
+
+use Elastica\Query\AbstractQuery;
+
+class FeatureCollector {
+       /** @var AbstractQuery $filter */
+       private $filter;
+
+       /** @var AbstractQuery[] $queries indexed by feature name */
+       private $queries = [];
+
+       /** @var AbstractQuery[] $qirescores indexed by feature name */
+       private $qirescores = [];
+
+       /** @var AbstractQuery[] $qdrescores indexed by feature name */
+       private $qdrescores = [];
+
+       /** @param AbstractQuery $filter */
+       public function setFilter( AbstractQuery $filter ) {
+               $this->filter = $filter;
+       }
+
+       /**
+        * @param string $feature
+        * @param AbstractQuery $query
+        */
+       public function addQuery( $feature, AbstractQuery $query ) {
+               $this->queries[$feature] = $query;
+       }
+
+       /**
+        * @param string $feature
+        * @param AbstractQuery $query
+        */
+       public function addQIRescore( $index, $functionType, $functionParams, 
$filter = null, $weight = null ) {
+               if ( is_array( $functionParams ) ) {
+                       $field = isset( $functionParams['field'] ) ? 
$functionParams['field'] : 'UNK';
+               } else {
+                       $field = 'UNK';
+               }
+               $this->qirescores["$index-$functionType-$field"] = [
+                       'function_type' => $functionType,
+                       'function_params' => $functionParams,
+                       'filter' => $filter,
+                       'weight' => $weight,
+               ];
+       }
+
+       /**
+        * @param string $feature
+        * @param AbstractQuery $query
+        */
+       public function addQDRescore( $feature, AbstractQuery $query ) {
+               $this->qdrescores[$feature] = $query;
+       }
+
+       /**
+        * @return AbstractQuery
+        */
+       public function getFilter() {
+               return $this->filter;
+       }
+
+       /**
+        * @return AbstractQuery[] indexed by feature's name
+        */
+       public function getQueries() {
+               return $this->queries;
+       }
+
+       /**
+        * @return AbstractQuery[] indexed by feature's name
+        */
+       public function getQDRescores() {
+               return $this->qdrescores;
+       }
+
+       /**
+        * @return array[] indexed by feature's name
+        */
+       public function getQIRescores() {
+               return $this->qirescores;
+       }
+}
diff --git a/includes/ML/MLSearcher.php b/includes/ML/MLSearcher.php
new file mode 100644
index 0000000..1d69a4c
--- /dev/null
+++ b/includes/ML/MLSearcher.php
@@ -0,0 +1,193 @@
+<?php
+
+namespace CirrusSearch\ML;
+
+use CirrusSearch\Connection;
+use CirrusSearch\Util;
+use CirrusSearch\FullTextQueryBuilder;
+use CirrusSearch\FullTextSimpleMatchQueryBuilder;
+use CirrusSearch\SearchConfig;
+use CirrusSearch\Searcher;
+use CirrusSearch\Search\Escaper;
+use CirrusSearch\Search\Filters;
+use CirrusSearch\Search\IdResultsType;
+use CirrusSearch\Search\ResultSet;
+use CirrusSearch\Search\SearchContext;
+use CirrusSearch\Search\RescoreBuilder;
+
+use Elastica\Multi\Search as MultiSearch;
+use Elastica\Search;
+use Elastica\Query;
+use Elastica\QueryBuilder;
+use Elastica\Query\AbstractQuery;
+use Elastica\Query\FunctionScore;
+
+class MLSearcher {
+       private $config;
+       private $connection;
+       private $indexBaseName;
+
+       public function __construct( Connection $conn, SearchConfig $config, 
$index ) {
+               $this->connection = $conn;
+               $this->config = $config;
+               $this->indexBaseName = $index ?: $config->get( 
SearchConfig::INDEX_BASE_NAME );
+       }
+
+       /**
+        * @var string $query
+        * @var array $namespaces
+        * @var int $topN
+        * @return array[] features
+        */
+       public function extractFeatures( $query, array $namespaces, $topN ) {
+               $collector = new FeatureCollector();
+               $query = Util::stripQuestionMarks( $query, $this->config->get( 
'CirrusSearchStripQuestionMarks' ) );
+               $builderProfile = $this->config->get( 
'CirrusSearchFullTextQueryBuilderProfile' );
+               $builderSettings = $this->config->getElement( 
'CirrusSearchFullTextQueryBuilderProfiles', $builderProfile );
+               if ( $builderSettings['builder_class'] === 
FullTextSimpleMatchQueryBuilder::class ) {
+                       throw new \RuntimeException( "Only 
FullTextSimpleMatchQueryBuilder is supported" );
+               }
+
+               $searcher = new Searcher( $this->connection, 0, $topN, 
$this->config, $namespaces, null, $this->indexBaseName );
+               $searcher->setResultsType( new IdResultsType() );
+               $status = $searcher->searchText( $query, false );
+               if ( is_array( $status ) ) {
+                       $ids = $status;
+               } elseif ( $status->isOK() ) {
+                       $ids = $status->getValue();
+               } else {
+                       return [];
+               }
+
+               $escaper = new Escaper( $this->config->get( 'LanguageCode' ), 
$this->config->get( 'CirrusSearchAllowLeadingWildcard' ) );
+               // TODO: refactor how we build query features so we don't have 
to rebuild all of them by hand
+               $qb = new $builderSettings['builder_class'](
+                       $this->config,
+                       $escaper,
+                       [
+                               // Handle morelike keyword (greedy). This needs 
to be the
+                               // very first item until combining with other 
queries
+                               // is worked out.
+                               new \CirrusSearch\Query\MoreLikeFeature( 
$this->config, [$this, [$searcher, 'get']] ),
+                               // Handle title prefix notation (greedy)
+                               new \CirrusSearch\Query\PrefixFeature(),
+                               // Handle prefer-recent keyword
+                               new \CirrusSearch\Query\PreferRecentFeature( 
$this->config ),
+                               // Handle local keyword
+                               new \CirrusSearch\Query\LocalFeature(),
+                               // Handle insource keyword using regex
+                               new \CirrusSearch\Query\RegexInSourceFeature( 
$this->config ),
+                               // Handle neartitle, nearcoord keywords, and 
their boosted alternates
+                               new \CirrusSearch\Query\GeoFeature(),
+                               // Handle boost-templates keyword
+                               new \CirrusSearch\Query\BoostTemplatesFeature(),
+                               // Handle hastemplate keyword
+                               new \CirrusSearch\Query\HasTemplateFeature(),
+                               // Handle linksto keyword
+                               new \CirrusSearch\Query\LinksToFeature(),
+                               // Handle incategory keyword
+                               new \CirrusSearch\Query\InCategoryFeature( 
$this->config ),
+                               // Handle non-regex insource keyword
+                               new \CirrusSearch\Query\SimpleInSourceFeature( 
$escaper ),
+                               // Handle intitle keyword
+                               new \CirrusSearch\Query\InTitleFeature( 
$escaper ),
+                               // inlanguage keyword
+                               new \CirrusSearch\Query\LanguageFeature(),
+                               // File types
+                               new \CirrusSearch\Query\FileTypeFeature(),
+                               // File numeric characteristics - size, 
resolution, etc.
+                               new \CirrusSearch\Query\FileNumericFeature(),
+                       ],
+                       $builderSettings['settings'],
+                       $collector
+               );
+
+               $searchContext = new SearchContext( $this->config, $namespaces 
);
+               $qb->build( $searchContext, $query, false );
+
+
+               if ( $collector->getFilter() === null ) {
+                       // We certainly switched to QueryString and
+                       // we can't really extract query string feature.
+                       return [];
+               }
+               $rescoreBuilder = new RescoreBuilder( $searchContext );
+               $rescoreBuilder->build( $collector );
+               return $this->extractScores( $collector, $searchContext, 
$namespaces, $ids );
+       }
+
+       private function extractScores( FeatureCollector $collector, 
SearchContext $context, $namespaces, array $ids ) {
+               $client = $this->connection->getClient();
+               $msearch = new MultiSearch( $client );
+               $indexType = $this->connection->pickIndexTypeForNamespaces( 
$namespaces );
+               $pageType = $this->connection->getPageType( 
$this->indexBaseName, $indexType );
+               $qb = new QueryBuilder();
+               $filter = $qb->query()->bool()
+                       ->addFilter( $qb->query()->ids( 'page', $ids ) );
+
+               $syntaxFilters = Filters::unify( $context->getFilters(), 
$context->getNotFilters() );
+               if ( $syntaxFilters !== null ) {
+                       $filter->addFilter( $syntaxFilters );
+               }
+
+
+               $baseline = [];
+               $addQuery = function( $name, AbstractQuery $query, $baseBoost )
+                               use ( $msearch, $filter, $pageType, $ids, $qb, 
&$baseline )
+               {
+                       if ( $query->hasParam( 'boost' ) ) {
+                               $query->setParam( 'boost', 1 );
+                       }
+                       $q = new Query( $qb->query()->bool()
+                               ->addMust( $query )
+                               ->addFilter( $filter ) );
+                       $q->setSize( count( $ids ) );
+                       $q->setSource( false );
+                       $baseline[$name] = [
+                               'score' => 0,
+                               'base_boost' => $baseBoost,
+                       ];
+                       $msearch->addSearch( $pageType->createSearch( $q ), 
$name );
+               };
+
+               foreach ( $collector->getQueries() as $name => $query ) {
+                       $addQuery( $name, $query, $query->hasParam( 'boost' ) ? 
$query->getParam( 'boost' ) : 1 );
+               }
+
+               foreach ( $collector->getQDRescores() as $name => $query ) {
+                       $addQuery( $name, $query, $query->getParam( 'boost' ) );
+               }
+
+               foreach ( $collector->getQIRescores() as $name => $data ) {
+                       $score = new FunctionScore();
+                       $score->setBoostMode( 'sum' );
+                       $score->setScoreMode( 'sum' );
+                       $score->addScriptScoreFunction( 
/*$data['function_type'],*/ $data['function_params'], $data['filter'], 1 );
+                       $q = new Query( $filter );
+                       $q->setSize( count( $ids ) );
+                       $q->setSource( false );
+                       $q->setParam( 'rescore',  [
+                               'window_size' => count( $ids ) + 100,
+                               'query' => [
+                                       'rescore_query' => $score->toArray(),
+                                       'query_weight' => 0,
+                                       'rescore_query_weight' => 1,
+                                       'score_mode' => 'total',
+                               ],
+                       ]);
+                       $baseline[$name] = [
+                               'score' => 0,
+                               'base_boost' => $data['weight'],
+                       ];
+                       $msearch->addSearch( $pageType->createSearch( $q ), 
$name );
+               }
+               $docMap = array_fill_keys( $ids, $baseline );
+               $res = $msearch->search();
+               foreach ( $res->getResultSets() as $name => $rs ) {
+                       foreach ( $rs as $r ) {
+                               $docMap[$r->getId()][$name]['score'] = 
$r->getScore();
+                       }
+               }
+               return $docMap;
+       }
+}
diff --git a/includes/Query/FullTextSimpleMatchQueryBuilder.php 
b/includes/Query/FullTextSimpleMatchQueryBuilder.php
index eadca32..e2f0e1d 100644
--- a/includes/Query/FullTextSimpleMatchQueryBuilder.php
+++ b/includes/Query/FullTextSimpleMatchQueryBuilder.php
@@ -5,6 +5,12 @@
 use CirrusSearch\Search\Escaper;
 use CirrusSearch\Search\SearchContext;
 use CirrusSearch\SearchConfig;
+use Elastica\Query\AbstractQuery;
+use Elastica\Query\BoolQuery;
+use Elastica\Query\MultiMatch;
+use Elastica\Query\Match;
+use Elastica\Query\DisMax;
+use Elastica\Query\QueryString;
 
 /**
  * Simple Match query builder, currently based on
@@ -50,7 +56,12 @@
         */
        private $dismaxSettings;
 
-       public function __construct( SearchConfig $config, Escaper $escaper, 
array $feature, array $settings ) {
+       /**
+        * @var FeatureCollector|null $collector
+        */
+       private $collector;
+
+       public function __construct( SearchConfig $config, Escaper $escaper, 
array $feature, array $settings, \CirrusSearch\ML\FeatureCollector $collector = 
null ) {
                parent::__construct( $config, $escaper, $feature );
                $this->fields = $settings['fields'];
                $this->phraseFields = $settings['phrase_rescore_fields'];
@@ -58,6 +69,7 @@
                $this->defaultQueryType = $settings['default_query_type'];
                $this->defaultMinShouldMatch = 
$settings['default_min_should_match'];
                $this->dismaxSettings = isset( $settings['dismax_settings'] ) ? 
$settings['dismax_settings'] : [];
+               $this->collector = $collector;
        }
 
        /**
@@ -70,7 +82,7 @@
         * @param string[] $nearMatchFields
         * @param string $queryString
         * @param string $nearMatchQuery
-        * @return \Elastica\Query\AbstractQuery
+        * @return AbstractQuery
         */
        protected function buildSearchTextQuery( SearchContext $context, array 
$fields, array $nearMatchFields, $queryString, $nearMatchQuery ) {
                if ( $context->isSyntaxUsed( 'query_string' ) || 
$this->requireAutoGeneratePhrase( $queryString ) ) {
@@ -85,12 +97,13 @@
 
                // Build one query for the full text fields and one for the 
near match fields so that
                // the near match can run unescaped.
-               $bool = new \Elastica\Query\BoolQuery();
+               $bool = new BoolQuery();
                $bool->setMinimumNumberShouldMatch( 1 );
                $bool->addShould( $queryForMostFields );
-               $nearMatch = new \Elastica\Query\MultiMatch();
+               $nearMatch = new MultiMatch();
                $nearMatch->setFields( $nearMatchFields );
                $nearMatch->setQuery( $nearMatchQuery );
+               $this->collectQuery( 'near_match', $nearMatch );
                $bool->addShould( $nearMatch );
 
                return $bool;
@@ -123,11 +136,11 @@
         * @param string[] $fields
         * @param string $queryText
         * @param int $slop
-        * @return \Elastica\Query\AbstractQuery
+        * @return AbstractQuery
         */
        protected function buildHighlightQuery( SearchContext $context, array 
$fields, $queryText, $slop ) {
                $query = parent::buildHighlightQuery( $context, $fields, 
$queryText, $slop );
-               if ( $this->usedExpQuery && $query instanceof 
\Elastica\Query\QueryString ) {
+               if ( $this->usedExpQuery && $query instanceof QueryString ) {
                        // the exp query accepts more docs (stopwords in query 
are not required)
                        /** @suppress PhanUndeclaredMethod $query is a 
QueryString */
                        $query->setDefaultOperator( 'OR' );
@@ -141,11 +154,11 @@
         * @param string[] $fields
         * @param string $queryText
         * @param int $slop
-        * @return \Elastica\Query\AbstractQuery
+        * @return AbstractQuery
         */
        protected function buildPhraseRescoreQuery( SearchContext $context, 
array $fields, $queryText, $slop ) {
                if ( $this->usedExpQuery ) {
-                       $phrase = new \Elastica\Query\MultiMatch();
+                       $phrase = new MultiMatch();
                        $phrase->setParam( 'type', 'phrase' );
                        $phrase->setParam( 'slop', $slop );
                        $fields = [];
@@ -154,6 +167,7 @@
                        }
                        $phrase->setFields( $fields );
                        $phrase->setQuery( $queryText );
+                       $this->collectQDRescore( 'phrase_rescore', $phrase );
                        return $phrase;
                } else {
                        return parent::buildPhraseRescoreQuery( $context, 
$fields, $queryText, $slop );
@@ -177,29 +191,30 @@
        /**
         * Generate an elasticsearch query by reading profile settings
         * @param string $queryString the query text
-        * @return \Elastica\Query\AbstractQuery
+        * @return AbstractQuery
         */
        private function buildExpQuery( $queryString ) {
-               $query = new \Elastica\Query\BoolQuery();
+               $query = new BoolQuery();
 
-               $all_filter = new \Elastica\Query\BoolQuery();
+               $all_filter = new BoolQuery();
                // FIXME: We can't use solely the stem field here
                // - Depending on langauges it may lack stopwords,
                // - Diacritics are sometimes (english) strangely (T141216)
                // A dedicated field used for filtering would be nice
-               $match = new \Elastica\Query\Match();
+               $match = new Match();
                $match->setField( 'all', [ "query" => $queryString ] );
                $match->setFieldOperator( 'all', 'AND' );
                $all_filter->addShould( $match );
-               $match = new \Elastica\Query\Match();
+               $match = new Match();
                $match->setField( 'all.plain', [ "query" => $queryString ] );
                $match->setFieldOperator( 'all.plain', 'AND' );
                $all_filter->addShould( $match );
                $query->addFilter( $all_filter );
+               $this->collectFilter( $all_filter );
                $dismaxQueries = [];
 
                foreach( $this->fields as $f => $settings ) {
-                       $mmatch = new \Elastica\Query\MultiMatch();
+                       $mmatch = new MultiMatch();
                        $mmatch->setQuery( $queryString );
                        $queryType = $this->defaultQueryType;
                        $minShouldMatch = $this->defaultMinShouldMatch;
@@ -235,11 +250,12 @@
                        if ( $in_dismax ) {
                                $dismaxQueries[$in_dismax][] = $mmatch;
                        } else {
+                               $this->collectQuery( $f, $mmatch );
                                $query->addShould( $mmatch );
                        }
                }
                foreach ( $dismaxQueries as $name => $queries ) {
-                       $dismax = new \Elastica\Query\DisMax();
+                       $dismax = new DisMax();
                        if ( isset ( $this->dismaxSettings[$name] ) ) {
                                $settings = $this->dismaxSettings[$name];
                                if ( isset ( $settings['tie_breaker'] ) ) {
@@ -252,10 +268,38 @@
                        foreach( $queries as $q ) {
                                $dismax->addQuery( $q );
                        }
+                       $this->collectQuery( $name, $dismax );
                        $query->addShould( $dismax );
                }
                // Removed in future lucene version 
https://issues.apache.org/jira/browse/LUCENE-7347
                $query->setParam( 'disable_coord', true );
                return $query;
        }
+
+       /** @var $filter AbstractQuery */
+       private function collectFilter( AbstractQuery $filter ) {
+               if ( $this->collector ) {
+                       $this->collector->setFilter( $filter );
+               }
+       }
+
+       /**
+        * @var $feature string
+        * @var $query AbstractQuery
+        */
+       private function collectQuery( $feature, AbstractQuery $query ) {
+               if ( $this->collector ) {
+                       $this->collector->addQuery( $feature, $query );
+               }
+       }
+
+       /**
+        * @var $feature string
+        * @var $query AbstractQuery
+        */
+       private function collectQDRescore( $feature, AbstractQuery $query ) {
+               if ( $this->collector ) {
+                       $this->collector->addQDRescore( $feature, $query );
+               }
+       }
 }
diff --git a/includes/Search/RescoreBuilders.php 
b/includes/Search/RescoreBuilders.php
index caf2d50..a270d88 100644
--- a/includes/Search/RescoreBuilders.php
+++ b/includes/Search/RescoreBuilders.php
@@ -4,6 +4,7 @@
 
 use CirrusSearch\Query\GeoFeature;
 use CirrusSearch\Util;
+use CirrusSearch\ML\FeatureCollector;
 use Elastica\Query\FunctionScore;
 use Elastica\Query\AbstractQuery;
 use MWNamespace;
@@ -71,9 +72,10 @@
        }
 
        /**
+        * @param FeatureCollector $collector
         * @return array of rescore queries
         */
-       public function build() {
+       public function build( FeatureCollector $collector = null) {
                $rescores = [];
                foreach( $this->profile['rescore'] as $rescoreDef ) {
                        $windowSize = $this->windowSize( $rescoreDef );
@@ -82,7 +84,7 @@
                        ];
 
                        $rescore['query'] = array_intersect_key( $rescoreDef, 
array_flip( self::$rescoreMainParams ) );
-                       $rescoreQuery = $this->buildRescoreQuery( $rescoreDef );
+                       $rescoreQuery = $this->buildRescoreQuery( $rescoreDef, 
$collector );
                        if ( $rescoreQuery === null ) {
                                continue;
                        }
@@ -96,12 +98,13 @@
         * builds the 'query' attribute by reading type
         *
         * @param array $rescoreDef
+        * @param FeatureCollector $collector
         * @return FunctionScore|null the rescore query
         */
-       private function buildRescoreQuery( array $rescoreDef ) {
+       private function buildRescoreQuery( array $rescoreDef, FeatureCollector 
$collector = null ) {
                switch( $rescoreDef['type'] ) {
                case self::FUNCTION_SCORE_TYPE:
-                       $funcChain = new FunctionScoreChain( $this->context, 
$rescoreDef['function_chain'] );
+                       $funcChain = new FunctionScoreChain( $this->context, 
$rescoreDef['function_chain'], $collector );
                        return $funcChain->buildRescoreQuery();
                default: throw new InvalidRescoreProfileException( "Unsupported 
rescore query type: " . $rescoreDef['type'] );
                }
@@ -224,10 +227,10 @@
         * @param string $chainName the name of the chain (must be a valid
         *  chain in wgCirrusSearchRescoreFunctionScoreChains)
         */
-       public function __construct( SearchContext $context, $chainName ) {
+       public function __construct( SearchContext $context, $chainName, 
FeatureCollector $collector = null ) {
                $this->chainName = $chainName;
                $this->context = $context;
-               $this->functionScore = new FunctionScoreDecorator();
+               $this->functionScore = new FunctionScoreDecorator( $collector );
                $this->chain = $context->getConfig()->getElement( 
'CirrusSearchRescoreFunctionScoreChains', $chainName );
                if ( $this->chain === null ) {
                        throw new InvalidRescoreProfileException( "Unknown 
rescore function chain $chainName" );
@@ -306,6 +309,13 @@
        /** @var int */
        private $size = 0;
 
+       /** @var FeatureCollector|null */
+       private $featuresCollector;
+
+       public function __construct( FeatureCollector $collector = null ) {
+               $this->featuresCollector = $collector;
+       }
+
        /**
         * @param string $functionType
         * @param array|float $functionParams
@@ -315,6 +325,11 @@
         */
        public function addFunction( $functionType, $functionParams, $filter = 
null, $weight = null ) {
                $this->size++;
+
+               if ( $this->featuresCollector ) {
+                       $this->featuresCollector->addQIRescore( $this->size, 
$functionType, $functionParams, $filter, $weight );
+               }
+
                return parent::addFunction( $functionType, $functionParams, 
$filter, $weight );
        }
 
diff --git a/includes/Search/SearchContext.php 
b/includes/Search/SearchContext.php
index 90912c7..10f890b 100644
--- a/includes/Search/SearchContext.php
+++ b/includes/Search/SearchContext.php
@@ -395,6 +395,13 @@
        }
 
        /**
+        * @return AbstractQuery[]
+        */
+       public function getFilters() {
+               return $this->filters;
+       }
+
+       /**
         * @param AbstractQuery $filter Query results must not match this filter
         */
        public function addNotFilter( AbstractQuery $filter ) {
@@ -402,6 +409,13 @@
        }
 
        /**
+        * @return AbstractQuery[]
+        */
+       public function getNotFilters() {
+               return $this->notFilters;
+       }
+
+       /**
         * @param bool $isFuzzy is this a fuzzy query?
         */
        public function setFuzzyQuery( $isFuzzy ) {
diff --git a/maintenance/runSearch.php b/maintenance/runSearch.php
index ee40621..249f3e7 100644
--- a/maintenance/runSearch.php
+++ b/maintenance/runSearch.php
@@ -3,8 +3,10 @@
 namespace CirrusSearch\Maintenance;
 
 use CirrusSearch;
+use CirrusSearch\Connection;
 use CirrusSearch\SearchConfig;
 use CirrusSearch\Search\ResultSet;
+use MediaWiki\MediaWikiServices;
 use RequestContext;
 use SearchSuggestionSet;
 use Status;
@@ -57,6 +59,7 @@
                $this->addOption( 'decode', 'urldecode() queries before running 
them', false, false );
                $this->addOption( 'explain', 'Include lucene explanation in the 
results', false, false );
                $this->addOption( 'limit', 'Set the max number of results 
returned by query (defaults to 10)', false, true );
+               $this->addOption( 'features', 'Extract feature scores (works 
only with full_text type', false, false );
        }
 
        public function execute() {
@@ -101,6 +104,10 @@
                        $query = urldecode( $query );
                }
                $data = [ 'query' => $query ];
+               if ( $this->getOption( 'features' ) ) {
+                       $data['results'] = $this->extractFeatures( $query );
+                       return json_encode( $data );
+               }
                $status = $this->searchFor( $query );
                if ( $status->isOK() ) {
                        $value = $status->getValue();
@@ -196,6 +203,27 @@
                        exit( 1 );
                }
        }
+
+       /** @return array[] indexed by docId */
+       protected function extractFeatures( $query ) {
+               $config = MediaWikiServices::getInstance()
+                               ->getConfigFactory()
+                               ->makeConfig( 'CirrusSearch' );
+               $indexBaseName = $config->get( SearchConfig::INDEX_BASE_NAME );
+               $connection = new Connection( $config );
+               // FIXME: proper support for namespaces
+               $searcher = new \CirrusSearch\ML\MLSearcher( $connection, 
$config, $indexBaseName );
+               $limit = $this->getOption( 'limit', 10 );
+               $docsAndFeatures = $searcher->extractFeatures( $query, 
[NS_MAIN], $limit );
+               $results = [];
+               foreach( $docsAndFeatures as $id => $features ) {
+                       $results[] = [
+                               'doc_id' => $id,
+                               'features' => $features
+                       ];
+               }
+               return $results;
+       }
 }
 
 $maintClass = RunSearch::class;

-- 
To view, visit https://gerrit.wikimedia.org/r/323550
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I31dda1bb9b9c914352e7e015a4e7766bfee25bc8
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to