jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/345585 )

Change subject: Add support for token_count_router
......................................................................


Add support for token_count_router

Move phrase detection into the plugin so that we use lucene analyzers.
Mostly needed for spaceless languages.

Bug: T152094
Change-Id: I7182531609af5b04c52b63388cece7d1981e236f
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
A includes/Elastica/MatchNone.php
A includes/Extra/Query/TokenCountRouter.php
M includes/Query/FullTextQueryStringQueryBuilder.php
M includes/Query/FullTextSimpleMatchQueryBuilder.php
M tests/jenkins/FullyFeaturedConfig.php
A tests/unit/Elastica/MatchNoneTest.php
9 files changed, 247 insertions(+), 12 deletions(-)

Approvals:
  Smalyshev: Looks good to me, but someone else must approve
  Tjones: Looks good to me, but someone else must approve
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 4bd5013..898f7e1 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -158,6 +158,11 @@
 // This allows forking on reindexing and is compatible with wikimedia-extra
 // versions 1.3.1, 1.4.2, 1.5.0, and greater:
 // $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
+//
+// Allows to use lucene tokenizers to activate phrase rescore. This allows not
+// to rely on the presence of spaces (which obviously does not work on 
spaceless
+// languages). Available since version 5.1.2
+// $wgCirrusSearchWikimediaExtraPlugin['token_count_router'] = true;
 $wgCirrusSearchWikimediaExtraPlugin = [];
 
 // Should CirrusSearch try to support regular expressions with insource:?
diff --git a/autoload.php b/autoload.php
index 93276b4..e6b67f0 100644
--- a/autoload.php
+++ b/autoload.php
@@ -37,6 +37,7 @@
        'CirrusSearch\\DataSender' => __DIR__ . '/includes/DataSender.php',
        'CirrusSearch\\Dump' => __DIR__ . '/includes/Dump.php',
        'CirrusSearch\\ElasticaErrorHandler' => __DIR__ . 
'/includes/ElasticaErrorHandler.php',
+       'CirrusSearch\\Elastica\\MatchNone' => __DIR__ . 
'/includes/Elastica/MatchNone.php',
        'CirrusSearch\\Elastica\\MultiSearch' => __DIR__ . 
'/includes/Elastica/MultiSearch.php',
        'CirrusSearch\\Elastica\\PooledHttp' => __DIR__ . 
'/includes/Elastica/PooledHttp.php',
        'CirrusSearch\\Elastica\\PooledHttps' => __DIR__ . 
'/includes/Elastica/PooledHttps.php',
@@ -49,6 +50,7 @@
        'CirrusSearch\\ExplainPrinter' => __DIR__ . 
'/includes/ExplainPrinter.php',
        'CirrusSearch\\Extra\\Query\\IdHashMod' => __DIR__ . 
'/includes/Extra/Query/IdHashMod.php',
        'CirrusSearch\\Extra\\Query\\SourceRegex' => __DIR__ . 
'/includes/Extra/Query/SourceRegex.php',
+       'CirrusSearch\\Extra\\Query\\TokenCountRouter' => __DIR__ . 
'/includes/Extra/Query/TokenCountRouter.php',
        'CirrusSearch\\ForceSearchIndex' => __DIR__ . 
'/maintenance/forceSearchIndex.php',
        'CirrusSearch\\FullTextQueryBuilderProfiles' => __DIR__ . 
'/profiles/FullTextQueryBuilderProfiles.php',
        'CirrusSearch\\HTMLCompletionProfileSettings' => __DIR__ . 
'/includes/HTMLCompletionProfileSettings.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index ef63cd7..ad9637e 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -181,6 +181,12 @@
 
     $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
 
+Allows to use lucene tokenizers to activate phrase rescore.
+This allows not to rely on the presence of spaces (which obviously does not
+work on spaceless languages). Available since version 5.1.2
+
+    $wgCirrusSearchWikimediaExtraPlugin['token_count_router'] = true;
+
 ; $wgCirrusSearchEnableRegex
 
 Default:
diff --git a/includes/Elastica/MatchNone.php b/includes/Elastica/MatchNone.php
new file mode 100644
index 0000000..f0c7b6b
--- /dev/null
+++ b/includes/Elastica/MatchNone.php
@@ -0,0 +1,18 @@
+<?php
+
+namespace CirrusSearch\Elastica;
+
+/**
+ * Backport of https://github.com/ruflin/Elastica/pull/1276
+ *
+ * @link 
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-all-query.html
+ */
+class MatchNone extends \Elastica\Query\AbstractQuery {
+    /**
+     * Creates match none query.
+     */
+    public function __construct() {
+        /** @suppress PhanTypeMismatchProperty (done like that in Elastica) */
+        $this->_params = new \stdClass();
+    }
+}
diff --git a/includes/Extra/Query/TokenCountRouter.php 
b/includes/Extra/Query/TokenCountRouter.php
new file mode 100644
index 0000000..9f56156
--- /dev/null
+++ b/includes/Extra/Query/TokenCountRouter.php
@@ -0,0 +1,135 @@
+<?php
+
+namespace CirrusSearch\Extra\Query;
+
+use Elastica\Query\AbstractQuery;
+
+/**
+ * TokenCountRouter query used to trigger a particular query by counting
+ * the number of tokens in the user query.
+ *
+ * @link 
https://github.com/wikimedia/search-extra/blob/master/docs/token_count_router.md
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+class TokenCountRouter extends AbstractQuery {
+       /**
+        * @const string greater than
+        */
+       const GT = 'gt';
+
+       /**
+        * @const string greater or equal
+        */
+       const GTE = 'gte';
+
+       /**
+        * @const string equal
+        */
+       const EQ = 'eq';
+
+       /**
+        * @const string not equal
+        */
+       const NEQ = 'neq';
+
+       /**
+        * @const string less than or equal
+        */
+       const LTE = 'lte';
+
+       /**
+        * @const string less than
+        */
+       const LT = 'lt';
+
+       /**
+        * @param string $text the text to analyze
+        * @param AbstractQuery $fallbackQuery the query to run when no
+        * conditions match
+        * @param string|null $field use the analyzer of this field
+        * @param string|null $analyzer use this analyzer
+        */
+       public function __construct( $text, AbstractQuery $fallbackQuery, 
$field = null, $analyzer = null ) {
+               $this->setText( $text );
+               $this->setFallback( $fallbackQuery );
+               if ( $field ) {
+                       $this->setField( $field );
+               }
+               if ( $analyzer ) {
+                       $this->setAnalyzer( $analyzer );
+               }
+       }
+
+       /**
+        * @param string $text count tokens from this text
+        * @return self
+        */
+       public function setText( $text ) {
+               return $this->setParam( 'text', $text );
+       }
+
+       /**
+        * @param AbstractQuery $query
+        * @return self
+        */
+       public function setFallback( AbstractQuery $query ) {
+               return $this->setParam( 'fallback', $query );
+       }
+
+       /**
+        * @param string $field the field to fetch analyzer info
+        * @return self
+        */
+       public function setField( $field ) {
+               return $this->setParam( 'field', $field );
+       }
+
+       /**
+        * @param string $analyzer the field to fetch analyzer info
+        * @return self
+        */
+       public function setAnalyzer( $analyzer ) {
+               return $this->setParam( 'analyzer', $analyzer );
+       }
+
+       /**
+        * Adds a new condition
+        * The first condition that evaluates to true is applied.
+        * If none match the fallback query is applied.
+        *
+        * @param string $type the condition to apply
+        * @param int $value the value to compare
+        * @param AbstractQuery $query the query to run if the condition is
+        * true ignoring all remaining conditions
+        */
+       public function addCondition( $type, $value, AbstractQuery $query ) {
+               switch( $type ) {
+               case self::GT:
+               case self::GTE:
+               case self::EQ:
+               case self::NEQ:
+               case self::LT:
+               case self::LTE: break;
+               default: throw new \InvalidArgumentException( "$type is not 
allowed as a condition type" );
+               }
+               return $this->addParam( 'conditions', [
+                       $type => $value,
+                       'query' => $query,
+               ] );
+       }
+}
diff --git a/includes/Query/FullTextQueryStringQueryBuilder.php 
b/includes/Query/FullTextQueryStringQueryBuilder.php
index a1f0f5e..1922c3d 100644
--- a/includes/Query/FullTextQueryStringQueryBuilder.php
+++ b/includes/Query/FullTextQueryStringQueryBuilder.php
@@ -6,6 +6,7 @@
 use CirrusSearch\SearchConfig;
 use CirrusSearch\Searcher;
 use CirrusSearch\Search\SearchContext;
+use CirrusSearch\Extra\Query\TokenCountRouter;
 use MediaWiki\Logger\LoggerFactory;
 
 /**
@@ -29,6 +30,11 @@
        private $queryStringQueryString = '';
 
        /**
+        * @var bool
+        */
+       private $useTokenCountRouter;
+
+       /**
         * @param SearchConfig $config
         * @param KeywordFeature[] $features
         * @param array[] $settings currently ignored
@@ -36,6 +42,7 @@
        public function __construct( SearchConfig $config, array $features, 
array $settings = [] ) {
                $this->config = $config;
                $this->features = $features;
+               $this->useTokenCountRouter = $this->config->getElement( 
'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true;
        }
 
        /**
@@ -203,16 +210,7 @@
                        $nonAllFields = $fields;
                }
 
-               // Only do a phrase match rescore if the query doesn't include 
any quotes and has a space.
-               // Queries without spaces are either single term or have a 
phrase query generated.
-               // Queries with the quote already contain a phrase query and we 
can't build phrase queries
-               // out of phrase queries at this point.
-               if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) > 
0.0 &&
-                               $this->config->get( 
'CirrusSearchPhraseRescoreWindowSize' ) &&
-                               !$searchContext->isSpecialKeywordUsed() &&
-                               strpos( $this->queryStringQueryString, '"' ) 
=== false &&
-                               strpos( $this->queryStringQueryString, ' ' ) 
!== false ) {
-
+               if ( $this->isPhraseRescoreNeeded( $searchContext ) ) {
                        $rescoreFields = $fields;
                        if ( !$this->config->get( 
'CirrusSearchAllFieldsForRescore' ) ) {
                                $rescoreFields = $nonAllFields;
@@ -620,6 +618,56 @@
         * @return \Elastica\Query\AbstractQuery
         */
        protected function buildPhraseRescoreQuery( SearchContext $context, 
array $fields, $queryText, $slop ) {
-               return $this->buildQueryString( $fields, '"' . $queryText . 
'"', $slop );
+               return $this->maybeWrapWithTokenCountRouter(
+                       $queryText,
+                       $this->buildQueryString( $fields, '"' . $queryText . 
'"', $slop )
+               );
+       }
+
+       /**
+        * Determines if a phrase rescore is needed
+        * @param SearchContext $searchContext
+        * @return bool true if we can a phrase rescore
+        */
+       protected function isPhraseRescoreNeeded( SearchContext $searchContext 
) {
+               // Only do a phrase match rescore if the query doesn't include
+               // any quotes and has a space or the token count router is
+               // active.
+               // Queries without spaces are either single term or have a
+               // phrase query generated.
+               // Queries with the quote already contain a phrase query and we
+               // can't build phrase queries out of phrase queries at this
+               // point.
+               if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) > 
0.0 &&
+                       $this->config->get( 
'CirrusSearchPhraseRescoreWindowSize' ) &&
+                       !$searchContext->isSpecialKeywordUsed() &&
+                       strpos( $this->queryStringQueryString, '"' ) === false 
&&
+                       ( $this->useTokenCountRouter || strpos( 
$this->queryStringQueryString, ' ' ) !== false )
+               ) {
+                       return true;
+               }
+               return false;
+       }
+
+       protected function maybeWrapWithTokenCountRouter( $queryText, 
\Elastica\Query\AbstractQuery $query ) {
+               if ( $this->useTokenCountRouter ) {
+                       $tokCount = new TokenCountRouter(
+                               // text
+                               $queryText,
+                               // fallack
+                               new \CirrusSearch\Elastica\MatchNone(),
+                               // field
+                               null,
+                               // analyzer
+                               'text_search'
+                       );
+                       $tokCount->addCondition(
+                               TokenCountRouter::GT,
+                               1,
+                               $query
+                       );
+                       return $tokCount;
+               }
+               return $query;
        }
 }
diff --git a/includes/Query/FullTextSimpleMatchQueryBuilder.php 
b/includes/Query/FullTextSimpleMatchQueryBuilder.php
index 6684ead..6788aa2 100644
--- a/includes/Query/FullTextSimpleMatchQueryBuilder.php
+++ b/includes/Query/FullTextSimpleMatchQueryBuilder.php
@@ -154,7 +154,7 @@
                        }
                        $phrase->setFields( $fields );
                        $phrase->setQuery( $queryText );
-                       return $phrase;
+                       return $this->maybeWrapWithTokenCountRouter( 
$queryText, $phrase );
                } else {
                        return parent::buildPhraseRescoreQuery( $context, 
$fields, $queryText, $slop );
                }
diff --git a/tests/jenkins/FullyFeaturedConfig.php 
b/tests/jenkins/FullyFeaturedConfig.php
index f848050..a637292 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -33,6 +33,9 @@
 $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
 $wgCirrusSearchWikimediaExtraPlugin[ 'documentVersion' ] = true;
 
+// Enable when https://gerrit.wikimedia.org/r/#/c/345174/ is available
+// $wgCirrusSearchWikimediaExtraPlugin[ 'token_count_router' ] = true;
+
 $wgCirrusSearchUseCompletionSuggester = 'yes';
 $wgCirrusSearchCompletionSuggesterUseDefaultSort = true;
 $wgCirrusSearchCompletionSuggesterSubphrases = [
diff --git a/tests/unit/Elastica/MatchNoneTest.php 
b/tests/unit/Elastica/MatchNoneTest.php
new file mode 100644
index 0000000..85ebdbd
--- /dev/null
+++ b/tests/unit/Elastica/MatchNoneTest.php
@@ -0,0 +1,18 @@
+<?php
+
+namespace CirrusSearch\Elastica;
+
+class MatchNoneTest extends \PHPUnit_Framework_TestCase {
+       public function testMatchNone() {
+               $query = new MatchNone();
+               $expectedArray = ['match_none' => new \stdClass()];
+               $this->assertEquals($expectedArray, $query->toArray());
+       }
+
+       public function testBackPorts() {
+               $this->assertFalse(
+                       class_exists( \Elastica\Query\MatchNone::class ),
+                       "MatchNone is now in elastica please remove this 
backport"
+               );
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/345585
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7182531609af5b04c52b63388cece7d1981e236f
Gerrit-PatchSet: 7
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to