jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/345585 )
Change subject: Add support for token_count_router
......................................................................
Add support for token_count_router
Move phrase detection into the plugin so that we use lucene analyzers.
Mostly needed for spaceless languages.
Bug: T152094
Change-Id: I7182531609af5b04c52b63388cece7d1981e236f
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
A includes/Elastica/MatchNone.php
A includes/Extra/Query/TokenCountRouter.php
M includes/Query/FullTextQueryStringQueryBuilder.php
M includes/Query/FullTextSimpleMatchQueryBuilder.php
M tests/jenkins/FullyFeaturedConfig.php
A tests/unit/Elastica/MatchNoneTest.php
9 files changed, 247 insertions(+), 12 deletions(-)
Approvals:
Smalyshev: Looks good to me, but someone else must approve
Tjones: Looks good to me, but someone else must approve
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 4bd5013..898f7e1 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -158,6 +158,11 @@
// This allows forking on reindexing and is compatible with wikimedia-extra
// versions 1.3.1, 1.4.2, 1.5.0, and greater:
// $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
+//
+// Allows to use lucene tokenizers to activate phrase rescore. This allows not
+// to rely on the presence of spaces (which obviously does not work on
spaceless
+// languages). Available since version 5.1.2
+// $wgCirrusSearchWikimediaExtraPlugin['token_count_router'] = true;
$wgCirrusSearchWikimediaExtraPlugin = [];
// Should CirrusSearch try to support regular expressions with insource:?
diff --git a/autoload.php b/autoload.php
index 93276b4..e6b67f0 100644
--- a/autoload.php
+++ b/autoload.php
@@ -37,6 +37,7 @@
'CirrusSearch\\DataSender' => __DIR__ . '/includes/DataSender.php',
'CirrusSearch\\Dump' => __DIR__ . '/includes/Dump.php',
'CirrusSearch\\ElasticaErrorHandler' => __DIR__ .
'/includes/ElasticaErrorHandler.php',
+ 'CirrusSearch\\Elastica\\MatchNone' => __DIR__ .
'/includes/Elastica/MatchNone.php',
'CirrusSearch\\Elastica\\MultiSearch' => __DIR__ .
'/includes/Elastica/MultiSearch.php',
'CirrusSearch\\Elastica\\PooledHttp' => __DIR__ .
'/includes/Elastica/PooledHttp.php',
'CirrusSearch\\Elastica\\PooledHttps' => __DIR__ .
'/includes/Elastica/PooledHttps.php',
@@ -49,6 +50,7 @@
'CirrusSearch\\ExplainPrinter' => __DIR__ .
'/includes/ExplainPrinter.php',
'CirrusSearch\\Extra\\Query\\IdHashMod' => __DIR__ .
'/includes/Extra/Query/IdHashMod.php',
'CirrusSearch\\Extra\\Query\\SourceRegex' => __DIR__ .
'/includes/Extra/Query/SourceRegex.php',
+ 'CirrusSearch\\Extra\\Query\\TokenCountRouter' => __DIR__ .
'/includes/Extra/Query/TokenCountRouter.php',
'CirrusSearch\\ForceSearchIndex' => __DIR__ .
'/maintenance/forceSearchIndex.php',
'CirrusSearch\\FullTextQueryBuilderProfiles' => __DIR__ .
'/profiles/FullTextQueryBuilderProfiles.php',
'CirrusSearch\\HTMLCompletionProfileSettings' => __DIR__ .
'/includes/HTMLCompletionProfileSettings.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index ef63cd7..ad9637e 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -181,6 +181,12 @@
$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
+Allows to use lucene tokenizers to activate phrase rescore.
+This allows not to rely on the presence of spaces (which obviously does not
+work on spaceless languages). Available since version 5.1.2
+
+ $wgCirrusSearchWikimediaExtraPlugin['token_count_router'] = true;
+
; $wgCirrusSearchEnableRegex
Default:
diff --git a/includes/Elastica/MatchNone.php b/includes/Elastica/MatchNone.php
new file mode 100644
index 0000000..f0c7b6b
--- /dev/null
+++ b/includes/Elastica/MatchNone.php
@@ -0,0 +1,18 @@
+<?php
+
+namespace CirrusSearch\Elastica;
+
+/**
+ * Backport of https://github.com/ruflin/Elastica/pull/1276
+ *
+ * @link
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-all-query.html
+ */
+class MatchNone extends \Elastica\Query\AbstractQuery {
+ /**
+ * Creates match none query.
+ */
+ public function __construct() {
+ /** @suppress PhanTypeMismatchProperty (done like that in Elastica) */
+ $this->_params = new \stdClass();
+ }
+}
diff --git a/includes/Extra/Query/TokenCountRouter.php
b/includes/Extra/Query/TokenCountRouter.php
new file mode 100644
index 0000000..9f56156
--- /dev/null
+++ b/includes/Extra/Query/TokenCountRouter.php
@@ -0,0 +1,135 @@
+<?php
+
+namespace CirrusSearch\Extra\Query;
+
+use Elastica\Query\AbstractQuery;
+
+/**
+ * TokenCountRouter query used to trigger a particular query by counting
+ * the number of tokens in the user query.
+ *
+ * @link
https://github.com/wikimedia/search-extra/blob/master/docs/token_count_router.md
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+class TokenCountRouter extends AbstractQuery {
+ /**
+ * @const string greater than
+ */
+ const GT = 'gt';
+
+ /**
+ * @const string greater or equal
+ */
+ const GTE = 'gte';
+
+ /**
+ * @const string equal
+ */
+ const EQ = 'eq';
+
+ /**
+ * @const string not equal
+ */
+ const NEQ = 'neq';
+
+ /**
+ * @const string less than or equal
+ */
+ const LTE = 'lte';
+
+ /**
+ * @const string less than
+ */
+ const LT = 'lt';
+
+ /**
+ * @param string $text the text to analyze
+ * @param AbstractQuery $fallbackQuery the query to run when no
+ * conditions match
+ * @param string|null $field use the analyzer of this field
+ * @param string|null $analyzer use this analyzer
+ */
+ public function __construct( $text, AbstractQuery $fallbackQuery,
$field = null, $analyzer = null ) {
+ $this->setText( $text );
+ $this->setFallback( $fallbackQuery );
+ if ( $field ) {
+ $this->setField( $field );
+ }
+ if ( $analyzer ) {
+ $this->setAnalyzer( $analyzer );
+ }
+ }
+
+ /**
+ * @param string $text count tokens from this text
+ * @return self
+ */
+ public function setText( $text ) {
+ return $this->setParam( 'text', $text );
+ }
+
+ /**
+ * @param AbstractQuery $query
+ * @return self
+ */
+ public function setFallback( AbstractQuery $query ) {
+ return $this->setParam( 'fallback', $query );
+ }
+
+ /**
+ * @param string $field the field to fetch analyzer info
+ * @return self
+ */
+ public function setField( $field ) {
+ return $this->setParam( 'field', $field );
+ }
+
+ /**
+ * @param string $analyzer the field to fetch analyzer info
+ * @return self
+ */
+ public function setAnalyzer( $analyzer ) {
+ return $this->setParam( 'analyzer', $analyzer );
+ }
+
+ /**
+ * Adds a new condition
+ * The first condition that evaluates to true is applied.
+ * If none match the fallback query is applied.
+ *
+ * @param string $type the condition to apply
+ * @param int $value the value to compare
+ * @param AbstractQuery $query the query to run if the condition is
+ * true ignoring all remaining conditions
+ */
+ public function addCondition( $type, $value, AbstractQuery $query ) {
+ switch( $type ) {
+ case self::GT:
+ case self::GTE:
+ case self::EQ:
+ case self::NEQ:
+ case self::LT:
+ case self::LTE: break;
+ default: throw new \InvalidArgumentException( "$type is not
allowed as a condition type" );
+ }
+ return $this->addParam( 'conditions', [
+ $type => $value,
+ 'query' => $query,
+ ] );
+ }
+}
diff --git a/includes/Query/FullTextQueryStringQueryBuilder.php
b/includes/Query/FullTextQueryStringQueryBuilder.php
index a1f0f5e..1922c3d 100644
--- a/includes/Query/FullTextQueryStringQueryBuilder.php
+++ b/includes/Query/FullTextQueryStringQueryBuilder.php
@@ -6,6 +6,7 @@
use CirrusSearch\SearchConfig;
use CirrusSearch\Searcher;
use CirrusSearch\Search\SearchContext;
+use CirrusSearch\Extra\Query\TokenCountRouter;
use MediaWiki\Logger\LoggerFactory;
/**
@@ -29,6 +30,11 @@
private $queryStringQueryString = '';
/**
+ * @var bool
+ */
+ private $useTokenCountRouter;
+
+ /**
* @param SearchConfig $config
* @param KeywordFeature[] $features
* @param array[] $settings currently ignored
@@ -36,6 +42,7 @@
public function __construct( SearchConfig $config, array $features,
array $settings = [] ) {
$this->config = $config;
$this->features = $features;
+ $this->useTokenCountRouter = $this->config->getElement(
'CirrusSearchWikimediaExtraPlugin', 'token_count_router' ) === true;
}
/**
@@ -203,16 +210,7 @@
$nonAllFields = $fields;
}
- // Only do a phrase match rescore if the query doesn't include
any quotes and has a space.
- // Queries without spaces are either single term or have a
phrase query generated.
- // Queries with the quote already contain a phrase query and we
can't build phrase queries
- // out of phrase queries at this point.
- if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) >
0.0 &&
- $this->config->get(
'CirrusSearchPhraseRescoreWindowSize' ) &&
- !$searchContext->isSpecialKeywordUsed() &&
- strpos( $this->queryStringQueryString, '"' )
=== false &&
- strpos( $this->queryStringQueryString, ' ' )
!== false ) {
-
+ if ( $this->isPhraseRescoreNeeded( $searchContext ) ) {
$rescoreFields = $fields;
if ( !$this->config->get(
'CirrusSearchAllFieldsForRescore' ) ) {
$rescoreFields = $nonAllFields;
@@ -620,6 +618,56 @@
* @return \Elastica\Query\AbstractQuery
*/
protected function buildPhraseRescoreQuery( SearchContext $context,
array $fields, $queryText, $slop ) {
- return $this->buildQueryString( $fields, '"' . $queryText .
'"', $slop );
+ return $this->maybeWrapWithTokenCountRouter(
+ $queryText,
+ $this->buildQueryString( $fields, '"' . $queryText .
'"', $slop )
+ );
+ }
+
+ /**
+ * Determines if a phrase rescore is needed
+ * @param SearchContext $searchContext
+ * @return bool true if we can a phrase rescore
+ */
+ protected function isPhraseRescoreNeeded( SearchContext $searchContext
) {
+ // Only do a phrase match rescore if the query doesn't include
+ // any quotes and has a space or the token count router is
+ // active.
+ // Queries without spaces are either single term or have a
+ // phrase query generated.
+ // Queries with the quote already contain a phrase query and we
+ // can't build phrase queries out of phrase queries at this
+ // point.
+ if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) >
0.0 &&
+ $this->config->get(
'CirrusSearchPhraseRescoreWindowSize' ) &&
+ !$searchContext->isSpecialKeywordUsed() &&
+ strpos( $this->queryStringQueryString, '"' ) === false
&&
+ ( $this->useTokenCountRouter || strpos(
$this->queryStringQueryString, ' ' ) !== false )
+ ) {
+ return true;
+ }
+ return false;
+ }
+
+ protected function maybeWrapWithTokenCountRouter( $queryText,
\Elastica\Query\AbstractQuery $query ) {
+ if ( $this->useTokenCountRouter ) {
+ $tokCount = new TokenCountRouter(
+ // text
+ $queryText,
+ // fallack
+ new \CirrusSearch\Elastica\MatchNone(),
+ // field
+ null,
+ // analyzer
+ 'text_search'
+ );
+ $tokCount->addCondition(
+ TokenCountRouter::GT,
+ 1,
+ $query
+ );
+ return $tokCount;
+ }
+ return $query;
}
}
diff --git a/includes/Query/FullTextSimpleMatchQueryBuilder.php
b/includes/Query/FullTextSimpleMatchQueryBuilder.php
index 6684ead..6788aa2 100644
--- a/includes/Query/FullTextSimpleMatchQueryBuilder.php
+++ b/includes/Query/FullTextSimpleMatchQueryBuilder.php
@@ -154,7 +154,7 @@
}
$phrase->setFields( $fields );
$phrase->setQuery( $queryText );
- return $phrase;
+ return $this->maybeWrapWithTokenCountRouter(
$queryText, $phrase );
} else {
return parent::buildPhraseRescoreQuery( $context,
$fields, $queryText, $slop );
}
diff --git a/tests/jenkins/FullyFeaturedConfig.php
b/tests/jenkins/FullyFeaturedConfig.php
index f848050..a637292 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -33,6 +33,9 @@
$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
$wgCirrusSearchWikimediaExtraPlugin[ 'documentVersion' ] = true;
+// Enable when https://gerrit.wikimedia.org/r/#/c/345174/ is available
+// $wgCirrusSearchWikimediaExtraPlugin[ 'token_count_router' ] = true;
+
$wgCirrusSearchUseCompletionSuggester = 'yes';
$wgCirrusSearchCompletionSuggesterUseDefaultSort = true;
$wgCirrusSearchCompletionSuggesterSubphrases = [
diff --git a/tests/unit/Elastica/MatchNoneTest.php
b/tests/unit/Elastica/MatchNoneTest.php
new file mode 100644
index 0000000..85ebdbd
--- /dev/null
+++ b/tests/unit/Elastica/MatchNoneTest.php
@@ -0,0 +1,18 @@
+<?php
+
+namespace CirrusSearch\Elastica;
+
+class MatchNoneTest extends \PHPUnit_Framework_TestCase {
+ public function testMatchNone() {
+ $query = new MatchNone();
+ $expectedArray = ['match_none' => new \stdClass()];
+ $this->assertEquals($expectedArray, $query->toArray());
+ }
+
+ public function testBackPorts() {
+ $this->assertFalse(
+ class_exists( \Elastica\Query\MatchNone::class ),
+ "MatchNone is now in elastica please remove this
backport"
+ );
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/345585
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7182531609af5b04c52b63388cece7d1981e236f
Gerrit-PatchSet: 7
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits