jenkins-bot has submitted this change and it was merged.
Change subject: CompletionSuggester: add support for crossnamespace redirects
......................................................................
CompletionSuggester: add support for crossnamespace redirects
The suggester index is now built from content and general index. It filters
docs with a redirect from the main namespace.
Removed ':' from the list of ignored char, ignoring ':' can be misleading for
certain kind of usage:
* wikipedia shortcuts like WP:WP when WP is not an alias to Wikipedia
Note that recycling won't be possible for the next update because the analysis
config changed.
I'm not a big fan of the way I deal with LinkBatch...
Bug: T129575
Change-Id: I77779fe066409c62ee5c8b3bf936500e6924b23a
---
M includes/BuildDocument/SuggestBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M maintenance/updateSuggesterIndex.php
M tests/jenkins/FullyFeaturedConfig.php
M tests/unit/SuggestBuilderTest.php
5 files changed, 170 insertions(+), 19 deletions(-)
Approvals:
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/BuildDocument/SuggestBuilder.php
b/includes/BuildDocument/SuggestBuilder.php
index 41f368a..c9e258a 100644
--- a/includes/BuildDocument/SuggestBuilder.php
+++ b/includes/BuildDocument/SuggestBuilder.php
@@ -2,6 +2,9 @@
namespace CirrusSearch\BuildDocument;
+use \Title;
+use \LinkBatch;
+
/**
* Build a doc ready for the titlesuggest index.
*
@@ -78,6 +81,13 @@
private $withGeo;
/**
+ * NOTE: Currently a fixed value because the completion suggester does
not support
+ * multi namespace suggestion.
+ * @var int $targetNamespace
+ */
+ private $targetNamespace = NS_MAIN;
+
+ /**
* @param SuggestScoringMethod $scoringMethod the scoring function to
use
* @param bool $withGeo
*/
@@ -88,12 +98,89 @@
}
/**
- * @param int $id the page id
- * @param array $inputDoc the page data
+ * @param array $inputDocs a batch of docs to build
* @return array a set of suggest documents
*/
- public function build( $id, $inputDoc ) {
- if( !isset( $inputDoc['title'] ) ) {
+ public function build( $inputDocs ) {
+ // Cross namespace titles
+ $crossNsTitles = array();
+ $docs = array();
+ foreach ( $inputDocs as $sourceDoc ) {
+ $inputDoc = $sourceDoc['source'];
+ $id = $sourceDoc['id'];
+ if ( !isset( $inputDoc['namespace'] ) ) {
+ // Bad doc, nothing to do here.
+ continue;
+ }
+ if( $inputDoc['namespace'] != NS_MAIN ) {
+ if ( !isset( $inputDoc['redirect'] ) ) {
+ // Bad doc, nothing to do here.
+ continue;
+ }
+
+ $score = $this->scoringMethod->score( $inputDoc
);
+ $location = $this->findPrimaryCoordinates(
$inputDoc );
+ foreach ( $inputDoc['redirect'] as $redir ) {
+ if ( !isset( $redir['namespace'] ) ||
!isset( $redir['title'] ) ) {
+ continue;
+ }
+ if ( $redir['namespace'] !=
$this->targetNamespace ) {
+ continue;
+ }
+ // Should we discount the score?
+ $score = $this->scoringMethod->score(
$inputDoc );
+ // We support only earth and the
primary/first coordinates...
+ $location =
$this->findPrimaryCoordinates( $inputDoc );
+
+ $title = Title::makeTitle(
$redir['namespace'], $redir['title'] );
+ $crossNsTitles[$redir['title']] = array(
+ 'title' => $title,
+ 'score' => $score,
+ 'location' => $location
+ );
+ }
+ } else {
+ if ( !isset( $inputDoc['title'] ) ) {
+ // Bad doc, nothing to do here.
+ continue;
+ }
+ $docs = array_merge( $docs,
$this->buildNormalSuggestions( $id, $inputDoc ) );
+ }
+ }
+
+ // Build cross ns suggestions
+ if ( !empty ( $crossNsTitles ) ) {
+ $titles = array();
+ foreach( $crossNsTitles as $text => $data ) {
+ $titles[] = $data['title'];
+ }
+ $lb = new LinkBatch( $titles );
+ $lb->setCaller( __METHOD__ );
+ $lb->execute();
+ // This is far from perfect:
+ // - we won't try to group similar redirects since we
don't know which one
+ // is the official one
+ // - we will certainly suggest multiple times the same
pages
+ // - we must not run a second pass at query time: no
redirect suggestion
+ foreach ( $crossNsTitles as $text => $data ) {
+ $suggestion = array(
+ 'text' => $text,
+ 'variants' => array()
+ );
+ $docs[] = $this->buildTitleSuggestion(
$data['title']->getArticleID(), $suggestion, $data['location'], $data['score']
);
+ }
+ }
+ return $docs;
+ }
+
+ /**
+ * Build classic suggestion
+ * @param int $id
+ * @param array $inputDoc
+ * @return array a set of suggest documents
+ */
+ private function buildNormalSuggestions( $id, $inputDoc ) {
+ if ( !isset( $inputDoc['title'] ) ) {
// Bad doc, nothing to do here.
return array();
}
@@ -118,7 +205,7 @@
*/
public function getRequiredFields() {
$fields = $this->scoringMethod->getRequiredFields();
- $fields = array_merge( $fields, array( 'title', 'redirect' ) );
+ $fields = array_merge( $fields, array( 'title', 'redirect',
'namespace' ) );
if ( $this->withGeo ) {
$fields[] = 'coordinates';
}
@@ -286,7 +373,11 @@
$redirects = array();
if ( isset( $doc['redirect'] ) ) {
foreach( $doc['redirect'] as $redir ) {
- $redirects[] = $redir['title'];
+ // Avoid suggesting/displaying non existent
titles
+ // in the target namespace
+ if( $redir['namespace'] ==
$this->targetNamespace ) {
+ $redirects[] = $redir['title'];
+ }
}
}
return $this->extractSimilars( $doc['title'], $redirects, true
);
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 7367dd5..936ae98 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -27,7 +27,7 @@
*/
class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder {
- const VERSION = "0.1";
+ const VERSION = "1.1";
/**
* Constructor
@@ -60,8 +60,12 @@
// but annoying to search for
"(C)"
// ')=>\u0020',
// '(=>\u0020',
+ // Ignoring : can be misleading
for expert users
+ // Because we will return
unrelated pages when the user
+ // search for "magic keywords"
like WP:WP which are sometimes
+ // pages in the main namespace
that redirect to other namespace
+ // ':=>\u0020',
// Others are the ones ignored
by common search engines
- ':=>\u0020',
';=>\u0020',
'\\[=>\u0020',
'\\]=>\u0020',
diff --git a/maintenance/updateSuggesterIndex.php
b/maintenance/updateSuggesterIndex.php
index 21ee718..e893368 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -281,6 +281,7 @@
$this->createIndex();
$this->indexData();
+ $this->indexData( Connection::GENERAL_INDEX_TYPE );
if ( $this->optimizeIndex ) {
$this->optimize();
}
@@ -363,6 +364,7 @@
$this->output( "Recycling index
{$this->getIndex()->getName()}\n");
$this->recycle = true;
$this->indexData();
+ $this->indexData( Connection::GENERAL_INDEX_TYPE );
// This is fragile... hopefully most of the docs will be
deleted from the old segments
// and will result in a fast operation.
// New segments should not be affected.
@@ -473,7 +475,7 @@
$this->output("ok.\n");
}
- private function indexData() {
+ private function indexData( $sourceIndexType =
Connection::CONTENT_INDEX_TYPE ) {
global $wgCirrusSearchCompletionDefaultScore;
$scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
$this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
@@ -490,7 +492,10 @@
new Elastica\Query\MatchAll(),
new Elastica\Filter\BoolAnd( array(
new Elastica\Filter\Type(
Connection::PAGE_TYPE_NAME ),
- new Elastica\Filter\Term( array(
"namespace" => NS_MAIN ) )
+ new Elastica\Filter\BoolOr( array(
+ new Elastica\Filter\Term(
array( "namespace" => NS_MAIN ) ),
+ new Elastica\Filter\Term(
array( "redirect.namespace" => NS_MAIN ) ),
+ ) )
) )
)
);
@@ -502,7 +507,7 @@
);
// TODO: only content index for now ( we'll have to check how
it works with commons )
- $sourceIndex = $this->getConnection()->getIndex(
$this->indexBaseName, Connection::CONTENT_INDEX_TYPE );
+ $sourceIndex = $this->getConnection()->getIndex(
$this->indexBaseName, $sourceIndexType );
$result = $sourceIndex->search( $query, $scrollOptions );
$totalDocsInIndex = $result->getResponse()->getData();
$totalDocsInIndex = $totalDocsInIndex['hits']['total'];
@@ -511,7 +516,7 @@
$docsDumped = 0;
- $this->output( "Indexing $totalDocsToDump documents
($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()}
and scoring method: $scoreMethodName\n" );
+ $this->output( "Indexing $totalDocsToDump documents from
$sourceIndexType ($totalDocsInIndex in the index) with batchId:
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
$destinationType = $this->getIndex()->getType(
Connection::TITLE_SUGGEST_TYPE_NAME );
@@ -519,13 +524,16 @@
function( $results ) use ( &$docsDumped,
$totalDocsToDump,
$destinationType ) {
$suggestDocs = array();
+ $inputDocs = array();
foreach ( $results as $result ) {
$docsDumped++;
- $suggests = $this->builder->build(
$result->getId(), $result->getSource() );
- foreach ( $suggests as $suggest ) {
- $suggestDocs[] = $suggest;
- }
+ $inputDocs[] = array(
+ 'id' => $result->getId(),
+ 'source' => $result->getSource()
+ );
}
+
+ $suggestDocs = $this->builder->build(
$inputDocs );
$this->outputProgress( $docsDumped,
$totalDocsToDump );
Util::withRetry( $this->indexRetryAttempts,
function() use ( $destinationType,
$suggestDocs ) {
@@ -533,7 +541,7 @@
}
);
}, 0, $this->indexRetryAttempts );
- $this->output( "Indexing done.\n" );
+ $this->output( "Indexing from $sourceIndexType index done.\n" );
}
public function validateAlias() {
diff --git a/tests/jenkins/FullyFeaturedConfig.php
b/tests/jenkins/FullyFeaturedConfig.php
index dd841ec..cebff3a 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -37,7 +37,7 @@
$wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] =
true;
$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
-$wgCirrusSearchUseCompletionSuggester = true;
+$wgCirrusSearchUseCompletionSuggester = 'yes';
$wgJobQueueAggregator = array(
'class' => 'JobQueueAggregatorRedis',
diff --git a/tests/unit/SuggestBuilderTest.php
b/tests/unit/SuggestBuilderTest.php
index 179793b..32c39fb 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -30,6 +30,7 @@
$redirScore = (int) ( $score *
SuggestBuilder::REDIRECT_DISCOUNT );
$doc = array(
'title' => 'Albert Einstein',
+ 'namespace' => 0,
'redirect' => array(
array( 'title' => "Albert Enstein", 'namespace'
=> 0 ),
array( 'title' => "Albert Einsten", 'namespace'
=> 0 ),
@@ -78,6 +79,7 @@
$redirScore = (int) ( $score *
SuggestBuilder::REDIRECT_DISCOUNT );
$doc = array(
'title' => 'Iraq',
+ 'namespace' => 0,
'redirect' => array(
array( 'title' => "Eraq", 'namespace' => 0 ),
array( 'title' => "Irak", 'namespace' => 0 ),
@@ -115,17 +117,63 @@
$this->assertSame( $expected, $suggestions );
}
+ public function testCrossNSRedirects() {
+ $builder = new SuggestBuilder(
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks' ) );
+ $score = 10;
+ $redirScore = (int) ( $score *
SuggestBuilder::REDIRECT_DISCOUNT );
+ $doc = array(
+ 'title' => 'Navigation',
+ 'namespace' => 12,
+ 'redirect' => array(
+ array( 'title' => 'WP:HN', 'namespace' => 0 ),
+ array( 'title' => 'WP:NAV', 'namespace' => 0 ),
+ ),
+ 'incoming_links' => $score
+ );
+
+ $expected = array(
+ array(
+ 'suggest' => array(
+ 'input' => array( 'WP:HN' ),
+ 'output' => '0:t:WP:HN', // LinkBatch
will set 0...
+ 'weight' => $score
+ ),
+ 'suggest-stop' => array(
+ 'input' => array( 'WP:HN' ),
+ 'output' => '0:t:WP:HN',
+ 'weight' => $score
+ ),
+ ),
+ array(
+ 'suggest' => array(
+ 'input' => array( 'WP:NAV' ),
+ 'output' => '0:t:WP:NAV',
+ 'weight' => $score
+ ),
+ 'suggest-stop' => array(
+ 'input' => array( 'WP:NAV' ),
+ 'output' => '0:t:WP:NAV',
+ 'weight' => $score
+ ),
+ )
+ );
+ $suggestions = $this->buildSuggestions( $builder, $doc );
+ $this->assertSame( $expected, $suggestions );
+ }
+
public function testUlm() {
$builder = new SuggestBuilder(
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks' ) );
$score = 10;
$redirScore = (int) ( $score *
SuggestBuilder::REDIRECT_DISCOUNT );
$doc = array(
'title' => 'Ulm',
+ 'namespace' => 0,
'redirect' => array(
array( 'title' => 'UN/LOCODE:DEULM',
'namespace' => 0 ),
array( 'title'=> 'Ulm, Germany', 'namespace' =>
0 ),
array( 'title' => "Ulm displaced persons camp",
'namespace' => 0 ),
array( 'title' => "Söflingen", 'namespace' => 0
),
+ array( 'title' => "Should be ignored",
'namespace' => 1 ),
),
'coordinates' => array(
array(
@@ -334,6 +382,6 @@
$dat = $x->getData();
unset( $dat['batch_id'] );
return $dat;
- }, $builder->build( 1, $doc ) );
+ }, $builder->build( array( array( 'id' => 1, 'source'
=> $doc ) ) ) );
}
}
--
To view, visit https://gerrit.wikimedia.org/r/276705
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I77779fe066409c62ee5c8b3bf936500e6924b23a
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits