jenkins-bot has submitted this change and it was merged.

Change subject: CompletionSuggester: add support for crossnamespace redirects
......................................................................


CompletionSuggester: add support for crossnamespace redirects

The suggester index is now built from content and general index.  It filters
docs with a redirect from the main namespace.
Removed ':' from the list of ignored char, ignoring ':' can be misleading for
certain kind of usage:
* wikipedia shortcuts like WP:WP when WP is not an alias to Wikipedia

Note that recycling won't be possible for the next update because the analysis
config changed.

I'm not a big fan of the way I deal with LinkBatch...

Bug: T129575
Change-Id: I77779fe066409c62ee5c8b3bf936500e6924b23a
---
M includes/BuildDocument/SuggestBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M maintenance/updateSuggesterIndex.php
M tests/jenkins/FullyFeaturedConfig.php
M tests/unit/SuggestBuilderTest.php
5 files changed, 170 insertions(+), 19 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/BuildDocument/SuggestBuilder.php 
b/includes/BuildDocument/SuggestBuilder.php
index 41f368a..c9e258a 100644
--- a/includes/BuildDocument/SuggestBuilder.php
+++ b/includes/BuildDocument/SuggestBuilder.php
@@ -2,6 +2,9 @@
 
 namespace CirrusSearch\BuildDocument;
 
+use \Title;
+use \LinkBatch;
+
 /**
  * Build a doc ready for the titlesuggest index.
  *
@@ -78,6 +81,13 @@
        private $withGeo;
 
        /**
+        * NOTE: Currently a fixed value because the completion suggester does 
not support
+        * multi namespace suggestion.
+        * @var int $targetNamespace
+        */
+       private $targetNamespace = NS_MAIN;
+
+       /**
         * @param SuggestScoringMethod $scoringMethod the scoring function to 
use
         * @param bool $withGeo
         */
@@ -88,12 +98,89 @@
        }
 
        /**
-        * @param int $id the page id
-        * @param array $inputDoc the page data
+        * @param array $inputDocs a batch of docs to build
         * @return array a set of suggest documents
         */
-       public function build( $id, $inputDoc ) {
-               if( !isset( $inputDoc['title'] ) ) {
+       public function build( $inputDocs ) {
+               // Cross namespace titles
+               $crossNsTitles = array();
+               $docs = array();
+               foreach ( $inputDocs as $sourceDoc ) {
+                       $inputDoc = $sourceDoc['source'];
+                       $id = $sourceDoc['id'];
+                       if ( !isset( $inputDoc['namespace'] ) ) {
+                               // Bad doc, nothing to do here.
+                               continue;
+                       }
+                       if( $inputDoc['namespace'] != NS_MAIN ) {
+                               if ( !isset( $inputDoc['redirect'] ) ) {
+                                       // Bad doc, nothing to do here.
+                                       continue;
+                               }
+
+                               $score = $this->scoringMethod->score( $inputDoc 
);
+                               $location = $this->findPrimaryCoordinates( 
$inputDoc );
+                               foreach ( $inputDoc['redirect'] as $redir ) {
+                                       if ( !isset( $redir['namespace'] ) || 
!isset( $redir['title'] ) ) {
+                                               continue;
+                                       }
+                                       if ( $redir['namespace'] != 
$this->targetNamespace ) {
+                                               continue;
+                                       }
+                                       // Should we discount the score?
+                                       $score = $this->scoringMethod->score( 
$inputDoc );
+                                       // We support only earth and the 
primary/first coordinates...
+                                       $location = 
$this->findPrimaryCoordinates( $inputDoc );
+
+                                       $title = Title::makeTitle( 
$redir['namespace'], $redir['title'] );
+                                       $crossNsTitles[$redir['title']] = array(
+                                               'title' => $title,
+                                               'score' => $score,
+                                               'location' => $location
+                                       );
+                               }
+                       } else {
+                               if ( !isset( $inputDoc['title'] ) ) {
+                                       // Bad doc, nothing to do here.
+                                       continue;
+                               }
+                               $docs = array_merge( $docs, 
$this->buildNormalSuggestions( $id, $inputDoc ) );
+                       }
+               }
+
+               // Build cross ns suggestions
+               if ( !empty ( $crossNsTitles ) ) {
+                       $titles = array();
+                       foreach( $crossNsTitles as $text => $data ) {
+                               $titles[] = $data['title'];
+                       }
+                       $lb = new LinkBatch( $titles );
+                       $lb->setCaller( __METHOD__ );
+                       $lb->execute();
+                       // This is far from perfect:
+                       // - we won't try to group similar redirects since we 
don't know which one
+                       //   is the official one
+                       // - we will certainly suggest multiple times the same 
pages
+                       // - we must not run a second pass at query time: no 
redirect suggestion
+                       foreach ( $crossNsTitles as $text => $data ) {
+                               $suggestion = array(
+                                       'text' => $text,
+                                       'variants' => array()
+                               );
+                               $docs[] = $this->buildTitleSuggestion( 
$data['title']->getArticleID(), $suggestion, $data['location'], $data['score'] 
);
+                       }
+               }
+               return $docs;
+       }
+
+       /**
+        * Build classic suggestion
+        * @param int $id
+        * @param array $inputDoc
+        * @return array a set of suggest documents
+        */
+       private function buildNormalSuggestions( $id, $inputDoc ) {
+               if ( !isset( $inputDoc['title'] ) ) {
                        // Bad doc, nothing to do here.
                        return array();
                }
@@ -118,7 +205,7 @@
         */
        public function getRequiredFields() {
                $fields = $this->scoringMethod->getRequiredFields();
-               $fields = array_merge( $fields, array( 'title', 'redirect' ) );
+               $fields = array_merge( $fields, array( 'title', 'redirect', 
'namespace' ) );
                if ( $this->withGeo ) {
                        $fields[] = 'coordinates';
                }
@@ -286,7 +373,11 @@
                $redirects = array();
                if ( isset( $doc['redirect'] ) ) {
                        foreach( $doc['redirect'] as $redir ) {
-                               $redirects[] = $redir['title'];
+                               // Avoid suggesting/displaying non existent 
titles
+                               // in the target namespace
+                               if( $redir['namespace'] == 
$this->targetNamespace ) {
+                                       $redirects[] = $redir['title'];
+                               }
                        }
                }
                return $this->extractSimilars( $doc['title'], $redirects, true 
);
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 7367dd5..936ae98 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -27,7 +27,7 @@
  */
 
 class SuggesterAnalysisConfigBuilder extends AnalysisConfigBuilder {
-       const VERSION = "0.1";
+       const VERSION = "1.1";
 
        /**
         * Constructor
@@ -60,8 +60,12 @@
                                                // but annoying to search for 
"(C)"
                                                // ')=>\u0020',
                                                // '(=>\u0020',
+                                               // Ignoring : can be misleading 
for expert users
+                                               // Because we will return 
unrelated pages when the user
+                                               // search for "magic keywords" 
like WP:WP which are sometimes
+                                               // pages in the main namespace 
that redirect to other namespace
+                                               // ':=>\u0020',
                                                // Others are the ones ignored 
by common search engines
-                                               ':=>\u0020',
                                                ';=>\u0020',
                                                '\\[=>\u0020',
                                                '\\]=>\u0020',
diff --git a/maintenance/updateSuggesterIndex.php 
b/maintenance/updateSuggesterIndex.php
index 21ee718..e893368 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -281,6 +281,7 @@
 
                $this->createIndex();
                $this->indexData();
+               $this->indexData( Connection::GENERAL_INDEX_TYPE );
                if ( $this->optimizeIndex ) {
                        $this->optimize();
                }
@@ -363,6 +364,7 @@
                $this->output( "Recycling index 
{$this->getIndex()->getName()}\n");
                $this->recycle = true;
                $this->indexData();
+               $this->indexData( Connection::GENERAL_INDEX_TYPE );
                // This is fragile... hopefully most of the docs will be 
deleted from the old segments
                // and will result in a fast operation.
                // New segments should not be affected.
@@ -473,7 +475,7 @@
                $this->output("ok.\n");
        }
 
-       private function indexData() {
+       private function indexData( $sourceIndexType = 
Connection::CONTENT_INDEX_TYPE ) {
                global $wgCirrusSearchCompletionDefaultScore;
                $scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
                $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
@@ -490,7 +492,10 @@
                                new Elastica\Query\MatchAll(),
                                new Elastica\Filter\BoolAnd( array(
                                        new Elastica\Filter\Type( 
Connection::PAGE_TYPE_NAME ),
-                                       new Elastica\Filter\Term( array( 
"namespace" => NS_MAIN ) )
+                                       new Elastica\Filter\BoolOr( array(
+                                               new Elastica\Filter\Term( 
array( "namespace" => NS_MAIN ) ),
+                                               new Elastica\Filter\Term( 
array( "redirect.namespace" => NS_MAIN ) ),
+                                       ) )
                                ) )
                        )
                );
@@ -502,7 +507,7 @@
                );
 
                // TODO: only content index for now ( we'll have to check how 
it works with commons )
-               $sourceIndex = $this->getConnection()->getIndex( 
$this->indexBaseName, Connection::CONTENT_INDEX_TYPE );
+               $sourceIndex = $this->getConnection()->getIndex( 
$this->indexBaseName, $sourceIndexType );
                $result = $sourceIndex->search( $query, $scrollOptions );
                $totalDocsInIndex = $result->getResponse()->getData();
                $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
@@ -511,7 +516,7 @@
 
 
                $docsDumped = 0;
-               $this->output( "Indexing $totalDocsToDump documents 
($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()} 
and scoring method: $scoreMethodName\n" );
+               $this->output( "Indexing $totalDocsToDump documents from 
$sourceIndexType ($totalDocsInIndex in the index) with batchId: 
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
 
                $destinationType = $this->getIndex()->getType( 
Connection::TITLE_SUGGEST_TYPE_NAME );
 
@@ -519,13 +524,16 @@
                        function( $results ) use ( &$docsDumped, 
$totalDocsToDump,
                                        $destinationType ) {
                                $suggestDocs = array();
+                               $inputDocs = array();
                                foreach ( $results as $result ) {
                                        $docsDumped++;
-                                       $suggests = $this->builder->build( 
$result->getId(), $result->getSource() );
-                                       foreach ( $suggests as $suggest ) {
-                                               $suggestDocs[] = $suggest;
-                                       }
+                                       $inputDocs[] = array(
+                                               'id' => $result->getId(),
+                                               'source' => $result->getSource()
+                                       );
                                }
+
+                               $suggestDocs = $this->builder->build( 
$inputDocs );
                                $this->outputProgress( $docsDumped, 
$totalDocsToDump );
                                Util::withRetry( $this->indexRetryAttempts,
                                        function() use ( $destinationType, 
$suggestDocs ) {
@@ -533,7 +541,7 @@
                                        }
                                );
                        }, 0, $this->indexRetryAttempts );
-               $this->output( "Indexing done.\n" );
+               $this->output( "Indexing from $sourceIndexType index done.\n" );
        }
 
        public function validateAlias() {
diff --git a/tests/jenkins/FullyFeaturedConfig.php 
b/tests/jenkins/FullyFeaturedConfig.php
index dd841ec..cebff3a 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -37,7 +37,7 @@
 $wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] = 
true;
 $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
 
-$wgCirrusSearchUseCompletionSuggester = true;
+$wgCirrusSearchUseCompletionSuggester = 'yes';
 
 $wgJobQueueAggregator = array(
        'class'       => 'JobQueueAggregatorRedis',
diff --git a/tests/unit/SuggestBuilderTest.php 
b/tests/unit/SuggestBuilderTest.php
index 179793b..32c39fb 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -30,6 +30,7 @@
                $redirScore = (int) ( $score * 
SuggestBuilder::REDIRECT_DISCOUNT );
                $doc = array(
                        'title' => 'Albert Einstein',
+                       'namespace' => 0,
                        'redirect' => array(
                                array( 'title' => "Albert Enstein", 'namespace' 
=> 0 ),
                                array( 'title' => "Albert Einsten", 'namespace' 
=> 0 ),
@@ -78,6 +79,7 @@
                $redirScore = (int) ( $score * 
SuggestBuilder::REDIRECT_DISCOUNT );
                $doc = array(
                        'title' => 'Iraq',
+                       'namespace' => 0,
                        'redirect' => array(
                                array( 'title' => "Eraq", 'namespace' => 0 ),
                                array( 'title' => "Irak", 'namespace' => 0 ),
@@ -115,17 +117,63 @@
                $this->assertSame( $expected, $suggestions );
        }
 
+       public function testCrossNSRedirects() {
+               $builder = new SuggestBuilder( 
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks' ) );
+               $score = 10;
+               $redirScore = (int) ( $score * 
SuggestBuilder::REDIRECT_DISCOUNT );
+               $doc = array(
+                       'title' => 'Navigation',
+                       'namespace' => 12,
+                       'redirect' => array(
+                               array( 'title' => 'WP:HN', 'namespace' => 0 ),
+                               array( 'title' => 'WP:NAV', 'namespace' => 0 ),
+                       ),
+                       'incoming_links' => $score
+               );
+
+               $expected = array(
+                       array(
+                               'suggest' => array(
+                                       'input' => array( 'WP:HN' ),
+                                       'output' => '0:t:WP:HN', // LinkBatch 
will set 0...
+                                       'weight' => $score
+                               ),
+                               'suggest-stop' => array(
+                                       'input' => array( 'WP:HN' ),
+                                       'output' => '0:t:WP:HN',
+                                       'weight' => $score
+                               ),
+                       ),
+                       array(
+                               'suggest' => array(
+                                       'input' => array( 'WP:NAV' ),
+                                       'output' => '0:t:WP:NAV',
+                                       'weight' => $score
+                               ),
+                               'suggest-stop' => array(
+                                       'input' => array( 'WP:NAV' ),
+                                       'output' => '0:t:WP:NAV',
+                                       'weight' => $score
+                               ),
+                       )
+               );
+               $suggestions = $this->buildSuggestions( $builder, $doc );
+               $this->assertSame( $expected, $suggestions );
+       }
+
        public function testUlm() {
                $builder = new SuggestBuilder( 
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks' ) );
                $score = 10;
                $redirScore = (int) ( $score * 
SuggestBuilder::REDIRECT_DISCOUNT );
                $doc = array(
                        'title' => 'Ulm',
+                       'namespace' => 0,
                        'redirect' => array(
                                array( 'title' => 'UN/LOCODE:DEULM', 
'namespace' => 0 ),
                                array( 'title'=> 'Ulm, Germany', 'namespace' => 
0 ),
                                array( 'title' => "Ulm displaced persons camp", 
'namespace' => 0 ),
                                array( 'title' => "Söflingen", 'namespace' => 0 
),
+                               array( 'title' => "Should be ignored", 
'namespace' => 1 ),
                        ),
                        'coordinates' => array(
                                array(
@@ -334,6 +382,6 @@
                                $dat = $x->getData();
                                unset( $dat['batch_id'] );
                                return $dat;
-                       }, $builder->build( 1, $doc ) );
+                       }, $builder->build( array( array( 'id' => 1, 'source' 
=> $doc ) ) ) );
        }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/276705
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I77779fe066409c62ee5c8b3bf936500e6924b23a
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to