DCausse has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/278716

Change subject: Fix cross-namespace redirect scores
......................................................................

Fix cross-namespace redirect scores

Refactored multiple calls to indexData into a single call with a loop over the
source indices.  The previous construct led to many subtle and dangerous bugs.
SuggestBuilder holds various states that should not be changed during the update
process, i.e. if batchId is udpated then the script might delete (when
recycling) all the data indexed from content index while indexing data from
general index.

This patch fixes the following score components:
* max_docs (used to normalize some score components) should be set to the total
  number of docs being indexed and not only to #docs in the source index being 
read.
* crossnamespace suggestions should be discounted at least like normal
  redirects. (discount set to 0.05 which is slightly more aggressive than
  classic redirects)

Bug: T130353
Change-Id: I782946a5fa60ab80dd9c990d6e82748da56040ac
---
M includes/BuildDocument/SuggestBuilder.php
M maintenance/updateSuggesterIndex.php
M tests/unit/SuggestBuilderTest.php
3 files changed, 98 insertions(+), 60 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/16/278716/1

diff --git a/includes/BuildDocument/SuggestBuilder.php 
b/includes/BuildDocument/SuggestBuilder.php
index c9e258a..db9be6f 100644
--- a/includes/BuildDocument/SuggestBuilder.php
+++ b/includes/BuildDocument/SuggestBuilder.php
@@ -45,6 +45,11 @@
        const REDIRECT_DISCOUNT = 0.1;
 
        /**
+        * Discount suggestions based on crossnamespace redirects
+        */
+       const CROSSNS_DISCOUNT = 0.05;
+
+       /**
         * Redirect suggestion type
         */
        const REDIRECT_SUGGESTION = 'r';
@@ -112,7 +117,13 @@
                                // Bad doc, nothing to do here.
                                continue;
                        }
-                       if( $inputDoc['namespace'] != NS_MAIN ) {
+                       if( $inputDoc['namespace'] == NS_MAIN ) {
+                               if ( !isset( $inputDoc['title'] ) ) {
+                                       // Bad doc, nothing to do here.
+                                       continue;
+                               }
+                               $docs = array_merge( $docs, 
$this->buildNormalSuggestions( $id, $inputDoc ) );
+                       } else {
                                if ( !isset( $inputDoc['redirect'] ) ) {
                                        // Bad doc, nothing to do here.
                                        continue;
@@ -127,8 +138,9 @@
                                        if ( $redir['namespace'] != 
$this->targetNamespace ) {
                                                continue;
                                        }
-                                       // Should we discount the score?
                                        $score = $this->scoringMethod->score( 
$inputDoc );
+                                       // Discount the score of these 
suggestions.
+                                       $score = (int) ($score * 
self::CROSSNS_DISCOUNT);
                                        // We support only earth and the 
primary/first coordinates...
                                        $location = 
$this->findPrimaryCoordinates( $inputDoc );
 
@@ -139,12 +151,6 @@
                                                'location' => $location
                                        );
                                }
-                       } else {
-                               if ( !isset( $inputDoc['title'] ) ) {
-                                       // Bad doc, nothing to do here.
-                                       continue;
-                               }
-                               $docs = array_merge( $docs, 
$this->buildNormalSuggestions( $id, $inputDoc ) );
                        }
                }
 
diff --git a/maintenance/updateSuggesterIndex.php 
b/maintenance/updateSuggesterIndex.php
index 3d9c3b0..54b9025 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -216,13 +216,13 @@
                        }
                } catch ( \Elastica\Exception\Connection\HttpException $e ) {
                        $message = $e->getMessage();
-                       $this->output( "\nUnexpected Elasticsearch failure.\n" 
);
+                       $this->log( "\nUnexpected Elasticsearch failure.\n" );
                        $this->error( "Http error communicating with 
Elasticsearch:  $message.\n", 1 );
                } catch ( \Elastica\Exception\ExceptionInterface $e ) {
                        $type = get_class( $e );
                        $message = ElasticsearchIntermediary::extractMessage( 
$e );
                        $trace = $e->getTraceAsString();
-                       $this->output( "\nUnexpected Elasticsearch failure.\n" 
);
+                       $this->log( "\nUnexpected Elasticsearch failure.\n" );
                        $this->error( "Elasticsearch failed in an unexpected 
way.  This is always a bug in CirrusSearch.\n" .
                                "Error type: $type\n" .
                                "Message: $message\n" .
@@ -265,10 +265,10 @@
                        // Extra check: if stats report usages we should not 
try to fix things
                        // automatically.
                        if ( $stats['_all']['total']['suggest']['total'] == 0 ) 
{
-                               $this->output( "Deleting broken index 
{$idx->getName()}\n" );
+                               $this->log( "Deleting broken index 
{$idx->getName()}\n" );
                                $this->deleteIndex( $idx );
                        } else {
-                               $this->output( "Broken index {$idx->getName()} 
appears to be in use, please check and delete.\n" );
+                               $this->log( "Broken index {$idx->getName()} 
appears to be in use, please check and delete.\n" );
                        }
 
                }
@@ -282,7 +282,6 @@
 
                $this->createIndex();
                $this->indexData();
-               $this->indexData( Connection::GENERAL_INDEX_TYPE );
                if ( $this->optimizeIndex ) {
                        $this->optimize();
                }
@@ -291,7 +290,7 @@
                $this->validateAlias();
                $this->updateVersions();
                $this->deleteOldIndex();
-               $this->output("done.\n");
+               $this->log("Done.\n");
        }
 
        private function canRecycle() {
@@ -369,10 +368,9 @@
         * but needs to be carefully tested on bigger indices with high QPS.
         */
        private function recycle() {
-               $this->output( "Recycling index 
{$this->getIndex()->getName()}\n");
+               $this->log( "Recycling index {$this->getIndex()->getName()}\n");
                $this->recycle = true;
                $this->indexData();
-               $this->indexData( Connection::GENERAL_INDEX_TYPE );
                // This is fragile... hopefully most of the docs will be 
deleted from the old segments
                // and will result in a fast operation.
                // New segments should not be affected.
@@ -408,7 +406,7 @@
                $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
                $totalDocsToDump = $totalDocsInIndex;
 
-               $this->output( "Deleting remaining docs from previous batch 
($totalDocsInIndex).\n" );
+               $this->log( "Deleting remaining docs from previous batch 
($totalDocsInIndex).\n" );
                Util::iterateOverScroll( $this->getIndex(), 
$result->getResponse()->getScrollId(), '15m',
                        function( $results ) use ( &$docsDumped, 
$totalDocsToDump ) {
                                $ids = array();
@@ -423,7 +421,7 @@
                                        }
                                );
                        }, 0, $this->indexRetryAttempts );
-               $this->output( "Done.\n" );
+               $this->log( "Done.\n" );
                // Old docs should be deleted now we can optimize and flush
                $this->optimize();
 
@@ -438,7 +436,7 @@
 
        private function deleteOldIndex() {
                if ( $this->oldIndex && $this->oldIndex->exists() ) {
-                       $this->output("Deleting " . $this->oldIndex->getName() 
. " ... ");
+                       $this->log("Deleting " . $this->oldIndex->getName() . " 
... ");
                        // @todo Utilize $this->oldIndex->delete(...) once 
Elastica library is updated
                        // to allow passing the master_timeout
                        $this->oldIndex->request(
@@ -472,26 +470,34 @@
        }
 
        private function optimize() {
-               $this->output("Optimizing index...");
+               $this->log("Optimizing index...");
                $this->getIndex()->optimize( array( 'max_num_segments' => 1 ) );
                $this->output("ok.\n");
        }
 
        private function expungeDeletes() {
-               $this->output("Purging deleted docs...");
+               $this->log("Purging deleted docs...");
                $this->getIndex()->optimize( array( 'only_expunge_deletes' => 
true, 'flush' => false ) );
                $this->output("ok.\n");
        }
 
-       private function indexData( $sourceIndexType = 
Connection::CONTENT_INDEX_TYPE ) {
+       private function indexData() {
                global $wgCirrusSearchCompletionDefaultScore;
                $scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
                if ( $this->scoreMethod == null ) {
                        $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
                }
                if ( $this->builder == null ) {
+                       // NOTE: the builder stores a batchId value to flag
+                       // documents indexed by this builder. Make sure to
+                       // reuse the same instance when building docs otherwize
+                       // the batchId might be regenerated and can cause data
+                       // loss when recycle the index.
                        $this->builder = new SuggestBuilder( 
$this->scoreMethod, $this->withGeo );
                }
+
+               // We build the suggestions by reading CONTENT and GENERAL 
indices.
+               $sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE, 
Connection::GENERAL_INDEX_TYPE );
 
                $query = new Query();
                $query->setFields( array( '_id', '_type', '_source' ) );
@@ -512,48 +518,67 @@
                        )
                );
 
-               $scrollOptions = array(
-                       'search_type' => 'scan',
-                       'scroll' => "15m",
-                       'size' => $this->indexChunkSize
-               );
+               // Run a first query to count the number of docs.
+               // This is needed for the scoring methods that need
+               // to normalize values against wiki size.
+               $mSearch = new \Elastica\Multi\Search( $this->getClient() );
+               foreach ( $sourceIndexTypes as $sourceIndexType ) {
+                       $search = new \Elastica\Search( $this->getClient() );
+                       $search->addIndex( $this->getConnection()->getIndex( 
$this->indexBaseName, $sourceIndexType ) );
+                       $search->setOption( 
\Elastica\Search::OPTION_SEARCH_TYPE, 
\Elastica\Search::OPTION_SEARCH_TYPE_COUNT );
+                       $mSearch->addSearch( $search );
+               }
 
-               // TODO: only content index for now ( we'll have to check how 
it works with commons )
-               $sourceIndex = $this->getConnection()->getIndex( 
$this->indexBaseName, $sourceIndexType );
-               $result = $sourceIndex->search( $query, $scrollOptions );
-               $totalDocsInIndex = $result->getResponse()->getData();
-               $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
-               $this->scoreMethod->setMaxDocs( $totalDocsInIndex );
-               $totalDocsToDump = $totalDocsInIndex;
+               $mSearchRes = $mSearch->search();
+               $total = 0;
+               foreach( $mSearchRes as $res ) {
+                       $total += $res->getTotalHits();
+               }
+               $this->log( "Setting max_docs to $total\n" );
+               $this->scoreMethod->setMaxDocs( $total );
 
+               foreach( $sourceIndexTypes as $sourceIndexType ) {
+                       $scrollOptions = array(
+                               'search_type' => 'scan',
+                               'scroll' => "15m",
+                               'size' => $this->indexChunkSize
+                       );
 
-               $docsDumped = 0;
-               $this->output( "Indexing $totalDocsToDump documents from 
$sourceIndexType ($totalDocsInIndex in the index) with batchId: 
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
+                       // TODO: only content index for now ( we'll have to 
check how it works with commons )
+                       $sourceIndex = $this->getConnection()->getIndex( 
$this->indexBaseName, $sourceIndexType );
+                       $result = $sourceIndex->search( $query, $scrollOptions 
);
+                       $totalDocsInIndex = $result->getResponse()->getData();
+                       $totalDocsInIndex = $totalDocsInIndex['hits']['total'];
+                       $totalDocsToDump = $totalDocsInIndex;
 
-               $destinationType = $this->getIndex()->getType( 
Connection::TITLE_SUGGEST_TYPE_NAME );
+                       $docsDumped = 0;
+                       $this->log( "Indexing $totalDocsToDump documents from 
$sourceIndexType ($totalDocsInIndex in the index) with batchId: 
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
 
-               Util::iterateOverScroll( $sourceIndex, 
$result->getResponse()->getScrollId(), '15m',
-                       function( $results ) use ( &$docsDumped, 
$totalDocsToDump,
-                                       $destinationType ) {
-                               $suggestDocs = array();
-                               $inputDocs = array();
-                               foreach ( $results as $result ) {
-                                       $docsDumped++;
-                                       $inputDocs[] = array(
-                                               'id' => $result->getId(),
-                                               'source' => $result->getSource()
-                                       );
-                               }
+                       $destinationType = $this->getIndex()->getType( 
Connection::TITLE_SUGGEST_TYPE_NAME );
 
-                               $suggestDocs = $this->builder->build( 
$inputDocs );
-                               $this->outputProgress( $docsDumped, 
$totalDocsToDump );
-                               Util::withRetry( $this->indexRetryAttempts,
-                                       function() use ( $destinationType, 
$suggestDocs ) {
-                                               $destinationType->addDocuments( 
$suggestDocs );
+                       Util::iterateOverScroll( $sourceIndex, 
$result->getResponse()->getScrollId(), '15m',
+                               function( $results ) use ( &$docsDumped, 
$totalDocsToDump,
+                                               $destinationType ) {
+                                       $suggestDocs = array();
+                                       $inputDocs = array();
+                                       foreach ( $results as $result ) {
+                                               $docsDumped++;
+                                               $inputDocs[] = array(
+                                                       'id' => 
$result->getId(),
+                                                       'source' => 
$result->getSource()
+                                               );
                                        }
-                               );
-                       }, 0, $this->indexRetryAttempts );
-               $this->output( "Indexing from $sourceIndexType index done.\n" );
+
+                                       $suggestDocs = $this->builder->build( 
$inputDocs );
+                                       $this->outputProgress( $docsDumped, 
$totalDocsToDump );
+                                       Util::withRetry( 
$this->indexRetryAttempts,
+                                               function() use ( 
$destinationType, $suggestDocs ) {
+                                                       
$destinationType->addDocuments( $suggestDocs );
+                                               }
+                                       );
+                               }, 0, $this->indexRetryAttempts );
+                       $this->log( "Indexing from $sourceIndexType index 
done.\n" );
+               }
        }
 
        public function validateAlias() {
@@ -593,6 +618,11 @@
                if ( ( $pctDone % 2 ) == 0 ) {
                        $this->outputIndented( "\t$pctDone% done...\n" );
                }
+       }
+
+       public function log( $message, $channel = NULL ) {
+               $date = new \DateTime();
+               parent::output( $date->format('Y-m-d H:i:s') . " " . $message, 
$channel );
        }
 
        /**
@@ -660,7 +690,7 @@
        }
 
        private function enableReplicas() {
-               $this->output("Enabling replicas...\n");
+               $this->log("Enabling replicas...\n");
                $args = array(
                        'index' => array(
                                'auto_expand_replicas' => 
$this->getReplicaCount(),
@@ -684,7 +714,7 @@
        }
 
        private function waitForGreen( $timeout = 600 ) {
-               $this->output( "Waiting for the index to go green...\n" );
+               $this->log( "Waiting for the index to go green...\n" );
                // Wait for the index to go green ( default 10 min)
                if ( !$this->utils->waitForGreen( $this->getIndex()->getName(), 
$timeout ) ) {
                        $this->error( "Failed to wait for green... please check 
config and delete the {$this->getIndex()->getName()} index if it was created.", 
1 );
@@ -703,7 +733,7 @@
        }
 
        private function updateVersions() {
-               $this->outputIndented( "Updating tracking indexes..." );
+               $this->log( "Updating tracking indexes..." );
                $index = $this->getConnection()->getIndex( 'mw_cirrus_versions' 
);
                if ( !$index->exists() ) {
                        throw new \Exception("mw_cirrus_versions does not 
exist, you must index your data first");
diff --git a/tests/unit/SuggestBuilderTest.php 
b/tests/unit/SuggestBuilderTest.php
index 32c39fb..04a7742 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -131,6 +131,8 @@
                        'incoming_links' => $score
                );
 
+               $score = (int) (SuggestBuilder::CROSSNS_DISCOUNT * $score);
+
                $expected = array(
                        array(
                                'suggest' => array(

-- 
To view, visit https://gerrit.wikimedia.org/r/278716
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I782946a5fa60ab80dd9c990d6e82748da56040ac
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to