DCausse has uploaded a new change for review. https://gerrit.wikimedia.org/r/278716
Change subject: Fix cross-namespace redirect scores ...................................................................... Fix cross-namespace redirect scores Refactored multiple calls to indexData into a single call with a loop over the source indices. The previous construct led to many subtle and dangerous bugs. SuggestBuilder holds various states that should not be changed during the update process, i.e. if batchId is udpated then the script might delete (when recycling) all the data indexed from content index while indexing data from general index. This patch fixes the following score components: * max_docs (used to normalize some score components) should be set to the total number of docs being indexed and not only to #docs in the source index being read. * crossnamespace suggestions should be discounted at least like normal redirects. (discount set to 0.05 which is slightly more aggressive than classic redirects) Bug: T130353 Change-Id: I782946a5fa60ab80dd9c990d6e82748da56040ac --- M includes/BuildDocument/SuggestBuilder.php M maintenance/updateSuggesterIndex.php M tests/unit/SuggestBuilderTest.php 3 files changed, 98 insertions(+), 60 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/16/278716/1 diff --git a/includes/BuildDocument/SuggestBuilder.php b/includes/BuildDocument/SuggestBuilder.php index c9e258a..db9be6f 100644 --- a/includes/BuildDocument/SuggestBuilder.php +++ b/includes/BuildDocument/SuggestBuilder.php @@ -45,6 +45,11 @@ const REDIRECT_DISCOUNT = 0.1; /** + * Discount suggestions based on crossnamespace redirects + */ + const CROSSNS_DISCOUNT = 0.05; + + /** * Redirect suggestion type */ const REDIRECT_SUGGESTION = 'r'; @@ -112,7 +117,13 @@ // Bad doc, nothing to do here. continue; } - if( $inputDoc['namespace'] != NS_MAIN ) { + if( $inputDoc['namespace'] == NS_MAIN ) { + if ( !isset( $inputDoc['title'] ) ) { + // Bad doc, nothing to do here. + continue; + } + $docs = array_merge( $docs, $this->buildNormalSuggestions( $id, $inputDoc ) ); + } else { if ( !isset( $inputDoc['redirect'] ) ) { // Bad doc, nothing to do here. continue; @@ -127,8 +138,9 @@ if ( $redir['namespace'] != $this->targetNamespace ) { continue; } - // Should we discount the score? $score = $this->scoringMethod->score( $inputDoc ); + // Discount the score of these suggestions. + $score = (int) ($score * self::CROSSNS_DISCOUNT); // We support only earth and the primary/first coordinates... $location = $this->findPrimaryCoordinates( $inputDoc ); @@ -139,12 +151,6 @@ 'location' => $location ); } - } else { - if ( !isset( $inputDoc['title'] ) ) { - // Bad doc, nothing to do here. - continue; - } - $docs = array_merge( $docs, $this->buildNormalSuggestions( $id, $inputDoc ) ); } } diff --git a/maintenance/updateSuggesterIndex.php b/maintenance/updateSuggesterIndex.php index 3d9c3b0..54b9025 100644 --- a/maintenance/updateSuggesterIndex.php +++ b/maintenance/updateSuggesterIndex.php @@ -216,13 +216,13 @@ } } catch ( \Elastica\Exception\Connection\HttpException $e ) { $message = $e->getMessage(); - $this->output( "\nUnexpected Elasticsearch failure.\n" ); + $this->log( "\nUnexpected Elasticsearch failure.\n" ); $this->error( "Http error communicating with Elasticsearch: $message.\n", 1 ); } catch ( \Elastica\Exception\ExceptionInterface $e ) { $type = get_class( $e ); $message = ElasticsearchIntermediary::extractMessage( $e ); $trace = $e->getTraceAsString(); - $this->output( "\nUnexpected Elasticsearch failure.\n" ); + $this->log( "\nUnexpected Elasticsearch failure.\n" ); $this->error( "Elasticsearch failed in an unexpected way. This is always a bug in CirrusSearch.\n" . "Error type: $type\n" . "Message: $message\n" . @@ -265,10 +265,10 @@ // Extra check: if stats report usages we should not try to fix things // automatically. if ( $stats['_all']['total']['suggest']['total'] == 0 ) { - $this->output( "Deleting broken index {$idx->getName()}\n" ); + $this->log( "Deleting broken index {$idx->getName()}\n" ); $this->deleteIndex( $idx ); } else { - $this->output( "Broken index {$idx->getName()} appears to be in use, please check and delete.\n" ); + $this->log( "Broken index {$idx->getName()} appears to be in use, please check and delete.\n" ); } } @@ -282,7 +282,6 @@ $this->createIndex(); $this->indexData(); - $this->indexData( Connection::GENERAL_INDEX_TYPE ); if ( $this->optimizeIndex ) { $this->optimize(); } @@ -291,7 +290,7 @@ $this->validateAlias(); $this->updateVersions(); $this->deleteOldIndex(); - $this->output("done.\n"); + $this->log("Done.\n"); } private function canRecycle() { @@ -369,10 +368,9 @@ * but needs to be carefully tested on bigger indices with high QPS. */ private function recycle() { - $this->output( "Recycling index {$this->getIndex()->getName()}\n"); + $this->log( "Recycling index {$this->getIndex()->getName()}\n"); $this->recycle = true; $this->indexData(); - $this->indexData( Connection::GENERAL_INDEX_TYPE ); // This is fragile... hopefully most of the docs will be deleted from the old segments // and will result in a fast operation. // New segments should not be affected. @@ -408,7 +406,7 @@ $totalDocsInIndex = $totalDocsInIndex['hits']['total']; $totalDocsToDump = $totalDocsInIndex; - $this->output( "Deleting remaining docs from previous batch ($totalDocsInIndex).\n" ); + $this->log( "Deleting remaining docs from previous batch ($totalDocsInIndex).\n" ); Util::iterateOverScroll( $this->getIndex(), $result->getResponse()->getScrollId(), '15m', function( $results ) use ( &$docsDumped, $totalDocsToDump ) { $ids = array(); @@ -423,7 +421,7 @@ } ); }, 0, $this->indexRetryAttempts ); - $this->output( "Done.\n" ); + $this->log( "Done.\n" ); // Old docs should be deleted now we can optimize and flush $this->optimize(); @@ -438,7 +436,7 @@ private function deleteOldIndex() { if ( $this->oldIndex && $this->oldIndex->exists() ) { - $this->output("Deleting " . $this->oldIndex->getName() . " ... "); + $this->log("Deleting " . $this->oldIndex->getName() . " ... "); // @todo Utilize $this->oldIndex->delete(...) once Elastica library is updated // to allow passing the master_timeout $this->oldIndex->request( @@ -472,26 +470,34 @@ } private function optimize() { - $this->output("Optimizing index..."); + $this->log("Optimizing index..."); $this->getIndex()->optimize( array( 'max_num_segments' => 1 ) ); $this->output("ok.\n"); } private function expungeDeletes() { - $this->output("Purging deleted docs..."); + $this->log("Purging deleted docs..."); $this->getIndex()->optimize( array( 'only_expunge_deletes' => true, 'flush' => false ) ); $this->output("ok.\n"); } - private function indexData( $sourceIndexType = Connection::CONTENT_INDEX_TYPE ) { + private function indexData() { global $wgCirrusSearchCompletionDefaultScore; $scoreMethodName = $this->getOption( 'scoringMethod', $wgCirrusSearchCompletionDefaultScore ); if ( $this->scoreMethod == null ) { $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName ); } if ( $this->builder == null ) { + // NOTE: the builder stores a batchId value to flag + // documents indexed by this builder. Make sure to + // reuse the same instance when building docs otherwize + // the batchId might be regenerated and can cause data + // loss when recycle the index. $this->builder = new SuggestBuilder( $this->scoreMethod, $this->withGeo ); } + + // We build the suggestions by reading CONTENT and GENERAL indices. + $sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE, Connection::GENERAL_INDEX_TYPE ); $query = new Query(); $query->setFields( array( '_id', '_type', '_source' ) ); @@ -512,48 +518,67 @@ ) ); - $scrollOptions = array( - 'search_type' => 'scan', - 'scroll' => "15m", - 'size' => $this->indexChunkSize - ); + // Run a first query to count the number of docs. + // This is needed for the scoring methods that need + // to normalize values against wiki size. + $mSearch = new \Elastica\Multi\Search( $this->getClient() ); + foreach ( $sourceIndexTypes as $sourceIndexType ) { + $search = new \Elastica\Search( $this->getClient() ); + $search->addIndex( $this->getConnection()->getIndex( $this->indexBaseName, $sourceIndexType ) ); + $search->setOption( \Elastica\Search::OPTION_SEARCH_TYPE, \Elastica\Search::OPTION_SEARCH_TYPE_COUNT ); + $mSearch->addSearch( $search ); + } - // TODO: only content index for now ( we'll have to check how it works with commons ) - $sourceIndex = $this->getConnection()->getIndex( $this->indexBaseName, $sourceIndexType ); - $result = $sourceIndex->search( $query, $scrollOptions ); - $totalDocsInIndex = $result->getResponse()->getData(); - $totalDocsInIndex = $totalDocsInIndex['hits']['total']; - $this->scoreMethod->setMaxDocs( $totalDocsInIndex ); - $totalDocsToDump = $totalDocsInIndex; + $mSearchRes = $mSearch->search(); + $total = 0; + foreach( $mSearchRes as $res ) { + $total += $res->getTotalHits(); + } + $this->log( "Setting max_docs to $total\n" ); + $this->scoreMethod->setMaxDocs( $total ); + foreach( $sourceIndexTypes as $sourceIndexType ) { + $scrollOptions = array( + 'search_type' => 'scan', + 'scroll' => "15m", + 'size' => $this->indexChunkSize + ); - $docsDumped = 0; - $this->output( "Indexing $totalDocsToDump documents from $sourceIndexType ($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" ); + // TODO: only content index for now ( we'll have to check how it works with commons ) + $sourceIndex = $this->getConnection()->getIndex( $this->indexBaseName, $sourceIndexType ); + $result = $sourceIndex->search( $query, $scrollOptions ); + $totalDocsInIndex = $result->getResponse()->getData(); + $totalDocsInIndex = $totalDocsInIndex['hits']['total']; + $totalDocsToDump = $totalDocsInIndex; - $destinationType = $this->getIndex()->getType( Connection::TITLE_SUGGEST_TYPE_NAME ); + $docsDumped = 0; + $this->log( "Indexing $totalDocsToDump documents from $sourceIndexType ($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" ); - Util::iterateOverScroll( $sourceIndex, $result->getResponse()->getScrollId(), '15m', - function( $results ) use ( &$docsDumped, $totalDocsToDump, - $destinationType ) { - $suggestDocs = array(); - $inputDocs = array(); - foreach ( $results as $result ) { - $docsDumped++; - $inputDocs[] = array( - 'id' => $result->getId(), - 'source' => $result->getSource() - ); - } + $destinationType = $this->getIndex()->getType( Connection::TITLE_SUGGEST_TYPE_NAME ); - $suggestDocs = $this->builder->build( $inputDocs ); - $this->outputProgress( $docsDumped, $totalDocsToDump ); - Util::withRetry( $this->indexRetryAttempts, - function() use ( $destinationType, $suggestDocs ) { - $destinationType->addDocuments( $suggestDocs ); + Util::iterateOverScroll( $sourceIndex, $result->getResponse()->getScrollId(), '15m', + function( $results ) use ( &$docsDumped, $totalDocsToDump, + $destinationType ) { + $suggestDocs = array(); + $inputDocs = array(); + foreach ( $results as $result ) { + $docsDumped++; + $inputDocs[] = array( + 'id' => $result->getId(), + 'source' => $result->getSource() + ); } - ); - }, 0, $this->indexRetryAttempts ); - $this->output( "Indexing from $sourceIndexType index done.\n" ); + + $suggestDocs = $this->builder->build( $inputDocs ); + $this->outputProgress( $docsDumped, $totalDocsToDump ); + Util::withRetry( $this->indexRetryAttempts, + function() use ( $destinationType, $suggestDocs ) { + $destinationType->addDocuments( $suggestDocs ); + } + ); + }, 0, $this->indexRetryAttempts ); + $this->log( "Indexing from $sourceIndexType index done.\n" ); + } } public function validateAlias() { @@ -593,6 +618,11 @@ if ( ( $pctDone % 2 ) == 0 ) { $this->outputIndented( "\t$pctDone% done...\n" ); } + } + + public function log( $message, $channel = NULL ) { + $date = new \DateTime(); + parent::output( $date->format('Y-m-d H:i:s') . " " . $message, $channel ); } /** @@ -660,7 +690,7 @@ } private function enableReplicas() { - $this->output("Enabling replicas...\n"); + $this->log("Enabling replicas...\n"); $args = array( 'index' => array( 'auto_expand_replicas' => $this->getReplicaCount(), @@ -684,7 +714,7 @@ } private function waitForGreen( $timeout = 600 ) { - $this->output( "Waiting for the index to go green...\n" ); + $this->log( "Waiting for the index to go green...\n" ); // Wait for the index to go green ( default 10 min) if ( !$this->utils->waitForGreen( $this->getIndex()->getName(), $timeout ) ) { $this->error( "Failed to wait for green... please check config and delete the {$this->getIndex()->getName()} index if it was created.", 1 ); @@ -703,7 +733,7 @@ } private function updateVersions() { - $this->outputIndented( "Updating tracking indexes..." ); + $this->log( "Updating tracking indexes..." ); $index = $this->getConnection()->getIndex( 'mw_cirrus_versions' ); if ( !$index->exists() ) { throw new \Exception("mw_cirrus_versions does not exist, you must index your data first"); diff --git a/tests/unit/SuggestBuilderTest.php b/tests/unit/SuggestBuilderTest.php index 32c39fb..04a7742 100644 --- a/tests/unit/SuggestBuilderTest.php +++ b/tests/unit/SuggestBuilderTest.php @@ -131,6 +131,8 @@ 'incoming_links' => $score ); + $score = (int) (SuggestBuilder::CROSSNS_DISCOUNT * $score); + $expected = array( array( 'suggest' => array( -- To view, visit https://gerrit.wikimedia.org/r/278716 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I782946a5fa60ab80dd9c990d6e82748da56040ac Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits