jenkins-bot has submitted this change and it was merged. Change subject: Centralize document id generation ......................................................................
Centralize document id generation To support multi-project indices we have to make sure document id's don't overlap between projects. The main idea for this is to prefix all ids with the wiki name, so the document 12 becomes testwiki:12. Internal to elasticsearch id's are already treated as strings so nothing needs to change there. This patch tries to track down all the places we deal with id's and classify them as document ids or page ids. All the places page id's need to be converted to pageId's they pass through SearchConfig::makeId(). This works, but I worry that it is fragile. Enabling this and reindexing must be performed before doing any kind of multi-wiki merging. By having the indices already configured to use distinct id's the transition phase of reindexing into a single index and cutting over should be worry free, requiring only a standard time based reindex to cover updates during the initial reindexing period. Bug: T139496 Change-Id: I5f1aa56645f7ff88649e4238ca6d0af551c858c3 --- M CirrusSearch.php M includes/BuildDocument/SuggestBuilder.php M includes/CompletionSuggester.php M includes/Connection.php M includes/DataSender.php M includes/Dump.php M includes/ElasticsearchIntermediary.php M includes/Hooks.php M includes/Job/CheckerJob.php M includes/Job/DeletePages.php M includes/Job/ElasticaWrite.php M includes/Job/Job.php M includes/Job/OtherIndex.php M includes/Maintenance/ChunkBuilder.php M includes/Maintenance/Reindexer.php M includes/OtherIndexes.php M includes/Query/GeoFeature.php M includes/Query/InCategoryFeature.php M includes/Sanity/Checker.php M includes/Sanity/QueueingRemediator.php M includes/Sanity/Remediator.php M includes/Search/SearchContext.php M includes/SearchConfig.php M includes/Searcher.php M includes/Updater.php M maintenance/copySearchIndex.php M maintenance/forceSearchIndex.php M maintenance/indexNamespaces.php M maintenance/runSearch.php M maintenance/saneitize.php M maintenance/saneitizeJobs.php M maintenance/updateOneSearchIndexConfig.php M maintenance/updateSuggesterIndex.php M tests/unit/Query/GeoFeatureTest.php M tests/unit/SuggestBuilderTest.php 35 files changed, 443 insertions(+), 275 deletions(-) Approvals: DCausse: Looks good to me, approved jenkins-bot: Verified Objections: Cindy-the-browser-test-bot: There's a problem with this change, please improve diff --git a/CirrusSearch.php b/CirrusSearch.php index a285eec..860fca3 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -911,7 +911,9 @@ $wgCirrusSearchSanityCheck = true; /** - * The base name of indexes used on this wiki. + * The base name of indexes used on this wiki. This value must be + * unique across all wiki's sharing an elasticsearch cluster unless + * $wgCirrusSearchMultiWikiIndices is set to true. */ $wgCirrusSearchIndexBaseName = wfWikiID(); @@ -936,6 +938,15 @@ */ $wgCirrusSearchFullTextQueryBuilderProfile = 'default'; +/** + * Transitionary flag for converting between older style + * doc ids (page ids) to the newer style ids (wikiid|pageid). + * Changing this from false to true requires first turning + * this on, then performing an in-place reindex. There may + * be some duplicate/outdated results while the inplace + * reindex is running. + */ +$wgCirrusSearchPrefixIds = false; $includes = __DIR__ . "/includes/"; $apiDir = $includes . 'Api/'; diff --git a/includes/BuildDocument/SuggestBuilder.php b/includes/BuildDocument/SuggestBuilder.php index 94c7fac..97360b1 100644 --- a/includes/BuildDocument/SuggestBuilder.php +++ b/includes/BuildDocument/SuggestBuilder.php @@ -113,7 +113,7 @@ $docs = array(); foreach ( $inputDocs as $sourceDoc ) { $inputDoc = $sourceDoc['source']; - $id = $sourceDoc['id']; + $docId = $sourceDoc['id']; if ( !isset( $inputDoc['namespace'] ) ) { // Bad doc, nothing to do here. continue; @@ -123,7 +123,7 @@ // Bad doc, nothing to do here. continue; } - $docs = array_merge( $docs, $this->buildNormalSuggestions( $id, $inputDoc ) ); + $docs = array_merge( $docs, $this->buildNormalSuggestions( $docId, $inputDoc ) ); } else { if ( !isset( $inputDoc['redirect'] ) ) { // Bad doc, nothing to do here. @@ -181,11 +181,11 @@ /** * Build classic suggestion * - * @param int $id + * @param string $docId * @param array $inputDoc * @return \Elastica\Document[] a set of suggest documents */ - private function buildNormalSuggestions( $id, array $inputDoc ) { + private function buildNormalSuggestions( $docId, array $inputDoc ) { if ( !isset( $inputDoc['title'] ) ) { // Bad doc, nothing to do here. return array(); @@ -197,9 +197,9 @@ $location = $this->findPrimaryCoordinates( $inputDoc ); $suggestions = $this->extractTitleAndSimilarRedirects( $inputDoc ); - $docs[] = $this->buildTitleSuggestion( $id, $suggestions['group'], $location, $score ); + $docs[] = $this->buildTitleSuggestion( $docId, $suggestions['group'], $location, $score ); if ( !empty( $suggestions['candidates'] ) ) { - $docs[] = $this->buildRedirectsSuggestion( $id, $suggestions['candidates'], + $docs[] = $this->buildRedirectsSuggestion( $docId, $suggestions['candidates'], $location, $score ); } return $docs; @@ -250,20 +250,20 @@ * The output is encoded as pageId:t:Title. * NOTE: the client will be able to display Title encoded in the output when searching. * - * @param int $id the page id + * @param string $docId the page id * @param array $title the title in 'text' and an array of similar redirects in 'variants' * @param array|null $location the geo coordinates or null if unavailable * @param int $score the weight of the suggestion * @return \Elastica\Document the suggestion document */ - private function buildTitleSuggestion( $id, array $title, array $location = null, $score ) { + private function buildTitleSuggestion( $docId, array $title, array $location = null, $score ) { $inputs = array( $this->prepareInput( $title['text'] ) ); foreach ( $title['variants'] as $variant ) { $inputs[] = $this->prepareInput( $variant ); } - $output = self::encodeTitleOutput( $id, $title['text'] ); + $output = self::encodeTitleOutput( $docId, $title['text'] ); return $this->buildSuggestion( - self::TITLE_SUGGESTION . $id, + self::TITLE_SUGGESTION . $docId, $output, $inputs, $location, @@ -279,33 +279,33 @@ * and choose the best one to display. This is because we are unable * to make this decision at index time. * - * @param int $id the page id + * @param string $docId the elasticsearch document id * @param string[] $redirects * @param array|null $location the geo coordinates or null if unavailable * @param int $score the weight of the suggestion * @return \Elastica\Document the suggestion document */ - private function buildRedirectsSuggestion( $id, array $redirects, array $location = null, $score ) { + private function buildRedirectsSuggestion( $docId, array $redirects, array $location = null, $score ) { $inputs = array(); foreach ( $redirects as $redirect ) { $inputs[] = $this->prepareInput( $redirect ); } - $output = $id . ":" . self::REDIRECT_SUGGESTION; + $output = $docId . ":" . self::REDIRECT_SUGGESTION; $score = (int) ( $score * self::REDIRECT_DISCOUNT ); - return $this->buildSuggestion( self::REDIRECT_SUGGESTION . $id, $output, $inputs, $location, $score ); + return $this->buildSuggestion( self::REDIRECT_SUGGESTION . $docId, $output, $inputs, $location, $score ); } /** * Builds a suggestion document. * - * @param string $id The document id + * @param string $docId The document id * @param string $output the suggestion output * @param string[] $inputs the suggestion inputs * @param array|null $location the geo coordinates or null if unavailable * @param int $score the weight of the suggestion * @return \Elastica\Document a doc ready to be indexed in the completion suggester */ - private function buildSuggestion( $id, $output, array $inputs, array $location = null, $score ) { + private function buildSuggestion( $docId, $output, array $inputs, array $location = null, $score ) { $doc = array( 'batch_id' => $this->batchId, 'suggest' => array ( @@ -334,7 +334,7 @@ 'context' => array( 'location' => $location ) ); } - return new \Elastica\Document( $id, $doc ); + return new \Elastica\Document( $docId, $doc ); } /** @@ -477,22 +477,22 @@ /** * Encode a title suggestion output * - * @param int $id pageId + * @param string $docId elasticsearch document id * @param string $title * @return string the encoded output */ - public static function encodeTitleOutput( $id, $title ) { - return $id . ':'. self::TITLE_SUGGESTION . ':' . $title; + public static function encodeTitleOutput( $docId, $title ) { + return $docId . ':'. self::TITLE_SUGGESTION . ':' . $title; } /** * Encode a redirect suggestion output * - * @param int $id pageId + * @param string $docId elasticsearch document id * @return string the encoded output */ - public static function encodeRedirectOutput( $id ) { - return $id . ':' . self::REDIRECT_SUGGESTION; + public static function encodeRedirectOutput( $docId ) { + return $docId . ':' . self::REDIRECT_SUGGESTION; } /** @@ -503,7 +503,7 @@ * text (optional): if TITLE_SUGGESTION the Title text * * @param string $output text value returned by a suggest query - * @return array|null mixed or null if the output is not properly encoded + * @return string[]|null array of strings, or null if the output is not properly encoded */ public static function decodeOutput( $output ) { if ( $output == null ) { @@ -519,7 +519,7 @@ switch( $parts[1] ) { case self::REDIRECT_SUGGESTION: return array( - 'id' => $parts[0], + 'docId' => $parts[0], 'type' => self::REDIRECT_SUGGESTION, ); case self::TITLE_SUGGESTION: @@ -527,7 +527,7 @@ return null; } return array( - 'id' => $parts[0], + 'docId' => $parts[0], 'type' => self::TITLE_SUGGESTION, 'text' => $parts[2] ); diff --git a/includes/CompletionSuggester.php b/includes/CompletionSuggester.php index 342553a..4a907bc 100644 --- a/includes/CompletionSuggester.php +++ b/includes/CompletionSuggester.php @@ -427,8 +427,8 @@ unset( $data['_shards'] ); $limit = $this->getHardLimit(); - $suggestions = array(); - $suggestionProfile = array(); + $suggestionsByDocId = array(); + $suggestionProfileByDocId = array(); foreach ( $data as $name => $results ) { $discount = $profiles[$name]['discount']; foreach ( $results as $suggested ) { @@ -438,50 +438,49 @@ // Ignore broken output continue; } - $pageId = $output['id']; + $docId = $output['docId']; $type = $output['type']; $score = $discount * $suggest['score']; - if ( !isset( $suggestions[$pageId] ) || - $score > $suggestions[$pageId]->getScore() + if ( !isset( $suggestionsByDocId[$docId] ) || + $score > $suggestionsByDocId[$docId]->getScore() ) { + $pageId = $this->config->makePageId( $docId ); $suggestion = new SearchSuggestion( $score, null, null, $pageId ); // If it's a title suggestion we have the text if ( $type === SuggestBuilder::TITLE_SUGGESTION ) { $suggestion->setText( $output['text'] ); } - $suggestions[$pageId] = $suggestion; - $suggestionProfile[$pageId] = $name; + $suggestionsByDocId[$docId] = $suggestion; + $suggestionProfileByDocId[$docId] = $name; } } } } // simply sort by existing scores - uasort( $suggestions, function ( SearchSuggestion $a, SearchSuggestion $b ) { + uasort( $suggestionsByDocId, function ( SearchSuggestion $a, SearchSuggestion $b ) { return $b->getScore() - $a->getScore(); } ); - $this->logContext['hitsTotal'] = count( $suggestions ); + $this->logContext['hitsTotal'] = count( $suggestionsByDocId ); - if ( $this->offset < $limit ) { - $suggestions = array_slice( $suggestions, $this->offset, $limit - $this->offset, true ); - } else { - $suggestions = array(); - } + $suggestionsByDocId = $this->offset < $limit + ? array_slice( $suggestionsByDocId, $this->offset, $limit - $this->offset, true ) + : array(); - $this->logContext['hitsReturned'] = count( $suggestions ); + $this->logContext['hitsReturned'] = count( $suggestionsByDocId ); $this->logContext['hitsOffset'] = $this->offset; // we must fetch redirect data for redirect suggestions - $missingText = array(); - foreach ( $suggestions as $id => $suggestion ) { + $missingTextDocIds = array(); + foreach ( $suggestionsByDocId as $docId => $suggestion ) { if ( $suggestion->getText() === null ) { - $missingText[] = $id; + $missingTextDocIds[] = $docId; } } - if ( !empty ( $missingText ) ) { + if ( !empty ( $missingTextDocIds ) ) { // Experimental. // // Second pass query to fetch redirects. @@ -497,7 +496,7 @@ $redirResponse = null; try { $redirResponse = $type->request( '_mget', 'GET', - array( 'ids' => $missingText ), + array( 'ids' => $missingTextDocIds ), array( '_source_include' => 'redirect' ) ); if ( $redirResponse->isOk() ) { $this->logContext['elasticTook2PassMs'] = intval( $redirResponse->getQueryTime() * 1000 ); @@ -508,8 +507,8 @@ } // We use the original query, we should maybe use the variant that generated this result? $text = Util::chooseBestRedirect( $this->term, $doc['_source']['redirect'] ); - if( !empty( $suggestions[$doc['_id']] ) ) { - $suggestions[$doc['_id']]->setText( $text ); + if( !empty( $suggestionsByDocId[$doc['_id']] ) ) { + $suggestionsByDocId[$doc['_id']]->setText( $text ); } } } else { @@ -534,7 +533,7 @@ } $finalResults = array_filter( - $suggestions, + $suggestionsByDocId, function ( SearchSuggestion $suggestion ) { // text should be not empty for suggestions return $suggestion->getText() != null; @@ -544,7 +543,7 @@ $this->logContext['hits'] = array(); $indexName = $this->connection->getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE )->getName(); $maxScore = 0; - foreach ( $finalResults as $suggestion ) { + foreach ( $finalResults as $docId => $suggestion ) { $title = $suggestion->getSuggestedTitle(); $pageId = $suggestion->getSuggestedTitleID() ?: -1; $maxScore = max( $maxScore, $suggestion->getScore() ); @@ -554,7 +553,9 @@ 'title' => $title ? (string) $title : $suggestion->getText(), 'index' => $indexName, 'pageId' => (int) $pageId, - 'profileName' => isset( $suggestionProfile[$pageId] ) ? $suggestionProfile[$pageId] : "", + 'profileName' => isset( $suggestionProfileByDocId[$docId] ) + ? $suggestionProfileByDocId[$docId] + : "", 'score' => $suggestion->getScore(), ); } diff --git a/includes/Connection.php b/includes/Connection.php index 12786f0..a80f6b4 100644 --- a/includes/Connection.php +++ b/includes/Connection.php @@ -259,6 +259,7 @@ /** * Is there more then one namespace in the provided index type? + * * @param string $indexType an index type * @return false|integer false if the number of indexes is unknown, an integer if it is known */ diff --git a/includes/DataSender.php b/includes/DataSender.php index 29c6413..17911b0 100644 --- a/includes/DataSender.php +++ b/includes/DataSender.php @@ -42,6 +42,11 @@ private $indexBaseName; /** + * @var SearchConfig + */ + private $searchConfig; + + /** * @var Connection */ public function __construct( Connection $conn, SearchConfig $config ) { @@ -49,6 +54,7 @@ $this->log = LoggerFactory::getInstance( 'CirrusSearch' ); $this->failedLog = LoggerFactory::getInstance( 'CirrusSearchChangeFailed' ); $this->indexBaseName = $config->get( SearchConfig::INDEX_BASE_NAME ); + $this->searchConfig = $config; } /** @@ -186,10 +192,10 @@ $responseSet = $bulk->send(); } catch ( ResponseException $e ) { $justDocumentMissing = $this->bulkResponseExceptionIsJustDocumentMissing( $e, - function( $id ) use ( $e ) { + function( $docId ) use ( $e ) { $this->log->info( - "Updating a page that doesn't yet exist in Elasticsearch: {id}", - array( 'id' => $id ) + "Updating a page that doesn't yet exist in Elasticsearch: {docId}", + array( 'docId' => $docId ) ); } ); @@ -220,11 +226,11 @@ /** * Send delete requests to Elasticsearch. * - * @param int[] $ids ids to delete from Elasticsearch + * @param string[] $docIds elasticsearch document ids to delete * @param string|null $indexType index from which to delete. null means all. * @return Status */ - public function sendDeletes( $ids, $indexType = null ) { + public function sendDeletes( $docIds, $indexType = null ) { if ( $indexType === null ) { $indexes = $this->connection->getAllIndexTypes(); } else { @@ -235,7 +241,7 @@ return Status::newFatal( 'cirrussearch-indexes-frozen' ); } - $idCount = count( $ids ); + $idCount = count( $docIds ); if ( $idCount !== 0 ) { try { foreach ( $indexes as $indexType ) { @@ -244,13 +250,13 @@ 'indexType' => $indexType, 'queryType' => 'send_deletes', ) ); - $this->connection->getPageType( $this->indexBaseName, $indexType )->deleteIds( $ids ); + $this->connection->getPageType( $this->indexBaseName, $indexType )->deleteIds( $docIds ); $this->success(); } } catch ( \Elastica\Exception\ExceptionInterface $e ) { $this->failure( $e ); $this->failedLog->warning( - 'Delete for ids: ' . implode( ',', $ids ), + 'Delete for ids: ' . implode( ',', $docIds ), array( 'exception' => $e ) ); return Status::newFatal( 'cirrussearch-failed-send-deletes' ); @@ -263,7 +269,7 @@ /** * @param string $localSite The wikiId to add/remove from local_sites_with_dupe * @param string $indexName The name of the index to perform updates to - * @param array $otherActions A list of arrays each containing the id within elasticsearch ('id') and the article id within $localSite ('articleId') + * @param array $otherActions A list of arrays each containing the id within elasticsearch ('docId') and the article namespace ('ns') and DB key ('dbKey') at the within $localSite * @return Status */ public function sendOtherIndexUpdates( $localSite, $indexName, array $otherActions ) { @@ -289,7 +295,7 @@ ), 'native' ); - $script->setId( $update['id'] ); + $script->setId( $update['docId'] ); $script->setParam( '_type', 'page' ); $script->setParam( '_index', $indexName ); $bulk->addScript( $script, 'update' ); @@ -380,11 +386,11 @@ // This is generally not an error but we should // log it to see how many we get $action = $bulkResponse->getAction(); - $id = 'missing'; + $docId = 'missing'; if ( $action instanceof \Elastica\Bulk\Action\AbstractDocument ) { - $id = $action->getData()->getId(); + $docId = $action->getData()->getId(); } - call_user_func( $logCallback, $id ); + call_user_func( $logCallback, $docId ); } } return $justDocumentMissing; diff --git a/includes/Dump.php b/includes/Dump.php index 0b8c7e9..d1e80df 100644 --- a/includes/Dump.php +++ b/includes/Dump.php @@ -38,8 +38,9 @@ $conn = new Connection( $config ); $searcher = new Searcher( $conn, 0, 0, null, array(), $this->getUser() ); - $id = $this->getTitle()->getArticleID(); - $esSources = $searcher->get( array( $id ), true ); + /** @suppress PhanUndeclaredMethod Phan doesn't know $config is a SearchConfig */ + $docId = $config->makeId( $this->getTitle()->getArticleID() ); + $esSources = $searcher->get( array( $docId ), true ); if ( !$esSources->isOK() ) { // Exception has been logged echo '{}'; diff --git a/includes/ElasticsearchIntermediary.php b/includes/ElasticsearchIntermediary.php index 584c683..f2a19f8 100644 --- a/includes/ElasticsearchIntermediary.php +++ b/includes/ElasticsearchIntermediary.php @@ -196,8 +196,8 @@ $request['indices'][] = $context['index']; } if ( isset( $context['namespaces'] ) ) { - foreach ( $context['namespaces'] as $id ) { - $request['namespaces'][] = (int) $id; + foreach ( $context['namespaces'] as $nsId ) { + $request['namespaces'][] = (int) $nsId; } } if ( !empty( $context['langdetect' ] ) ) { diff --git a/includes/Hooks.php b/includes/Hooks.php index e245152..30b645c 100644 --- a/includes/Hooks.php +++ b/includes/Hooks.php @@ -364,7 +364,9 @@ // Note that we must use the article id provided or it'll be lost in the ether. The job can't // load it from the title because the page row has already been deleted. JobQueueGroup::singleton()->push( - new Job\DeletePages( $page->getTitle(), array( 'id' => $pageId ) ) + new Job\DeletePages( $page->getTitle(), array( + 'docId' => self::getConfig()->makeId( $pageId ) + ) ) ); return true; } @@ -633,7 +635,7 @@ $oldIndexType = $conn->getIndexSuffixForNamespace( $title->getNamespace() ); $job = new Job\DeletePages( $title, array( 'indexType' => $oldIndexType, - 'id' => $oldId + 'docId' => self::getConfig()->makeId( $oldId ) ) ); // Push the job after DB commit but cancel on rollback wfGetDB( DB_MASTER )->onTransactionIdle( function() use ( $job ) { @@ -708,13 +710,19 @@ } /** + * @return SearchConfig + */ + private static function getConfig() { + return MediaWikiServices::getInstance() + ->getConfigFactory() + ->makeConfig( 'CirrusSearch' ); + } + + /** * @return Connection */ private static function getConnection() { - $config = MediaWikiServices::getInstance() - ->getConfigFactory() - ->makeConfig( 'CirrusSearch' ); - return new Connection( $config ); + return new Connection( self::getConfig() ); } /** diff --git a/includes/Job/CheckerJob.php b/includes/Job/CheckerJob.php index a4895cb..b199972 100644 --- a/includes/Job/CheckerJob.php +++ b/includes/Job/CheckerJob.php @@ -5,7 +5,6 @@ use ArrayObject; use CirrusSearch\Connection; use CirrusSearch\Searcher; -use CirrusSearch\SearchConfig; use CirrusSearch\Sanity\Checker; use CirrusSearch\Sanity\QueueingRemediator; use MediaWiki\Logger\LoggerFactory; @@ -34,17 +33,17 @@ class CheckerJob extends Job { /** * Construct a new CherckerJob. - * @param int $fromId - * @param int $toId + * @param int $fromPageId + * @param int $toPageId * @param int $delay * @param string $profile sanitization profile to use * @param string|null $cluster * @return CheckerJob */ - public static function build( $fromId, $toId, $delay, $profile, $cluster ) { + public static function build( $fromPageId, $toPageId, $delay, $profile, $cluster ) { $job = new self( Title::makeTitle( 0, "" ), array( - 'fromId' => $fromId, - 'toId' => $toId, + 'fromPageId' => $fromPageId, + 'toPageId' => $toPageId, 'createdAt' => time(), 'retryCount' => 0, 'profile' => $profile, @@ -55,22 +54,28 @@ } /** + * @param Title $title + * @param array $params + */ + public function __construct( $title, $params ) { + // BC for jobs created before id fields were clarified to be explicitly page id's + if ( isset( $params['fromId'] ) ) { + $params['fromPageId'] = $params['fromId']; + unset( $params['fromId'] ); + } + if ( isset( $parmas['toId'] ) ) { + $params['toPageId'] = $params['toId']; + unset( $params['toId'] ); + } + parent::__construct( $title, $params ); + } + + /** * @return bool * @throws \MWException */ protected function doJob() { - $config = MediaWikiServices::getInstance() - ->getConfigFactory() - ->makeConfig( 'CirrusSearch' ); - return $this->runCheck( $config ); - } - - /** - * @param SearchConfig $config - * @return bool - */ - private function runCheck( SearchConfig $config ) { - $profile = $config->getElement( 'CirrusSearchSanitizationProfiles', $this->params['profile'] ); + $profile = $this->searchConfig->getElement( 'CirrusSearchSanitizationProfiles', $this->params['profile'] ); if( !$profile ) { LoggerFactory::getInstance( 'CirrusSearch' )->warning( "Cannot run CheckerJob invalid profile {profile} provided, check CirrusSearchSanityCheck config.", @@ -105,7 +110,7 @@ $startTime = time(); - $connections = $this->decideClusters( $config ); + $connections = $this->decideClusters(); $clusterNames = implode( ', ', array_keys( $connections ) ); LoggerFactory::getInstance( 'CirrusSearch' )->debug( "Running CheckerJob on cluster $clusterNames {diff}s after insertion", @@ -115,14 +120,15 @@ ) ); - $from = $this->params['fromId']; - $to = $this->params['toId']; + $from = $this->params['fromPageId']; + $to = $this->params['toPageId']; $pageCache = new ArrayObject(); $checkers = array(); foreach( $connections as $cluster => $connection ) { - $searcher = new Searcher( $connection, 0, 0, $config, array(), null ); + $searcher = new Searcher( $connection, 0, 0, $this->searchConfig, array(), null ); $checker = new Checker( + $this->searchConfig, $connection, new QueueingRemediator( $cluster ), $searcher, diff --git a/includes/Job/DeletePages.php b/includes/Job/DeletePages.php index d794977..0ef32b7 100644 --- a/includes/Job/DeletePages.php +++ b/includes/Job/DeletePages.php @@ -2,6 +2,8 @@ namespace CirrusSearch\Job; +use MediaWiki\MediaWikiServices; + /** * Job wrapper around Updater::deletePages. If indexType parameter is * specified then only deletes from that type of index. @@ -23,6 +25,15 @@ */ class DeletePages extends Job { public function __construct( $title, $params ) { + // BC for jobs created before this explicitly handled document id's + if ( isset( $params['id'] ) ) { + // parent::__construct would set $this->searchConfig, but we need this early. Since + // it's only for BC probably ok to leave this dirtiness for now. + $config = MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 'CirrusSearch' ); + /** @suppress PhanUndeclaredMethod phan doesn't know this is a SearchConfig */ + $params['docId'] = $config->makeId( $params['id'] ); + unset( $params['id'] ); + } parent::__construct( $title, $params ); // This is one of the cheapest jobs we have. Plus I'm reasonably @@ -36,7 +47,7 @@ $updater = $this->createUpdater(); $indexType = isset( $this->params[ 'indexType' ] ) ? $this->params[ 'indexType' ] : null; - return $updater->deletePages( array( $this->title ), array( $this->params[ 'id' ] ), + return $updater->deletePages( array( $this->title ), array( $this->params[ 'docId' ] ), $wgCirrusSearchClientSideUpdateTimeout, $indexType ); } } diff --git a/includes/Job/ElasticaWrite.php b/includes/Job/ElasticaWrite.php index a9e3c74..8466ddf 100644 --- a/includes/Job/ElasticaWrite.php +++ b/includes/Job/ElasticaWrite.php @@ -63,10 +63,7 @@ } protected function doJob() { - $config = MediaWikiServices::getInstance() - ->getConfigFactory() - ->makeConfig( 'CirrusSearch' ); - $connections = $this->decideClusters( $config ); + $connections = $this->decideClusters(); $clusterNames = implode( ', ', array_keys( $connections ) ); LoggerFactory::getInstance( 'CirrusSearch' )->debug( "Running {method} on cluster $clusterNames {diff}s after insertion", @@ -84,7 +81,7 @@ $conn->setTimeout( $this->params['clientSideTimeout'] ); } - $sender = new DataSender( $conn, $config ); + $sender = new DataSender( $conn, $this->searchConfig ); try { $status = call_user_func_array( array( $sender, $this->params['method'] ), diff --git a/includes/Job/Job.php b/includes/Job/Job.php index 8e4d90c..c4ea432 100644 --- a/includes/Job/Job.php +++ b/includes/Job/Job.php @@ -36,6 +36,11 @@ protected $connection; /** + * @var SearchConfig + */ + protected $searchConfig; + + /** * @var bool should we retry if this job failed */ private $allowRetries = true; @@ -58,13 +63,13 @@ // data. Luckily, this is how the JobQueue implementations work. $this->removeDuplicates = true; - $config = MediaWikiServices::getInstance() + $this->searchConfig = MediaWikiServices::getInstance() ->getConfigFactory() ->makeConfig( 'CirrusSearch' ); // When the 'cluster' parameter is provided the job must only operate on // the specified cluster, take special care to ensure nested jobs get the // correct cluster set. When set to null all clusters should be written to. - $this->connection = Connection::getPool( $config, $params['cluster'] ); + $this->connection = Connection::getPool( $this->searchConfig, $params['cluster'] ); } public function setConnection( Connection $connection ) { @@ -145,7 +150,7 @@ if ( isset( $this->params['cluster'] ) ) { $flags[] = 'same-cluster'; } - return new Updater( $this->connection, $flags ); + return new Updater( $this->connection, $this->searchConfig, $flags ); } /** @@ -183,15 +188,14 @@ * NOTE: only suited for jobs that work on multiple clusters by * inspecting the 'cluster' job param * - * @param SearchConfig $config * @return Connection[] indexed by cluster name */ - protected function decideClusters( SearchConfig $config ) { + protected function decideClusters() { $cluster = isset ( $this->params['cluster'] ) ? $this->params['cluster'] : null; if ( $cluster === null ) { - return Connection::getWritableClusterConnections( $config ); + return Connection::getWritableClusterConnections( $this->searchConfig ); } - if ( !$config->canWriteToCluster( $cluster ) ) { + if ( !$this->searchConfig->canWriteToCluster( $cluster ) ) { // Just in case a job is present in the queue but its cluster // has been removed from the config file. LoggerFactory::getInstance( 'CirrusSearch' )->warning( @@ -204,6 +208,6 @@ // this job does not allow retries so we just need to throw an exception throw new \RuntimeException( "Received {$this->command} job for an unwritable cluster $cluster." ); } - return array( $cluster => Connection::getPool( $config, $cluster ) ); + return array( $cluster => Connection::getPool( $this->searchConfig, $cluster ) ); } } diff --git a/includes/Job/OtherIndex.php b/includes/Job/OtherIndex.php index a922855..774f71d 100644 --- a/includes/Job/OtherIndex.php +++ b/includes/Job/OtherIndex.php @@ -61,7 +61,7 @@ if ( $this->params['cluster'] ) { $flags[] = 'same-cluster'; } - $otherIdx = new OtherIndexes( $this->connection, $flags, wfWikiID() ); + $otherIdx = new OtherIndexes( $this->connection, $this->searchConfig, $flags, wfWikiID() ); $otherIdx->updateOtherIndex( $titles ); } } diff --git a/includes/Maintenance/ChunkBuilder.php b/includes/Maintenance/ChunkBuilder.php index 3221146..d08beb5 100644 --- a/includes/Maintenance/ChunkBuilder.php +++ b/includes/Maintenance/ChunkBuilder.php @@ -29,19 +29,19 @@ * larger than that size are spat out. If specified as a number followed * by the word "total" without a space between them then that many chunks * will be spat out sized to cover the entire wiki. - * @param int $from - * @param int $to + * @param int $fromPageId + * @param int $toPageId */ - public function build( $self, array $options, $buildChunks, $from, $to ) { + public function build( $self, array $options, $buildChunks, $fromPageId, $toPageId ) { $fixedChunkSize = strpos( $buildChunks, 'total' ) === false; $buildChunks = intval( $buildChunks ); if ( $fixedChunkSize ) { $chunkSize = $buildChunks; } else { - $chunkSize = max( 1, ceil( ( $to - $from ) / $buildChunks ) ); + $chunkSize = max( 1, ceil( ( $toPageId - $fromPageId ) / $buildChunks ) ); } - for ( $id = $from; $id < $to; $id = $id + $chunkSize ) { - $chunkToId = min( $to, $id + $chunkSize ); + for ( $pageId = $fromPageId; $pageId < $toPageId; $pageId += $chunkSize ) { + $chunkToId = min( $toPageId, $pageId + $chunkSize ); print "php $self"; foreach ( $options as $optName => $optVal ) { if ( $optVal === null || $optVal === false || $optName === 'fromId' || @@ -51,7 +51,7 @@ } print " --$optName $optVal"; } - print " --fromId $id --toId $chunkToId\n"; + print " --fromId $pageId --toId $chunkToId\n"; } } } diff --git a/includes/Maintenance/Reindexer.php b/includes/Maintenance/Reindexer.php index ca08722..906037e 100644 --- a/includes/Maintenance/Reindexer.php +++ b/includes/Maintenance/Reindexer.php @@ -4,6 +4,7 @@ use CirrusSearch\Connection; use CirrusSearch\ElasticsearchIntermediary; +use CirrusSearch\SearchConfig; use CirrusSearch\Util; use Elastica\Document; use Elastica\Exception\Connection\HttpException; @@ -32,6 +33,11 @@ * http://www.gnu.org/copyleft/gpl.html */ class Reindexer { + /** + * @var SearchConfig + */ + private $searchConfig; + /*** "From" portion ***/ /** * @var Index @@ -96,6 +102,7 @@ private $out; /** + * @param SearchConfig $searchConfig * @param Connection $source * @param Connection $target * @param Type[] $types @@ -108,8 +115,9 @@ * @param Maintenance $out * @throws \Exception */ - public function __construct( Connection $source, Connection $target, array $types, array $oldTypes, $shardCount, $replicaCount, $connectionTimeout, array $mergeSettings, array $mappingConfig, Maintenance $out = null ) { + public function __construct( SearchConfig $searchConfig, Connection $source, Connection $target, array $types, array $oldTypes, $shardCount, $replicaCount, $connectionTimeout, array $mergeSettings, array $mappingConfig, Maintenance $out = null ) { // @todo: this constructor has too many arguments - refactor! + $this->searchConfig = $searchConfig; $this->oldConnection = $source; $this->connection = $target; $this->types = $types; @@ -366,11 +374,16 @@ $data['wiki'] = wfWikiId(); } + // Maybe instead the reindexer should know if we are converting from the old + // style numeric page id's to the new style prefixed id's. This probably + // works though. + $docId = $this->searchConfig->maybeMakeId( $result->getId() ); + // Note that while setting the opType to create might improve performance slightly it can cause // trouble if the scroll returns the same id twice. It can do that if the document is updated // during the scroll process. I'm unclear on if it will always do that, so you still have to // perform the date based catch up after the reindex. - return new Document( $result->getId(), $data ); + return new Document( $docId, $data ); } /** diff --git a/includes/OtherIndexes.php b/includes/OtherIndexes.php index 64f0370..5781598 100644 --- a/includes/OtherIndexes.php +++ b/includes/OtherIndexes.php @@ -28,13 +28,13 @@ private $localSite; /** - * Constructor * @param Connection $connection + * @param SearchConfig $config * @param array $flags * @param string $localSite */ - public function __construct( Connection $connection, array $flags, $localSite) { - parent::__construct( $connection, $flags ); + public function __construct( Connection $connection, SearchConfig $config, array $flags, $localSite) { + parent::__construct( $connection, $config, $flags ); $this->localSite = $localSite; } @@ -103,10 +103,10 @@ $query->setSize( 1 ); $findIdsMultiSearch->addSearch( $type->createSearch( $query ) ); - $findIdsClosures[] = function( $id ) use + $findIdsClosures[] = function( $docId ) use ( $otherIndex, &$updates, $title ) { $updates[$otherIndex][] = array( - 'id' => $id, + 'docId' => $docId, 'ns' => $title->getNamespace(), 'dbKey' => $title->getDBkey(), ); diff --git a/includes/Query/GeoFeature.php b/includes/Query/GeoFeature.php index 74a75d2..6a9aade 100644 --- a/includes/Query/GeoFeature.php +++ b/includes/Query/GeoFeature.php @@ -3,6 +3,7 @@ namespace CirrusSearch\Query; use CirrusSearch\Search\SearchContext; +use CirrusSearch\SearchConfig; use Elastica\Query\AbstractQuery; use GeoData\GeoData; use GeoData\Coord; @@ -59,10 +60,13 @@ } if ( substr( $key, -5 ) === 'title' ) { - list( $coord, $radius, $exclude ) = $this->parseGeoNearbyTitle( $value ); + list( $coord, $radius, $excludeDocId ) = $this->parseGeoNearbyTitle( + $context->getConfig(), + $value + ); } else { list( $coord, $radius ) = $this->parseGeoNearby( $value ); - $exclude = 0; + $excludeDocId = ''; } $filter = null; @@ -71,7 +75,7 @@ if ( substr( $key, 0, 6 ) === 'boost-' ) { $context->addGeoBoost( $coord, $radius, $negated ? 0.1 : 1 ); } else { - $filter = self::createQuery( $coord, $radius, $exclude ); + $filter = self::createQuery( $coord, $radius, $excludeDocId ); } } @@ -83,12 +87,13 @@ * <title> * <radius>,<title> * - * @param string $text + * @param SearchConfig $config the Cirrus config object + * @param string $text user input to parse * @return array Three member array with Coordinate object, integer radius * in meters, and page id to exclude from results.. When invalid the * Coordinate returned will be null. */ - public function parseGeoNearbyTitle( $text ) { + public function parseGeoNearbyTitle( SearchConfig $config, $text ) { $title = Title::newFromText( $text ); if ( $title && $title->exists() ) { // Default radius if not provided: 5km @@ -99,24 +104,24 @@ // remaining text is a valid title to use. $pieces = explode( ',', $text, 2 ); if ( count( $pieces ) !== 2 ) { - return array( null, 0, 0 ); + return array( null, 0, '' ); } $radius = $this->parseDistance( $pieces[0] ); if ( $radius === null ) { - return array( null, 0, 0 ); + return array( null, 0, '' ); } $title = Title::newFromText( $pieces[1] ); if ( !$title || !$title->exists() ) { - return array( null, 0, 0 ); + return array( null, 0, '' ); } } $coord = GeoData::getPageCoordinates( $title ); if ( !$coord ) { - return array( null, 0, 0 ); + return array( null, 0, '' ); } - return array( $coord, $radius, $title->getArticleID() ); + return array( $coord, $radius, $config->makeId( $title->getArticleID() ) ); } /** @@ -187,10 +192,10 @@ * * @param Coord $coord * @param int $radius Search radius in meters - * @param int $idToExclude Page id to exclude, or 0 for no exclusions. + * @param string $docIdToExclude Document id to exclude, or "" for no exclusions. * @return AbstractQuery */ - public static function createQuery( Coord $coord, $radius, $idToExclude = 0 ) { + public static function createQuery( Coord $coord, $radius, $docIdToExclude = '' ) { $query = new \Elastica\Query\BoolQuery(); $query->addFilter( new \Elastica\Query\Term( [ 'coordinates.globe' => $coord->globe ] ) ); $query->addFilter( new \Elastica\Query\Term( [ 'coordinates.primary' => 1 ] ) ); @@ -203,8 +208,8 @@ $distanceFilter->setOptimizeBbox( 'indexed' ); $query->addFilter( $distanceFilter ); - if ( $idToExclude > 0 ) { - $query->addMustNot( new \Elastica\Query\Term( [ '_id' => $idToExclude ] ) ); + if ( $docIdToExclude !== '' ) { + $query->addMustNot( new \Elastica\Query\Term( [ '_id' => $docIdToExclude ] ) ); } $nested = new \Elastica\Query\Nested(); diff --git a/includes/Query/InCategoryFeature.php b/includes/Query/InCategoryFeature.php index 1508041..042b9ea 100644 --- a/includes/Query/InCategoryFeature.php +++ b/includes/Query/InCategoryFeature.php @@ -77,20 +77,20 @@ */ private function matchPageCategories( array $categories ) { $filter = new \Elastica\Query\BoolQuery(); - $ids = array(); + $pageIds = array(); $names = array(); foreach ( $categories as $category ) { if ( substr( $category, 0, 3 ) === 'id:' ) { - $id = substr( $category, 3 ); - if ( ctype_digit( $id ) ) { - $ids[] = $id; + $pageId = substr( $category, 3 ); + if ( ctype_digit( $pageId ) ) { + $pageIds[] = $pageId; } } else { $names[] = $category; } } - foreach ( Title::newFromIDs( $ids ) as $title ) { + foreach ( Title::newFromIDs( $pageIds ) as $title ) { $names[] = $title->getText(); } if ( !$names ) { diff --git a/includes/Sanity/Checker.php b/includes/Sanity/Checker.php index 18f018c..7bd917a 100644 --- a/includes/Sanity/Checker.php +++ b/includes/Sanity/Checker.php @@ -4,6 +4,7 @@ use ArrayObject; use CirrusSearch\Connection; +use CirrusSearch\SearchConfig; use CirrusSearch\Searcher; use MediaWiki\MediaWikiServices; use Status; @@ -30,6 +31,11 @@ */ class Checker { + /** + * @var SearchConfig + */ + private $searchConfig; + /** * @var Connection */ @@ -73,7 +79,8 @@ * @param bool $fastRedirectCheck fast but inconsistent redirect check * @param ArrayObject|null $pageCache cache for WikiPage loaded from db */ - public function __construct( Connection $connection, Remediator $remediator, Searcher $searcher, $logSane, $fastRedirectCheck, ArrayObject $pageCache = null ) { + public function __construct( SearchConfig $config, Connection $connection, Remediator $remediator, Searcher $searcher, $logSane, $fastRedirectCheck, ArrayObject $pageCache = null ) { + $this->searchConfig = $config; $this->connection = $connection; $this->remediator = $remediator; $this->searcher = $searcher; @@ -89,21 +96,23 @@ * @return int the number of pages updated */ public function check( array $pageIds ) { + $docIds = array_map( array( $this->searchConfig, 'makeId' ), $pageIds ); + $pagesFromDb = $this->loadPagesFromDB( $pageIds ); - $pagesFromIndex = $this->loadPagesFromIndex( $pageIds ); + $pagesFromIndex = $this->loadPagesFromIndex( $docIds ); $nbPagesFixed = 0; - foreach( $pageIds as $pageId ) { + foreach( array_combine( $pageIds, $docIds ) as $pageId => $docId ) { $fromIndex = array(); - if ( isset( $pagesFromIndex[$pageId] ) ) { - $fromIndex = $pagesFromIndex[$pageId]; + if ( isset( $pagesFromIndex[$docId] ) ) { + $fromIndex = $pagesFromIndex[$docId]; } $updated = false; if ( isset ( $pagesFromDb[$pageId] ) ) { $page = $pagesFromDb[$pageId]; - $updated = $this->checkExisitingPage( $pageId, $page, $fromIndex ); + $updated = $this->checkExisitingPage( $docId, $pageId, $page, $fromIndex ); } else { - $updated = $this->checkInexistentPage( $pageId, $fromIndex ); + $updated = $this->checkInexistentPage( $docId, $pageId, $fromIndex ); } if( $updated ) { $nbPagesFixed++; @@ -122,12 +131,13 @@ * - delete it if it's a redirect * - verify it if found in the index * + * @param string $docId * @param int $pageId * @param WikiPage $page * @param \Elastica\Result[] $fromIndex * @return bool true if a modification was needed */ - private function checkExisitingPage( $pageId, $page, $fromIndex ) { + private function checkExisitingPage( $docId, $pageId, $page, $fromIndex ) { $inIndex = count( $fromIndex ) > 0; if ( $this->checkIfRedirect( $page ) ) { if ( $inIndex ) { @@ -138,7 +148,7 @@ return false; } if ( $inIndex ) { - return $this->checkIndexMismatch( $pageId, $page, $fromIndex ); + return $this->checkIndexMismatch( $docId, $pageId, $page, $fromIndex ); } $this->remediator->pageNotInIndex( $page ); return true; @@ -168,17 +178,18 @@ * Check that an inexistent page is not present in the index * and delete it if found * + * @param string $docId * @param int $pageId * @param WikiPage $page * @param \Elastica\Result[] $fromIndex * @return bool true if a modification was needed */ - private function checkInexistentPage( $pageId, $fromIndex ) { + private function checkInexistentPage( $docId, $pageId, $fromIndex ) { $inIndex = count( $fromIndex ) > 0; if ( $inIndex ) { foreach( $fromIndex as $r ) { $title = Title::makeTitle( $r->namespace, $r->title ); - $this->remediator->ghostPageInIndex( $pageId, $title ); + $this->remediator->ghostPageInIndex( $docId, $title ); } return true; } @@ -192,19 +203,20 @@ * is properly indexed to the appropriate index by checking its * namespace. * + * @param string $docId * @param int $pageId * @param WikiPage $page * @param \Elastica\Result[] $fromIndex * @return bool true if a modification was needed */ - private function checkIndexMismatch( $pageId, $page, $fromIndex ) { + private function checkIndexMismatch( $docId, $pageId, $page, $fromIndex ) { $foundInsanityInIndex = false; $expectedType = $this->connection->getIndexSuffixForNamespace( $page->getTitle()->getNamespace() ); foreach ( $fromIndex as $indexInfo ) { $type = $this->connection->extractIndexSuffix( $indexInfo->getIndex() ); if ( $type !== $expectedType ) { // Got to grab the index type from the index name.... - $this->remediator->pageInWrongIndex( $page, $type ); + $this->remediator->pageInWrongIndex( $docId, $page, $type ); $foundInsanityInIndex = true; } } @@ -216,22 +228,22 @@ } /** - * @param int[] $ids page ids + * @param int[] $pageIds page ids * @return WikiPage[] the list of wiki pages indexed in page id */ - private function loadPagesFromDB( array $ids ) { + private function loadPagesFromDB( array $pageIds ) { // If no cache object is constructed we build a new one. // Building it in the constructor would cause memleaks because // there is no automatic prunning of old entries. If a cache // object is provided the owner of this Checker instance must take // care of the cleaning. $cache = $this->pageCache ?: new ArrayObject(); - $ids = array_diff( $ids, array_keys( $cache->getArrayCopy() ) ); - if ( empty( $ids ) ) { + $pageIds = array_diff( $pageIds, array_keys( $cache->getArrayCopy() ) ); + if ( empty( $pageIds ) ) { return $cache->getArrayCopy(); } $dbr = wfGetDB( DB_SLAVE ); - $where = 'page_id IN (' . $dbr->makeList( $ids ) . ')'; + $where = 'page_id IN (' . $dbr->makeList( $pageIds ) . ')'; $res = $dbr->select( array( 'page' ), WikiPage::selectFields(), @@ -246,12 +258,12 @@ } /** - * @param int[] $ids page ids + * @param string[] $docIds document ids * @return \Elastica\Result[][] search results indexed by page id * @throws \Exception if an error occurred */ - private function loadPagesFromIndex( array $ids ) { - $status = $this->searcher->get( $ids, array( 'namespace', 'title' ) ); + private function loadPagesFromIndex( array $docIds ) { + $status = $this->searcher->get( $docIds, array( 'namespace', 'title' ) ); if ( !$status->isOK() ) { throw new \Exception( 'Cannot fetch ids from index' ); } diff --git a/includes/Sanity/QueueingRemediator.php b/includes/Sanity/QueueingRemediator.php index 98f9c2e..c50004f 100644 --- a/includes/Sanity/QueueingRemediator.php +++ b/includes/Sanity/QueueingRemediator.php @@ -28,6 +28,9 @@ */ class QueueingRemediator implements Remediator { + /** + * @var string|null + */ protected $cluster; /** @@ -37,6 +40,7 @@ public function __construct( $cluster ) { $this->cluster = $cluster; } + public function redirectInIndex( WikiPage $page ) { $this->pushLinksUpdateJob( $page ); } @@ -45,27 +49,29 @@ } /** + * @param string $docId * @param int $pageId * @param Title $title */ - public function ghostPageInIndex( $pageId, Title $title ) { + public function ghostPageInIndex( $docId, Title $title ) { JobQueueGroup::singleton()->push( new DeletePages( $title, array( - 'id' => $pageId, + 'docId' => $docId, 'cluster' => $this->cluster, ) ) ); } /** + * @param string $docId * @param WikiPage $page * @param string $wrongIndex */ - public function pageInWrongIndex( WikiPage $page, $wrongIndex ) { + public function pageInWrongIndex( $docId, WikiPage $page, $wrongIndex ) { JobQueueGroup::singleton()->push( new DeletePages( $page->getTitle(), array( 'indexType' => $wrongIndex, - 'id' => $page->getId(), + 'docId' => $docId, 'cluster' => $this->cluster, ) ) ); diff --git a/includes/Sanity/Remediator.php b/includes/Sanity/Remediator.php index 764763d..6725dac 100644 --- a/includes/Sanity/Remediator.php +++ b/includes/Sanity/Remediator.php @@ -39,17 +39,20 @@ /** * A non-existent page is in the index. Odds are good it was deleted. - * @param int $pageId id of the deleted page + * + * @param string $docId elsaticsearch document id of the deleted page * @param Title $title title of the page read from the ghost */ - public function ghostPageInIndex( $pageId, Title $title ); + public function ghostPageInIndex( $docId, Title $title ); /** * An existent page is in more then one index. + * + * @param string $docId elasticsearch document id * @param WikiPage $page page in too many indexes * @param string $indexType index type that the page is in but shouldn't be in */ - public function pageInWrongIndex( WikiPage $page, $indexType ); + public function pageInWrongIndex( $docId, WikiPage $page, $indexType ); } /** @@ -60,16 +63,17 @@ public function pageNotInIndex( WikiPage $page ) {} /** - * @param int $pageId + * @param string $docId * @param Title $title */ - public function ghostPageInIndex( $pageId, Title $title ) {} + public function ghostPageInIndex( $docId, Title $title ) {} /** + * @param string $docId * @param WikiPage $page * @param string $indexType */ - public function pageInWrongIndex( WikiPage $page, $indexType ) {} + public function pageInWrongIndex( $docId, WikiPage $page, $indexType ) {} } /** @@ -97,29 +101,30 @@ } /** - * @param int $pageId + * @param string $docId * @param Title $title */ - public function ghostPageInIndex( $pageId, Title $title ) { - $this->log( $pageId, $title, 'Deleted page in index' ); - $this->next->ghostPageInIndex( $pageId, $title ); + public function ghostPageInIndex( $docId, Title $title ) { + $this->log( $docId, $title, 'Deleted page in index' ); + $this->next->ghostPageInIndex( $docId, $title ); } /** + * @param string $docId * @param WikiPage $page * @param string $indexType */ - public function pageInWrongIndex( WikiPage $page, $indexType ) { + public function pageInWrongIndex( $docId, WikiPage $page, $indexType ) { $this->log( $page->getId(), $page->getTitle(), "Page in wrong index: $indexType" ); - $this->next->pageInWrongIndex( $page, $indexType ); + $this->next->pageInWrongIndex( $docId, $page, $indexType ); } /** - * @param int $pageId + * @param int|string $pageOrDocId * @param Title $title * @param string $message */ - private function log( $pageId, $title, $message ) { - printf("%30s %10d %s\n", $message, $pageId, $title ); + private function log( $pageOrDocId, $title, $message ) { + printf("%30s %10d %s\n", $message, $pageOrDocId, $title ); } } diff --git a/includes/Search/SearchContext.php b/includes/Search/SearchContext.php index 51804b9..d7729b1 100644 --- a/includes/Search/SearchContext.php +++ b/includes/Search/SearchContext.php @@ -189,7 +189,7 @@ } /** - * the namespaces being requested. + * mediawiki namespace id's being requested. * NOTE: this value may change during the Searcher process. * * @return int[]|null @@ -199,7 +199,7 @@ } /** - * set the namespaces + * set the mediawiki namespace id's * * @param int[]|null $namespaces array of integer */ diff --git a/includes/SearchConfig.php b/includes/SearchConfig.php index 216039b..ca67963 100644 --- a/includes/SearchConfig.php +++ b/includes/SearchConfig.php @@ -13,6 +13,7 @@ class SearchConfig implements \Config { // Constants for referring to various config values. Helps prevent fat-fingers const INDEX_BASE_NAME = 'CirrusSearchIndexBaseName'; + const PREFIX_IDS = 'CirrusSearchPrefixIds'; /** * Override settings @@ -130,6 +131,72 @@ } /** + * @todo + * The indices have to be rebuilt with new id's and we have to know when + * generating queries if new style id's are being used, or old style. It + * could plausibly be done with the metastore index, but that seems like + * overkill because the knowledge is only necessary during transition, and + * not post-transition. Additionally this function would then need to know + * the name of the index being queried, and that isn't always known when + * building. + * + * @param string|int $pageId + * @return string + */ + public function makeId( $pageId ) { + $prefix = $this->get( self::PREFIX_IDS ) + ? $this->getWikiId() + : null; + + if ( $prefix === null ) { + return (string) $pageId; + } else { + return "{$prefix}|{$pageId}"; + } + } + + /** + * There are times, such as when using the Reindexer, when we aren't completely + * sure if these are old style numeric page id's, or new style prefixed id's. + * Do some magic to decide and self::makeId() when necessary. + * + * @param string|int $pageOrDocId + * @return string + */ + public function maybeMakeId( $pageOrDocId ) { + if ( !is_string( $pageOrDocId ) || ctype_digit( $pageOrDocId ) ) { + return $this->makeId( $pageOrDocId ); + } else { + return $pageOrDocId; + } + } + + /** + * Convert an elasticsearch document id back into a mediawiki page id. + * + * @param string $docId Elasticsearch document id + * @return int Related mediawiki page id + */ + public function makePageId( $docId ) { + if ( !$this->get( self::PREFIX_IDS ) ) { + return (int)$docId; + } + + $pieces = explode( '|', $docId ); + switch( count( $pieces ) ) { + case 2: + return (int)$pieces[1]; + case 1: + // Broken doc id...assume somehow this didn't get prefixed. + // Attempt to continue on...but maybe should throw exception + // instead? + return (int)$docId; + default: + throw new \Exception( "Invalid document id: $docId" ); + } + } + + /** * Get user's language * @return string User's language code */ diff --git a/includes/Searcher.php b/includes/Searcher.php index 278a217..68158d8 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -372,11 +372,12 @@ */ public function moreLikeTheseArticles( array $titles, $options = Searcher::MORE_LIKE_THESE_NONE ) { sort( $titles, SORT_STRING ); - $pageIds = array(); + $docIds = array(); $likeDocs = array(); foreach ( $titles as $title ) { - $pageIds[] = $title->getArticleID(); - $likeDocs[] = array( '_id' => $title->getArticleID() ); + $docId = $this->config->makeId( $title->getArticleID() ); + $docIds[] = $docId; + $likeDocs[] = array( '_id' => $docId ); } // If no fields has been set we return no results. @@ -414,7 +415,7 @@ // Run a first pass to extract the text field content because we want to compare it // against other fields. $text = array(); - $found = $this->get( $pageIds, array( 'text' ) ); + $found = $this->get( $docIds, array( 'text' ) ); if ( !$found->isOK() ) { return $found; } @@ -449,14 +450,14 @@ } /** - * Get the page with $id. Note that the result is a status containing _all_ pages found. + * Get the page with $docId. Note that the result is a status containing _all_ pages found. * It is possible to find more then one page if the page is in multiple indexes. - * @param int[] $pageIds array of page ids + * @param string[] $docIds array of document ids * @param string[]|true|false $sourceFiltering source filtering to apply * @return Status containing pages found, containing an empty array if not found, * or an error if there was an error */ - public function get( array $pageIds, $sourceFiltering ) { + public function get( array $docIds, $sourceFiltering ) { $indexType = $this->connection->pickIndexTypeForNamespaces( $this->searchContext->getNamespaces() ); @@ -466,15 +467,16 @@ $size = count ( $this->connection->getAllIndexSuffixesForNamespaces( $this->searchContext->getNamespaces() )); - $size *= count( $pageIds ); + $size *= count( $docIds ); + return Util::doPoolCounterWork( 'CirrusSearch-Search', $this->user, - function() use ( $pageIds, $sourceFiltering, $indexType, $size ) { + function() use ( $docIds, $sourceFiltering, $indexType, $size ) { try { - $this->start( "get of {indexType}.{pageIds}", array( + $this->start( "get of {indexType}.{docIds}", array( 'indexType' => $indexType, - 'pageIds' => array_map( 'intval', $pageIds ), + 'docIds' => $docIds, 'queryType' => 'get', ) ); // Shard timeout not supported on get requests so we just use the client side timeout @@ -483,7 +485,7 @@ // theorically well suited for this kind of job but they are not // supported on aliases with multiple indices (content/general) $pageType = $this->connection->getPageType( $this->indexBaseName, $indexType ); - $query = new \Elastica\Query( new \Elastica\Query\Ids( null, $pageIds ) ); + $query = new \Elastica\Query( new \Elastica\Query\Ids( null, $docIds ) ); $query->setParam( '_source', $sourceFiltering ); $query->addParam( 'stats', 'get' ); // We ignore limits provided to the searcher diff --git a/includes/Updater.php b/includes/Updater.php index 389eb7e..331375c 100644 --- a/includes/Updater.php +++ b/includes/Updater.php @@ -51,11 +51,17 @@ protected $writeToClusterName; /** + * @var SearchConfig + */ + protected $searchConfig; + + /** * @param Connection $conn * @param string[] $flags */ - public function __construct( Connection $conn, array $flags = array() ) { + public function __construct( Connection $conn, SearchConfig $config, array $flags = array() ) { parent::__construct( $conn, null, 0 ); + $this->searchConfig = $config; if ( in_array( 'same-cluster', $flags ) ) { $this->writeToClusterName = $this->connection->getClusterName(); } @@ -85,11 +91,11 @@ if ( count( $redirects ) === 0 ) { return true; } - $redirectIds = array(); + $redirectDocIds = array(); foreach ( $redirects as $redirect ) { - $redirectIds[] = $redirect->getId(); + $redirectDocIds[] = $this->searchConfig->makeId( $redirect->getId() ); } - return $this->deletePages( array(), $redirectIds, $wgCirrusSearchClientSideUpdateTimeout ); + return $this->deletePages( array(), $redirectDocIds, $wgCirrusSearchClientSideUpdateTimeout ); } /** @@ -232,25 +238,25 @@ } /** - * Delete pages from the elasticsearch index. $titles and $ids must point to the + * Delete pages from the elasticsearch index. $titles and $docIds must point to the * same pages and should point to them in the same order. * * @param Title[] $titles List of titles to delete. If empty then skipped other index * maintenance is skipped. - * @param integer[] $ids List of ids to delete + * @param integer[] $docIds List of elasticsearch document ids to delete * @param null|int $clientSideTimeout timeout in seconds to update pages or null to not * change the configured timeout which defaults to 300 seconds. * @param string $indexType index from which to delete * @return bool True if nothing happened or we successfully deleted, false on failure */ - public function deletePages( $titles, $ids, $clientSideTimeout = null, $indexType = null ) { + public function deletePages( $titles, $docIds, $clientSideTimeout = null, $indexType = null ) { Job\OtherIndex::queueIfRequired( $titles, $this->writeToClusterName ); $job = new Job\ElasticaWrite( $titles ? reset( $titles ) : Title::makeTitle( 0, "" ), array( 'clientSideTimeout' => $clientSideTimeout, 'method' => 'sendDeletes', - 'arguments' => array( $ids, $indexType ), + 'arguments' => array( $docIds, $indexType ), 'cluster' => $this->writeToClusterName, ) ); @@ -286,7 +292,7 @@ continue; } - $doc = new \Elastica\Document( $page->getId(), array( + $doc = new \Elastica\Document( $this->searchConfig->makeId( $page->getId() ), array( 'version' => $page->getLatest(), 'version_type' => 'external', 'wiki' => wfWikiID(), diff --git a/maintenance/copySearchIndex.php b/maintenance/copySearchIndex.php index e6cbcdb..c542730 100644 --- a/maintenance/copySearchIndex.php +++ b/maintenance/copySearchIndex.php @@ -85,10 +85,7 @@ if ( $sourceConnection->getClusterName() == $targetConnection->getClusterName() ) { $this->error("Target cluster should be different from current cluster.", 1); } - $config = MediaWikiServices::getInstance() - ->getConfigFactory() - ->makeConfig( 'CirrusSearch' ); - $clusterSettings = new ClusterSettings( $config, $targetConnection->getClusterName() ); + $clusterSettings = new ClusterSettings( $this->getSearchConfig(), $targetConnection->getClusterName() ); $targetIndexName = $targetConnection->getIndexName( $this->indexBaseName, $this->indexType ); $utils = new ConfigUtils( $targetConnection->getClient(), $this ); @@ -96,6 +93,7 @@ $targetIndexName ); $reindexer = new Reindexer( + $this->getSearchConfig(), $sourceConnection, $targetConnection, // Target Index diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php index d30707f..a239c85 100644 --- a/maintenance/forceSearchIndex.php +++ b/maintenance/forceSearchIndex.php @@ -61,15 +61,14 @@ private $runWithIds; /** - * @var int[][] chunks of ids to reindex when --ids is used, chunk size - * is controlled by mBatchSize + * @var int[] list of page ids to reindex when --ids is used */ - private $ids; + private $pageIds; public function __construct() { parent::__construct(); - $this->mDescription = "Force indexing some pages. Setting --from or --to will switch from id based indexing to " + $this->mDescription = "Force indexing some pages. Setting --from or --to will switch from page id based indexing to " . "date based indexing which uses less efficient queries and follows redirects.\n\n" . "Note: All froms are _exclusive_ and all tos are _inclusive_.\n" . "Note 2: Setting fromId and toId use the efficient query so those are ok.\n" @@ -79,7 +78,7 @@ $this->addOption( 'to', 'Stop date of reindex in YYYY-mm-ddTHH:mm:ssZ. Defaults to now.', false, true ); $this->addOption( 'fromId', 'Start indexing at a specific page_id. Not useful with --deletes.', false, true ); $this->addOption( 'toId', 'Stop indexing at a specific page_id. Not useful with --deletes or --from or --to.', false, true ); - $this->addOption( 'ids', 'List of ids (comma separated) to reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true ); + $this->addOption( 'ids', 'List of page ids (comma separated) to reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true ); $this->addOption( 'deletes', 'If this is set then just index deletes, not updates or creates.', false ); $this->addOption( 'limit', 'Maximum number of pages to process before exiting the script. Default to unlimited.', false, true ); $this->addOption( 'buildChunks', 'Instead of running the script spit out commands that can be farmed out to ' . @@ -120,11 +119,11 @@ $this->error( "$wiki index(es) do not exist. Did you forget to run updateSearchIndexConfig?", 1 ); } - // We need to check ids options early otherwize hasOption may return + // We need to check ids options early otherwise hasOption may return // true even if the user did not set the option on the commandline if ( $this->hasOption( 'ids' ) ) { $this->runWithIds = true; - $this->ids = $this->buildIdBatches(); + $this->pageIds = $this->buildPageIdBatches(); } if ( !is_null( $this->getOption( 'from' ) ) || !is_null( $this->getOption( 'to' ) ) ) { @@ -195,7 +194,7 @@ } else { $size = count( $batch['titlesToDelete'] ); $updater = $this->createUpdater(); - $updater->deletePages( $batch['titlesToDelete'], $batch['idsToDelete'] ); + $updater->deletePages( $batch['titlesToDelete'], $batch['docIdsToDelete'] ); } @@ -211,7 +210,7 @@ $this->waitForQueueToDrain( $wiki ); } - private function buildIdBatches() { + private function buildPageIdBatches() { if ( $this->getOption( 'deletes' ) || $this->hasOption( 'limit' ) || $this->hasOption( 'from' ) || $this->hasOption( 'to' ) || $this->hasOption( 'fromId' ) || $this->hasOption( 'toId' ) @@ -219,15 +218,15 @@ $this->error( '--ids cannot be used with deletes/from/to/fromId/toId/limit', 1 ); } - $ids = array_map( function( $id ) { - $id = trim( $id ); - if ( !ctype_digit( $id ) ) { - $this->error( "Invalid id provided in --ids, got '$id', expected a positive integer", 1 ); + $pageIds = array_map( function( $pageId ) { + $pageId = trim( $pageId ); + if ( !ctype_digit( $pageId ) ) { + $this->error( "Invalid page id provided in --ids, got '$pageId', expected a positive integer", 1 ); } - return intval( $id ); + return intval( $pageId ); }, explode( ',', $this->getOption( 'ids' ) ) ); - return array_unique( $ids, SORT_REGULAR ); + return array_unique( $pageIds, SORT_REGULAR ); } private function buildUpdateFlags() { @@ -358,15 +357,15 @@ return new CallbackIterator( $it, function ( $batch ) { $titlesToDelete = array(); - $idsToDelete = array(); + $docIdsToDelete = array(); foreach ( $batch as $row ) { $titlesToDelete[] = Title::makeTitle( $row->ar_namespace, $row->ar_title ); - $idsToDelete[] = $row->ar_page_id; + $docIdsToDelete[] = $this->getSearchConfig()->makeId( $row->ar_page_id ); } return array( 'titlesToDelete' => $titlesToDelete, - 'idsToDelete' => $idsToDelete, + 'docIdsToDelete' => $docIdsToDelete, 'endingAt' => isset( $row ) ? ( new MWTimestamp( $row->ar_timestamp ) )->getTimestamp( TS_ISO_8601 ) : 'unknown', @@ -379,7 +378,7 @@ $dbr = $this->getDB( DB_SLAVE ); $it = new BatchRowIterator( $dbr, 'page', 'page_id', $this->mBatchSize ); $it->addConditions( array( - 'page_id in (' . $dbr->makeList( $this->ids, LIST_COMMA ) . ')', + 'page_id in (' . $dbr->makeList( $this->pageIds, LIST_COMMA ) . ')', ) ); $this->attachPageConditions( $dbr, $it, 'page' ); @@ -584,7 +583,7 @@ if ( $this->hasOption( 'cluster' ) ) { $flags[] = 'same-cluster'; } - return new Updater( $this->getConnection(), $flags ); + return new Updater( $this->getConnection(), $this->getSearchConfig(), $flags ); } } diff --git a/maintenance/indexNamespaces.php b/maintenance/indexNamespaces.php index d57bf87..b96eaf7 100644 --- a/maintenance/indexNamespaces.php +++ b/maintenance/indexNamespaces.php @@ -43,15 +43,15 @@ $this->output( "done\n" ); $this->outputIndented( "Indexing namespaces..." ); - $namesById = array(); - foreach ( $wgContLang->getNamespaceIds() as $name => $id ) { + $namesByNsId = array(); + foreach ( $wgContLang->getNamespaceIds() as $name => $nsId ) { if ( $name ) { - $namesById[ $id ][] = $name; + $namesByNsId[$nsId][] = $name; } } $documents = array(); - foreach ( $namesById as $id => $names ) { - $documents[] = new Document( $id, array( + foreach ( $namesByNsId as $nsId => $names ) { + $documents[] = new Document( $nsId, array( 'name' => $names, 'wiki' => $this->getSearchConfig()->getWikiId(), ) ); diff --git a/maintenance/runSearch.php b/maintenance/runSearch.php index 47292a7..4ad7847 100644 --- a/maintenance/runSearch.php +++ b/maintenance/runSearch.php @@ -113,7 +113,7 @@ $data['rows'][] = array( // use getDocId() rather than asking the title to allow this script // to work when a production index has been imported to a test es instance - 'pageId' => $result->getDocId(), + 'docId' => $result->getDocId(), 'title' => $result->getTitle()->getPrefixedText(), 'score' => $result->getScore(), 'snippets' => array( diff --git a/maintenance/saneitize.php b/maintenance/saneitize.php index e3bc4d3..3d9368f 100644 --- a/maintenance/saneitize.php +++ b/maintenance/saneitize.php @@ -38,12 +38,12 @@ /** * @var int mediawiki page id */ - private $fromId; + private $fromPageId; /** * @var int mediawiki page id */ - private $toId; + private $toPageId; /** * @var bool true to enable fast but inconsistent redirect checks @@ -97,12 +97,12 @@ $buildChunks = $this->getOption( 'buildChunks'); if ( $buildChunks ) { $builder = new \CirrusSearch\Maintenance\ChunkBuilder(); - $builder->build( $this->mSelf, $this->mOptions, $buildChunks, $this->fromId, $this->toId ); + $builder->build( $this->mSelf, $this->mOptions, $buildChunks, $this->fromPageId, $this->toPageId ); return; } $this->buildChecker(); $updated = $this->check(); - $this->output( "Fixed $updated page(s) (" . ( $this->toId - $this->fromId ) . " checked)\n" ); + $this->output( "Fixed $updated page(s) (" . ( $this->toPageId - $this->fromPageId ) . " checked)\n" ); } /** @@ -110,41 +110,41 @@ */ private function check() { $updated = 0; - for ( $pageId = $this->fromId; $pageId <= $this->toId; $pageId += $this->mBatchSize ) { - $max = min( $this->toId, $pageId + $this->mBatchSize - 1 ); + for ( $pageId = $this->fromPageId; $pageId <= $this->toPageId; $pageId += $this->mBatchSize ) { + $max = min( $this->toPageId, $pageId + $this->mBatchSize - 1 ); $updated += $this->checkChunk( range( $pageId, $max ) ); } return $updated; } /** - * @param int[] $ids + * @param int[] $pageIds mediawiki page ids * @return int number of pages corrected */ - private function checkChunk( array $ids ) { - $updated = $this->checker->check( $ids ); - $this->output( sprintf( "[%20s]%10d/%d\n", wfWikiID(), end( $ids ), - $this->toId ) ); + private function checkChunk( array $pageIds ) { + $updated = $this->checker->check( $pageIds ); + $this->output( sprintf( "[%20s]%10d/%d\n", wfWikiID(), end( $pageIds ), + $this->toPageId ) ); return $updated; } private function setFromAndTo() { $dbr = $this->getDB( DB_SLAVE ); - $this->fromId = $this->getOption( 'fromId' ); - if ( $this->fromId === null ) { - $this->fromId = 0; + $this->fromPageId = $this->getOption( 'fromId' ); + if ( $this->fromPageId === null ) { + $this->fromPageId = 0; } - $this->toId = $this->getOption( 'toId' ); - if ( $this->toId === null ) { - $this->toId = $dbr->selectField( 'page', 'MAX(page_id)' ); - if ( $this->toId === false ) { - $this->toId = 0; + $this->toPageId = $this->getOption( 'toId' ); + if ( $this->toPageId === null ) { + $this->toPageId = $dbr->selectField( 'page', 'MAX(page_id)' ); + if ( $this->toPageId === false ) { + $this->toPageId = 0; } else { // Its technically possible for there to be pages in the index with ids greater // than the maximum id in the database. That isn't super likely, but we'll // check a bit ahead just in case. This isn't scientific or super accurate, // but its cheap. - $this->toId += 100; + $this->toPageId += 100; } } } @@ -161,6 +161,7 @@ // This searcher searches all indexes for the current wiki. $searcher = new Searcher( $this->getConnection(), 0, 0, null, array(), null ); $this->checker = new Checker( + $this->getSearchConfig(), $this->getConnection(), $remediator, $searcher, diff --git a/maintenance/saneitizeJobs.php b/maintenance/saneitizeJobs.php index 6adaab6..7ddfa03 100644 --- a/maintenance/saneitizeJobs.php +++ b/maintenance/saneitizeJobs.php @@ -6,7 +6,6 @@ use CirrusSearch\SearchConfig; use CirrusSearch\Job\CheckerJob; use CirrusSearch\Maintenance\Maintenance; -use CirrusSearch\Sanity\Checker; use CirrusSearch\Sanity\NoopRemediator; use CirrusSearch\Sanity\PrintingRemediator; use CirrusSearch\Sanity\QueueingRemediator; diff --git a/maintenance/updateOneSearchIndexConfig.php b/maintenance/updateOneSearchIndexConfig.php index 10f344a..b8509a2 100644 --- a/maintenance/updateOneSearchIndexConfig.php +++ b/maintenance/updateOneSearchIndexConfig.php @@ -426,6 +426,7 @@ $connection = $this->getConnection(); $reindexer = new Reindexer( + $this->getSearchConfig(), $connection, $connection, array( $this->getPageType() ), diff --git a/maintenance/updateSuggesterIndex.php b/maintenance/updateSuggesterIndex.php index 0e393c0..7d4474c 100644 --- a/maintenance/updateSuggesterIndex.php +++ b/maintenance/updateSuggesterIndex.php @@ -277,8 +277,8 @@ return; } $indexByName = array(); - foreach( $indices as $idx ) { - $indexByName[$idx] = $this->getConnection()->getIndex( $idx ); + foreach( $indices as $name ) { + $indexByName[$name] = $this->getConnection()->getIndex( $name ); } $status = new Status($this->getClient()); @@ -286,16 +286,16 @@ // do not try to delete indices that are used in aliases unset( $indexByName[$aliased->getName()] ); } - foreach ( $indexByName as $name => $idx ) { + foreach ( $indexByName as $name => $index ) { # double check with stats - $stats = $idx->getStats()->getData(); + $stats = $index->getStats()->getData(); // Extra check: if stats report usages we should not try to fix things // automatically. if ( $stats['_all']['total']['suggest']['total'] == 0 ) { - $this->log( "Deleting broken index {$idx->getName()}\n" ); - $this->deleteIndex( $idx ); + $this->log( "Deleting broken index {$index->getName()}\n" ); + $this->deleteIndex( $index ); } else { - $this->log( "Broken index {$idx->getName()} appears to be in use, please check and delete.\n" ); + $this->log( "Broken index {$index->getName()} appears to be in use, please check and delete.\n" ); } } @@ -437,15 +437,15 @@ $this->log( "Deleting remaining docs from previous batch ($totalDocsInIndex).\n" ); MWElasticUtils::iterateOverScroll( $this->getIndex(), $result->getResponse()->getScrollId(), '15m', function( $results ) use ( &$docsDumped, $totalDocsToDump ) { - $ids = array(); + $docIds = array(); foreach( $results as $result ) { $docsDumped++; - $ids[] = $result->getId(); + $docIds[] = $result->getId(); } $this->outputProgress( $docsDumped, $totalDocsToDump ); MWElasticUtils::withRetry( $this->indexRetryAttempts, - function() use ( $ids ) { - $this->getType()->deleteIds( $ids ); + function() use ( $docIds ) { + $this->getType()->deleteIds( $docIds ); } ); }, 0, $this->indexRetryAttempts ); diff --git a/tests/unit/Query/GeoFeatureTest.php b/tests/unit/Query/GeoFeatureTest.php index 95f245c..941fa48 100644 --- a/tests/unit/Query/GeoFeatureTest.php +++ b/tests/unit/Query/GeoFeatureTest.php @@ -2,6 +2,7 @@ namespace CirrusSearch\Query; +use CirrusSearch\SearchConfig; use GeoData\Coord; use LoadBalancer; use IDatabase; @@ -222,11 +223,11 @@ '7km,Washington, D.C.' ), 'unknown page lookup' => array( - array( null, 0, 0 ), + array( null, 0, '' ), 'Unknown Title', ), 'unknown page lookup with radius' => array( - array( null, 0, 0 ), + array( null, 0, '' ), '4km, Unknown Title', ), ); @@ -270,9 +271,16 @@ MediaWikiServices::getInstance()->getLinkCache() ->addGoodLinkObj( 1234567, Title::newFromText( 'Washington, D.C.' ) ); + $config = $this->getMock( SearchConfig::class ); + $config->expects( $this->any() ) + ->method( 'makeId' ) + ->will( $this->returnCallback( function ( $id ) { + return $id; + } ) ); + // Finally run the test $feature = new GeoFeature; - $result = $feature->parseGeoNearbyTitle( $value ); + $result = $feature->parseGeoNearbyTitle( $config, $value ); if ( $result[0] instanceof Coord ) { $result[0] = array( 'lat' => $result[0]->lat, 'lon' => $result[0]->lon ); } diff --git a/tests/unit/SuggestBuilderTest.php b/tests/unit/SuggestBuilderTest.php index 0ad7ed9..3590689 100644 --- a/tests/unit/SuggestBuilderTest.php +++ b/tests/unit/SuggestBuilderTest.php @@ -346,7 +346,7 @@ return array( 'title' => array( array( - 'id' => 123, + 'docId' => '123', 'type' => SuggestBuilder::TITLE_SUGGESTION, 'text' => 'This is a title', ), @@ -354,7 +354,7 @@ ), 'redirect' => array( array( - 'id' => 123, + 'docId' => '123', 'type' => SuggestBuilder::REDIRECT_SUGGESTION, ), SuggestBuilder::encodeRedirectOutput( 123 ), -- To view, visit https://gerrit.wikimedia.org/r/300179 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I5f1aa56645f7ff88649e4238ca6d0af551c858c3 Gerrit-PatchSet: 14 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits