[MediaWiki-commits] [Gerrit] Make reindex process less brittle - change (mediawiki...CirrusSearch)
jenkins-bot has submitted this change and it was merged. Change subject: Make reindex process less brittle .. Make reindex process less brittle 1. Catch bulk index failures and retry them as single indexes. This should help with really really large pages. 2. Catch failures in those single indexes and backoff for some time and retry. 3. Make the batch size and the number of errors to try to backoff from configurable. Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d --- M maintenance/updateOneSearchIndexConfig.php 1 file changed, 59 insertions(+), 5 deletions(-) Approvals: Chad: Looks good to me, approved jenkins-bot: Verified diff --git a/maintenance/updateOneSearchIndexConfig.php b/maintenance/updateOneSearchIndexConfig.php index 962a07f..0d34258 100644 --- a/maintenance/updateOneSearchIndexConfig.php +++ b/maintenance/updateOneSearchIndexConfig.php @@ -3,6 +3,7 @@ namespace CirrusSearch; use Elastica; use \Maintenance; +use \ProfileSection; /** * Update the search configuration on the search backend. @@ -43,7 +44,8 @@ // Is the index currently closed? private $closed = false; - private $reindexChunkSize = 1000; + private $reindexChunkSize; + private $reindexRetryAttempts; private $indexBaseName; private $indexIdentifier; @@ -129,6 +131,14 @@ $maintenance->addOption( 'reindexAcceptableCountDeviation', 'How much can the reindexed ' . 'copy of an index is allowed to deviate from the current copy without triggering a ' . 'reindex failure. Defaults to 5%.', false, true ); + $maintenance->addOption( 'reindexChunkSize', 'Documents per shard to reindex in a batch. ' . + 'Note when changing the number of shards that the old shard size is used, not the new ' . + 'one. If you see many errors submitting documents in bulk but the automatic retry as ' . + 'singles works then lower this number. Defaults to 100.', false, true ); + $maintenance->addOption( 'reindexRetryAttempts', 'Number of times to back off and retry ' . + 'per failure. Note that failures are not common but if Elasticsearch is in the process ' . + 'of moving a shard this can time out. This will retry the attempt after some backoff ' . + 'rather than failing the whole reindex process. Defaults to 5.', false, true ); $maintenance->addOption( 'baseName', 'What basename to use for all indexes, ' . 'defaults to wiki id', false, true ); } @@ -152,6 +162,8 @@ $this->reindexProcesses = $this->getOption( 'reindexProcesses', wfIsWindows() ? 1 : 10 ); $this->reindexAcceptableCountDeviation = $this->parsePotentialPercent( $this->getOption( 'reindexAcceptableCountDeviation', '5%' ) ); + $this->reindexChunkSize = $this->getOption( 'reindexChunkSize', 100 ); + $this->reindexRetryAttempts = $this->getOption( 'reindexRetryAttempts', 5 ); $this->langCode = $wgLanguageCode; $this->aggressiveSplitting = $wgCirrusSearchUseAggressiveSplitting; $this->prefixSearchStartsWithAny = $wgCirrusSearchPrefixSearchStartsWithAnyWord; @@ -630,10 +642,7 @@ $result->next(); } wfProfileOut( __METHOD__ . '::packageDocs' ); - wfProfileIn( __METHOD__ . '::sendDocs' ); - $updateResult = $this->getPageType()->addDocuments( $documents ); - wfDebugLog( 'CirrusSearch', 'Update completed in ' . $updateResult->getEngineTime() . ' (engine) millis' ); - wfProfileOut( __METHOD__ . '::sendDocs' ); + $this->sendDocumentsWithRetry( $messagePrefix, $documents ); $completed += $result->count(); $rate = round( $completed / ( microtime( true ) - $operationStartTime ) ); $this->output( $this->indent . $messagePrefix . @@ -646,6 +655,51 @@ } } + private function sendDocumentsWithRetry( $messagePrefix, $documents ) { + $profiler = new ProfileSection( __METHOD__ ); + + $errors = 0; + while ( true ) { + if ( $errors < $this->reindexRetryAttempts ) { + try { + $this->sendDocuments( $messagePrefix, $documents ); + return; + } catch ( \Elastica\Exception\ExceptionInterface $e ) { +
[MediaWiki-commits] [Gerrit] Make reindex process less brittle - change (mediawiki...CirrusSearch)
Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/113371 Change subject: Make reindex process less brittle .. Make reindex process less brittle 1. Catch bulk index failures and retry them as single indexes. This should help with really really large pages. 2. Catch failures in those single indexes and backoff for some time and retry. 3. Make the batch size and the number of errors to try to backoff from configurable. Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d --- M maintenance/updateOneSearchIndexConfig.php 1 file changed, 59 insertions(+), 5 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/71/113371/1 diff --git a/maintenance/updateOneSearchIndexConfig.php b/maintenance/updateOneSearchIndexConfig.php index 962a07f..0d34258 100644 --- a/maintenance/updateOneSearchIndexConfig.php +++ b/maintenance/updateOneSearchIndexConfig.php @@ -3,6 +3,7 @@ namespace CirrusSearch; use Elastica; use \Maintenance; +use \ProfileSection; /** * Update the search configuration on the search backend. @@ -43,7 +44,8 @@ // Is the index currently closed? private $closed = false; - private $reindexChunkSize = 1000; + private $reindexChunkSize; + private $reindexRetryAttempts; private $indexBaseName; private $indexIdentifier; @@ -129,6 +131,14 @@ $maintenance->addOption( 'reindexAcceptableCountDeviation', 'How much can the reindexed ' . 'copy of an index is allowed to deviate from the current copy without triggering a ' . 'reindex failure. Defaults to 5%.', false, true ); + $maintenance->addOption( 'reindexChunkSize', 'Documents per shard to reindex in a batch. ' . + 'Note when changing the number of shards that the old shard size is used, not the new ' . + 'one. If you see many errors submitting documents in bulk but the automatic retry as ' . + 'singles works then lower this number. Defaults to 100.', false, true ); + $maintenance->addOption( 'reindexRetryAttempts', 'Number of times to back off and retry ' . + 'per failure. Note that failures are not common but if Elasticsearch is in the process ' . + 'of moving a shard this can time out. This will retry the attempt after some backoff ' . + 'rather than failing the whole reindex process. Defaults to 5.', false, true ); $maintenance->addOption( 'baseName', 'What basename to use for all indexes, ' . 'defaults to wiki id', false, true ); } @@ -152,6 +162,8 @@ $this->reindexProcesses = $this->getOption( 'reindexProcesses', wfIsWindows() ? 1 : 10 ); $this->reindexAcceptableCountDeviation = $this->parsePotentialPercent( $this->getOption( 'reindexAcceptableCountDeviation', '5%' ) ); + $this->reindexChunkSize = $this->getOption( 'reindexChunkSize', 100 ); + $this->reindexRetryAttempts = $this->getOption( 'reindexRetryAttempts', 5 ); $this->langCode = $wgLanguageCode; $this->aggressiveSplitting = $wgCirrusSearchUseAggressiveSplitting; $this->prefixSearchStartsWithAny = $wgCirrusSearchPrefixSearchStartsWithAnyWord; @@ -630,10 +642,7 @@ $result->next(); } wfProfileOut( __METHOD__ . '::packageDocs' ); - wfProfileIn( __METHOD__ . '::sendDocs' ); - $updateResult = $this->getPageType()->addDocuments( $documents ); - wfDebugLog( 'CirrusSearch', 'Update completed in ' . $updateResult->getEngineTime() . ' (engine) millis' ); - wfProfileOut( __METHOD__ . '::sendDocs' ); + $this->sendDocumentsWithRetry( $messagePrefix, $documents ); $completed += $result->count(); $rate = round( $completed / ( microtime( true ) - $operationStartTime ) ); $this->output( $this->indent . $messagePrefix . @@ -646,6 +655,51 @@ } } + private function sendDocumentsWithRetry( $messagePrefix, $documents ) { + $profiler = new ProfileSection( __METHOD__ ); + + $errors = 0; + while ( true ) { + if ( $errors < $this->reindexRetryAttempts ) { + try { + $this->sendDocuments( $messagePrefix, $documents ); + return; +