[MediaWiki-commits] [Gerrit] Make reindex process less brittle - change (mediawiki...CirrusSearch)

2014-02-14 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Make reindex process less brittle
..


Make reindex process less brittle

1.  Catch bulk index failures and retry them as single indexes.  This should
help with really really large pages.
2.  Catch failures in those single indexes and backoff for some time and
retry.
3.  Make the batch size and the number of errors to try to backoff from
configurable.

Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d
---
M maintenance/updateOneSearchIndexConfig.php
1 file changed, 59 insertions(+), 5 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/maintenance/updateOneSearchIndexConfig.php 
b/maintenance/updateOneSearchIndexConfig.php
index 962a07f..0d34258 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -3,6 +3,7 @@
 namespace CirrusSearch;
 use Elastica;
 use \Maintenance;
+use \ProfileSection;
 
 /**
  * Update the search configuration on the search backend.
@@ -43,7 +44,8 @@
// Is the index currently closed?
private $closed = false;
 
-   private $reindexChunkSize = 1000;
+   private $reindexChunkSize;
+   private $reindexRetryAttempts;
 
private $indexBaseName;
private $indexIdentifier;
@@ -129,6 +131,14 @@
$maintenance->addOption( 'reindexAcceptableCountDeviation', 
'How much can the reindexed ' .
'copy of an index is allowed to deviate from the 
current copy without triggering a ' .
'reindex failure.  Defaults to 5%.', false, true );
+   $maintenance->addOption( 'reindexChunkSize', 'Documents per 
shard to reindex in a batch.   ' .
+   'Note when changing the number of shards that the old shard 
size is used, not the new ' .
+   'one.  If you see many errors submitting documents in bulk 
but the automatic retry as ' .
+   'singles works then lower this number.  Defaults to 100.', 
false, true );
+   $maintenance->addOption( 'reindexRetryAttempts', 'Number of 
times to back off and retry ' .
+   'per failure.  Note that failures are not common but if 
Elasticsearch is in the process ' .
+   'of moving a shard this can time out.  This will retry 
the attempt after some backoff ' .
+   'rather than failing the whole reindex process.  
Defaults to 5.', false, true );
$maintenance->addOption( 'baseName', 'What basename to use for 
all indexes, ' .
'defaults to wiki id', false, true );
}
@@ -152,6 +162,8 @@
$this->reindexProcesses = $this->getOption( 'reindexProcesses', 
wfIsWindows() ? 1 : 10 );
$this->reindexAcceptableCountDeviation = 
$this->parsePotentialPercent(
$this->getOption( 'reindexAcceptableCountDeviation', 
'5%' ) );
+   $this->reindexChunkSize = $this->getOption( 'reindexChunkSize', 
100 );
+   $this->reindexRetryAttempts = $this->getOption( 
'reindexRetryAttempts', 5 );
$this->langCode = $wgLanguageCode;
$this->aggressiveSplitting = 
$wgCirrusSearchUseAggressiveSplitting;
$this->prefixSearchStartsWithAny = 
$wgCirrusSearchPrefixSearchStartsWithAnyWord;
@@ -630,10 +642,7 @@
$result->next();
}
wfProfileOut( __METHOD__ . '::packageDocs' );
-   wfProfileIn( __METHOD__ . '::sendDocs' );
-   $updateResult = 
$this->getPageType()->addDocuments( $documents );
-   wfDebugLog( 'CirrusSearch', 'Update completed 
in ' . $updateResult->getEngineTime() . ' (engine) millis' );
-   wfProfileOut( __METHOD__ . '::sendDocs' );
+   $this->sendDocumentsWithRetry( $messagePrefix, 
$documents );
$completed += $result->count();
$rate = round( $completed / ( microtime( true ) 
- $operationStartTime ) );
$this->output( $this->indent . $messagePrefix .
@@ -646,6 +655,51 @@
}
}
 
+   private function sendDocumentsWithRetry( $messagePrefix, $documents ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   $errors = 0;
+   while ( true ) {
+   if ( $errors < $this->reindexRetryAttempts ) {
+   try {
+   $this->sendDocuments( $messagePrefix, 
$documents );
+   return;
+   } catch ( 
\Elastica\Exception\ExceptionInterface $e ) {
+ 

[MediaWiki-commits] [Gerrit] Make reindex process less brittle - change (mediawiki...CirrusSearch)

2014-02-14 Thread Manybubbles (Code Review)
Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/113371

Change subject: Make reindex process less brittle
..

Make reindex process less brittle

1.  Catch bulk index failures and retry them as single indexes.  This should
help with really really large pages.
2.  Catch failures in those single indexes and backoff for some time and
retry.
3.  Make the batch size and the number of errors to try to backoff from
configurable.

Change-Id: I77c82dc8dcaf180f4d701d4ea277c1c45262592d
---
M maintenance/updateOneSearchIndexConfig.php
1 file changed, 59 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/71/113371/1

diff --git a/maintenance/updateOneSearchIndexConfig.php 
b/maintenance/updateOneSearchIndexConfig.php
index 962a07f..0d34258 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -3,6 +3,7 @@
 namespace CirrusSearch;
 use Elastica;
 use \Maintenance;
+use \ProfileSection;
 
 /**
  * Update the search configuration on the search backend.
@@ -43,7 +44,8 @@
// Is the index currently closed?
private $closed = false;
 
-   private $reindexChunkSize = 1000;
+   private $reindexChunkSize;
+   private $reindexRetryAttempts;
 
private $indexBaseName;
private $indexIdentifier;
@@ -129,6 +131,14 @@
$maintenance->addOption( 'reindexAcceptableCountDeviation', 
'How much can the reindexed ' .
'copy of an index is allowed to deviate from the 
current copy without triggering a ' .
'reindex failure.  Defaults to 5%.', false, true );
+   $maintenance->addOption( 'reindexChunkSize', 'Documents per 
shard to reindex in a batch.   ' .
+   'Note when changing the number of shards that the old shard 
size is used, not the new ' .
+   'one.  If you see many errors submitting documents in bulk 
but the automatic retry as ' .
+   'singles works then lower this number.  Defaults to 100.', 
false, true );
+   $maintenance->addOption( 'reindexRetryAttempts', 'Number of 
times to back off and retry ' .
+   'per failure.  Note that failures are not common but if 
Elasticsearch is in the process ' .
+   'of moving a shard this can time out.  This will retry 
the attempt after some backoff ' .
+   'rather than failing the whole reindex process.  
Defaults to 5.', false, true );
$maintenance->addOption( 'baseName', 'What basename to use for 
all indexes, ' .
'defaults to wiki id', false, true );
}
@@ -152,6 +162,8 @@
$this->reindexProcesses = $this->getOption( 'reindexProcesses', 
wfIsWindows() ? 1 : 10 );
$this->reindexAcceptableCountDeviation = 
$this->parsePotentialPercent(
$this->getOption( 'reindexAcceptableCountDeviation', 
'5%' ) );
+   $this->reindexChunkSize = $this->getOption( 'reindexChunkSize', 
100 );
+   $this->reindexRetryAttempts = $this->getOption( 
'reindexRetryAttempts', 5 );
$this->langCode = $wgLanguageCode;
$this->aggressiveSplitting = 
$wgCirrusSearchUseAggressiveSplitting;
$this->prefixSearchStartsWithAny = 
$wgCirrusSearchPrefixSearchStartsWithAnyWord;
@@ -630,10 +642,7 @@
$result->next();
}
wfProfileOut( __METHOD__ . '::packageDocs' );
-   wfProfileIn( __METHOD__ . '::sendDocs' );
-   $updateResult = 
$this->getPageType()->addDocuments( $documents );
-   wfDebugLog( 'CirrusSearch', 'Update completed 
in ' . $updateResult->getEngineTime() . ' (engine) millis' );
-   wfProfileOut( __METHOD__ . '::sendDocs' );
+   $this->sendDocumentsWithRetry( $messagePrefix, 
$documents );
$completed += $result->count();
$rate = round( $completed / ( microtime( true ) 
- $operationStartTime ) );
$this->output( $this->indent . $messagePrefix .
@@ -646,6 +655,51 @@
}
}
 
+   private function sendDocumentsWithRetry( $messagePrefix, $documents ) {
+   $profiler = new ProfileSection( __METHOD__ );
+
+   $errors = 0;
+   while ( true ) {
+   if ( $errors < $this->reindexRetryAttempts ) {
+   try {
+   $this->sendDocuments( $messagePrefix, 
$documents );
+   return;
+