Aaron Schulz has uploaded a new change for review. https://gerrit.wikimedia.org/r/161749
Change subject: Slave lag check tweaks to JobRunner ...................................................................... Slave lag check tweaks to JobRunner * Do not block forever, but wait up to 10 seconds. Likewise, check the lag times in memcached on startup. This at least lets runners avoid lagged wikis but still work on others. * Made a few small related documentation and code cleanups. Change-Id: Ic1339bab54cba6b6cbea7d97a80ff87c7c5c87af --- M includes/GlobalFunctions.php M includes/db/LoadBalancer.php M includes/db/LoadMonitor.php M includes/jobqueue/JobRunner.php 4 files changed, 27 insertions(+), 7 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core refs/changes/49/161749/1 diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php index 490df24..3306acd 100644 --- a/includes/GlobalFunctions.php +++ b/includes/GlobalFunctions.php @@ -3761,11 +3761,18 @@ * @param float|null $ifWritesSince Only wait if writes were done since this UNIX timestamp * @param string|bool $wiki Wiki identifier accepted by wfGetLB * @param string|bool $cluster Cluster name accepted by LBFactory. Default: false. + * @param int|null $timeout Max wait time. Default: 1 day (cli), ~10 seconds (web) * @return bool Success (able to connect and no timeouts reached) */ -function wfWaitForSlaves( $ifWritesSince = false, $wiki = false, $cluster = false ) { +function wfWaitForSlaves( + $ifWritesSince = null, $wiki = false, $cluster = false, $timeout = null +) { // B/C: first argument used to be "max seconds of lag"; ignore such values - $ifWritesSince = ( $ifWritesSince > 1e9 ) ? $ifWritesSince : false; + $ifWritesSince = ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null; + + if ( $timeout === null ) { + $timeout = ( PHP_SAPI === 'cli' ) ? 86400 : 10; + } if ( $cluster !== false ) { $lb = wfGetLBFactory()->getExternalLB( $cluster ); @@ -3787,7 +3794,7 @@ // The DBMS may not support getMasterPos() or the whole // load balancer might be fake (e.g. $wgAllDBsAreLocalhost). if ( $pos !== false ) { - return $lb->waitForAll( $pos, PHP_SAPI === 'cli' ? 86400 : null ); + return $lb->waitForAll( $pos, $timeout ); } } diff --git a/includes/db/LoadBalancer.php b/includes/db/LoadBalancer.php index e517a02..f79fde0 100644 --- a/includes/db/LoadBalancer.php +++ b/includes/db/LoadBalancer.php @@ -1128,7 +1128,7 @@ * Results are cached for a short time in memcached, and indefinitely in the process cache * * @param string|bool $wiki - * @return array + * @return array Map of (server index => seconds) */ function getLagTimes( $wiki = false ) { # Try process cache diff --git a/includes/db/LoadMonitor.php b/includes/db/LoadMonitor.php index 7281485..b694a6f 100644 --- a/includes/db/LoadMonitor.php +++ b/includes/db/LoadMonitor.php @@ -48,7 +48,7 @@ * @param array $serverIndexes * @param string $wiki * - * @return array + * @return array Map of (server index => seconds) */ public function getLagTimes( $serverIndexes, $wiki ); } diff --git a/includes/jobqueue/JobRunner.php b/includes/jobqueue/JobRunner.php index 8ccceda..a256c43 100644 --- a/includes/jobqueue/JobRunner.php +++ b/includes/jobqueue/JobRunner.php @@ -76,6 +76,14 @@ $this->runJobsLog( "Executed $count periodic queue task(s)." ); } + // Bail out if there is too much DB lag + // @note: getLagTimes() has better caching than getMaxLag() + $maxLag = max( wfGetLBFactory()->getMainLB( wfWikiID() )->getLagTimes() ); + if ( $maxLag >= 5 ) { + $response['reached'] = 'slave-lag-limit'; + return $response; + } + // Flush any pending DB writes for sanity wfGetLBFactory()->commitMasterChanges(); @@ -172,10 +180,15 @@ break; } - // Don't let any of the main DB slaves get backed up + // Don't let any of the main DB slaves get backed up. + // This only waits for so long before exiting and letting + // other wikis in the farm (on different masters) get a chance. $timePassed = microtime( true ) - $lastTime; if ( $timePassed >= 5 || $timePassed < 0 ) { - wfWaitForSlaves( $lastTime ); + if ( !wfWaitForSlaves( $lastTime, wfWikiID(), false, 5 ) ) { + $response['reached'] = 'slave-lag-limit'; + break; + } $lastTime = microtime( true ); } // Don't let any queue slaves/backups fall behind -- To view, visit https://gerrit.wikimedia.org/r/161749 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic1339bab54cba6b6cbea7d97a80ff87c7c5c87af Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Aaron Schulz <asch...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits