Aaron Schulz has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/161749

Change subject: Slave lag check tweaks to JobRunner
......................................................................

Slave lag check tweaks to JobRunner

* Do not block forever, but wait up to 10 seconds. Likewise,
  check the lag times in memcached on startup. This at least
  lets runners avoid lagged wikis but still work on others.
* Made a few small related documentation and code cleanups.

Change-Id: Ic1339bab54cba6b6cbea7d97a80ff87c7c5c87af
---
M includes/GlobalFunctions.php
M includes/db/LoadBalancer.php
M includes/db/LoadMonitor.php
M includes/jobqueue/JobRunner.php
4 files changed, 27 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/49/161749/1

diff --git a/includes/GlobalFunctions.php b/includes/GlobalFunctions.php
index 490df24..3306acd 100644
--- a/includes/GlobalFunctions.php
+++ b/includes/GlobalFunctions.php
@@ -3761,11 +3761,18 @@
  * @param float|null $ifWritesSince Only wait if writes were done since this 
UNIX timestamp
  * @param string|bool $wiki Wiki identifier accepted by wfGetLB
  * @param string|bool $cluster Cluster name accepted by LBFactory. Default: 
false.
+ * @param int|null $timeout Max wait time. Default: 1 day (cli), ~10 seconds 
(web)
  * @return bool Success (able to connect and no timeouts reached)
  */
-function wfWaitForSlaves( $ifWritesSince = false, $wiki = false, $cluster = 
false ) {
+function wfWaitForSlaves(
+       $ifWritesSince = null, $wiki = false, $cluster = false, $timeout = null
+) {
        // B/C: first argument used to be "max seconds of lag"; ignore such 
values
-       $ifWritesSince = ( $ifWritesSince > 1e9 ) ? $ifWritesSince : false;
+       $ifWritesSince = ( $ifWritesSince > 1e9 ) ? $ifWritesSince : null;
+
+       if ( $timeout === null ) {
+               $timeout = ( PHP_SAPI === 'cli' ) ? 86400 : 10;
+       }
 
        if ( $cluster !== false ) {
                $lb = wfGetLBFactory()->getExternalLB( $cluster );
@@ -3787,7 +3794,7 @@
                // The DBMS may not support getMasterPos() or the whole
                // load balancer might be fake (e.g. $wgAllDBsAreLocalhost).
                if ( $pos !== false ) {
-                       return $lb->waitForAll( $pos, PHP_SAPI === 'cli' ? 
86400 : null );
+                       return $lb->waitForAll( $pos, $timeout );
                }
        }
 
diff --git a/includes/db/LoadBalancer.php b/includes/db/LoadBalancer.php
index e517a02..f79fde0 100644
--- a/includes/db/LoadBalancer.php
+++ b/includes/db/LoadBalancer.php
@@ -1128,7 +1128,7 @@
         * Results are cached for a short time in memcached, and indefinitely 
in the process cache
         *
         * @param string|bool $wiki
-        * @return array
+        * @return array Map of (server index => seconds)
         */
        function getLagTimes( $wiki = false ) {
                # Try process cache
diff --git a/includes/db/LoadMonitor.php b/includes/db/LoadMonitor.php
index 7281485..b694a6f 100644
--- a/includes/db/LoadMonitor.php
+++ b/includes/db/LoadMonitor.php
@@ -48,7 +48,7 @@
         * @param array $serverIndexes
         * @param string $wiki
         *
-        * @return array
+        * @return array Map of (server index => seconds)
         */
        public function getLagTimes( $serverIndexes, $wiki );
 }
diff --git a/includes/jobqueue/JobRunner.php b/includes/jobqueue/JobRunner.php
index 8ccceda..a256c43 100644
--- a/includes/jobqueue/JobRunner.php
+++ b/includes/jobqueue/JobRunner.php
@@ -76,6 +76,14 @@
                        $this->runJobsLog( "Executed $count periodic queue 
task(s)." );
                }
 
+               // Bail out if there is too much DB lag
+               // @note: getLagTimes() has better caching than getMaxLag()
+               $maxLag = max( wfGetLBFactory()->getMainLB( wfWikiID() 
)->getLagTimes() );
+               if ( $maxLag >= 5 ) {
+                       $response['reached'] = 'slave-lag-limit';
+                       return $response;
+               }
+
                // Flush any pending DB writes for sanity
                wfGetLBFactory()->commitMasterChanges();
 
@@ -172,10 +180,15 @@
                                        break;
                                }
 
-                               // Don't let any of the main DB slaves get 
backed up
+                               // Don't let any of the main DB slaves get 
backed up.
+                               // This only waits for so long before exiting 
and letting
+                               // other wikis in the farm (on different 
masters) get a chance.
                                $timePassed = microtime( true ) - $lastTime;
                                if ( $timePassed >= 5 || $timePassed < 0 ) {
-                                       wfWaitForSlaves( $lastTime );
+                                       if ( !wfWaitForSlaves( $lastTime, 
wfWikiID(), false, 5 ) ) {
+                                               $response['reached'] = 
'slave-lag-limit';
+                                               break;
+                                       }
                                        $lastTime = microtime( true );
                                }
                                // Don't let any queue slaves/backups fall 
behind

-- 
To view, visit https://gerrit.wikimedia.org/r/161749
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic1339bab54cba6b6cbea7d97a80ff87c7c5c87af
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Aaron Schulz <asch...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to