EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/325605

Change subject: Add job queue option for initImageData maintenance script
......................................................................

Add job queue option for initImageData maintenance script

Trying to run this script in the cluster fatals out due to memory
problems somewhat regularly. The --start option helps to restart
it where it fell down, but when trying to run against hundreds of
wiki's that is a one-off solution that makes ensuring everything is
actually visited a pain.

To try and isolate errors add an option to push the parsing into the
job queue. There is still the possibility to miss pages, but job queue
retries should take care of us for the most part. Attempts to keep
load down on the databases by making sure no more than a specified
number of jobs are queued/processing at a given time.

Bug: T152155
Change-Id: I3a4e3a415b2f03de0bb36ac0515241e950130fde
---
M extension.json
A includes/Job/InitImageDataJob.php
M maintenance/initImageData.php
3 files changed, 79 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/PageImages 
refs/changes/05/325605/1

diff --git a/extension.json b/extension.json
index 4e74d66..f78c44a 100644
--- a/extension.json
+++ b/extension.json
@@ -18,7 +18,8 @@
                "ApiQueryPageImages": "includes/ApiQueryPageImages.php",
                "PageImages": "includes/PageImages.php",
                "PageImages\\Hooks\\LinksUpdateHookHandler": 
"includes/LinksUpdateHookHandler.php",
-               "PageImages\\Hooks\\ParserFileProcessingHookHandlers": 
"includes/ParserFileProcessingHookHandlers.php"
+               "PageImages\\Hooks\\ParserFileProcessingHookHandlers": 
"includes/ParserFileProcessingHookHandlers.php",
+               "PageImages\\Job\\InitImageDataJob": 
"includes/Job/InitImageDataJob.php"
        },
        "Hooks": {
                "ParserMakeImageParams": 
"PageImages\\Hooks\\ParserFileProcessingHookHandlers::onParserMakeImageParams",
@@ -29,6 +30,9 @@
                "AfterParserFetchFileAndTitle": 
"PageImages\\Hooks\\ParserFileProcessingHookHandlers::onAfterParserFetchFileAndTitle",
                "SpecialMobileEditWatchlist::images": 
"PageImages::onSpecialMobileEditWatchlist_images"
        },
+       "JobClasses": {
+               "InitImageDataJob": "PageImages\\Job\\InitImageDataJob"
+       },
        "config": {
                "PageImagesScores": {
                        "value": {
diff --git a/includes/Job/InitImageDataJob.php 
b/includes/Job/InitImageDataJob.php
new file mode 100644
index 0000000..3debeae
--- /dev/null
+++ b/includes/Job/InitImageDataJob.php
@@ -0,0 +1,30 @@
+<?php
+
+namespace PageImages\Job;
+
+use Job;
+use MediaWiki\MediaWikiServices;
+use MWExceptionHandler;
+use RefreshLinks;
+use Title;
+
+class InitImageDataJob extends Job {
+       public function __construct( Title $title, array $params ) {
+               parent::__construct( 'InitImageDataJob', $title, $params );
+       }
+
+       public function run() {
+               $lbFactory = 
MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+
+               foreach ( $this->params['page_ids'] as $id ) {
+                       try {
+                               RefreshLinks::fixLinksFromArticle( $id );
+                       } catch (\Exception $e) {
+                               // There are some broken pages out there that 
just don't parse.
+                               // Log it and keep on trucking.
+                               MWExceptionHandler::logException( $e );
+                       }
+                       $lbFactory->waitForReplication();
+               }
+       }
+}
diff --git a/maintenance/initImageData.php b/maintenance/initImageData.php
index 9f2af9e..3596a84 100644
--- a/maintenance/initImageData.php
+++ b/maintenance/initImageData.php
@@ -7,6 +7,7 @@
 require_once( "$IP/maintenance/Maintenance.php" );
 
 use MediaWiki\MediaWikiServices;
+use PageImages\Job\InitImageDataJob;
 
 /**
  * @license WTFPL 2.0
@@ -21,19 +22,26 @@
                $this->addOption( 'earlier-than',
                        'Run only on pages earlier than this timestamp', false, 
true );
                $this->addOption( 'start', 'Starting page ID', false, true );
+               $this->addOption( 'queue-pressure', 'Maximum number of jobs to 
enqueue at a time. If not provided or 0 will be run in-process.', false, true );
+               $this->addOption( 'quiet', "Don't report on job queue pressure" 
);
                $this->setBatchSize( 100 );
        }
 
        public function execute() {
                global $wgPageImagesNamespaces;
 
-               $id = $this->getOption( 'start', 0 );
-               $lbFactory = 
MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+               $lastId = $this->getOption( 'start', 0 );
+               $isQuiet = $this->getOption( 'quiet', false );
+               $queue = null;
+               $maxPressure = $this->getOption( 'queue-pressure', 0 );
+               if ( $maxPressure > 0 ) {
+                       $queue = JobQueueGroup::singleton();
+               }
 
                do {
-                       $tables = array( 'page', 'imagelinks' );
-                       $conds = array(
-                               'page_id > ' . (int) $id,
+                       $tables = [ 'page', 'imagelinks' ];
+                       $conds = array( 
+                               'page_id > ' . (int) $lastId,
                                'il_from IS NOT NULL',
                                'page_is_redirect' => 0,
                        );
@@ -57,15 +65,42 @@
                                [ 'LIMIT' => $this->mBatchSize, 'ORDER_BY' => 
'page_id', 'GROUP BY' => 'page_id' ],
                                $joinConds
                        );
+                       $page_ids = [];
                        foreach ( $res as $row ) {
-                               $id = $row->page_id;
-                               RefreshLinks::fixLinksFromArticle( $id );
-                               $lbFactory->waitForReplication();
+                               $pageIds[] = $row->page_id;
                        }
-                       $this->output( "$id\n" );
+                       $job = new InitImageDataJob( Title::newMainPage(), [ 
'page_ids' => $pageIds ] );
+                       if ( $queue === null ) {
+                               $job->run();
+                       } else {
+                               $queue->push( $job );
+                               $this->waitForMaxPressure( $queue, 
$maxPressure, $isQuiet );
+                       }
+                       $lastId = end( $pageIds );
+                       $this->output( "$lastId\n" );
                } while ( $res->numRows() );
                $this->output( "done\n" );
        }
+
+       /**
+        * @param JobQueueGroup $queue The job queue to fetch pressure from
+        * @param int $maxPressure The maximum number of queued + active
+        *  jobs that can exist when returning
+        * @param bool $isQuiet When false report on job queue pressure every 
10s
+        */
+       private function waitForMaxPressure( JobQueueGroup $queue, 
$maxPressure, $isQuiet ) {
+               $group = $queue->get( 'InitImageDataJob' );
+               $i = 0;
+               do {
+                       sleep( 1 );
+                       $queued = $group->getSize();
+                       $running = $group->getAcquiredCount();
+                       if ( !$isQuiet && ++$i % 10 === 0 ) {
+                               $now = date( 'Y-m-d H:i:s T');
+                               $this->output( "[$now] Queued: $queued Running: 
$running Max: $maxPressure\n" );
+                       }
+               } while ( $queued + $running >= $maxPressure );
+       }
 }
 
 $maintClass = 'InitImageData';

-- 
To view, visit https://gerrit.wikimedia.org/r/325605
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3a4e3a415b2f03de0bb36ac0515241e950130fde
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/PageImages
Gerrit-Branch: wmf/1.29.0-wmf.4
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to