ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/355080 )

Change subject: generate metadata dumps as first pass, rev text as second pass 
dumps
......................................................................

generate metadata dumps as first pass, rev text as second pass dumps

[WIP] Stubs work; only some parts have been tested; one part is incomplete.

This cannot be merged until a change to the regular dump maintenance scripts
is merged in core.

Bug: T164262
Change-Id: I787a26ff6004a875b71ef38905904b7c489f22d4
---
M includes/Dump/Exporter.php
A maintenance/backupPrefetch.inc
M maintenance/dumpBackup.php
A maintenance/dumpTextPass.php
4 files changed, 390 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Flow 
refs/changes/80/355080/1

diff --git a/includes/Dump/Exporter.php b/includes/Dump/Exporter.php
index 948b07f..4069485 100644
--- a/includes/Dump/Exporter.php
+++ b/includes/Dump/Exporter.php
@@ -413,11 +413,21 @@
                        }
                }
 
-               $output = Xml::element(
-                       'revision',
-                       $attribs,
-                       $revision->getContent( $format )
-               ) . "\n";
+               if ( $this->text == WikiExporter::STUB ) {
+                       // no text, only the metadata, for two-pass dumps
+                       $output = Xml::element(
+                               'revision',
+                               $attribs
+                       ) . "\n";
+               }
+               else {
+                       // we dump the text
+                       $output = Xml::element(
+                               'revision',
+                               $attribs,
+                               $revision->getContent( $format )
+                       ) . "\n";
+               }
                $this->sink->write( $output );
        }
 
diff --git a/maintenance/backupPrefetch.inc b/maintenance/backupPrefetch.inc
new file mode 100644
index 0000000..5e9a01d
--- /dev/null
+++ b/maintenance/backupPrefetch.inc
@@ -0,0 +1,79 @@
+<?php
+
+class FlowBaseDump extends BaseDump {
+       protected $atBoardEnd = false;
+       protected $lastBoard = 0;
+
+       /**
+        * Attempts to fetch the text of a particular board revision
+        * from the dump stream. May return null if the board is
+        * unavailable.
+        *
+        * @param hex $board ID number of board to read
+        * @param int $rev ID number of revision to read
+        * @return string|null
+        */
+       function prefetch( $board, $rev ) {
+               $rev = intval( $rev );
+               while ( $this->lastBoard < $board && !$this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch at board 
$this->lastBoard, looking for $board" );
+                       $this->nextBoard();
+               }
+               if ( $this->lastBoard > $board || $this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch already past board 
$board "
+                               . "looking for rev $rev  [$this->lastBoard, 
$this->lastRev]" );
+
+                       return null;
+               }
+               while ( $this->lastRev < $rev && !$this->atEnd && 
!$this->atBoardEnd ) {
+                       $this->debug( "BaseDump::prefetch at board 
$this->lastBoard, rev $this->lastRev, "
+                               . "looking for $board, $rev" );
+                       $this->nextRev();
+               }
+               if ( $this->lastRev == $rev && !$this->atEnd ) {
+                       $this->debug( "BaseDump::prefetch hit on $board, $rev 
[$this->lastBoard, $this->lastRev]" );
+
+                       return $this->nextText();
+               } else {
+                       $this->debug( "BaseDump::prefetch already past rev $rev 
on board $board "
+                               . "[$this->lastBoard, $this->lastRev]" );
+
+                       return null;
+               }
+       }
+
+       /**
+        * @access private
+        */
+       function nextBoard() {
+               if ( $this->skipTo( 'board', 'mediawiki' ) ) {
+                       if ( $this->skipTo( 'id' ) ) {
+                                $boardId = UUID::create( $this->nodeContents() 
);
+                               $this->lastBoard = $boardId->getHex();
+                               $this->lastRev = 0;
+                               $this->atBoardEnd = false;
+                       }
+               } else {
+                       $this->close();
+                       if ( count( $this->infiles ) ) {
+                               $infile = array_shift( $this->infiles );
+                               $this->reader->open( $infile );
+                               $this->atEnd = false;
+                       }
+               }
+       }
+
+       /**
+        * @access private
+        */
+       function nextRev() {
+               if ( $this->skipTo( 'revision' ) ) {
+                       if ( $this->skipTo( 'id' ) ) {
+                               $this->lastRev = intval( $this->nodeContents() 
);
+                       }
+               } else {
+                       $this->atBoardEnd = true;
+               }
+       }
+
+}
diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php
index a17ed63..92112bb 100644
--- a/maintenance/dumpBackup.php
+++ b/maintenance/dumpBackup.php
@@ -32,6 +32,7 @@
                $this->addOption( 'pagelist', 'Dump only pages of which the 
title is included in the file', false, true );
 
                $this->addOption( 'start', 'Start from page_id n', false, true 
);
+               $this->addOption( 'stub', 'Don\'t perform old_text lookups; for 
2-pass dump' );
                $this->addOption( 'end', 'Stop before page_id n (exclusive)', 
false, true );
                $this->addOption( 'skip-header', 'Don\'t output the <mediawiki> 
header' );
                $this->addOption( 'skip-footer', 'Don\'t output the 
</mediawiki> footer' );
@@ -53,10 +54,12 @@
 
                $this->processOptions();
 
+                $textMode = $this->hasOption( 'stub' ) ? WikiExporter::STUB : 
WikiExporter::TEXT;
+
                if ( $this->hasOption( 'full' ) ) {
-                       $this->dump( WikiExporter::FULL );
+                       $this->dump( WikiExporter::FULL, $textMode );
                } elseif ( $this->hasOption( 'current' ) ) {
-                       $this->dump( WikiExporter::CURRENT );
+                       $this->dump( WikiExporter::CURRENT, $textMode );
                } else {
                        $this->error( 'No valid action specified.', 1 );
                }
@@ -64,7 +67,7 @@
 
        /**
         * @param int $history WikiExporter::FULL or WikiExporter::CURRENT
-        * @param int $text Unused, but exists for compat with parent
+        * @param int $text WikiExporter::STUB or WikiExporter::TEXT
         */
        public function dump( $history, $text = WikiExporter::TEXT ) {
                # Notice messages will foul up your XML output even if they're
@@ -74,7 +77,7 @@
                }
 
                $db = Container::get( 'db.factory' )->getDB( DB_SLAVE );
-               $exporter = new Exporter( $db, $history, Exporter::STREAM, 
Exporter::TEXT );
+               $exporter = new Exporter( $db, $history, Exporter::STREAM, 
$text );
                $exporter->setOutputSink( $this->sink );
 
                if ( !$this->skipHeader ) {
diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php
new file mode 100644
index 0000000..577ed68
--- /dev/null
+++ b/maintenance/dumpTextPass.php
@@ -0,0 +1,289 @@
+<?php
+use Flow\Container;
+use Flow\Dump\Exporter;
+
+use Wikimedia\Rdbms\IMaintainableDatabase;
+
+$maintPath = ( getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance'
+       : dirname( __FILE__ ) . '/../../../maintenance' );
+require_once $maintPath . '/Maintenance.php';
+require_once $maintPath . '/backup.inc';
+require_once $maintPath . '/backupTextPass.inc';
+
+class FlowTextPassDumper extends TextPassDumper {
+       protected $firstBoardWritten = false;
+       protected $lastBoardWritten = false;
+
+       /**
+        * @param array $args For backward compatibility
+        */
+       function __construct( $args = null ) {
+               parent::__construct();
+
+               $this->addDescription( <<<TEXT
+This script postprocesses Flow XML dumps from dumpBackup.php to add
+revision text which was stubbed out (using --stub).
+
+XML input is accepted on stdin.
+XML output is sent to stdout; progress reports are sent to stderr.
+TEXT
+               );
+               $this->stderr = fopen( "php://stderr", "wt" );
+
+               $this->addOption( 'stub', 'To load a compressed stub dump 
instead of stdin. ' .
+                       'Specify as --stub=<type>:<file>.', false, true );
+               $this->addOption( 'prefetch', 'Use a prior dump file as a text 
source, to savepressure on the ' .
+                       'database. (Requires the XMLReader extension). Specify 
as --prefetch=<type>:<file>',
+                       false, true );
+               $this->addOption( 'quiet', 'Don\'t dump status reports to 
stderr.' );
+               $this->addOption( 'current', 'Base ETA on number of boards in 
database instead of all revisions' );
+               $this->addOption( 'buffersize', 'Buffer size in bytes to use 
for reading the stub. ' .
+                       '(Default: 512KB, Minimum: 4KB)', false, true );
+
+               if ( $args ) {
+                       $this->loadWithArgv( $args );
+                       $this->processOptions();
+               }
+       }
+
+
+        function processOptions() {
+                parent::processOptions();
+
+                if ( $this->hasOption( 'prefetch' ) ) {
+                       require_once $maintPath . 
'$IP/extensions/Flow/maintenance/backupPrefetch.inc';
+                        $url = $this->processFileOpt( $this->getOption( 
'prefetch' ) );
+                        $this->prefetch = new FlowBaseDump( $url );
+               }
+
+        }
+
+       function dump( $history, $text = WikiExporter::TEXT ) {
+               // Notice messages will foul up your XML output even if they're
+               // relatively harmless.
+               if ( ini_get( 'display_errors' ) ) {
+                       ini_set( 'display_errors', 'stderr' );
+               }
+
+               // We are trying to get an initial database connection to avoid 
that the
+               // first try of this request's first call to getFlowText fails. 
However, if
+               // obtaining a good DB connection fails it's not a serious 
issue, as
+               // getFlowText does retry upon failure and can start without 
having a working
+               // DB connection.
+               try {
+                       $this->rotateDb();
+               } catch ( Exception $e ) {
+                       // We do not even count this as failure. Just let 
eventual
+                       // watchdogs know.
+                       $this->progress( "Getting initial DB connection failed 
(" .
+                               $e->getMessage() . ")" );
+               }
+
+               $this->egress = $this->sink;
+               $input = fopen( $this->input, "rt" );
+               $this->readDump( $input );
+
+       }
+
+       /**
+        * @throws MWException Failure to parse XML input
+        * @param string $input
+        * @return bool
+        */
+       function readDump( $input ) {
+               $this->thisBoard = 0;
+               return parent::readDump( $input );
+       }
+
+       /**
+        * Tries to get the revision text for a revision id (UUID).
+        *
+        * Upon errors, retries (Up to $this->maxFailures tries each call).
+        * If still no good revision text could be found even after this 
retrying, "" is returned.
+        * If no good revision text could be returned for
+        * $this->maxConsecutiveFailedTextRetrievals consecutive calls to 
getFlowText, MWException
+        * is thrown.
+        *
+        * @param string $id The revision id to get the text for
+        *
+        * @throws MWException
+        * @return string The revision text for $id, or ""
+        */
+       function getFlowText( $id ) {
+               global $wgContentHandlerUseDB;
+                global $wgContLang;
+
+               $prefetchNotTried = true; // Whether or not we already tried to 
get the text via prefetch.
+               $text = false; // The candidate for a good text. false if no 
proper value.
+               $failures = 0; // The number of times, this invocation of 
getFlowText already failed.
+
+               // The number of times getFlowText failed without yielding a 
good text in between.
+               static $consecutiveFailedTextRetrievals = 0;
+
+               $this->fetchCount++;
+
+               // To allow to simply return on success and do not have to 
worry about book keeping,
+               // we assume, this fetch works (possible after some retries). 
Nevertheless, we koop
+               // the old value, so we can restore it, if problems occur (See 
after the while loop).
+               $oldConsecutiveFailedTextRetrievals = 
$consecutiveFailedTextRetrievals;
+               $consecutiveFailedTextRetrievals = 0;
+
+               // FIXME how do we set this?? we want to get an 
AbstractRevision object based on the UUID. So?
+               $revision = null;
+                $format = $revision->getContentFormat();
+
+               while ( $failures < $this->maxFailures ) {
+
+                       // As soon as we found a good text for the $id, we will 
return immediately.
+                       // Hence, if we make it past the try catch block, we 
know that we did not
+                       // find a good text.
+
+                       try {
+                                // Utterly untested, FIXME
+                               // Trying to get prefetch, if it has not been 
tried before
+                               if ( $text === false && isset( $this->prefetch 
) && $prefetchNotTried ) {
+                                       $prefetchNotTried = false;
+                                       $tryIsPrefetch = true;
+                                       $boardId = UUID::create( 
$this->thisBoard );
+                                       $text = $this->prefetch->prefetch( 
$boardId->getHex(),
+                                               intval( $this->thisRev ) );
+
+                                       if ( $text === null ) {
+                                               $text = false;
+                                       }
+                               }
+
+                               if ( $text === false ) {
+                                       // Fallback to asking the database
+                                       $tryIsPrefetch = false;
+
+                                       $text = $revision->getContent( $format 
);
+                                       if ( $text !== false ) {
+                                               return $text;
+                                       }
+                               }
+
+                               if ( $text === false ) {
+                                       throw new MWException( "Generic error 
while obtaining text for id " . $id );
+                               }
+
+                               if ( $tryIsPrefetch ) {
+                                       $this->prefetchCount++;
+                               }
+                               return $text;
+                       } catch ( Exception $e ) {
+                               $msg = "getting/checking text " . $id . " 
failed (" . $e->getMessage() . ")";
+                               if ( $failures + 1 < $this->maxFailures ) {
+                                       $msg .= " (Will retry " . ( 
$this->maxFailures - $failures - 1 ) . " more times)";
+                               }
+                               $this->progress( $msg );
+                       }
+
+                       // Something went wrong; we did not get a text that was 
plausible :(
+                       $failures++;
+
+                       // A failure in a prefetch hit does not warrant 
resetting db connection etc.
+                       if ( !$tryIsPrefetch ) {
+                               // After backing off for some time, we try to 
reboot the whole process as
+                               // much as possible to not carry over failures 
from one part to the other
+                               // parts
+                               sleep( $this->failureTimeout );
+                               try {
+                                       $this->rotateDb();
+                               } catch ( Exception $e ) {
+                                       $this->progress( "Rebooting getFlowText 
infrastructure failed (" . $e->getMessage() . ")" .
+                                               " Trying to continue anyways" );
+                               }
+                       }
+               }
+
+               // Retrieving a good text for $id failed (at least) maxFailures 
times.
+               // We abort for this $id.
+
+               // Restoring the consecutive failures, and maybe aborting, if 
the dump
+               // is too broken.
+               $consecutiveFailedTextRetrievals = 
$oldConsecutiveFailedTextRetrievals + 1;
+               if ( $consecutiveFailedTextRetrievals > 
$this->maxConsecutiveFailedTextRetrievals ) {
+                       throw new MWException( "Graceful storage failure" );
+               }
+
+               return "";
+       }
+
+        function writeOpenBoard() {
+                // horrible but avoids adding Flow-specific methods to 
DumpOutput in core
+               $this->sink->writeOpenPage( null, $this->buffer );
+        }
+
+        function writeCloseBoard() {
+                // horrible but avoids adding Flow-specific methods to 
DumpOutput in core
+               $this->sink->writeclosePage( $this->buffer );
+        }
+
+       function startElement( $parser, $name, $attribs ) {
+
+               if ( $name == 'revision' ) {
+                       $this->clearOpenElement( null );
+                       $this->lastName = $name;
+                       $this->state = $name;
+                       $this->writeOpenBoard( null, $this->buffer );
+                       $this->buffer = "";
+                        if ( isset( $attribs['id'] ) ) {
+                                $id = $attribs['id'];
+                                $text = $this->getFlowText( $id );
+                                $this->openElement = [ $name, $attribs ];
+                                if ( strlen( $text ) > 0 ) {
+                                       # FIXME this needs conversion in the 
routine or after
+                                        $this->characterData( $parser, $text );
+                                }
+                        }
+               } elseif ( $name == 'board' ) {
+                       $this->clearOpenElement( null );
+                       $this->lastName = $name;
+                       $this->state = $name;
+                       if ( $this->atStart ) {
+                                $this->sink->writeOpenStream( $this->buffer );
+                               $this->buffer = "";
+                               $this->atStart = false;
+                       }
+                        $this->openElement = [ $name, $attribs ];
+               } else {
+                        parent::startElement( $parser, $name, $attribs );
+                }
+       }
+
+       function endElement( $parser, $name ) {
+               if ( $name == 'board' ) {
+                       if ( $this->openElement ) {
+                               $this->clearOpenElement( "" );
+                       } else {
+                               $this->buffer .= "</$name>";
+                       }
+                       if ( !$this->firstBoardWritten ) {
+                               $this->firstBoardWritten = trim( 
$this->thisBoard );
+                       }
+                       $this->lastBoardWritten = trim( $this->thisBoard );
+                        $this->writeCloseBoard( $this->buffer );
+                        $this->buffer = "";
+                        $this->thisPage = "";
+                }  else {
+                        parent::endElement( $parser, $name );
+                }
+       }
+
+       function characterData( $parser, $data ) {
+               if ( $this->lastName == "id" ) {
+                       if ( $this->state == "board" ) {
+                               $this->clearOpenElement( null );
+                               $this->thisBoard .= $data;
+                                $this->buffer .= htmlspecialchars( $data );
+                                return;
+                       }
+                }
+                parent::characterData( $parser, $data );
+       }
+}
+
+$maintClass = 'FlowTextPassDumper';
+require_once RUN_MAINTENANCE_IF_MAIN;

-- 
To view, visit https://gerrit.wikimedia.org/r/355080
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I787a26ff6004a875b71ef38905904b7c489f22d4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to