ArielGlenn has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/355080 )
Change subject: generate metadata dumps as first pass, rev text as second pass
dumps
......................................................................
generate metadata dumps as first pass, rev text as second pass dumps
[WIP] Stubs work; only some parts have been tested; one part is incomplete.
This cannot be merged until a change to the regular dump maintenance scripts
is merged in core.
Bug: T164262
Change-Id: I787a26ff6004a875b71ef38905904b7c489f22d4
---
M includes/Dump/Exporter.php
A maintenance/backupPrefetch.inc
M maintenance/dumpBackup.php
A maintenance/dumpTextPass.php
4 files changed, 390 insertions(+), 9 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Flow
refs/changes/80/355080/1
diff --git a/includes/Dump/Exporter.php b/includes/Dump/Exporter.php
index 948b07f..4069485 100644
--- a/includes/Dump/Exporter.php
+++ b/includes/Dump/Exporter.php
@@ -413,11 +413,21 @@
}
}
- $output = Xml::element(
- 'revision',
- $attribs,
- $revision->getContent( $format )
- ) . "\n";
+ if ( $this->text == WikiExporter::STUB ) {
+ // no text, only the metadata, for two-pass dumps
+ $output = Xml::element(
+ 'revision',
+ $attribs
+ ) . "\n";
+ }
+ else {
+ // we dump the text
+ $output = Xml::element(
+ 'revision',
+ $attribs,
+ $revision->getContent( $format )
+ ) . "\n";
+ }
$this->sink->write( $output );
}
diff --git a/maintenance/backupPrefetch.inc b/maintenance/backupPrefetch.inc
new file mode 100644
index 0000000..5e9a01d
--- /dev/null
+++ b/maintenance/backupPrefetch.inc
@@ -0,0 +1,79 @@
+<?php
+
+class FlowBaseDump extends BaseDump {
+ protected $atBoardEnd = false;
+ protected $lastBoard = 0;
+
+ /**
+ * Attempts to fetch the text of a particular board revision
+ * from the dump stream. May return null if the board is
+ * unavailable.
+ *
+ * @param hex $board ID number of board to read
+ * @param int $rev ID number of revision to read
+ * @return string|null
+ */
+ function prefetch( $board, $rev ) {
+ $rev = intval( $rev );
+ while ( $this->lastBoard < $board && !$this->atEnd ) {
+ $this->debug( "BaseDump::prefetch at board
$this->lastBoard, looking for $board" );
+ $this->nextBoard();
+ }
+ if ( $this->lastBoard > $board || $this->atEnd ) {
+ $this->debug( "BaseDump::prefetch already past board
$board "
+ . "looking for rev $rev [$this->lastBoard,
$this->lastRev]" );
+
+ return null;
+ }
+ while ( $this->lastRev < $rev && !$this->atEnd &&
!$this->atBoardEnd ) {
+ $this->debug( "BaseDump::prefetch at board
$this->lastBoard, rev $this->lastRev, "
+ . "looking for $board, $rev" );
+ $this->nextRev();
+ }
+ if ( $this->lastRev == $rev && !$this->atEnd ) {
+ $this->debug( "BaseDump::prefetch hit on $board, $rev
[$this->lastBoard, $this->lastRev]" );
+
+ return $this->nextText();
+ } else {
+ $this->debug( "BaseDump::prefetch already past rev $rev
on board $board "
+ . "[$this->lastBoard, $this->lastRev]" );
+
+ return null;
+ }
+ }
+
+ /**
+ * @access private
+ */
+ function nextBoard() {
+ if ( $this->skipTo( 'board', 'mediawiki' ) ) {
+ if ( $this->skipTo( 'id' ) ) {
+ $boardId = UUID::create( $this->nodeContents()
);
+ $this->lastBoard = $boardId->getHex();
+ $this->lastRev = 0;
+ $this->atBoardEnd = false;
+ }
+ } else {
+ $this->close();
+ if ( count( $this->infiles ) ) {
+ $infile = array_shift( $this->infiles );
+ $this->reader->open( $infile );
+ $this->atEnd = false;
+ }
+ }
+ }
+
+ /**
+ * @access private
+ */
+ function nextRev() {
+ if ( $this->skipTo( 'revision' ) ) {
+ if ( $this->skipTo( 'id' ) ) {
+ $this->lastRev = intval( $this->nodeContents()
);
+ }
+ } else {
+ $this->atBoardEnd = true;
+ }
+ }
+
+}
diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php
index a17ed63..92112bb 100644
--- a/maintenance/dumpBackup.php
+++ b/maintenance/dumpBackup.php
@@ -32,6 +32,7 @@
$this->addOption( 'pagelist', 'Dump only pages of which the
title is included in the file', false, true );
$this->addOption( 'start', 'Start from page_id n', false, true
);
+ $this->addOption( 'stub', 'Don\'t perform old_text lookups; for
2-pass dump' );
$this->addOption( 'end', 'Stop before page_id n (exclusive)',
false, true );
$this->addOption( 'skip-header', 'Don\'t output the <mediawiki>
header' );
$this->addOption( 'skip-footer', 'Don\'t output the
</mediawiki> footer' );
@@ -53,10 +54,12 @@
$this->processOptions();
+ $textMode = $this->hasOption( 'stub' ) ? WikiExporter::STUB :
WikiExporter::TEXT;
+
if ( $this->hasOption( 'full' ) ) {
- $this->dump( WikiExporter::FULL );
+ $this->dump( WikiExporter::FULL, $textMode );
} elseif ( $this->hasOption( 'current' ) ) {
- $this->dump( WikiExporter::CURRENT );
+ $this->dump( WikiExporter::CURRENT, $textMode );
} else {
$this->error( 'No valid action specified.', 1 );
}
@@ -64,7 +67,7 @@
/**
* @param int $history WikiExporter::FULL or WikiExporter::CURRENT
- * @param int $text Unused, but exists for compat with parent
+ * @param int $text WikiExporter::STUB or WikiExporter::TEXT
*/
public function dump( $history, $text = WikiExporter::TEXT ) {
# Notice messages will foul up your XML output even if they're
@@ -74,7 +77,7 @@
}
$db = Container::get( 'db.factory' )->getDB( DB_SLAVE );
- $exporter = new Exporter( $db, $history, Exporter::STREAM,
Exporter::TEXT );
+ $exporter = new Exporter( $db, $history, Exporter::STREAM,
$text );
$exporter->setOutputSink( $this->sink );
if ( !$this->skipHeader ) {
diff --git a/maintenance/dumpTextPass.php b/maintenance/dumpTextPass.php
new file mode 100644
index 0000000..577ed68
--- /dev/null
+++ b/maintenance/dumpTextPass.php
@@ -0,0 +1,289 @@
+<?php
+use Flow\Container;
+use Flow\Dump\Exporter;
+
+use Wikimedia\Rdbms\IMaintainableDatabase;
+
+$maintPath = ( getenv( 'MW_INSTALL_PATH' ) !== false
+ ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance'
+ : dirname( __FILE__ ) . '/../../../maintenance' );
+require_once $maintPath . '/Maintenance.php';
+require_once $maintPath . '/backup.inc';
+require_once $maintPath . '/backupTextPass.inc';
+
+class FlowTextPassDumper extends TextPassDumper {
+ protected $firstBoardWritten = false;
+ protected $lastBoardWritten = false;
+
+ /**
+ * @param array $args For backward compatibility
+ */
+ function __construct( $args = null ) {
+ parent::__construct();
+
+ $this->addDescription( <<<TEXT
+This script postprocesses Flow XML dumps from dumpBackup.php to add
+revision text which was stubbed out (using --stub).
+
+XML input is accepted on stdin.
+XML output is sent to stdout; progress reports are sent to stderr.
+TEXT
+ );
+ $this->stderr = fopen( "php://stderr", "wt" );
+
+ $this->addOption( 'stub', 'To load a compressed stub dump
instead of stdin. ' .
+ 'Specify as --stub=<type>:<file>.', false, true );
+ $this->addOption( 'prefetch', 'Use a prior dump file as a text
source, to savepressure on the ' .
+ 'database. (Requires the XMLReader extension). Specify
as --prefetch=<type>:<file>',
+ false, true );
+ $this->addOption( 'quiet', 'Don\'t dump status reports to
stderr.' );
+ $this->addOption( 'current', 'Base ETA on number of boards in
database instead of all revisions' );
+ $this->addOption( 'buffersize', 'Buffer size in bytes to use
for reading the stub. ' .
+ '(Default: 512KB, Minimum: 4KB)', false, true );
+
+ if ( $args ) {
+ $this->loadWithArgv( $args );
+ $this->processOptions();
+ }
+ }
+
+
+ function processOptions() {
+ parent::processOptions();
+
+ if ( $this->hasOption( 'prefetch' ) ) {
+ require_once $maintPath .
'$IP/extensions/Flow/maintenance/backupPrefetch.inc';
+ $url = $this->processFileOpt( $this->getOption(
'prefetch' ) );
+ $this->prefetch = new FlowBaseDump( $url );
+ }
+
+ }
+
+ function dump( $history, $text = WikiExporter::TEXT ) {
+ // Notice messages will foul up your XML output even if they're
+ // relatively harmless.
+ if ( ini_get( 'display_errors' ) ) {
+ ini_set( 'display_errors', 'stderr' );
+ }
+
+ // We are trying to get an initial database connection to avoid
that the
+ // first try of this request's first call to getFlowText fails.
However, if
+ // obtaining a good DB connection fails it's not a serious
issue, as
+ // getFlowText does retry upon failure and can start without
having a working
+ // DB connection.
+ try {
+ $this->rotateDb();
+ } catch ( Exception $e ) {
+ // We do not even count this as failure. Just let
eventual
+ // watchdogs know.
+ $this->progress( "Getting initial DB connection failed
(" .
+ $e->getMessage() . ")" );
+ }
+
+ $this->egress = $this->sink;
+ $input = fopen( $this->input, "rt" );
+ $this->readDump( $input );
+
+ }
+
+ /**
+ * @throws MWException Failure to parse XML input
+ * @param string $input
+ * @return bool
+ */
+ function readDump( $input ) {
+ $this->thisBoard = 0;
+ return parent::readDump( $input );
+ }
+
+ /**
+ * Tries to get the revision text for a revision id (UUID).
+ *
+ * Upon errors, retries (Up to $this->maxFailures tries each call).
+ * If still no good revision text could be found even after this
retrying, "" is returned.
+ * If no good revision text could be returned for
+ * $this->maxConsecutiveFailedTextRetrievals consecutive calls to
getFlowText, MWException
+ * is thrown.
+ *
+ * @param string $id The revision id to get the text for
+ *
+ * @throws MWException
+ * @return string The revision text for $id, or ""
+ */
+ function getFlowText( $id ) {
+ global $wgContentHandlerUseDB;
+ global $wgContLang;
+
+ $prefetchNotTried = true; // Whether or not we already tried to
get the text via prefetch.
+ $text = false; // The candidate for a good text. false if no
proper value.
+ $failures = 0; // The number of times, this invocation of
getFlowText already failed.
+
+ // The number of times getFlowText failed without yielding a
good text in between.
+ static $consecutiveFailedTextRetrievals = 0;
+
+ $this->fetchCount++;
+
+ // To allow to simply return on success and do not have to
worry about book keeping,
+ // we assume, this fetch works (possible after some retries).
Nevertheless, we koop
+ // the old value, so we can restore it, if problems occur (See
after the while loop).
+ $oldConsecutiveFailedTextRetrievals =
$consecutiveFailedTextRetrievals;
+ $consecutiveFailedTextRetrievals = 0;
+
+ // FIXME how do we set this?? we want to get an
AbstractRevision object based on the UUID. So?
+ $revision = null;
+ $format = $revision->getContentFormat();
+
+ while ( $failures < $this->maxFailures ) {
+
+ // As soon as we found a good text for the $id, we will
return immediately.
+ // Hence, if we make it past the try catch block, we
know that we did not
+ // find a good text.
+
+ try {
+ // Utterly untested, FIXME
+ // Trying to get prefetch, if it has not been
tried before
+ if ( $text === false && isset( $this->prefetch
) && $prefetchNotTried ) {
+ $prefetchNotTried = false;
+ $tryIsPrefetch = true;
+ $boardId = UUID::create(
$this->thisBoard );
+ $text = $this->prefetch->prefetch(
$boardId->getHex(),
+ intval( $this->thisRev ) );
+
+ if ( $text === null ) {
+ $text = false;
+ }
+ }
+
+ if ( $text === false ) {
+ // Fallback to asking the database
+ $tryIsPrefetch = false;
+
+ $text = $revision->getContent( $format
);
+ if ( $text !== false ) {
+ return $text;
+ }
+ }
+
+ if ( $text === false ) {
+ throw new MWException( "Generic error
while obtaining text for id " . $id );
+ }
+
+ if ( $tryIsPrefetch ) {
+ $this->prefetchCount++;
+ }
+ return $text;
+ } catch ( Exception $e ) {
+ $msg = "getting/checking text " . $id . "
failed (" . $e->getMessage() . ")";
+ if ( $failures + 1 < $this->maxFailures ) {
+ $msg .= " (Will retry " . (
$this->maxFailures - $failures - 1 ) . " more times)";
+ }
+ $this->progress( $msg );
+ }
+
+ // Something went wrong; we did not get a text that was
plausible :(
+ $failures++;
+
+ // A failure in a prefetch hit does not warrant
resetting db connection etc.
+ if ( !$tryIsPrefetch ) {
+ // After backing off for some time, we try to
reboot the whole process as
+ // much as possible to not carry over failures
from one part to the other
+ // parts
+ sleep( $this->failureTimeout );
+ try {
+ $this->rotateDb();
+ } catch ( Exception $e ) {
+ $this->progress( "Rebooting getFlowText
infrastructure failed (" . $e->getMessage() . ")" .
+ " Trying to continue anyways" );
+ }
+ }
+ }
+
+ // Retrieving a good text for $id failed (at least) maxFailures
times.
+ // We abort for this $id.
+
+ // Restoring the consecutive failures, and maybe aborting, if
the dump
+ // is too broken.
+ $consecutiveFailedTextRetrievals =
$oldConsecutiveFailedTextRetrievals + 1;
+ if ( $consecutiveFailedTextRetrievals >
$this->maxConsecutiveFailedTextRetrievals ) {
+ throw new MWException( "Graceful storage failure" );
+ }
+
+ return "";
+ }
+
+ function writeOpenBoard() {
+ // horrible but avoids adding Flow-specific methods to
DumpOutput in core
+ $this->sink->writeOpenPage( null, $this->buffer );
+ }
+
+ function writeCloseBoard() {
+ // horrible but avoids adding Flow-specific methods to
DumpOutput in core
+ $this->sink->writeclosePage( $this->buffer );
+ }
+
+ function startElement( $parser, $name, $attribs ) {
+
+ if ( $name == 'revision' ) {
+ $this->clearOpenElement( null );
+ $this->lastName = $name;
+ $this->state = $name;
+ $this->writeOpenBoard( null, $this->buffer );
+ $this->buffer = "";
+ if ( isset( $attribs['id'] ) ) {
+ $id = $attribs['id'];
+ $text = $this->getFlowText( $id );
+ $this->openElement = [ $name, $attribs ];
+ if ( strlen( $text ) > 0 ) {
+ # FIXME this needs conversion in the
routine or after
+ $this->characterData( $parser, $text );
+ }
+ }
+ } elseif ( $name == 'board' ) {
+ $this->clearOpenElement( null );
+ $this->lastName = $name;
+ $this->state = $name;
+ if ( $this->atStart ) {
+ $this->sink->writeOpenStream( $this->buffer );
+ $this->buffer = "";
+ $this->atStart = false;
+ }
+ $this->openElement = [ $name, $attribs ];
+ } else {
+ parent::startElement( $parser, $name, $attribs );
+ }
+ }
+
+ function endElement( $parser, $name ) {
+ if ( $name == 'board' ) {
+ if ( $this->openElement ) {
+ $this->clearOpenElement( "" );
+ } else {
+ $this->buffer .= "</$name>";
+ }
+ if ( !$this->firstBoardWritten ) {
+ $this->firstBoardWritten = trim(
$this->thisBoard );
+ }
+ $this->lastBoardWritten = trim( $this->thisBoard );
+ $this->writeCloseBoard( $this->buffer );
+ $this->buffer = "";
+ $this->thisPage = "";
+ } else {
+ parent::endElement( $parser, $name );
+ }
+ }
+
+ function characterData( $parser, $data ) {
+ if ( $this->lastName == "id" ) {
+ if ( $this->state == "board" ) {
+ $this->clearOpenElement( null );
+ $this->thisBoard .= $data;
+ $this->buffer .= htmlspecialchars( $data );
+ return;
+ }
+ }
+ parent::characterData( $parser, $data );
+ }
+}
+
+$maintClass = 'FlowTextPassDumper';
+require_once RUN_MAINTENANCE_IF_MAIN;
--
To view, visit https://gerrit.wikimedia.org/r/355080
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I787a26ff6004a875b71ef38905904b7c489f22d4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits