jenkins-bot has submitted this change and it was merged.

Change subject: Dump Flow data
......................................................................


Dump Flow data

A big chunk of this patch is a refactoring of the search updaters.
I wanted to reuse the code that already iterators over all header
& topics, but it was unwieldly. I split that into separate classes
that are now Iterator objects.

Other changes:
* Fixed Flow\Search\Connection, which could no longer call
parent::__construct since that no longer exists
* Flow\Container couldn’t be autoloaded from maintenance
script's constructor (autoload not yet initialized, I assume)
* FlowFixWorkflowLastUpdateTimestamp.php was failing because
getBinary now returns a UUIDBlob instead of the plain binary string

Bug: T89398
Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05
---
M autoload.php
M container.php
A includes/Dump/Exporter.php
A includes/Search/Iterators/AbstractIterator.php
A includes/Search/Iterators/HeaderIterator.php
A includes/Search/Iterators/TopicIterator.php
D includes/Search/TopicUpdater.php
D includes/Search/Updater.php
A includes/Search/Updaters/AbstractUpdater.php
R includes/Search/Updaters/HeaderUpdater.php
A includes/Search/Updaters/TopicUpdater.php
M maintenance/FlowFixWorkflowLastUpdateTimestamp.php
M maintenance/FlowForceSearchIndex.php
A maintenance/dumpBackup.php
14 files changed, 1,133 insertions(+), 482 deletions(-)

Approvals:
  Mattflaschen: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/autoload.php b/autoload.php
index 91d368e..5dfa95f 100644
--- a/autoload.php
+++ b/autoload.php
@@ -114,6 +114,7 @@
        'Flow\\Data\\Utils\\SortRevisionsByRevisionId' => __DIR__ . 
'/includes/Data/Utils/SortRevisionsByRevisionId.php',
        'Flow\\Data\\Utils\\UserMerger' => __DIR__ . 
'/includes/Data/Utils/UserMerger.php',
        'Flow\\DbFactory' => __DIR__ . '/includes/DbFactory.php',
+       'Flow\\Dump\\Exporter' => __DIR__ . '/includes/Dump/Exporter.php',
        'Flow\\Exception\\CatchableFatalErrorException' => __DIR__ . 
'/includes/Exception/CatchableFatalErrorException.php',
        'Flow\\Exception\\CrossWikiException' => __DIR__ . 
'/includes/Exception/ExceptionHandling.php',
        'Flow\\Exception\\DataModelException' => __DIR__ . 
'/includes/Exception/ExceptionHandling.php',
@@ -279,12 +280,15 @@
        'Flow\\Repository\\UserName\\UserNameQuery' => __DIR__ . 
'/includes/Repository/UserName/UserNameQuery.php',
        'Flow\\RevisionActionPermissions' => __DIR__ . 
'/includes/RevisionActionPermissions.php',
        'Flow\\Search\\Connection' => __DIR__ . 
'/includes/Search/Connection.php',
-       'Flow\\Search\\HeaderUpdater' => __DIR__ . 
'/includes/Search/HeaderUpdater.php',
+       'Flow\\Search\\Iterators\\AbstractIterator' => __DIR__ . 
'/includes/Search/Iterators/AbstractIterator.php',
+       'Flow\\Search\\Iterators\\HeaderIterator' => __DIR__ . 
'/includes/Search/Iterators/HeaderIterator.php',
+       'Flow\\Search\\Iterators\\TopicIterator' => __DIR__ . 
'/includes/Search/Iterators/TopicIterator.php',
        'Flow\\Search\\Maintenance\\MappingConfigBuilder' => __DIR__ . 
'/includes/Search/maintenance/MappingConfigBuilder.php',
        'Flow\\Search\\SearchEngine' => __DIR__ . 
'/includes/Search/SearchEngine.php',
        'Flow\\Search\\Searcher' => __DIR__ . '/includes/Search/Searcher.php',
-       'Flow\\Search\\TopicUpdater' => __DIR__ . 
'/includes/Search/TopicUpdater.php',
-       'Flow\\Search\\Updater' => __DIR__ . '/includes/Search/Updater.php',
+       'Flow\\Search\\Updaters\\AbstractUpdater' => __DIR__ . 
'/includes/Search/Updaters/AbstractUpdater.php',
+       'Flow\\Search\\Updaters\\HeaderUpdater' => __DIR__ . 
'/includes/Search/Updaters/HeaderUpdater.php',
+       'Flow\\Search\\Updaters\\TopicUpdater' => __DIR__ . 
'/includes/Search/Updaters/TopicUpdater.php',
        'Flow\\SpamFilter\\AbuseFilter' => __DIR__ . 
'/includes/SpamFilter/AbuseFilter.php',
        'Flow\\SpamFilter\\ConfirmEdit' => __DIR__ . 
'/includes/SpamFilter/ConfirmEdit.php',
        'Flow\\SpamFilter\\ContentLengthFilter' => __DIR__ . 
'/includes/SpamFilter/ContentLengthFilter.php',
diff --git a/container.php b/container.php
index 31dabde..6eb11b0 100644
--- a/container.php
+++ b/container.php
@@ -1032,12 +1032,18 @@
        global $wgFlowSearchServers, $wgFlowSearchConnectionAttempts;
        return new Flow\Search\Connection( $wgFlowSearchServers, 
$wgFlowSearchConnectionAttempts );
 };
+$c['search.index.iterators.header'] = function( $c ) {
+       return new \Flow\Search\Iterators\HeaderIterator( $c['db.factory'] );
+};
+$c['search.index.iterators.topic'] = function( $c ) {
+       return new \Flow\Search\Iterators\TopicIterator( $c['db.factory'], 
$c['loader.root_post'] );
+};
 $c['search.index.updaters'] = function( $c ) {
        // permissions for anon user
        $anonPermissions = new Flow\RevisionActionPermissions( 
$c['flow_actions'], new User );
        return array(
-               'topic' => new \Flow\Search\TopicUpdater( $c['db.factory'], 
$anonPermissions,  $c['loader.root_post'] ),
-               'header' => new \Flow\Search\HeaderUpdater( $c['db.factory'], 
$anonPermissions )
+               'topic' => new \Flow\Search\Updaters\TopicUpdater( 
$c['search.index.iterators.topic'], $anonPermissions, $c['loader.root_post'] ),
+               'header' => new \Flow\Search\Updaters\HeaderUpdater( 
$c['search.index.iterators.header'], $anonPermissions )
        );
 };
 
diff --git a/includes/Dump/Exporter.php b/includes/Dump/Exporter.php
new file mode 100644
index 0000000..4002475
--- /dev/null
+++ b/includes/Dump/Exporter.php
@@ -0,0 +1,409 @@
+<?php
+
+namespace Flow\Dump;
+
+use BatchRowIterator;
+use DatabaseBase;
+use Exception;
+use Flow\Collection\PostSummaryCollection;
+use Flow\Container;
+use Flow\Data\ManagerGroup;
+use Flow\Model\AbstractRevision;
+use Flow\Model\Header;
+use Flow\Model\PostRevision;
+use Flow\Model\PostSummary;
+use Flow\Model\UUID;
+use Flow\Model\Workflow;
+use Flow\RevisionActionPermissions;
+use Flow\Search\Iterators\AbstractIterator;
+use Flow\Search\Iterators\HeaderIterator;
+use Flow\Search\Iterators\TopicIterator;
+use ReflectionProperty;
+use TimestampException;
+use User;
+use WikiExporter;
+use Xml;
+
+class Exporter extends WikiExporter {
+       /**
+        * Map of [db column name => xml attribute name]
+        *
+        * @var array
+        */
+       public static $map = array(
+               'rev_id' => 'id',
+               'rev_user_id' => 'userid',
+               'rev_user_ip' => 'userip',
+               'rev_user_wiki' => 'userwiki',
+               'rev_parent_id' => 'parentid',
+               'rev_change_type' => 'changetype',
+               'rev_type' => 'type',
+               'rev_type_id' => 'typeid',
+               'rev_content' => 'content',
+               'rev_content_url' => 'contenturl',
+               'rev_flags' => 'flags',
+               'rev_mod_state' => 'modstate',
+               'rev_mod_user_id' => 'moduserid',
+               'rev_mod_user_ip' => 'moduserip',
+               'rev_mod_user_wiki' => 'moduserwiki',
+               'rev_mod_timestamp' => 'modtimestamp',
+               'rev_mod_reason' => 'modreason',
+               'rev_last_edit_id' => 'lasteditid',
+               'rev_edit_user_id' => 'edituserid',
+               'rev_edit_user_ip' => 'edituserip',
+               'rev_edit_user_wiki' => 'edituserwiki',
+               'rev_content_length' => 'contentlength',
+               'rev_previous_content_length' => 'previouscontentlength',
+
+               'tree_parent_id' => 'treeparentid',
+               'tree_rev_descendant_id' => 'treedescendantid',
+               'tree_rev_id' => 'treerevid',
+               'tree_orig_user_id' => 'treeoriguserid',
+               'tree_orig_user_ip' => 'treeoriguserip',
+               'tree_orig_user_wiki' => 'treeoriguserwiki',
+       );
+
+       /**
+          @var ReflectionProperty $prevRevisionProperty Previous revision 
property
+       */
+       protected $prevRevisionProperty;
+
+       /**
+          @var ReflectionProperty $changeTypeProperty Change type property
+       */
+       protected $changeTypeProperty;
+
+       /**
+        * {@inheritDoc}
+        */
+       function __construct( $db, $history = WikiExporter::CURRENT,
+               $buffer = WikiExporter::BUFFER, $text = WikiExporter::TEXT ) {
+
+               parent::__construct( $db, $history, $buffer, $text );
+               $this->prevRevisionProperty = new ReflectionProperty( 
'Flow\Model\AbstractRevision', 'prevRevision' );
+               $this->prevRevisionProperty->setAccessible( true );
+
+               $this->changeTypeProperty = new ReflectionProperty( 
'Flow\Model\AbstractRevision', 'changeType' );
+               $this->changeTypeProperty->setAccessible( true );
+       }
+
+       public static function schemaVersion() {
+               return '1';
+       }
+
+       public function openStream() {
+               global $wgLanguageCode;
+               $version = static::schemaVersion();
+
+               $output = Xml::openElement(
+                       'mediawiki',
+                       array(
+                               // @todo: update after creating schema
+//                             'xmlns'  => 
"http://www.mediawiki.org/xml/export-$version/";,
+//                             'xmlns:xsi' => 
"http://www.w3.org/2001/XMLSchema-instance";,
+//                             'xsi:schemaLocation' => 
"http://www.mediawiki.org/xml/export-$version/ 
http://www.mediawiki.org/xml/export-$version.xsd";,
+                               'version' => $version,
+                               'xml:lang' => $wgLanguageCode
+                       )
+               ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       /**
+        * @param string[]|null $pages Array of DB-prefixed page titles
+        * @param int|null $startId page_id to start from (inclusive)
+        * @param int|null $endId page_id to end (exclusive)
+        * @return BatchRowIterator
+        */
+       public function getWorkflowIterator( array $pages = null, $startId = 
null, $endId = null ) {
+               /** @var DatabaseBase $dbr */
+               $dbr = Container::get( 'db.factory' )->getDB( DB_SLAVE );
+
+               $iterator = new BatchRowIterator( $dbr, 'flow_workflow', 
'workflow_id', 300 );
+               $iterator->setFetchColumns( array( '*' ) );
+               $iterator->addConditions( array( 'workflow_wiki' => wfWikiId() 
) );
+               $iterator->addConditions( array( 'workflow_type' => 
'discussion' ) );
+
+               if ( $pages ) {
+                       $pageConds = array();
+                       foreach ( $pages as $page ) {
+                               $title = \Title::newFromDBkey( $page );
+                               $pageConds[] = $dbr->makeList(
+                                       array(
+                                               'workflow_namespace' => 
$title->getNamespace(),
+                                               'workflow_title_text' => 
$title->getDBkey()
+                                       ),
+                                       LIST_AND
+                               );
+                       }
+
+                       $iterator->addConditions( array( $dbr->makeList( 
$pageConds, LIST_OR ) ) );
+               }
+               if ( $startId ) {
+                       $iterator->addConditions( array( 'workflow_page_id >= ' 
. $dbr->addQuotes( $startId ) ) );
+               }
+               if ( $endId ) {
+                       $iterator->addConditions( array( 'workflow_page_id < ' 
. $dbr->addQuotes( $endId ) ) );
+               }
+
+               return $iterator;
+       }
+
+       /**
+        * @param BatchRowIterator $workflowIterator
+        * @param UUID|null $revStartId
+        * @param UUID|null $revEndId
+        * @throws Exception
+        * @throws TimestampException
+        * @throws \Flow\Exception\InvalidInputException
+        */
+       public function dump( BatchRowIterator $workflowIterator, $revStartId = 
null, $revEndId = null ) {
+               foreach ( $workflowIterator as $rows ) {
+                       foreach ( $rows as $row ) {
+                               $workflow = Workflow::fromStorageRow( (array) 
$row );
+
+                               $headerIterator = Container::get( 
'search.index.iterators.header' );
+                               $topicIterator = Container::get( 
'search.index.iterators.topic' );
+                               /** @var AbstractIterator $iterator */
+                               foreach ( array( $headerIterator, 
$topicIterator ) as $iterator ) {
+                                       $iterator->setPage( 
$row->workflow_page_id );
+                                       $iterator->setFrom( $revStartId );
+                                       $iterator->setTo( $revEndId );
+                               }
+
+                               $this->formatWorkflow( $workflow, 
$headerIterator, $topicIterator );
+                       }
+               }
+       }
+
+       protected function formatWorkflow( Workflow $workflow, HeaderIterator 
$headerIterator, TopicIterator $topicIterator ) {
+               if ( $workflow->isDeleted() ) {
+                       return;
+               }
+
+               $output = Xml::openElement( 'board', array(
+                       'id' => $workflow->getId()->getAlphadecimal(),
+                       'title' => 
$workflow->getOwnerTitle()->getPrefixedDBkey(),
+               ) ) . "\n";
+               $this->sink->write( $output );
+
+               foreach ( $headerIterator as $revision ) {
+                       /** @var Header $revision */
+                       $this->formatHeader( $revision );
+               }
+               foreach ( $topicIterator as $revision ) {
+                       /** @var PostRevision $revision */
+                       $this->formatTopic( $revision );
+               }
+
+               $output = Xml::closeElement( 'board' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatTopic( PostRevision $revision ) {
+               if ( !$this->isAllowed( $revision ) ) {
+                       return;
+               }
+
+               $output = Xml::openElement( 'topic', array(
+                       'id' => $revision->getCollectionId()->getAlphadecimal(),
+               ) ) . "\n";
+               $this->sink->write( $output );
+
+               $this->formatPost( $revision );
+
+               // find summary for this topic & add it as revision
+               $summaryCollection = PostSummaryCollection::newFromId( 
$revision->getCollectionId() );
+               try {
+                       /** @var PostSummary $summary */
+                       $summary = $summaryCollection->getLastRevision();
+                       $this->formatSummary( $summary );
+               } catch ( \Exception $e ) {
+                       // no summary - that's ok!
+               }
+
+               $output = Xml::closeElement( 'topic' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatHeader( Header $revision ) {
+               if ( !$this->isAllowed( $revision ) ) {
+                       return;
+               }
+
+               $output = Xml::openElement( 'description', array(
+                       'id' => $revision->getCollectionId()->getAlphadecimal()
+               ) ) . "\n";
+               $this->sink->write( $output );
+
+               $this->formatRevisions( $revision );
+
+               $output = Xml::closeElement( 'description' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatPost( PostRevision $revision ) {
+               if ( !$this->isAllowed( $revision ) ) {
+                       return;
+               }
+
+               $output = Xml::openElement( 'post', array(
+                       'id' => $revision->getCollectionId()->getAlphadecimal()
+               ) ) . "\n";
+               $this->sink->write( $output );
+
+               $this->formatRevisions( $revision );
+
+               if ( $revision->getChildren() ) {
+                       $output = Xml::openElement( 'children' ) . "\n";
+                       $this->sink->write( $output );
+
+                       foreach ( $revision->getChildren() as $child ) {
+                               $this->formatPost( $child );
+                       }
+
+                       $output = Xml::closeElement( 'children' ) . "\n";
+                       $this->sink->write( $output );
+               }
+
+               $output = Xml::closeElement( 'post' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatSummary( PostSummary $revision ) {
+               if ( !$this->isAllowed( $revision ) ) {
+                       return;
+               }
+
+               $output = Xml::openElement( 'summary', array(
+                       'id' => $revision->getCollectionId()->getAlphadecimal()
+               ) ) . "\n";
+               $this->sink->write( $output );
+
+               $this->formatRevisions( $revision );
+
+               $output = Xml::closeElement( 'summary' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatRevisions( AbstractRevision $revision ) {
+               $output = Xml::openElement( 'revisions' ) . "\n";
+               $this->sink->write( $output );
+
+               $collection = $revision->getCollection();
+               if ( $this->history === WikiExporter::FULL ) {
+                       /** @var AbstractRevision[] $revisions */
+                       $revisions = array_reverse( 
$collection->getAllRevisions() );
+                       $prevId = null;
+
+                       foreach ( $revisions as $revision ) {
+                               if ( $this->isAllowed( $revision ) ) {
+                                       if ( $prevId !== null ) {
+                                               // override parent id: this is 
used to get rid of gaps
+                                               // that are caused by moderated 
items, where the
+                                               // revision tree would be 
incorrect
+                                               
$this->prevRevisionProperty->setValue( $revision, $prevId );
+
+                                               // Since $prevId is set, we know
+                                               // there was a gap, and the 
original
+                                               // 
hide-topic/delete-topic/suppress-topic
+                                               // was removed. Since that is 
used for
+                                               // listeners in 
FlowActions.php, we replace
+                                               // restore-topic with 
edit-title and make a
+                                               // null edit (we don't do null 
edits in the
+                                               // normal application flow, but 
this
+                                               // provides a way to replace 
restore).
+                                               $oldChangeType = 
$revision->getChangeType();
+
+                                               if ( $oldChangeType === 
'restore-topic' ) {
+                                                       
$this->changeTypeProperty->setValue( $revision, 'edit-title' );
+                                               }
+
+                                               if ( $oldChangeType === 
'restore-post' ) {
+                                                       
$this->changeTypeProperty->setValue( $revision, 'edit-post' );
+                                               }
+
+                                               $prevId = null;
+                                       }
+                                       $this->formatRevision( $revision );
+                               } elseif ( $prevId === null ) {
+                                       // if revision can't be dumped, store 
its parent id so we
+                                       // can re-apply it to the next one that 
can be displayed, so
+                                       // we don't have gaps
+                                       $prevId = 
$revision->getPrevRevisionId();
+                               }
+                       }
+               } elseif ( $this->history === WikiExporter::CURRENT ) {
+                       $first = $collection->getFirstRevision();
+
+                       // storing only last revision won't work (it'll 
reference non-existing
+                       // parents): we'll construct a bogus revision with most 
of the original
+                       // metadata, but with the current content & id (= 
timestamp)
+                       $first = $first->toStorageRow( $first );
+                       $last = $revision->toStorageRow( $revision );
+                       $first['rev_id'] = $last['rev_id'];
+                       $first['rev_content'] = $last['rev_content'];
+                       $first['rev_flags'] = $last['rev_flags'];
+                       if ( isset( $first['tree_rev_id'] ) ) {
+                               // PostRevision-only: tree_rev_id must match 
rev_id
+                               $first['tree_rev_id'] = $first['rev_id'];
+                       }
+
+                       // clear buffered cache, to make sure it doesn't serve 
the existing (already
+                       // loaded) revision when trying to turn our bogus mixed 
data into a revision
+                       /** @var ManagerGroup $storage */
+                       $storage = Container::get( 'storage' );
+                       $storage->clear();
+
+                       $mix = $revision->fromStorageRow( $first );
+
+                       $this->formatRevision( $mix );
+               }
+
+               $output = Xml::closeElement( 'revisions' ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       protected function formatRevision( AbstractRevision $revision ) {
+               if ( !$this->isAllowed( $revision ) ) {
+                       return;
+               }
+
+               $attribs = $revision->toStorageRow( $revision );
+
+               // make sure there are no leftover key columns (unknown to 
$attribs)
+               $keys = array_intersect_key(static::$map, $attribs );
+               // now make sure $values columns are in the same order as $keys 
are
+               // (array_merge) and there are no leftover columns 
(array_intersect_key)
+               $values = array_intersect_key( array_merge( $keys, $attribs ), 
$keys );
+               // combine them
+               $attribs = array_combine( $keys, $values );
+
+               // references to external store etc. are useless; we'll include 
the real
+               // content as node text
+               unset($attribs['content'], $attribs['contenturl']);
+               $format = $revision->getContentFormat();
+               $attribs['flags'] = 'utf-8,' . $format;
+
+               $output = Xml::element(
+                       'revision',
+                       $attribs,
+                       $revision->getContent( $format )
+               ) . "\n";
+               $this->sink->write( $output );
+       }
+
+       /**
+        * Test if anon users are allowed to view a particular revision.
+        *
+        * @param AbstractRevision $revision
+        * @return bool
+        */
+       protected function isAllowed( AbstractRevision $revision ) {
+               $user = User::newFromId( 0 );
+               $actions = Container::get( 'flow_actions' );
+               $permissions = new RevisionActionPermissions( $actions, $user );
+
+               return $permissions->isAllowed( $revision, 'view' );
+       }
+}
diff --git a/includes/Search/Iterators/AbstractIterator.php 
b/includes/Search/Iterators/AbstractIterator.php
new file mode 100644
index 0000000..1e19c30
--- /dev/null
+++ b/includes/Search/Iterators/AbstractIterator.php
@@ -0,0 +1,190 @@
+<?php
+
+namespace Flow\Search\Iterators;
+
+use DatabaseBase;
+use Flow\Container;
+use Flow\Data\ManagerGroup;
+use Flow\DbFactory;
+use Flow\Exception\InvalidDataException;
+use Flow\Model\AbstractRevision;
+use Flow\Model\UUID;
+use Iterator;
+use ResultWrapper;
+use stdClass;
+
+abstract class AbstractIterator implements Iterator {
+       /**
+        * @var DatabaseBase
+        */
+       protected $dbr;
+
+       /**
+        * @var array
+        */
+       protected $conditions = array();
+
+       /**
+        * @var ResultWrapper|null
+        */
+       protected $results;
+
+       /**
+        * Depending on where we are in the iteration, this can be null (object
+        * constructed but not yet being iterated over), AbstractRevision (being
+        * iterated) or false (end of iteration, no more revisions)
+        *
+        * @var AbstractRevision|null|false
+        */
+       protected $current;
+
+       /**
+        * Depending on where we are in the iteration, this can be integer 
(object
+        * being iterated over) or null (iteration not yet started, or 
completed)
+        *
+        * @var int|null
+        */
+       protected $key;
+
+       /**
+        * @param DbFactory $dbFactory
+        */
+       public function __construct( DbFactory $dbFactory ) {
+               $this->dbr = $dbFactory->getDB( DB_SLAVE );
+               $this->conditions = array( 'workflow_wiki' => wfWikiId() );
+       }
+
+       /**
+        * @return bool|ResultWrapper
+        */
+       abstract protected function query();
+
+       /**
+        * @param array|int|null $pageId
+        */
+       public function setPage( $pageId = null ) {
+               $this->results = null;
+
+               unset( $this->conditions['workflow_page_id'] );
+               if ( $pageId !== null ) {
+                       $this->conditions['workflow_page_id'] = $pageId;
+               }
+       }
+
+       /**
+        * @param int|null $namespace
+        */
+       public function setNamespace( $namespace = null ) {
+               $this->results = null;
+
+               unset( $this->conditions['workflow_namespace'] );
+               if ( $namespace !== null ) {
+                       $this->conditions['workflow_namespace'] = $namespace;
+               }
+       }
+
+       /**
+        * Define where to start iterating (inclusive)
+        *
+        * @param UUID|null $revId
+        */
+       public function setFrom( UUID $revId = null ) {
+               $this->results = null;
+
+               unset( $this->conditions[0] );
+               if ( $revId !== null ) {
+                       $this->conditions[0] = 'rev_id >= ' . 
$this->dbr->addQuotes( $revId->getBinary() );
+               }
+       }
+
+       /**
+        * Define where to stop iterating (exclusive)
+        *
+        * @param UUID|null $revId
+        */
+       public function setTo( UUID $revId = null ) {
+               $this->results = null;
+
+               unset( $this->conditions[1] );
+               if ( $revId !== null ) {
+                       $this->conditions[1] = 'rev_id < ' . 
$this->dbr->addQuotes( $revId->getBinary() );
+               }
+       }
+
+       /**
+        * @return AbstractRevision|null The most recently fetched revision 
object
+        */
+       public function current() {
+               return $this->current;
+       }
+
+       /**
+        * @return integer 0-indexed count of the page number fetched
+        */
+       public function key() {
+               return $this->key;
+       }
+
+       /**
+        * Reset the iterator to the beginning of the table.
+        */
+       public function rewind() {
+               $this->results = null;
+               $this->key = -1; // self::next() will turn this into 0
+               $this->current = null;
+               $this->next();
+       }
+
+       /**
+        * @return bool True when the iterator is in a valid state
+        */
+       public function valid() {
+               return (bool) $this->current;
+       }
+
+       /**
+        * Fetch the next set of rows from the database.
+        */
+       public function next() {
+               if ( $this->results === null ) {
+                       $this->results = $this->query();
+               }
+
+               $current = $this->results->fetchObject();
+               if ( $current !== false ) {
+                       $this->current = $this->transform( $current );
+                       $this->key++;
+               } else {
+                       // end of iteration reached
+                       $this->current = false;
+                       $this->key = null;
+               }
+       }
+
+       /**
+        * Transforms the DB row into a revision object.
+        *
+        * $row will be one of the results of static::query(). In this method, 
$row
+        * is expected to have at least properties `rev_id` & `rev_type`, which 
will
+        * be used to fetch this specific row's data from storage.
+        *
+        * This will need to do some DB/cache requests. Ideally, those would be
+        * bundled instead of being done on a per-row record. These iterators
+        * are only meant to be run in maintenance scripts, however, so it
+        * doesn't really matter that much ;)
+        *
+        * @param stdClass $row
+        * @return AbstractRevision
+        */
+       protected function transform( stdClass $row ) {
+               $uuid = UUID::create( $row->rev_id );
+
+               /** @var ManagerGroup $storage */
+               $storage = Container::get( 'storage' );
+
+               // prevent memory from being filled up
+               $storage->clear();
+
+               return $storage->getStorage( $row->rev_type )->get( $uuid );
+       }
+}
diff --git a/includes/Search/Iterators/HeaderIterator.php 
b/includes/Search/Iterators/HeaderIterator.php
new file mode 100644
index 0000000..7e755f2
--- /dev/null
+++ b/includes/Search/Iterators/HeaderIterator.php
@@ -0,0 +1,28 @@
+<?php
+
+namespace Flow\Search\Iterators;
+
+class HeaderIterator extends AbstractIterator {
+       /**
+        * {@inheritDoc}
+        */
+       protected function query() {
+               // get the current (=most recent, =max) revision id for all 
headers
+               return $this->dbr->select(
+                       array( 'flow_revision', 'flow_workflow' ),
+                       array( 'rev_id' => 'MAX(rev_id)', 'rev_type' ),
+                       $this->conditions,
+                       __METHOD__,
+                       array(
+                               'ORDER BY' => 'rev_id ASC',
+                               'GROUP BY' => 'rev_type_id',
+                       ),
+                       array(
+                               'flow_workflow' => array(
+                                       'INNER JOIN',
+                                       array( 'workflow_id = rev_type_id' , 
'rev_type' => 'header' )
+                               ),
+                       )
+               );
+       }
+}
diff --git a/includes/Search/Iterators/TopicIterator.php 
b/includes/Search/Iterators/TopicIterator.php
new file mode 100644
index 0000000..f30d3d7
--- /dev/null
+++ b/includes/Search/Iterators/TopicIterator.php
@@ -0,0 +1,112 @@
+<?php
+
+namespace Flow\Search\Iterators;
+
+use Flow\DbFactory;
+use Flow\Exception\InvalidDataException;
+use Flow\Model\PostRevision;
+use Flow\Model\UUID;
+use Flow\Repository\RootPostLoader;
+use stdClass;
+
+class TopicIterator extends AbstractIterator {
+       /**
+        * @var PostRevision
+        */
+       protected $previous;
+
+       /**
+        * @var RootPostLoader
+        */
+       protected $rootPostLoader;
+
+       /**
+        * @param DbFactory $dbFactory
+        * @param RootPostLoader $rootPostLoader
+        */
+       public function __construct( DbFactory $dbFactory, RootPostLoader 
$rootPostLoader ) {
+               parent::__construct( $dbFactory );
+               $this->rootPostLoader = $rootPostLoader;
+       }
+
+       /**
+        * Define where to start iterating (inclusive)
+        *
+        * We'll be querying the workflow table instead of the revisions table.
+        * Because it's possible to request only a couple of revisions (in 
between
+        * certain ids), we'll need to override the parent buildQueryConditions
+        * method to also work on the workflow table.
+        * A topic workflow is updated with a workflow_last_update_timestamp for
+        * every change made in the topic. Our UUIDs are sequential & 
time-based,
+        * so we can just query for workflows with a timestamp higher than the
+        * timestamp derived from the starting UUID and lower than the end UUID.
+        *
+        * @param UUID|null $revId
+        */
+       public function setFrom( UUID $revId = null ) {
+               $this->results = null;
+
+               unset( $this->conditions[0] );
+               if ( $revId !== null ) {
+                       $this->conditions[0] = 'workflow_last_update_timestamp 
>= ' . $this->dbr->addQuotes( $revId->getBinary() );
+               }
+       }
+
+       /**
+        * Define where to stop iterating (exclusive)
+        *
+        * We'll be querying the workflow table instead of the revisions table.
+        * Because it's possible to request only a couple of revisions (in 
between
+        * certain ids), we'll need to override the parent buildQueryConditions
+        * method to also work on the workflow table.
+        * A topic workflow is updated with a workflow_last_update_timestamp for
+        * every change made in the topic. Our UUIDs are sequential & 
time-based,
+        * so we can just query for workflows with a timestamp higher than the
+        * timestamp derived from the starting UUID and lower than the end UUID.
+        *
+        * @param UUID|null $revId
+        */
+       public function setTo( UUID $revId = null ) {
+               $this->results = null;
+
+               unset( $this->conditions[1] );
+               if ( $revId !== null ) {
+                       $this->conditions[1] = 'workflow_last_update_timestamp 
< ' . $this->dbr->addQuotes( $revId->getBinary() );
+               }
+       }
+
+       /**
+        * Instead of querying for revisions (which is what we actually need), 
we'll
+        * just query the workflow table, which will save us some complicated 
joins.
+        * The workflow_id for a topic title (aka root post) is the same as its
+        * collection id, so we can pass that to the root post loader and 
*poof*, we
+        * have our revisions!
+        *
+        * {@inheritDoc}
+        */
+       protected function query() {
+               return $this->dbr->select(
+                       array( 'flow_workflow' ),
+                       // for root post (topic title), workflow_id is the same 
as its rev_type_id
+                       array( 'workflow_id', 'workflow_last_update_timestamp' 
),
+                       array(
+                               'workflow_type' => 'topic'
+                       ) + $this->conditions,
+                       __METHOD__,
+                       array(
+                               'ORDER BY' => 'workflow_last_update_timestamp 
ASC',
+                       )
+               );
+       }
+
+       /**
+        * {@inheritDoc}
+        */
+       protected function transform( stdClass $row ) {
+               $root = UUID::create( $row->workflow_id );
+
+               // we need to fetch all data via rootloader because we'll want 
children
+               // to be populated
+               return $this->rootPostLoader->get( $root );
+       }
+}
diff --git a/includes/Search/TopicUpdater.php b/includes/Search/TopicUpdater.php
deleted file mode 100644
index 992ee9b..0000000
--- a/includes/Search/TopicUpdater.php
+++ /dev/null
@@ -1,211 +0,0 @@
-<?php
-
-namespace Flow\Search;
-
-use Flow\Collection\PostSummaryCollection;
-use Flow\DbFactory;
-use Flow\Model\PostRevision;
-use Flow\Model\PostSummary;
-use Flow\Model\UUID;
-use Flow\Repository\RootPostLoader;
-use Flow\RevisionActionPermissions;
-use ResultWrapper;
-use Sanitizer;
-
-class TopicUpdater extends Updater {
-       /**
-        * @var RootPostLoader
-        */
-       protected $rootPostLoader;
-
-       /**
-        * @param DbFactory $dbFactory
-        * @param RevisionActionPermissions $permissions
-        * @param RootPostLoader $rootPostLoader
-        */
-       public function __construct( DbFactory $dbFactory, 
RevisionActionPermissions $permissions, RootPostLoader $rootPostLoader ) {
-               parent::__construct( $dbFactory, $permissions );
-               $this->rootPostLoader = $rootPostLoader;
-       }
-
-       /**
-        * {@inheritDoc}
-        */
-       public function getTypeName() {
-               return Connection::TOPIC_TYPE_NAME;
-       }
-
-       /**
-        * We'll be querying the workflow table instead of the revisions table.
-        * Because it's possible to request only a couple of revisions (in 
between
-        * certain ids), we'll need to override the parent buildQueryConditions
-        * method to also work on the workflow table.
-        * A topic workflow is updated with a workflow_last_update_timestamp for
-        * every change made in the topic. Our UUIDs are sequential & 
time-based,
-        * so we can just query for workflows with a timestamp higher than the
-        * timestamp derived from the starting UUID and lower than the end UUID.
-        *
-        * {@inheritDoc}
-        */
-       public function buildQueryConditions( UUID $fromId = null, UUID $toId = 
null, $namespace = null ) {
-               $dbr = $this->dbFactory->getDB( DB_SLAVE );
-
-               $conditions = array();
-
-               // only find entries in a given range
-               if ( $fromId !== null ) {
-                       $conditions[] = 'workflow_last_update_timestamp >= ' . 
$dbr->addQuotes( $fromId->getTimestamp() );
-               }
-               if ( $toId !== null ) {
-                       $conditions[] = 'workflow_last_update_timestamp <= ' . 
$dbr->addQuotes( $toId->getTimestamp() );
-               }
-
-               // find only within requested wiki/namespace
-               $conditions['workflow_wiki'] = wfWikiId();
-               if ( $namespace !== null ) {
-                       $conditions['workflow_namespace'] = $namespace;
-               }
-
-               return $conditions;
-       }
-
-       /**
-        * Instead of querying for revisions (which is what we actually need), 
we'll
-        * just query the workflow table, which will save us some complicated 
joins.
-        * The workflow_id for a topic title (aka root post) is the same as its
-        * revision is, so we can pass that to the root post loader and *poof*, 
we
-        * have our revisions!
-        *
-        * {@inheritDoc}
-        */
-       public function getRevisions( array $conditions = array(), array 
$options = array() ) {
-               $workflows = $this->getWorkflows( $conditions, $options );
-               return $this->getRoots( $workflows );
-       }
-
-       /**
-        * {@inheritDoc}
-        */
-       public function buildDocument( /* PostRevision */ $revision ) {
-               /** @var PostRevision $revision */
-
-               // get timestamp from the most recent revision
-               $updateTimestamp = 
$revision->getCollection()->getWorkflow()->getLastUpdatedObj();
-               // timestamp for initial topic post
-               $creationTimestamp = 
$revision->getCollectionId()->getTimestampObj();
-
-               // get content from all child posts in a [post id => [data]] 
array
-               $revisions = $this->getRevisionsData( $revision );
-
-               // find summary for this topic & add it as revision
-               $summaryCollection = PostSummaryCollection::newFromId( 
$revision->getCollectionId() );
-               try {
-                       /** @var PostSummary $summaryRevision */
-                       $summaryRevision = 
$summaryCollection->getLastRevision();
-                       $data = current( $this->getRevisionsData( 
$summaryRevision ) );
-                       if ( $data !== false ) {
-                               $revisions[] = $data;
-                       }
-               } catch ( \Exception $e ) {
-                       // no summary - that's ok!
-               }
-
-               // get board title associated with this revision
-               $title = 
$revision->getCollection()->getWorkflow()->getOwnerTitle();
-
-               $doc = new \Elastica\Document(
-                       $revision->getCollectionId()->getAlphadecimal(),
-                       array(
-                               'namespace' => $title->getNamespace(),
-                               'namespace_text' => 
$title->getPageLanguage()->getFormattedNsText( $title->getNamespace() ),
-                               'pageid' => $title->getArticleID(),
-                               'title' => $title->getText(),
-                               'timestamp' => 
$creationTimestamp->getTimestamp( TS_ISO_8601 ),
-                               'update_timestamp' => 
$updateTimestamp->getTimestamp( TS_ISO_8601 ),
-                               'revisions' => $revisions,
-                       )
-               );
-
-               return $doc;
-       }
-
-       /**
-        * @param array $conditions
-        * @param array $options
-        * @return bool|ResultWrapper
-        */
-       public function getWorkflows( array $conditions = array(), array 
$options = array() ) {
-               $dbr = $this->dbFactory->getDB( DB_SLAVE );
-
-               return $dbr->select(
-                       array( 'flow_workflow' ),
-                       // for root post (topic title), workflow_id is the same 
as its rev_type_id
-                       array( 'workflow_id', 'workflow_last_update_timestamp' 
),
-                       array(
-                               'workflow_type' => 'topic'
-                       ) + $conditions,
-                       __METHOD__,
-                       array(
-                               'ORDER BY' => 'workflow_last_update_timestamp 
ASC',
-                       ) + $options
-               );
-       }
-
-       /**
-        * @param ResultWrapper $workflows
-        * @return PostRevision[]
-        */
-       public function getRoots( ResultWrapper $workflows ) {
-               $roots = array();
-               foreach ( $workflows as $row ) {
-                       $roots[$row->workflow_id] = UUID::create( 
$row->workflow_id );
-               }
-
-               // we need to fetch all data via rootloader because we'll want 
children
-               // to be populated
-               return $this->rootPostLoader->getMulti( $roots );
-       }
-
-       /**
-        * Recursively get the data for all children. This will add the 
revision's
-        * content to the results array, with the post ID as key.
-        *
-        * @param PostRevision|PostSummary $revision
-        * @return array
-        */
-       public function getRevisionsData( /* PostRevision|PostSummary */ 
$revision ) {
-               // store type of revision so we can also search for very 
specific types
-               // (e.g. titles only)
-               // possible values will be:
-               // * title
-               // * post
-               // * post-summary
-               $type = $revision->getRevisionType();
-               if ( method_exists( $revision, 'isTopicTitle' ) && 
$revision->isTopicTitle() ) {
-                       $type = 'title';
-               }
-
-               $data = array();
-
-               if ( $this->permissions->isAllowed( $revision, 'view' ) ) {
-                       $data[] = array(
-                               'id' => 
$revision->getCollectionId()->getAlphadecimal(),
-                               'text' => trim( Sanitizer::stripAllTags( 
$revision->getContentInHtml() ) ),
-                               'source_text' => 
$revision->getContentInWikitext(),
-                               'moderation_state' => 
$revision->getModerationState(),
-                               'timestamp' => 
$revision->getCollectionId()->getTimestamp( TS_ISO_8601 ),
-                               'update_timestamp' => 
$revision->getRevisionId()->getTimestamp( TS_ISO_8601 ),
-                               'type' => $type,
-                       );
-               }
-
-               if ( $revision instanceof PostRevision ) {
-                       // get data from all child posts too
-                       foreach ( $revision->getChildren() as $child ) {
-                               $data = array_merge( $data, 
$this->getRevisionsData( $child ) );
-                       }
-               }
-
-               return $data;
-       }
-}
diff --git a/includes/Search/Updater.php b/includes/Search/Updater.php
deleted file mode 100644
index 06d2235..0000000
--- a/includes/Search/Updater.php
+++ /dev/null
@@ -1,149 +0,0 @@
-<?php
-
-namespace Flow\Search;
-
-use Flow\Container;
-use Flow\DbFactory;
-use Flow\Exception\FlowException;
-use Flow\Model\AbstractRevision;
-use Flow\Model\UUID;
-use Flow\RevisionActionPermissions;
-use MWExceptionHandler;
-
-abstract class Updater {
-       /**
-        * @var DbFactory
-        */
-       protected $dbFactory;
-
-       /**
-        * @var RevisionActionPermissions
-        */
-       protected $permissions;
-
-       /**
-        * @var Connection
-        */
-       protected $connection;
-
-       /**
-        * @param DbFactory $dbFactory
-        * @param RevisionActionPermissions $permissions
-        */
-       public function __construct( DbFactory $dbFactory, 
RevisionActionPermissions $permissions ) {
-               $this->dbFactory = $dbFactory;
-               $this->permissions = $permissions;
-               $this->connection = Container::get( 'search.connection' );
-       }
-
-       /**
-        * @return string One of the Connection::*_TYPE_NAME constants
-        */
-       abstract public function getTypeName();
-
-       /**
-        * @param array $conditions
-        * @param array $options
-        * @return AbstractRevision[]
-        */
-       abstract public function getRevisions( array $conditions = array(), 
array $options = array() );
-
-       /**
-        * @param AbstractRevision $revision
-        * @return \Elastica\Document
-        */
-       abstract public function buildDocument( /* AbstractRevision */ 
$revision );
-
-       /**
-        * @param UUID|null $fromId
-        * @param UUID|null $toId
-        * @param int|null $namespace
-        * @return array
-        */
-       public function buildQueryConditions( UUID $fromId = null, UUID $toId = 
null, $namespace = null ) {
-               $dbr = $this->dbFactory->getDB( DB_SLAVE );
-
-               $conditions = array();
-
-               // only find entries in a given range
-               if ( $fromId !== null ) {
-                       $conditions[] = 'rev_id >= ' . $dbr->addQuotes( 
$fromId->getBinary() );
-               }
-               if ( $toId !== null ) {
-                       $conditions[] = 'rev_id <= ' . $dbr->addQuotes( 
$toId->getBinary() );
-               }
-
-               // find only within requested wiki/namespace
-               $conditions['workflow_wiki'] = wfWikiId();
-               if ( $namespace !== null ) {
-                       $conditions['workflow_namespace'] = $namespace;
-               }
-
-               return $conditions;
-       }
-
-       /**
-        * @param AbstractRevision[] $revisions
-        * @return \Elastica\Document[]
-        */
-       protected function buildDocumentsForRevisions( array $revisions ) {
-               $documents = array();
-               foreach ( $revisions as $revision ) {
-                       try {
-                               $documents[] = $this->buildDocument( $revision 
);
-                       } catch ( FlowException $e ) {
-                               // just ignore revisions that fail to build 
document...
-                               wfWarn( __METHOD__ . ': Failed to build 
document for ' . $revision->getRevisionId()->getAlphadecimal() . ': ' . 
$e->getMessage());
-                               MWExceptionHandler::logException( $e );
-                       }
-               }
-
-               return $documents;
-       }
-
-       /**
-        * @param AbstractRevision[] $revisions
-        * @param string|null $shardTimeout Timeout in Elasticsearch time 
format (1m, 15s, ...)
-        * @param int|null $clientSideTimeout
-        * @return int
-        */
-       public function updateRevisions( array $revisions, $shardTimeout = 
null, $clientSideTimeout = null ) {
-               if ( $clientSideTimeout !== null ) {
-                       $this->connection->setTimeout( $clientSideTimeout );
-               }
-
-               $documents = $this->buildDocumentsForRevisions( $revisions );
-               $this->sendDocuments( $documents, $shardTimeout );
-
-               return count( $documents );
-       }
-
-       /**
-        * @param \Elastica\Document[] $documents
-        * @param string|null $shardTimeout Timeout in Elasticsearch time 
format (1m, 15s, ...)
-        */
-       protected function sendDocuments( array $documents, $shardTimeout = 
null ) {
-               if ( count( $documents ) === 0 ) {
-                       return;
-               }
-
-               try {
-                       // addDocuments (notice plural) is the bulk api
-                       $bulk = new \Elastica\Bulk( 
$this->connection->getClient() );
-                       if ( $shardTimeout !== null ) {
-                               $bulk->setShardTimeout( $shardTimeout );
-                       }
-
-                       $index = $this->connection->getFlowIndex( wfWikiId() );
-                       $type = $index->getType( $this->getTypeName() );
-                       $bulk->setType( $type );
-                       $bulk->addDocuments( $documents );
-                       $bulk->send();
-               } catch ( \Exception $e ) {
-                       $documentIds = array_map( function( $doc ) {
-                               return $doc->getId();
-                       }, $documents );
-                       wfWarn( __METHOD__ . ': Failed updating documents (' . 
implode( ',', $documentIds ) . '): ' . $e->getMessage() );
-               }
-       }
-}
diff --git a/includes/Search/Updaters/AbstractUpdater.php 
b/includes/Search/Updaters/AbstractUpdater.php
new file mode 100644
index 0000000..52dd7e9
--- /dev/null
+++ b/includes/Search/Updaters/AbstractUpdater.php
@@ -0,0 +1,116 @@
+<?php
+
+namespace Flow\Search\Updaters;
+
+use Flow\Container;
+use Flow\Exception\FlowException;
+use Flow\Model\AbstractRevision;
+use Flow\RevisionActionPermissions;
+use Flow\Search\Connection;
+use Flow\Search\Iterators\AbstractIterator;
+use MWExceptionHandler;
+
+abstract class AbstractUpdater {
+       /**
+        * @var AbstractIterator
+        */
+       public $iterator;
+
+       /**
+        * @var RevisionActionPermissions
+        */
+       protected $permissions;
+
+       /**
+        * @var Connection
+        */
+       protected $connection;
+
+       /**
+        * @param AbstractIterator $iterator
+        * @param RevisionActionPermissions $permissions
+        */
+       public function __construct( AbstractIterator $iterator, 
RevisionActionPermissions $permissions ) {
+               $this->iterator = $iterator;
+               $this->permissions = $permissions;
+               $this->connection = Container::get( 'search.connection' );
+       }
+
+       /**
+        * @return string One of the Connection::*_TYPE_NAME constants
+        */
+       abstract public function getTypeName();
+
+       /**
+        * @param AbstractRevision $revision
+        * @return \Elastica\Document
+        */
+       abstract public function buildDocument( AbstractRevision $revision );
+
+       /**
+        * @param string|null $shardTimeout Timeout in Elasticsearch time 
format (1m, 15s, ...)
+        * @param int|null $clientSideTimeout
+        * @param int $batchSize
+        * @return int
+        */
+       public function updateRevisions( $shardTimeout = null, 
$clientSideTimeout = null, $batchSize = 50 ) {
+               if ( $clientSideTimeout !== null ) {
+                       $this->connection->setTimeout( $clientSideTimeout );
+               }
+
+               $documents = array();
+               $count = 0;
+               foreach ( $this->iterator as $revision ) {
+                       try {
+                               $documents[] = $this->buildDocument( $revision 
);
+                               $count++;
+                       } catch ( FlowException $e ) {
+                               // just ignore revisions that fail to build 
document...
+                               wfWarn( __METHOD__ . ': Failed to build 
document for ' . $revision->getRevisionId()->getAlphadecimal() . ': ' . 
$e->getMessage());
+                               MWExceptionHandler::logException( $e );
+                       }
+
+                       // send documents in small batches
+                       if ( count( $documents ) > $batchSize ) {
+                               $this->sendDocuments( $documents, $shardTimeout 
);
+                               $documents = array();
+                       }
+               }
+
+               if ( $documents ) {
+                       // send remaining documents
+                       $this->sendDocuments( $documents, $shardTimeout );
+               }
+
+               return $count;
+       }
+
+       /**
+        * @param \Elastica\Document[] $documents
+        * @param string|null $shardTimeout Timeout in Elasticsearch time 
format (1m, 15s, ...)
+        */
+       protected function sendDocuments( array $documents, $shardTimeout = 
null ) {
+               if ( count( $documents ) === 0 ) {
+                       return;
+               }
+
+               try {
+                       // addDocuments (notice plural) is the bulk api
+                       $bulk = new \Elastica\Bulk( 
$this->connection->getClient() );
+                       if ( $shardTimeout !== null ) {
+                               $bulk->setShardTimeout( $shardTimeout );
+                       }
+
+                       $index = $this->connection->getFlowIndex( wfWikiId() );
+                       $type = $index->getType( $this->getTypeName() );
+                       $bulk->setType( $type );
+                       $bulk->addDocuments( $documents );
+                       $bulk->send();
+               } catch ( \Exception $e ) {
+                       $documentIds = array_map( function( $doc ) {
+                               return $doc->getId();
+                       }, $documents );
+                       wfWarn( __METHOD__ . ': Failed updating documents (' . 
implode( ',', $documentIds ) . '): ' . $e->getMessage() );
+               }
+       }
+}
diff --git a/includes/Search/HeaderUpdater.php 
b/includes/Search/Updaters/HeaderUpdater.php
similarity index 63%
rename from includes/Search/HeaderUpdater.php
rename to includes/Search/Updaters/HeaderUpdater.php
index 2a61a95..5f8c02a 100644
--- a/includes/Search/HeaderUpdater.php
+++ b/includes/Search/Updaters/HeaderUpdater.php
@@ -1,14 +1,13 @@
 <?php
 
-namespace Flow\Search;
+namespace Flow\Search\Updaters;
 
-use Flow\Container;
-use Flow\Data\ManagerGroup;
+use Flow\Model\AbstractRevision;
 use Flow\Model\Header;
-use Flow\Model\UUID;
+use Flow\Search\Connection;
 use Sanitizer;
 
-class HeaderUpdater extends Updater {
+class HeaderUpdater extends AbstractUpdater {
        /**
         * {@inheritDoc}
         */
@@ -19,41 +18,7 @@
        /**
         * {@inheritDoc}
         */
-       public function getRevisions( array $conditions = array(), array 
$options = array() ) {
-               $dbr = $this->dbFactory->getDB( DB_SLAVE );
-
-               // get the current (=most recent, =max) revision id for all 
headers
-               $rows = $dbr->select(
-                       array( 'flow_revision', 'flow_workflow' ),
-                       array( 'rev_id' => 'MAX(rev_id)' ),
-                       $conditions,
-                       __METHOD__,
-                       array(
-                               'ORDER BY' => 'rev_id ASC',
-                               'GROUP BY' => 'rev_type_id',
-                       ) + $options,
-                       array(
-                               'flow_workflow' => array(
-                                       'INNER JOIN',
-                                       array( 'workflow_id = rev_type_id' , 
'rev_type' => 'header' )
-                               ),
-                       )
-               );
-
-               $uuids = array();
-               foreach ( $rows as $row ) {
-                       $uuids[] = UUID::create( $row->rev_id );
-               }
-
-               /** @var ManagerGroup $storage */
-               $storage = Container::get( 'storage' );
-               return $storage->getStorage( 'Header' )->getMulti( $uuids );
-       }
-
-       /**
-        * {@inheritDoc}
-        */
-       public function buildDocument( /* Header */ $revision ) {
+       public function buildDocument( AbstractRevision /* Header */ $revision 
) {
                /** @var Header $revision */
 
                // get article title associated with this revision
diff --git a/includes/Search/Updaters/TopicUpdater.php 
b/includes/Search/Updaters/TopicUpdater.php
new file mode 100644
index 0000000..9bbb96e
--- /dev/null
+++ b/includes/Search/Updaters/TopicUpdater.php
@@ -0,0 +1,126 @@
+<?php
+
+namespace Flow\Search\Updaters;
+
+use Flow\Collection\PostSummaryCollection;
+use Flow\Model\AbstractRevision;
+use Flow\Model\PostRevision;
+use Flow\Model\PostSummary;
+use Flow\Repository\RootPostLoader;
+use Flow\RevisionActionPermissions;
+use Flow\Search\Connection;
+use Flow\Search\Iterators\AbstractIterator;
+use Sanitizer;
+
+class TopicUpdater extends AbstractUpdater {
+       /**
+        * @var RootPostLoader
+        */
+       protected $rootPostLoader;
+
+       /**
+        * @param AbstractIterator $iterator
+        * @param RevisionActionPermissions $permissions
+        * @param RootPostLoader $rootPostLoader
+        */
+       public function __construct( AbstractIterator $iterator, 
RevisionActionPermissions $permissions, RootPostLoader $rootPostLoader ) {
+               parent::__construct( $iterator, $permissions );
+               $this->rootPostLoader = $rootPostLoader;
+       }
+
+       /**
+        * {@inheritDoc}
+        */
+       public function getTypeName() {
+               return Connection::TOPIC_TYPE_NAME;
+       }
+
+       /**
+        * {@inheritDoc}
+        */
+       public function buildDocument( AbstractRevision /* PostRevision */ 
$revision ) {
+               /** @var PostRevision $revision */
+
+               // get timestamp from the most recent revision
+               $updateTimestamp = 
$revision->getCollection()->getWorkflow()->getLastUpdatedObj();
+               // timestamp for initial topic post
+               $creationTimestamp = 
$revision->getCollectionId()->getTimestampObj();
+
+               // get content from all child posts in a [post id => [data]] 
array
+               $revisions = $this->getRevisionsData( $revision );
+
+               // find summary for this topic & add it as revision
+               $summaryCollection = PostSummaryCollection::newFromId( 
$revision->getCollectionId() );
+               try {
+                       /** @var PostSummary $summaryRevision */
+                       $summaryRevision = 
$summaryCollection->getLastRevision();
+                       $data = current( $this->getRevisionsData( 
$summaryRevision ) );
+                       if ( $data !== false ) {
+                               $revisions[] = $data;
+                       }
+               } catch ( \Exception $e ) {
+                       // no summary - that's ok!
+               }
+
+               // get board title associated with this revision
+               $title = 
$revision->getCollection()->getWorkflow()->getOwnerTitle();
+
+               $doc = new \Elastica\Document(
+                       $revision->getCollectionId()->getAlphadecimal(),
+                       array(
+                               'namespace' => $title->getNamespace(),
+                               'namespace_text' => 
$title->getPageLanguage()->getFormattedNsText( $title->getNamespace() ),
+                               'pageid' => $title->getArticleID(),
+                               'title' => $title->getText(),
+                               'timestamp' => 
$creationTimestamp->getTimestamp( TS_ISO_8601 ),
+                               'update_timestamp' => 
$updateTimestamp->getTimestamp( TS_ISO_8601 ),
+                               'revisions' => $revisions,
+                       )
+               );
+
+               return $doc;
+       }
+
+       /**
+        * Recursively get the data for all children. This will add the 
revision's
+        * content to the results array, with the post ID as key.
+        *
+        * @param PostRevision|PostSummary $revision
+        * @return array
+        */
+       public function getRevisionsData( /* PostRevision|PostSummary */ 
$revision ) {
+               // store type of revision so we can also search for very 
specific types
+               // (e.g. titles only)
+               // possible values will be:
+               // * title
+               // * post
+               // * post-summary
+               $type = $revision->getRevisionType();
+               if ( method_exists( $revision, 'isTopicTitle' ) && 
$revision->isTopicTitle() ) {
+                       $type = 'title';
+               }
+
+               $data = array();
+
+               if ( $this->permissions->isAllowed( $revision, 'view' ) ) {
+                       $data[] = array(
+                               'id' => 
$revision->getCollectionId()->getAlphadecimal(),
+                               'text' => trim( Sanitizer::stripAllTags( 
$revision->getContentInHtml() ) ),
+                               'source_text' => 
$revision->getContentInWikitext(),
+                               'moderation_state' => 
$revision->getModerationState(),
+                               'timestamp' => 
$revision->getCollectionId()->getTimestamp( TS_ISO_8601 ),
+                               'update_timestamp' => 
$revision->getRevisionId()->getTimestamp( TS_ISO_8601 ),
+                               'type' => $type,
+                       );
+               }
+
+               if ( $revision instanceof PostRevision ) {
+                       // get data from all child posts too
+                       foreach ( $revision->getChildren() as $child ) {
+                               $data = array_merge( $data, 
$this->getRevisionsData( $child ) );
+                       }
+               }
+
+               return $data;
+       }
+}
diff --git a/maintenance/FlowFixWorkflowLastUpdateTimestamp.php 
b/maintenance/FlowFixWorkflowLastUpdateTimestamp.php
index 3acd028..638fa1e 100644
--- a/maintenance/FlowFixWorkflowLastUpdateTimestamp.php
+++ b/maintenance/FlowFixWorkflowLastUpdateTimestamp.php
@@ -190,7 +190,7 @@
                /** @var Workflow[] $workflows */
                $workflows = $this->storage->getMulti( 'Workflow', $uuids );
                foreach ( $workflows as $workflow ) {
-                       $timestamp = 
$timestamps[$workflow->getId()->getBinary()];
+                       $timestamp = 
$timestamps[$workflow->getId()->getBinary()->__toString()];
                        $workflow->updateLastUpdated( UUID::getComparisonUUID( 
$timestamp ) );
                }
 
diff --git a/maintenance/FlowForceSearchIndex.php 
b/maintenance/FlowForceSearchIndex.php
index 7d2f6d0..2987727 100644
--- a/maintenance/FlowForceSearchIndex.php
+++ b/maintenance/FlowForceSearchIndex.php
@@ -1,10 +1,9 @@
 <?php
 
 use Flow\Container;
-use Flow\Model\AbstractRevision;
 use Flow\Model\UUID;
 use Flow\Search\Connection;
-use Flow\Search\Updater;
+use Flow\Search\Updaters\AbstractUpdater;
 
 require_once ( getenv( 'MW_INSTALL_PATH' ) !== false
        ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php'
@@ -33,100 +32,44 @@
 
                $this->addOption( 'fromId', 'Start indexing at a specific 
revision id (inclusive).', false, true );
                $this->addOption( 'toId', 'Stop indexing at a specific revision 
(inclusive).', false, true );
-               $this->addOption( 'limit', 'Maximum number of revisions to 
process before exiting the script. Default to unlimited.', false, true );
                $this->addOption( 'namespace', 'Only index revisions in this 
given namespace', false, true );
-
-               $this->connection = Container::get( 'search.connection' );
        }
 
        public function execute() {
                global $wgFlowSearchMaintenanceTimeout;
 
+               $this->connection = Container::get( 'search.connection' );
+
                // Set the timeout for maintenance actions
                $this->connection->setTimeout( $wgFlowSearchMaintenanceTimeout 
);
 
-               /** @var Updater[] $updaters */
+               /** @var AbstractUpdater[] $updaters */
                $updaters = Container::get( 'search.index.updaters' );
                foreach ( $updaters as $updaterType => $updater ) {
                        $fromId = $this->getOption( 'fromId', null );
                        $fromId = $fromId ? UUID::create( $fromId ) : null;
                        $toId = $this->getOption( 'toId', null );
                        $toId = $toId ? UUID::create( $toId ) : null;
+                       if ( $toId !== null ) {
+                               // AbstractIterator::toId is exclusive, but we 
want inclusive,
+                               // so just feed toId() the next possible UUID 
(UUID + 1)
+                               // We need some base conversion & bcadd because 
the number may
+                               // be too large to be an int.
+                               $decimal = wfBaseConvert( 
$toId->getAlphadecimal(), 36, 10 );
+                               $new = bcadd( $decimal, 1, 0 );
+                               $alnum = wfBaseConvert( $new, 10, 36 );
+                               $toId = UUID::create( $alnum );
+                       }
                        $namespace = $this->getOption( 'namespace', null );
-                       $numRevisionsToIndex = $this->getOption( 'limit', null 
);
                        $total = 0;
 
-                       while ( true ) {
-                               // if a limit was provided, we should make sure 
to not fetch
-                               // more revisions than asked for
-                               $options = array( 'LIMIT' => $this->mBatchSize 
);
-                               if ( $numRevisionsToIndex ) {
-                                       $options['LIMIT'] = min( 
$numRevisionsToIndex, $this->mBatchSize );
+                       $updater->iterator->setNamespace( $namespace );
+                       $updater->iterator->setFrom( $fromId );
+                       $updater->iterator->setTo( $toId );
 
-                                       // since we do this in batches, we'll 
subtract the size of
-                                       // each batch until 
$numRevisionsToIndex is reached
-                                       $numRevisionsToIndex -= 
$this->mBatchSize;
-                                       if ( $options['LIMIT'] <= 0 ) {
-                                               break;
-                                       }
-                               }
-
-                               $conditions = $updater->buildQueryConditions( 
$fromId, $toId, $namespace );
-                               $revisions = $updater->getRevisions( 
$conditions, $options );
-
-                               // stop if we're all out of revisions
-                               if ( !$revisions ) {
-                                       break;
-                               }
-
-                               $total += $updater->updateRevisions( 
$revisions, null, null );
-                               $this->output( "Indexed $total $updaterType 
document(s)\n" );
-
-                               // prepare for next batch, starting at the next 
id
-                               // prevFromId will default to around unix epoch 
- there can be
-                               // no data before that
-                               $prevFromId = $fromId ?: 
UUID::getComparisonUUID( '1' );
-                               $fromId = $this->getNextFromId( $revisions );
-
-                               // make sure we don't get stuck in an infinite 
loop
-                               $diff = $prevFromId->getTimestampObj()->diff( 
$fromId->getTimestampObj() );
-                               // invert will be 1 if the diff is a negative 
time period from
-                               // $prevFromId to $fromId, which means that the 
new $timestamp is
-                               // more recent than our current $result
-                               if ( $diff->invert ) {
-                                       $this->error(
-                                               'Got stuck in an infinite 
loop.' . "\n" .
-                                               'workflow_last_update_timestamp 
is likely incorrect ' .
-                                               'for some workflows.' . "\n" .
-                                               'Run 
maintenance/FlowFixWorkflowLastUpdateTimestamp.php ' .
-                                               'to automatically fix those.', 
1 );
-                               }
-
-                               // prevent memory from being filled up
-                               Container::get( 'storage' )->clear();
-                       }
+                       $total += $updater->updateRevisions( null, null, 
$this->mBatchSize );
+                       $this->output( "Indexed $total $updaterType 
document(s)\n" );
                }
-       }
-
-       /**
-        * @param AbstractRevision[] $revisions
-        * @return UUID
-        */
-       protected function getNextFromId( array $revisions ) {
-               /** @var AbstractRevision $last */
-               $last = end( $revisions );
-
-               if ( $last instanceof \Flow\Model\Header ) {
-                       $timestamp = $last->getRevisionId()->getTimestampObj();
-               } else {
-                       $timestamp = 
$last->getCollection()->getWorkflow()->getLastUpdatedObj();
-               }
-
-               // $timestamp is the timestamp of the last revision we fetched. 
fromId
-               // is inclusive, and we don't want to include what we already 
have here,
-               // so we'll advance 1 more and call that the next fromId
-               $timestamp = (int) $timestamp->getTimestamp( TS_UNIX );
-               return UUID::getComparisonUUID( $timestamp + 1 );
        }
 }
 
diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php
new file mode 100644
index 0000000..ecd61fb
--- /dev/null
+++ b/maintenance/dumpBackup.php
@@ -0,0 +1,112 @@
+<?php
+
+use Flow\Container;
+use Flow\Dump\Exporter;
+use Flow\Model\UUID;
+
+$originalDir = getcwd();
+
+$optionsWithArgs = array( 'pagelist', 'start', 'end', 'revstart', 'revend' );
+
+$maintPath = ( getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance'
+       : dirname( __FILE__ ) . '/../../../maintenance' );
+require_once $maintPath . '/commandLine.inc';
+require_once $maintPath . '/backup.inc';
+
+class FlowBackupDumper extends BackupDumper {
+       function dump( $history, $text = Exporter::TEXT ) {
+               # Notice messages will foul up your XML output even if they're
+               # relatively harmless.
+               if ( ini_get( 'display_errors' ) ) {
+                       ini_set( 'display_errors', 'stderr' );
+               }
+
+               $db = Container::get( 'db.factory' )->getDB( DB_SLAVE );
+               $exporter = new Exporter( $db, $history, Exporter::STREAM, 
Exporter::TEXT );
+               $wrapper = new DumpOutput( $this->sink, $this );
+               $exporter->setOutputSink( $wrapper );
+
+               if ( !$this->skipHeader ) {
+                       $exporter->openStream();
+               }
+
+               $workflowIterator = $exporter->getWorkflowIterator( 
$this->pages, $this->startId, $this->endId );
+
+               $revStartId = $this->revStartId ? UUID::create( 
$this->revStartId ) : null;
+               $revEndId = $this->revEndId ? UUID::create( $this->revEndId ) : 
null;
+               $exporter->dump( $workflowIterator, $revStartId, $revEndId );
+
+               if ( !$this->skipFooter ) {
+                       $exporter->closeStream();
+               }
+
+               $this->report( true );
+       }
+}
+
+$dumper = new FlowBackupDumper( $argv );
+
+if ( isset( $options['pagelist'] ) ) {
+       $olddir = getcwd();
+       chdir( $originalDir );
+       $pages = file( $options['pagelist'] );
+       chdir( $olddir );
+       if ( $pages === false ) {
+               echo "Unable to open file {$options['pagelist']}\n";
+               die( 1 );
+       }
+       $pages = array_map( 'trim', $pages );
+       $dumper->pages = array_filter( $pages, create_function( '$x', 'return 
$x !== "";' ) );
+}
+
+if ( isset( $options['start'] ) ) {
+       $dumper->startId = intval( $options['start'] );
+}
+if ( isset( $options['end'] ) ) {
+       $dumper->endId = intval( $options['end'] );
+}
+
+if ( isset( $options['revstart'] ) ) {
+       $dumper->revStartId = intval( $options['revstart'] );
+}
+if ( isset( $options['revend'] ) ) {
+       $dumper->revEndId = intval( $options['revend'] );
+}
+$dumper->skipHeader = isset( $options['skip-header'] );
+$dumper->skipFooter = isset( $options['skip-footer'] );
+
+if ( isset( $options['full'] ) ) {
+       $dumper->dump( WikiExporter::FULL );
+} elseif ( isset( $options['current'] ) ) {
+       $dumper->dump( WikiExporter::CURRENT );
+} else {
+       $dumper->progress( <<<ENDS
+This script dumps the Flow discussion database into an
+XML interchange wrapper format for export.
+
+It can either import only the current revision, or full history.
+
+Although the --full will export all public revisions, non-public revisions
+are removed, and the remaining revisions are renormalized to accomodate this.
+It is recommended that you keep database backups as well.
+
+XML output is sent to stdout; progress reports are sent to stderr.
+
+Usage: php dumpBackup.php <action> [<options>]
+Actions:
+  --full      Dump all revisions of every description/post/summary.
+  --current   Dump only the latest revision of every description/post/summary.
+  --pagelist=<file>
+              Where <file> is a list of page titles to be dumped
+Options:
+  --start=n   Start from page_id or log_id n
+  --end=n     Stop before page_id or log_id n (exclusive)
+  --revstart=n  Start from rev_id n
+  --revend=n    Stop before rev_id n (exclusive)
+  --skip-header Don't output the <mediawiki> header
+  --skip-footer Don't output the </mediawiki> footer
+
+ENDS
+       );
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/242569
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05
Gerrit-PatchSet: 22
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <mmul...@wikimedia.org>
Gerrit-Reviewer: Mattflaschen <mflasc...@wikimedia.org>
Gerrit-Reviewer: Matthias Mullie <mmul...@wikimedia.org>
Gerrit-Reviewer: Sbisson <sbis...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to