jenkins-bot has submitted this change and it was merged. Change subject: Dump Flow data ......................................................................
Dump Flow data A big chunk of this patch is a refactoring of the search updaters. I wanted to reuse the code that already iterators over all header & topics, but it was unwieldly. I split that into separate classes that are now Iterator objects. Other changes: * Fixed Flow\Search\Connection, which could no longer call parent::__construct since that no longer exists * Flow\Container couldn’t be autoloaded from maintenance script's constructor (autoload not yet initialized, I assume) * FlowFixWorkflowLastUpdateTimestamp.php was failing because getBinary now returns a UUIDBlob instead of the plain binary string Bug: T89398 Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05 --- M autoload.php M container.php A includes/Dump/Exporter.php A includes/Search/Iterators/AbstractIterator.php A includes/Search/Iterators/HeaderIterator.php A includes/Search/Iterators/TopicIterator.php D includes/Search/TopicUpdater.php D includes/Search/Updater.php A includes/Search/Updaters/AbstractUpdater.php R includes/Search/Updaters/HeaderUpdater.php A includes/Search/Updaters/TopicUpdater.php M maintenance/FlowFixWorkflowLastUpdateTimestamp.php M maintenance/FlowForceSearchIndex.php A maintenance/dumpBackup.php 14 files changed, 1,133 insertions(+), 482 deletions(-) Approvals: Mattflaschen: Looks good to me, approved jenkins-bot: Verified diff --git a/autoload.php b/autoload.php index 91d368e..5dfa95f 100644 --- a/autoload.php +++ b/autoload.php @@ -114,6 +114,7 @@ 'Flow\\Data\\Utils\\SortRevisionsByRevisionId' => __DIR__ . '/includes/Data/Utils/SortRevisionsByRevisionId.php', 'Flow\\Data\\Utils\\UserMerger' => __DIR__ . '/includes/Data/Utils/UserMerger.php', 'Flow\\DbFactory' => __DIR__ . '/includes/DbFactory.php', + 'Flow\\Dump\\Exporter' => __DIR__ . '/includes/Dump/Exporter.php', 'Flow\\Exception\\CatchableFatalErrorException' => __DIR__ . '/includes/Exception/CatchableFatalErrorException.php', 'Flow\\Exception\\CrossWikiException' => __DIR__ . '/includes/Exception/ExceptionHandling.php', 'Flow\\Exception\\DataModelException' => __DIR__ . '/includes/Exception/ExceptionHandling.php', @@ -279,12 +280,15 @@ 'Flow\\Repository\\UserName\\UserNameQuery' => __DIR__ . '/includes/Repository/UserName/UserNameQuery.php', 'Flow\\RevisionActionPermissions' => __DIR__ . '/includes/RevisionActionPermissions.php', 'Flow\\Search\\Connection' => __DIR__ . '/includes/Search/Connection.php', - 'Flow\\Search\\HeaderUpdater' => __DIR__ . '/includes/Search/HeaderUpdater.php', + 'Flow\\Search\\Iterators\\AbstractIterator' => __DIR__ . '/includes/Search/Iterators/AbstractIterator.php', + 'Flow\\Search\\Iterators\\HeaderIterator' => __DIR__ . '/includes/Search/Iterators/HeaderIterator.php', + 'Flow\\Search\\Iterators\\TopicIterator' => __DIR__ . '/includes/Search/Iterators/TopicIterator.php', 'Flow\\Search\\Maintenance\\MappingConfigBuilder' => __DIR__ . '/includes/Search/maintenance/MappingConfigBuilder.php', 'Flow\\Search\\SearchEngine' => __DIR__ . '/includes/Search/SearchEngine.php', 'Flow\\Search\\Searcher' => __DIR__ . '/includes/Search/Searcher.php', - 'Flow\\Search\\TopicUpdater' => __DIR__ . '/includes/Search/TopicUpdater.php', - 'Flow\\Search\\Updater' => __DIR__ . '/includes/Search/Updater.php', + 'Flow\\Search\\Updaters\\AbstractUpdater' => __DIR__ . '/includes/Search/Updaters/AbstractUpdater.php', + 'Flow\\Search\\Updaters\\HeaderUpdater' => __DIR__ . '/includes/Search/Updaters/HeaderUpdater.php', + 'Flow\\Search\\Updaters\\TopicUpdater' => __DIR__ . '/includes/Search/Updaters/TopicUpdater.php', 'Flow\\SpamFilter\\AbuseFilter' => __DIR__ . '/includes/SpamFilter/AbuseFilter.php', 'Flow\\SpamFilter\\ConfirmEdit' => __DIR__ . '/includes/SpamFilter/ConfirmEdit.php', 'Flow\\SpamFilter\\ContentLengthFilter' => __DIR__ . '/includes/SpamFilter/ContentLengthFilter.php', diff --git a/container.php b/container.php index 31dabde..6eb11b0 100644 --- a/container.php +++ b/container.php @@ -1032,12 +1032,18 @@ global $wgFlowSearchServers, $wgFlowSearchConnectionAttempts; return new Flow\Search\Connection( $wgFlowSearchServers, $wgFlowSearchConnectionAttempts ); }; +$c['search.index.iterators.header'] = function( $c ) { + return new \Flow\Search\Iterators\HeaderIterator( $c['db.factory'] ); +}; +$c['search.index.iterators.topic'] = function( $c ) { + return new \Flow\Search\Iterators\TopicIterator( $c['db.factory'], $c['loader.root_post'] ); +}; $c['search.index.updaters'] = function( $c ) { // permissions for anon user $anonPermissions = new Flow\RevisionActionPermissions( $c['flow_actions'], new User ); return array( - 'topic' => new \Flow\Search\TopicUpdater( $c['db.factory'], $anonPermissions, $c['loader.root_post'] ), - 'header' => new \Flow\Search\HeaderUpdater( $c['db.factory'], $anonPermissions ) + 'topic' => new \Flow\Search\Updaters\TopicUpdater( $c['search.index.iterators.topic'], $anonPermissions, $c['loader.root_post'] ), + 'header' => new \Flow\Search\Updaters\HeaderUpdater( $c['search.index.iterators.header'], $anonPermissions ) ); }; diff --git a/includes/Dump/Exporter.php b/includes/Dump/Exporter.php new file mode 100644 index 0000000..4002475 --- /dev/null +++ b/includes/Dump/Exporter.php @@ -0,0 +1,409 @@ +<?php + +namespace Flow\Dump; + +use BatchRowIterator; +use DatabaseBase; +use Exception; +use Flow\Collection\PostSummaryCollection; +use Flow\Container; +use Flow\Data\ManagerGroup; +use Flow\Model\AbstractRevision; +use Flow\Model\Header; +use Flow\Model\PostRevision; +use Flow\Model\PostSummary; +use Flow\Model\UUID; +use Flow\Model\Workflow; +use Flow\RevisionActionPermissions; +use Flow\Search\Iterators\AbstractIterator; +use Flow\Search\Iterators\HeaderIterator; +use Flow\Search\Iterators\TopicIterator; +use ReflectionProperty; +use TimestampException; +use User; +use WikiExporter; +use Xml; + +class Exporter extends WikiExporter { + /** + * Map of [db column name => xml attribute name] + * + * @var array + */ + public static $map = array( + 'rev_id' => 'id', + 'rev_user_id' => 'userid', + 'rev_user_ip' => 'userip', + 'rev_user_wiki' => 'userwiki', + 'rev_parent_id' => 'parentid', + 'rev_change_type' => 'changetype', + 'rev_type' => 'type', + 'rev_type_id' => 'typeid', + 'rev_content' => 'content', + 'rev_content_url' => 'contenturl', + 'rev_flags' => 'flags', + 'rev_mod_state' => 'modstate', + 'rev_mod_user_id' => 'moduserid', + 'rev_mod_user_ip' => 'moduserip', + 'rev_mod_user_wiki' => 'moduserwiki', + 'rev_mod_timestamp' => 'modtimestamp', + 'rev_mod_reason' => 'modreason', + 'rev_last_edit_id' => 'lasteditid', + 'rev_edit_user_id' => 'edituserid', + 'rev_edit_user_ip' => 'edituserip', + 'rev_edit_user_wiki' => 'edituserwiki', + 'rev_content_length' => 'contentlength', + 'rev_previous_content_length' => 'previouscontentlength', + + 'tree_parent_id' => 'treeparentid', + 'tree_rev_descendant_id' => 'treedescendantid', + 'tree_rev_id' => 'treerevid', + 'tree_orig_user_id' => 'treeoriguserid', + 'tree_orig_user_ip' => 'treeoriguserip', + 'tree_orig_user_wiki' => 'treeoriguserwiki', + ); + + /** + @var ReflectionProperty $prevRevisionProperty Previous revision property + */ + protected $prevRevisionProperty; + + /** + @var ReflectionProperty $changeTypeProperty Change type property + */ + protected $changeTypeProperty; + + /** + * {@inheritDoc} + */ + function __construct( $db, $history = WikiExporter::CURRENT, + $buffer = WikiExporter::BUFFER, $text = WikiExporter::TEXT ) { + + parent::__construct( $db, $history, $buffer, $text ); + $this->prevRevisionProperty = new ReflectionProperty( 'Flow\Model\AbstractRevision', 'prevRevision' ); + $this->prevRevisionProperty->setAccessible( true ); + + $this->changeTypeProperty = new ReflectionProperty( 'Flow\Model\AbstractRevision', 'changeType' ); + $this->changeTypeProperty->setAccessible( true ); + } + + public static function schemaVersion() { + return '1'; + } + + public function openStream() { + global $wgLanguageCode; + $version = static::schemaVersion(); + + $output = Xml::openElement( + 'mediawiki', + array( + // @todo: update after creating schema +// 'xmlns' => "http://www.mediawiki.org/xml/export-$version/", +// 'xmlns:xsi' => "http://www.w3.org/2001/XMLSchema-instance", +// 'xsi:schemaLocation' => "http://www.mediawiki.org/xml/export-$version/ http://www.mediawiki.org/xml/export-$version.xsd", + 'version' => $version, + 'xml:lang' => $wgLanguageCode + ) + ) . "\n"; + $this->sink->write( $output ); + } + + /** + * @param string[]|null $pages Array of DB-prefixed page titles + * @param int|null $startId page_id to start from (inclusive) + * @param int|null $endId page_id to end (exclusive) + * @return BatchRowIterator + */ + public function getWorkflowIterator( array $pages = null, $startId = null, $endId = null ) { + /** @var DatabaseBase $dbr */ + $dbr = Container::get( 'db.factory' )->getDB( DB_SLAVE ); + + $iterator = new BatchRowIterator( $dbr, 'flow_workflow', 'workflow_id', 300 ); + $iterator->setFetchColumns( array( '*' ) ); + $iterator->addConditions( array( 'workflow_wiki' => wfWikiId() ) ); + $iterator->addConditions( array( 'workflow_type' => 'discussion' ) ); + + if ( $pages ) { + $pageConds = array(); + foreach ( $pages as $page ) { + $title = \Title::newFromDBkey( $page ); + $pageConds[] = $dbr->makeList( + array( + 'workflow_namespace' => $title->getNamespace(), + 'workflow_title_text' => $title->getDBkey() + ), + LIST_AND + ); + } + + $iterator->addConditions( array( $dbr->makeList( $pageConds, LIST_OR ) ) ); + } + if ( $startId ) { + $iterator->addConditions( array( 'workflow_page_id >= ' . $dbr->addQuotes( $startId ) ) ); + } + if ( $endId ) { + $iterator->addConditions( array( 'workflow_page_id < ' . $dbr->addQuotes( $endId ) ) ); + } + + return $iterator; + } + + /** + * @param BatchRowIterator $workflowIterator + * @param UUID|null $revStartId + * @param UUID|null $revEndId + * @throws Exception + * @throws TimestampException + * @throws \Flow\Exception\InvalidInputException + */ + public function dump( BatchRowIterator $workflowIterator, $revStartId = null, $revEndId = null ) { + foreach ( $workflowIterator as $rows ) { + foreach ( $rows as $row ) { + $workflow = Workflow::fromStorageRow( (array) $row ); + + $headerIterator = Container::get( 'search.index.iterators.header' ); + $topicIterator = Container::get( 'search.index.iterators.topic' ); + /** @var AbstractIterator $iterator */ + foreach ( array( $headerIterator, $topicIterator ) as $iterator ) { + $iterator->setPage( $row->workflow_page_id ); + $iterator->setFrom( $revStartId ); + $iterator->setTo( $revEndId ); + } + + $this->formatWorkflow( $workflow, $headerIterator, $topicIterator ); + } + } + } + + protected function formatWorkflow( Workflow $workflow, HeaderIterator $headerIterator, TopicIterator $topicIterator ) { + if ( $workflow->isDeleted() ) { + return; + } + + $output = Xml::openElement( 'board', array( + 'id' => $workflow->getId()->getAlphadecimal(), + 'title' => $workflow->getOwnerTitle()->getPrefixedDBkey(), + ) ) . "\n"; + $this->sink->write( $output ); + + foreach ( $headerIterator as $revision ) { + /** @var Header $revision */ + $this->formatHeader( $revision ); + } + foreach ( $topicIterator as $revision ) { + /** @var PostRevision $revision */ + $this->formatTopic( $revision ); + } + + $output = Xml::closeElement( 'board' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatTopic( PostRevision $revision ) { + if ( !$this->isAllowed( $revision ) ) { + return; + } + + $output = Xml::openElement( 'topic', array( + 'id' => $revision->getCollectionId()->getAlphadecimal(), + ) ) . "\n"; + $this->sink->write( $output ); + + $this->formatPost( $revision ); + + // find summary for this topic & add it as revision + $summaryCollection = PostSummaryCollection::newFromId( $revision->getCollectionId() ); + try { + /** @var PostSummary $summary */ + $summary = $summaryCollection->getLastRevision(); + $this->formatSummary( $summary ); + } catch ( \Exception $e ) { + // no summary - that's ok! + } + + $output = Xml::closeElement( 'topic' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatHeader( Header $revision ) { + if ( !$this->isAllowed( $revision ) ) { + return; + } + + $output = Xml::openElement( 'description', array( + 'id' => $revision->getCollectionId()->getAlphadecimal() + ) ) . "\n"; + $this->sink->write( $output ); + + $this->formatRevisions( $revision ); + + $output = Xml::closeElement( 'description' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatPost( PostRevision $revision ) { + if ( !$this->isAllowed( $revision ) ) { + return; + } + + $output = Xml::openElement( 'post', array( + 'id' => $revision->getCollectionId()->getAlphadecimal() + ) ) . "\n"; + $this->sink->write( $output ); + + $this->formatRevisions( $revision ); + + if ( $revision->getChildren() ) { + $output = Xml::openElement( 'children' ) . "\n"; + $this->sink->write( $output ); + + foreach ( $revision->getChildren() as $child ) { + $this->formatPost( $child ); + } + + $output = Xml::closeElement( 'children' ) . "\n"; + $this->sink->write( $output ); + } + + $output = Xml::closeElement( 'post' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatSummary( PostSummary $revision ) { + if ( !$this->isAllowed( $revision ) ) { + return; + } + + $output = Xml::openElement( 'summary', array( + 'id' => $revision->getCollectionId()->getAlphadecimal() + ) ) . "\n"; + $this->sink->write( $output ); + + $this->formatRevisions( $revision ); + + $output = Xml::closeElement( 'summary' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatRevisions( AbstractRevision $revision ) { + $output = Xml::openElement( 'revisions' ) . "\n"; + $this->sink->write( $output ); + + $collection = $revision->getCollection(); + if ( $this->history === WikiExporter::FULL ) { + /** @var AbstractRevision[] $revisions */ + $revisions = array_reverse( $collection->getAllRevisions() ); + $prevId = null; + + foreach ( $revisions as $revision ) { + if ( $this->isAllowed( $revision ) ) { + if ( $prevId !== null ) { + // override parent id: this is used to get rid of gaps + // that are caused by moderated items, where the + // revision tree would be incorrect + $this->prevRevisionProperty->setValue( $revision, $prevId ); + + // Since $prevId is set, we know + // there was a gap, and the original + // hide-topic/delete-topic/suppress-topic + // was removed. Since that is used for + // listeners in FlowActions.php, we replace + // restore-topic with edit-title and make a + // null edit (we don't do null edits in the + // normal application flow, but this + // provides a way to replace restore). + $oldChangeType = $revision->getChangeType(); + + if ( $oldChangeType === 'restore-topic' ) { + $this->changeTypeProperty->setValue( $revision, 'edit-title' ); + } + + if ( $oldChangeType === 'restore-post' ) { + $this->changeTypeProperty->setValue( $revision, 'edit-post' ); + } + + $prevId = null; + } + $this->formatRevision( $revision ); + } elseif ( $prevId === null ) { + // if revision can't be dumped, store its parent id so we + // can re-apply it to the next one that can be displayed, so + // we don't have gaps + $prevId = $revision->getPrevRevisionId(); + } + } + } elseif ( $this->history === WikiExporter::CURRENT ) { + $first = $collection->getFirstRevision(); + + // storing only last revision won't work (it'll reference non-existing + // parents): we'll construct a bogus revision with most of the original + // metadata, but with the current content & id (= timestamp) + $first = $first->toStorageRow( $first ); + $last = $revision->toStorageRow( $revision ); + $first['rev_id'] = $last['rev_id']; + $first['rev_content'] = $last['rev_content']; + $first['rev_flags'] = $last['rev_flags']; + if ( isset( $first['tree_rev_id'] ) ) { + // PostRevision-only: tree_rev_id must match rev_id + $first['tree_rev_id'] = $first['rev_id']; + } + + // clear buffered cache, to make sure it doesn't serve the existing (already + // loaded) revision when trying to turn our bogus mixed data into a revision + /** @var ManagerGroup $storage */ + $storage = Container::get( 'storage' ); + $storage->clear(); + + $mix = $revision->fromStorageRow( $first ); + + $this->formatRevision( $mix ); + } + + $output = Xml::closeElement( 'revisions' ) . "\n"; + $this->sink->write( $output ); + } + + protected function formatRevision( AbstractRevision $revision ) { + if ( !$this->isAllowed( $revision ) ) { + return; + } + + $attribs = $revision->toStorageRow( $revision ); + + // make sure there are no leftover key columns (unknown to $attribs) + $keys = array_intersect_key(static::$map, $attribs ); + // now make sure $values columns are in the same order as $keys are + // (array_merge) and there are no leftover columns (array_intersect_key) + $values = array_intersect_key( array_merge( $keys, $attribs ), $keys ); + // combine them + $attribs = array_combine( $keys, $values ); + + // references to external store etc. are useless; we'll include the real + // content as node text + unset($attribs['content'], $attribs['contenturl']); + $format = $revision->getContentFormat(); + $attribs['flags'] = 'utf-8,' . $format; + + $output = Xml::element( + 'revision', + $attribs, + $revision->getContent( $format ) + ) . "\n"; + $this->sink->write( $output ); + } + + /** + * Test if anon users are allowed to view a particular revision. + * + * @param AbstractRevision $revision + * @return bool + */ + protected function isAllowed( AbstractRevision $revision ) { + $user = User::newFromId( 0 ); + $actions = Container::get( 'flow_actions' ); + $permissions = new RevisionActionPermissions( $actions, $user ); + + return $permissions->isAllowed( $revision, 'view' ); + } +} diff --git a/includes/Search/Iterators/AbstractIterator.php b/includes/Search/Iterators/AbstractIterator.php new file mode 100644 index 0000000..1e19c30 --- /dev/null +++ b/includes/Search/Iterators/AbstractIterator.php @@ -0,0 +1,190 @@ +<?php + +namespace Flow\Search\Iterators; + +use DatabaseBase; +use Flow\Container; +use Flow\Data\ManagerGroup; +use Flow\DbFactory; +use Flow\Exception\InvalidDataException; +use Flow\Model\AbstractRevision; +use Flow\Model\UUID; +use Iterator; +use ResultWrapper; +use stdClass; + +abstract class AbstractIterator implements Iterator { + /** + * @var DatabaseBase + */ + protected $dbr; + + /** + * @var array + */ + protected $conditions = array(); + + /** + * @var ResultWrapper|null + */ + protected $results; + + /** + * Depending on where we are in the iteration, this can be null (object + * constructed but not yet being iterated over), AbstractRevision (being + * iterated) or false (end of iteration, no more revisions) + * + * @var AbstractRevision|null|false + */ + protected $current; + + /** + * Depending on where we are in the iteration, this can be integer (object + * being iterated over) or null (iteration not yet started, or completed) + * + * @var int|null + */ + protected $key; + + /** + * @param DbFactory $dbFactory + */ + public function __construct( DbFactory $dbFactory ) { + $this->dbr = $dbFactory->getDB( DB_SLAVE ); + $this->conditions = array( 'workflow_wiki' => wfWikiId() ); + } + + /** + * @return bool|ResultWrapper + */ + abstract protected function query(); + + /** + * @param array|int|null $pageId + */ + public function setPage( $pageId = null ) { + $this->results = null; + + unset( $this->conditions['workflow_page_id'] ); + if ( $pageId !== null ) { + $this->conditions['workflow_page_id'] = $pageId; + } + } + + /** + * @param int|null $namespace + */ + public function setNamespace( $namespace = null ) { + $this->results = null; + + unset( $this->conditions['workflow_namespace'] ); + if ( $namespace !== null ) { + $this->conditions['workflow_namespace'] = $namespace; + } + } + + /** + * Define where to start iterating (inclusive) + * + * @param UUID|null $revId + */ + public function setFrom( UUID $revId = null ) { + $this->results = null; + + unset( $this->conditions[0] ); + if ( $revId !== null ) { + $this->conditions[0] = 'rev_id >= ' . $this->dbr->addQuotes( $revId->getBinary() ); + } + } + + /** + * Define where to stop iterating (exclusive) + * + * @param UUID|null $revId + */ + public function setTo( UUID $revId = null ) { + $this->results = null; + + unset( $this->conditions[1] ); + if ( $revId !== null ) { + $this->conditions[1] = 'rev_id < ' . $this->dbr->addQuotes( $revId->getBinary() ); + } + } + + /** + * @return AbstractRevision|null The most recently fetched revision object + */ + public function current() { + return $this->current; + } + + /** + * @return integer 0-indexed count of the page number fetched + */ + public function key() { + return $this->key; + } + + /** + * Reset the iterator to the beginning of the table. + */ + public function rewind() { + $this->results = null; + $this->key = -1; // self::next() will turn this into 0 + $this->current = null; + $this->next(); + } + + /** + * @return bool True when the iterator is in a valid state + */ + public function valid() { + return (bool) $this->current; + } + + /** + * Fetch the next set of rows from the database. + */ + public function next() { + if ( $this->results === null ) { + $this->results = $this->query(); + } + + $current = $this->results->fetchObject(); + if ( $current !== false ) { + $this->current = $this->transform( $current ); + $this->key++; + } else { + // end of iteration reached + $this->current = false; + $this->key = null; + } + } + + /** + * Transforms the DB row into a revision object. + * + * $row will be one of the results of static::query(). In this method, $row + * is expected to have at least properties `rev_id` & `rev_type`, which will + * be used to fetch this specific row's data from storage. + * + * This will need to do some DB/cache requests. Ideally, those would be + * bundled instead of being done on a per-row record. These iterators + * are only meant to be run in maintenance scripts, however, so it + * doesn't really matter that much ;) + * + * @param stdClass $row + * @return AbstractRevision + */ + protected function transform( stdClass $row ) { + $uuid = UUID::create( $row->rev_id ); + + /** @var ManagerGroup $storage */ + $storage = Container::get( 'storage' ); + + // prevent memory from being filled up + $storage->clear(); + + return $storage->getStorage( $row->rev_type )->get( $uuid ); + } +} diff --git a/includes/Search/Iterators/HeaderIterator.php b/includes/Search/Iterators/HeaderIterator.php new file mode 100644 index 0000000..7e755f2 --- /dev/null +++ b/includes/Search/Iterators/HeaderIterator.php @@ -0,0 +1,28 @@ +<?php + +namespace Flow\Search\Iterators; + +class HeaderIterator extends AbstractIterator { + /** + * {@inheritDoc} + */ + protected function query() { + // get the current (=most recent, =max) revision id for all headers + return $this->dbr->select( + array( 'flow_revision', 'flow_workflow' ), + array( 'rev_id' => 'MAX(rev_id)', 'rev_type' ), + $this->conditions, + __METHOD__, + array( + 'ORDER BY' => 'rev_id ASC', + 'GROUP BY' => 'rev_type_id', + ), + array( + 'flow_workflow' => array( + 'INNER JOIN', + array( 'workflow_id = rev_type_id' , 'rev_type' => 'header' ) + ), + ) + ); + } +} diff --git a/includes/Search/Iterators/TopicIterator.php b/includes/Search/Iterators/TopicIterator.php new file mode 100644 index 0000000..f30d3d7 --- /dev/null +++ b/includes/Search/Iterators/TopicIterator.php @@ -0,0 +1,112 @@ +<?php + +namespace Flow\Search\Iterators; + +use Flow\DbFactory; +use Flow\Exception\InvalidDataException; +use Flow\Model\PostRevision; +use Flow\Model\UUID; +use Flow\Repository\RootPostLoader; +use stdClass; + +class TopicIterator extends AbstractIterator { + /** + * @var PostRevision + */ + protected $previous; + + /** + * @var RootPostLoader + */ + protected $rootPostLoader; + + /** + * @param DbFactory $dbFactory + * @param RootPostLoader $rootPostLoader + */ + public function __construct( DbFactory $dbFactory, RootPostLoader $rootPostLoader ) { + parent::__construct( $dbFactory ); + $this->rootPostLoader = $rootPostLoader; + } + + /** + * Define where to start iterating (inclusive) + * + * We'll be querying the workflow table instead of the revisions table. + * Because it's possible to request only a couple of revisions (in between + * certain ids), we'll need to override the parent buildQueryConditions + * method to also work on the workflow table. + * A topic workflow is updated with a workflow_last_update_timestamp for + * every change made in the topic. Our UUIDs are sequential & time-based, + * so we can just query for workflows with a timestamp higher than the + * timestamp derived from the starting UUID and lower than the end UUID. + * + * @param UUID|null $revId + */ + public function setFrom( UUID $revId = null ) { + $this->results = null; + + unset( $this->conditions[0] ); + if ( $revId !== null ) { + $this->conditions[0] = 'workflow_last_update_timestamp >= ' . $this->dbr->addQuotes( $revId->getBinary() ); + } + } + + /** + * Define where to stop iterating (exclusive) + * + * We'll be querying the workflow table instead of the revisions table. + * Because it's possible to request only a couple of revisions (in between + * certain ids), we'll need to override the parent buildQueryConditions + * method to also work on the workflow table. + * A topic workflow is updated with a workflow_last_update_timestamp for + * every change made in the topic. Our UUIDs are sequential & time-based, + * so we can just query for workflows with a timestamp higher than the + * timestamp derived from the starting UUID and lower than the end UUID. + * + * @param UUID|null $revId + */ + public function setTo( UUID $revId = null ) { + $this->results = null; + + unset( $this->conditions[1] ); + if ( $revId !== null ) { + $this->conditions[1] = 'workflow_last_update_timestamp < ' . $this->dbr->addQuotes( $revId->getBinary() ); + } + } + + /** + * Instead of querying for revisions (which is what we actually need), we'll + * just query the workflow table, which will save us some complicated joins. + * The workflow_id for a topic title (aka root post) is the same as its + * collection id, so we can pass that to the root post loader and *poof*, we + * have our revisions! + * + * {@inheritDoc} + */ + protected function query() { + return $this->dbr->select( + array( 'flow_workflow' ), + // for root post (topic title), workflow_id is the same as its rev_type_id + array( 'workflow_id', 'workflow_last_update_timestamp' ), + array( + 'workflow_type' => 'topic' + ) + $this->conditions, + __METHOD__, + array( + 'ORDER BY' => 'workflow_last_update_timestamp ASC', + ) + ); + } + + /** + * {@inheritDoc} + */ + protected function transform( stdClass $row ) { + $root = UUID::create( $row->workflow_id ); + + // we need to fetch all data via rootloader because we'll want children + // to be populated + return $this->rootPostLoader->get( $root ); + } +} diff --git a/includes/Search/TopicUpdater.php b/includes/Search/TopicUpdater.php deleted file mode 100644 index 992ee9b..0000000 --- a/includes/Search/TopicUpdater.php +++ /dev/null @@ -1,211 +0,0 @@ -<?php - -namespace Flow\Search; - -use Flow\Collection\PostSummaryCollection; -use Flow\DbFactory; -use Flow\Model\PostRevision; -use Flow\Model\PostSummary; -use Flow\Model\UUID; -use Flow\Repository\RootPostLoader; -use Flow\RevisionActionPermissions; -use ResultWrapper; -use Sanitizer; - -class TopicUpdater extends Updater { - /** - * @var RootPostLoader - */ - protected $rootPostLoader; - - /** - * @param DbFactory $dbFactory - * @param RevisionActionPermissions $permissions - * @param RootPostLoader $rootPostLoader - */ - public function __construct( DbFactory $dbFactory, RevisionActionPermissions $permissions, RootPostLoader $rootPostLoader ) { - parent::__construct( $dbFactory, $permissions ); - $this->rootPostLoader = $rootPostLoader; - } - - /** - * {@inheritDoc} - */ - public function getTypeName() { - return Connection::TOPIC_TYPE_NAME; - } - - /** - * We'll be querying the workflow table instead of the revisions table. - * Because it's possible to request only a couple of revisions (in between - * certain ids), we'll need to override the parent buildQueryConditions - * method to also work on the workflow table. - * A topic workflow is updated with a workflow_last_update_timestamp for - * every change made in the topic. Our UUIDs are sequential & time-based, - * so we can just query for workflows with a timestamp higher than the - * timestamp derived from the starting UUID and lower than the end UUID. - * - * {@inheritDoc} - */ - public function buildQueryConditions( UUID $fromId = null, UUID $toId = null, $namespace = null ) { - $dbr = $this->dbFactory->getDB( DB_SLAVE ); - - $conditions = array(); - - // only find entries in a given range - if ( $fromId !== null ) { - $conditions[] = 'workflow_last_update_timestamp >= ' . $dbr->addQuotes( $fromId->getTimestamp() ); - } - if ( $toId !== null ) { - $conditions[] = 'workflow_last_update_timestamp <= ' . $dbr->addQuotes( $toId->getTimestamp() ); - } - - // find only within requested wiki/namespace - $conditions['workflow_wiki'] = wfWikiId(); - if ( $namespace !== null ) { - $conditions['workflow_namespace'] = $namespace; - } - - return $conditions; - } - - /** - * Instead of querying for revisions (which is what we actually need), we'll - * just query the workflow table, which will save us some complicated joins. - * The workflow_id for a topic title (aka root post) is the same as its - * revision is, so we can pass that to the root post loader and *poof*, we - * have our revisions! - * - * {@inheritDoc} - */ - public function getRevisions( array $conditions = array(), array $options = array() ) { - $workflows = $this->getWorkflows( $conditions, $options ); - return $this->getRoots( $workflows ); - } - - /** - * {@inheritDoc} - */ - public function buildDocument( /* PostRevision */ $revision ) { - /** @var PostRevision $revision */ - - // get timestamp from the most recent revision - $updateTimestamp = $revision->getCollection()->getWorkflow()->getLastUpdatedObj(); - // timestamp for initial topic post - $creationTimestamp = $revision->getCollectionId()->getTimestampObj(); - - // get content from all child posts in a [post id => [data]] array - $revisions = $this->getRevisionsData( $revision ); - - // find summary for this topic & add it as revision - $summaryCollection = PostSummaryCollection::newFromId( $revision->getCollectionId() ); - try { - /** @var PostSummary $summaryRevision */ - $summaryRevision = $summaryCollection->getLastRevision(); - $data = current( $this->getRevisionsData( $summaryRevision ) ); - if ( $data !== false ) { - $revisions[] = $data; - } - } catch ( \Exception $e ) { - // no summary - that's ok! - } - - // get board title associated with this revision - $title = $revision->getCollection()->getWorkflow()->getOwnerTitle(); - - $doc = new \Elastica\Document( - $revision->getCollectionId()->getAlphadecimal(), - array( - 'namespace' => $title->getNamespace(), - 'namespace_text' => $title->getPageLanguage()->getFormattedNsText( $title->getNamespace() ), - 'pageid' => $title->getArticleID(), - 'title' => $title->getText(), - 'timestamp' => $creationTimestamp->getTimestamp( TS_ISO_8601 ), - 'update_timestamp' => $updateTimestamp->getTimestamp( TS_ISO_8601 ), - 'revisions' => $revisions, - ) - ); - - return $doc; - } - - /** - * @param array $conditions - * @param array $options - * @return bool|ResultWrapper - */ - public function getWorkflows( array $conditions = array(), array $options = array() ) { - $dbr = $this->dbFactory->getDB( DB_SLAVE ); - - return $dbr->select( - array( 'flow_workflow' ), - // for root post (topic title), workflow_id is the same as its rev_type_id - array( 'workflow_id', 'workflow_last_update_timestamp' ), - array( - 'workflow_type' => 'topic' - ) + $conditions, - __METHOD__, - array( - 'ORDER BY' => 'workflow_last_update_timestamp ASC', - ) + $options - ); - } - - /** - * @param ResultWrapper $workflows - * @return PostRevision[] - */ - public function getRoots( ResultWrapper $workflows ) { - $roots = array(); - foreach ( $workflows as $row ) { - $roots[$row->workflow_id] = UUID::create( $row->workflow_id ); - } - - // we need to fetch all data via rootloader because we'll want children - // to be populated - return $this->rootPostLoader->getMulti( $roots ); - } - - /** - * Recursively get the data for all children. This will add the revision's - * content to the results array, with the post ID as key. - * - * @param PostRevision|PostSummary $revision - * @return array - */ - public function getRevisionsData( /* PostRevision|PostSummary */ $revision ) { - // store type of revision so we can also search for very specific types - // (e.g. titles only) - // possible values will be: - // * title - // * post - // * post-summary - $type = $revision->getRevisionType(); - if ( method_exists( $revision, 'isTopicTitle' ) && $revision->isTopicTitle() ) { - $type = 'title'; - } - - $data = array(); - - if ( $this->permissions->isAllowed( $revision, 'view' ) ) { - $data[] = array( - 'id' => $revision->getCollectionId()->getAlphadecimal(), - 'text' => trim( Sanitizer::stripAllTags( $revision->getContentInHtml() ) ), - 'source_text' => $revision->getContentInWikitext(), - 'moderation_state' => $revision->getModerationState(), - 'timestamp' => $revision->getCollectionId()->getTimestamp( TS_ISO_8601 ), - 'update_timestamp' => $revision->getRevisionId()->getTimestamp( TS_ISO_8601 ), - 'type' => $type, - ); - } - - if ( $revision instanceof PostRevision ) { - // get data from all child posts too - foreach ( $revision->getChildren() as $child ) { - $data = array_merge( $data, $this->getRevisionsData( $child ) ); - } - } - - return $data; - } -} diff --git a/includes/Search/Updater.php b/includes/Search/Updater.php deleted file mode 100644 index 06d2235..0000000 --- a/includes/Search/Updater.php +++ /dev/null @@ -1,149 +0,0 @@ -<?php - -namespace Flow\Search; - -use Flow\Container; -use Flow\DbFactory; -use Flow\Exception\FlowException; -use Flow\Model\AbstractRevision; -use Flow\Model\UUID; -use Flow\RevisionActionPermissions; -use MWExceptionHandler; - -abstract class Updater { - /** - * @var DbFactory - */ - protected $dbFactory; - - /** - * @var RevisionActionPermissions - */ - protected $permissions; - - /** - * @var Connection - */ - protected $connection; - - /** - * @param DbFactory $dbFactory - * @param RevisionActionPermissions $permissions - */ - public function __construct( DbFactory $dbFactory, RevisionActionPermissions $permissions ) { - $this->dbFactory = $dbFactory; - $this->permissions = $permissions; - $this->connection = Container::get( 'search.connection' ); - } - - /** - * @return string One of the Connection::*_TYPE_NAME constants - */ - abstract public function getTypeName(); - - /** - * @param array $conditions - * @param array $options - * @return AbstractRevision[] - */ - abstract public function getRevisions( array $conditions = array(), array $options = array() ); - - /** - * @param AbstractRevision $revision - * @return \Elastica\Document - */ - abstract public function buildDocument( /* AbstractRevision */ $revision ); - - /** - * @param UUID|null $fromId - * @param UUID|null $toId - * @param int|null $namespace - * @return array - */ - public function buildQueryConditions( UUID $fromId = null, UUID $toId = null, $namespace = null ) { - $dbr = $this->dbFactory->getDB( DB_SLAVE ); - - $conditions = array(); - - // only find entries in a given range - if ( $fromId !== null ) { - $conditions[] = 'rev_id >= ' . $dbr->addQuotes( $fromId->getBinary() ); - } - if ( $toId !== null ) { - $conditions[] = 'rev_id <= ' . $dbr->addQuotes( $toId->getBinary() ); - } - - // find only within requested wiki/namespace - $conditions['workflow_wiki'] = wfWikiId(); - if ( $namespace !== null ) { - $conditions['workflow_namespace'] = $namespace; - } - - return $conditions; - } - - /** - * @param AbstractRevision[] $revisions - * @return \Elastica\Document[] - */ - protected function buildDocumentsForRevisions( array $revisions ) { - $documents = array(); - foreach ( $revisions as $revision ) { - try { - $documents[] = $this->buildDocument( $revision ); - } catch ( FlowException $e ) { - // just ignore revisions that fail to build document... - wfWarn( __METHOD__ . ': Failed to build document for ' . $revision->getRevisionId()->getAlphadecimal() . ': ' . $e->getMessage()); - MWExceptionHandler::logException( $e ); - } - } - - return $documents; - } - - /** - * @param AbstractRevision[] $revisions - * @param string|null $shardTimeout Timeout in Elasticsearch time format (1m, 15s, ...) - * @param int|null $clientSideTimeout - * @return int - */ - public function updateRevisions( array $revisions, $shardTimeout = null, $clientSideTimeout = null ) { - if ( $clientSideTimeout !== null ) { - $this->connection->setTimeout( $clientSideTimeout ); - } - - $documents = $this->buildDocumentsForRevisions( $revisions ); - $this->sendDocuments( $documents, $shardTimeout ); - - return count( $documents ); - } - - /** - * @param \Elastica\Document[] $documents - * @param string|null $shardTimeout Timeout in Elasticsearch time format (1m, 15s, ...) - */ - protected function sendDocuments( array $documents, $shardTimeout = null ) { - if ( count( $documents ) === 0 ) { - return; - } - - try { - // addDocuments (notice plural) is the bulk api - $bulk = new \Elastica\Bulk( $this->connection->getClient() ); - if ( $shardTimeout !== null ) { - $bulk->setShardTimeout( $shardTimeout ); - } - - $index = $this->connection->getFlowIndex( wfWikiId() ); - $type = $index->getType( $this->getTypeName() ); - $bulk->setType( $type ); - $bulk->addDocuments( $documents ); - $bulk->send(); - } catch ( \Exception $e ) { - $documentIds = array_map( function( $doc ) { - return $doc->getId(); - }, $documents ); - wfWarn( __METHOD__ . ': Failed updating documents (' . implode( ',', $documentIds ) . '): ' . $e->getMessage() ); - } - } -} diff --git a/includes/Search/Updaters/AbstractUpdater.php b/includes/Search/Updaters/AbstractUpdater.php new file mode 100644 index 0000000..52dd7e9 --- /dev/null +++ b/includes/Search/Updaters/AbstractUpdater.php @@ -0,0 +1,116 @@ +<?php + +namespace Flow\Search\Updaters; + +use Flow\Container; +use Flow\Exception\FlowException; +use Flow\Model\AbstractRevision; +use Flow\RevisionActionPermissions; +use Flow\Search\Connection; +use Flow\Search\Iterators\AbstractIterator; +use MWExceptionHandler; + +abstract class AbstractUpdater { + /** + * @var AbstractIterator + */ + public $iterator; + + /** + * @var RevisionActionPermissions + */ + protected $permissions; + + /** + * @var Connection + */ + protected $connection; + + /** + * @param AbstractIterator $iterator + * @param RevisionActionPermissions $permissions + */ + public function __construct( AbstractIterator $iterator, RevisionActionPermissions $permissions ) { + $this->iterator = $iterator; + $this->permissions = $permissions; + $this->connection = Container::get( 'search.connection' ); + } + + /** + * @return string One of the Connection::*_TYPE_NAME constants + */ + abstract public function getTypeName(); + + /** + * @param AbstractRevision $revision + * @return \Elastica\Document + */ + abstract public function buildDocument( AbstractRevision $revision ); + + /** + * @param string|null $shardTimeout Timeout in Elasticsearch time format (1m, 15s, ...) + * @param int|null $clientSideTimeout + * @param int $batchSize + * @return int + */ + public function updateRevisions( $shardTimeout = null, $clientSideTimeout = null, $batchSize = 50 ) { + if ( $clientSideTimeout !== null ) { + $this->connection->setTimeout( $clientSideTimeout ); + } + + $documents = array(); + $count = 0; + foreach ( $this->iterator as $revision ) { + try { + $documents[] = $this->buildDocument( $revision ); + $count++; + } catch ( FlowException $e ) { + // just ignore revisions that fail to build document... + wfWarn( __METHOD__ . ': Failed to build document for ' . $revision->getRevisionId()->getAlphadecimal() . ': ' . $e->getMessage()); + MWExceptionHandler::logException( $e ); + } + + // send documents in small batches + if ( count( $documents ) > $batchSize ) { + $this->sendDocuments( $documents, $shardTimeout ); + $documents = array(); + } + } + + if ( $documents ) { + // send remaining documents + $this->sendDocuments( $documents, $shardTimeout ); + } + + return $count; + } + + /** + * @param \Elastica\Document[] $documents + * @param string|null $shardTimeout Timeout in Elasticsearch time format (1m, 15s, ...) + */ + protected function sendDocuments( array $documents, $shardTimeout = null ) { + if ( count( $documents ) === 0 ) { + return; + } + + try { + // addDocuments (notice plural) is the bulk api + $bulk = new \Elastica\Bulk( $this->connection->getClient() ); + if ( $shardTimeout !== null ) { + $bulk->setShardTimeout( $shardTimeout ); + } + + $index = $this->connection->getFlowIndex( wfWikiId() ); + $type = $index->getType( $this->getTypeName() ); + $bulk->setType( $type ); + $bulk->addDocuments( $documents ); + $bulk->send(); + } catch ( \Exception $e ) { + $documentIds = array_map( function( $doc ) { + return $doc->getId(); + }, $documents ); + wfWarn( __METHOD__ . ': Failed updating documents (' . implode( ',', $documentIds ) . '): ' . $e->getMessage() ); + } + } +} diff --git a/includes/Search/HeaderUpdater.php b/includes/Search/Updaters/HeaderUpdater.php similarity index 63% rename from includes/Search/HeaderUpdater.php rename to includes/Search/Updaters/HeaderUpdater.php index 2a61a95..5f8c02a 100644 --- a/includes/Search/HeaderUpdater.php +++ b/includes/Search/Updaters/HeaderUpdater.php @@ -1,14 +1,13 @@ <?php -namespace Flow\Search; +namespace Flow\Search\Updaters; -use Flow\Container; -use Flow\Data\ManagerGroup; +use Flow\Model\AbstractRevision; use Flow\Model\Header; -use Flow\Model\UUID; +use Flow\Search\Connection; use Sanitizer; -class HeaderUpdater extends Updater { +class HeaderUpdater extends AbstractUpdater { /** * {@inheritDoc} */ @@ -19,41 +18,7 @@ /** * {@inheritDoc} */ - public function getRevisions( array $conditions = array(), array $options = array() ) { - $dbr = $this->dbFactory->getDB( DB_SLAVE ); - - // get the current (=most recent, =max) revision id for all headers - $rows = $dbr->select( - array( 'flow_revision', 'flow_workflow' ), - array( 'rev_id' => 'MAX(rev_id)' ), - $conditions, - __METHOD__, - array( - 'ORDER BY' => 'rev_id ASC', - 'GROUP BY' => 'rev_type_id', - ) + $options, - array( - 'flow_workflow' => array( - 'INNER JOIN', - array( 'workflow_id = rev_type_id' , 'rev_type' => 'header' ) - ), - ) - ); - - $uuids = array(); - foreach ( $rows as $row ) { - $uuids[] = UUID::create( $row->rev_id ); - } - - /** @var ManagerGroup $storage */ - $storage = Container::get( 'storage' ); - return $storage->getStorage( 'Header' )->getMulti( $uuids ); - } - - /** - * {@inheritDoc} - */ - public function buildDocument( /* Header */ $revision ) { + public function buildDocument( AbstractRevision /* Header */ $revision ) { /** @var Header $revision */ // get article title associated with this revision diff --git a/includes/Search/Updaters/TopicUpdater.php b/includes/Search/Updaters/TopicUpdater.php new file mode 100644 index 0000000..9bbb96e --- /dev/null +++ b/includes/Search/Updaters/TopicUpdater.php @@ -0,0 +1,126 @@ +<?php + +namespace Flow\Search\Updaters; + +use Flow\Collection\PostSummaryCollection; +use Flow\Model\AbstractRevision; +use Flow\Model\PostRevision; +use Flow\Model\PostSummary; +use Flow\Repository\RootPostLoader; +use Flow\RevisionActionPermissions; +use Flow\Search\Connection; +use Flow\Search\Iterators\AbstractIterator; +use Sanitizer; + +class TopicUpdater extends AbstractUpdater { + /** + * @var RootPostLoader + */ + protected $rootPostLoader; + + /** + * @param AbstractIterator $iterator + * @param RevisionActionPermissions $permissions + * @param RootPostLoader $rootPostLoader + */ + public function __construct( AbstractIterator $iterator, RevisionActionPermissions $permissions, RootPostLoader $rootPostLoader ) { + parent::__construct( $iterator, $permissions ); + $this->rootPostLoader = $rootPostLoader; + } + + /** + * {@inheritDoc} + */ + public function getTypeName() { + return Connection::TOPIC_TYPE_NAME; + } + + /** + * {@inheritDoc} + */ + public function buildDocument( AbstractRevision /* PostRevision */ $revision ) { + /** @var PostRevision $revision */ + + // get timestamp from the most recent revision + $updateTimestamp = $revision->getCollection()->getWorkflow()->getLastUpdatedObj(); + // timestamp for initial topic post + $creationTimestamp = $revision->getCollectionId()->getTimestampObj(); + + // get content from all child posts in a [post id => [data]] array + $revisions = $this->getRevisionsData( $revision ); + + // find summary for this topic & add it as revision + $summaryCollection = PostSummaryCollection::newFromId( $revision->getCollectionId() ); + try { + /** @var PostSummary $summaryRevision */ + $summaryRevision = $summaryCollection->getLastRevision(); + $data = current( $this->getRevisionsData( $summaryRevision ) ); + if ( $data !== false ) { + $revisions[] = $data; + } + } catch ( \Exception $e ) { + // no summary - that's ok! + } + + // get board title associated with this revision + $title = $revision->getCollection()->getWorkflow()->getOwnerTitle(); + + $doc = new \Elastica\Document( + $revision->getCollectionId()->getAlphadecimal(), + array( + 'namespace' => $title->getNamespace(), + 'namespace_text' => $title->getPageLanguage()->getFormattedNsText( $title->getNamespace() ), + 'pageid' => $title->getArticleID(), + 'title' => $title->getText(), + 'timestamp' => $creationTimestamp->getTimestamp( TS_ISO_8601 ), + 'update_timestamp' => $updateTimestamp->getTimestamp( TS_ISO_8601 ), + 'revisions' => $revisions, + ) + ); + + return $doc; + } + + /** + * Recursively get the data for all children. This will add the revision's + * content to the results array, with the post ID as key. + * + * @param PostRevision|PostSummary $revision + * @return array + */ + public function getRevisionsData( /* PostRevision|PostSummary */ $revision ) { + // store type of revision so we can also search for very specific types + // (e.g. titles only) + // possible values will be: + // * title + // * post + // * post-summary + $type = $revision->getRevisionType(); + if ( method_exists( $revision, 'isTopicTitle' ) && $revision->isTopicTitle() ) { + $type = 'title'; + } + + $data = array(); + + if ( $this->permissions->isAllowed( $revision, 'view' ) ) { + $data[] = array( + 'id' => $revision->getCollectionId()->getAlphadecimal(), + 'text' => trim( Sanitizer::stripAllTags( $revision->getContentInHtml() ) ), + 'source_text' => $revision->getContentInWikitext(), + 'moderation_state' => $revision->getModerationState(), + 'timestamp' => $revision->getCollectionId()->getTimestamp( TS_ISO_8601 ), + 'update_timestamp' => $revision->getRevisionId()->getTimestamp( TS_ISO_8601 ), + 'type' => $type, + ); + } + + if ( $revision instanceof PostRevision ) { + // get data from all child posts too + foreach ( $revision->getChildren() as $child ) { + $data = array_merge( $data, $this->getRevisionsData( $child ) ); + } + } + + return $data; + } +} diff --git a/maintenance/FlowFixWorkflowLastUpdateTimestamp.php b/maintenance/FlowFixWorkflowLastUpdateTimestamp.php index 3acd028..638fa1e 100644 --- a/maintenance/FlowFixWorkflowLastUpdateTimestamp.php +++ b/maintenance/FlowFixWorkflowLastUpdateTimestamp.php @@ -190,7 +190,7 @@ /** @var Workflow[] $workflows */ $workflows = $this->storage->getMulti( 'Workflow', $uuids ); foreach ( $workflows as $workflow ) { - $timestamp = $timestamps[$workflow->getId()->getBinary()]; + $timestamp = $timestamps[$workflow->getId()->getBinary()->__toString()]; $workflow->updateLastUpdated( UUID::getComparisonUUID( $timestamp ) ); } diff --git a/maintenance/FlowForceSearchIndex.php b/maintenance/FlowForceSearchIndex.php index 7d2f6d0..2987727 100644 --- a/maintenance/FlowForceSearchIndex.php +++ b/maintenance/FlowForceSearchIndex.php @@ -1,10 +1,9 @@ <?php use Flow\Container; -use Flow\Model\AbstractRevision; use Flow\Model\UUID; use Flow\Search\Connection; -use Flow\Search\Updater; +use Flow\Search\Updaters\AbstractUpdater; require_once ( getenv( 'MW_INSTALL_PATH' ) !== false ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php' @@ -33,100 +32,44 @@ $this->addOption( 'fromId', 'Start indexing at a specific revision id (inclusive).', false, true ); $this->addOption( 'toId', 'Stop indexing at a specific revision (inclusive).', false, true ); - $this->addOption( 'limit', 'Maximum number of revisions to process before exiting the script. Default to unlimited.', false, true ); $this->addOption( 'namespace', 'Only index revisions in this given namespace', false, true ); - - $this->connection = Container::get( 'search.connection' ); } public function execute() { global $wgFlowSearchMaintenanceTimeout; + $this->connection = Container::get( 'search.connection' ); + // Set the timeout for maintenance actions $this->connection->setTimeout( $wgFlowSearchMaintenanceTimeout ); - /** @var Updater[] $updaters */ + /** @var AbstractUpdater[] $updaters */ $updaters = Container::get( 'search.index.updaters' ); foreach ( $updaters as $updaterType => $updater ) { $fromId = $this->getOption( 'fromId', null ); $fromId = $fromId ? UUID::create( $fromId ) : null; $toId = $this->getOption( 'toId', null ); $toId = $toId ? UUID::create( $toId ) : null; + if ( $toId !== null ) { + // AbstractIterator::toId is exclusive, but we want inclusive, + // so just feed toId() the next possible UUID (UUID + 1) + // We need some base conversion & bcadd because the number may + // be too large to be an int. + $decimal = wfBaseConvert( $toId->getAlphadecimal(), 36, 10 ); + $new = bcadd( $decimal, 1, 0 ); + $alnum = wfBaseConvert( $new, 10, 36 ); + $toId = UUID::create( $alnum ); + } $namespace = $this->getOption( 'namespace', null ); - $numRevisionsToIndex = $this->getOption( 'limit', null ); $total = 0; - while ( true ) { - // if a limit was provided, we should make sure to not fetch - // more revisions than asked for - $options = array( 'LIMIT' => $this->mBatchSize ); - if ( $numRevisionsToIndex ) { - $options['LIMIT'] = min( $numRevisionsToIndex, $this->mBatchSize ); + $updater->iterator->setNamespace( $namespace ); + $updater->iterator->setFrom( $fromId ); + $updater->iterator->setTo( $toId ); - // since we do this in batches, we'll subtract the size of - // each batch until $numRevisionsToIndex is reached - $numRevisionsToIndex -= $this->mBatchSize; - if ( $options['LIMIT'] <= 0 ) { - break; - } - } - - $conditions = $updater->buildQueryConditions( $fromId, $toId, $namespace ); - $revisions = $updater->getRevisions( $conditions, $options ); - - // stop if we're all out of revisions - if ( !$revisions ) { - break; - } - - $total += $updater->updateRevisions( $revisions, null, null ); - $this->output( "Indexed $total $updaterType document(s)\n" ); - - // prepare for next batch, starting at the next id - // prevFromId will default to around unix epoch - there can be - // no data before that - $prevFromId = $fromId ?: UUID::getComparisonUUID( '1' ); - $fromId = $this->getNextFromId( $revisions ); - - // make sure we don't get stuck in an infinite loop - $diff = $prevFromId->getTimestampObj()->diff( $fromId->getTimestampObj() ); - // invert will be 1 if the diff is a negative time period from - // $prevFromId to $fromId, which means that the new $timestamp is - // more recent than our current $result - if ( $diff->invert ) { - $this->error( - 'Got stuck in an infinite loop.' . "\n" . - 'workflow_last_update_timestamp is likely incorrect ' . - 'for some workflows.' . "\n" . - 'Run maintenance/FlowFixWorkflowLastUpdateTimestamp.php ' . - 'to automatically fix those.', 1 ); - } - - // prevent memory from being filled up - Container::get( 'storage' )->clear(); - } + $total += $updater->updateRevisions( null, null, $this->mBatchSize ); + $this->output( "Indexed $total $updaterType document(s)\n" ); } - } - - /** - * @param AbstractRevision[] $revisions - * @return UUID - */ - protected function getNextFromId( array $revisions ) { - /** @var AbstractRevision $last */ - $last = end( $revisions ); - - if ( $last instanceof \Flow\Model\Header ) { - $timestamp = $last->getRevisionId()->getTimestampObj(); - } else { - $timestamp = $last->getCollection()->getWorkflow()->getLastUpdatedObj(); - } - - // $timestamp is the timestamp of the last revision we fetched. fromId - // is inclusive, and we don't want to include what we already have here, - // so we'll advance 1 more and call that the next fromId - $timestamp = (int) $timestamp->getTimestamp( TS_UNIX ); - return UUID::getComparisonUUID( $timestamp + 1 ); } } diff --git a/maintenance/dumpBackup.php b/maintenance/dumpBackup.php new file mode 100644 index 0000000..ecd61fb --- /dev/null +++ b/maintenance/dumpBackup.php @@ -0,0 +1,112 @@ +<?php + +use Flow\Container; +use Flow\Dump\Exporter; +use Flow\Model\UUID; + +$originalDir = getcwd(); + +$optionsWithArgs = array( 'pagelist', 'start', 'end', 'revstart', 'revend' ); + +$maintPath = ( getenv( 'MW_INSTALL_PATH' ) !== false + ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance' + : dirname( __FILE__ ) . '/../../../maintenance' ); +require_once $maintPath . '/commandLine.inc'; +require_once $maintPath . '/backup.inc'; + +class FlowBackupDumper extends BackupDumper { + function dump( $history, $text = Exporter::TEXT ) { + # Notice messages will foul up your XML output even if they're + # relatively harmless. + if ( ini_get( 'display_errors' ) ) { + ini_set( 'display_errors', 'stderr' ); + } + + $db = Container::get( 'db.factory' )->getDB( DB_SLAVE ); + $exporter = new Exporter( $db, $history, Exporter::STREAM, Exporter::TEXT ); + $wrapper = new DumpOutput( $this->sink, $this ); + $exporter->setOutputSink( $wrapper ); + + if ( !$this->skipHeader ) { + $exporter->openStream(); + } + + $workflowIterator = $exporter->getWorkflowIterator( $this->pages, $this->startId, $this->endId ); + + $revStartId = $this->revStartId ? UUID::create( $this->revStartId ) : null; + $revEndId = $this->revEndId ? UUID::create( $this->revEndId ) : null; + $exporter->dump( $workflowIterator, $revStartId, $revEndId ); + + if ( !$this->skipFooter ) { + $exporter->closeStream(); + } + + $this->report( true ); + } +} + +$dumper = new FlowBackupDumper( $argv ); + +if ( isset( $options['pagelist'] ) ) { + $olddir = getcwd(); + chdir( $originalDir ); + $pages = file( $options['pagelist'] ); + chdir( $olddir ); + if ( $pages === false ) { + echo "Unable to open file {$options['pagelist']}\n"; + die( 1 ); + } + $pages = array_map( 'trim', $pages ); + $dumper->pages = array_filter( $pages, create_function( '$x', 'return $x !== "";' ) ); +} + +if ( isset( $options['start'] ) ) { + $dumper->startId = intval( $options['start'] ); +} +if ( isset( $options['end'] ) ) { + $dumper->endId = intval( $options['end'] ); +} + +if ( isset( $options['revstart'] ) ) { + $dumper->revStartId = intval( $options['revstart'] ); +} +if ( isset( $options['revend'] ) ) { + $dumper->revEndId = intval( $options['revend'] ); +} +$dumper->skipHeader = isset( $options['skip-header'] ); +$dumper->skipFooter = isset( $options['skip-footer'] ); + +if ( isset( $options['full'] ) ) { + $dumper->dump( WikiExporter::FULL ); +} elseif ( isset( $options['current'] ) ) { + $dumper->dump( WikiExporter::CURRENT ); +} else { + $dumper->progress( <<<ENDS +This script dumps the Flow discussion database into an +XML interchange wrapper format for export. + +It can either import only the current revision, or full history. + +Although the --full will export all public revisions, non-public revisions +are removed, and the remaining revisions are renormalized to accomodate this. +It is recommended that you keep database backups as well. + +XML output is sent to stdout; progress reports are sent to stderr. + +Usage: php dumpBackup.php <action> [<options>] +Actions: + --full Dump all revisions of every description/post/summary. + --current Dump only the latest revision of every description/post/summary. + --pagelist=<file> + Where <file> is a list of page titles to be dumped +Options: + --start=n Start from page_id or log_id n + --end=n Stop before page_id or log_id n (exclusive) + --revstart=n Start from rev_id n + --revend=n Stop before rev_id n (exclusive) + --skip-header Don't output the <mediawiki> header + --skip-footer Don't output the </mediawiki> footer + +ENDS + ); +} -- To view, visit https://gerrit.wikimedia.org/r/242569 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I52bc7c0ce7813a78f9006ca4b7d931a905726c05 Gerrit-PatchSet: 22 Gerrit-Project: mediawiki/extensions/Flow Gerrit-Branch: master Gerrit-Owner: Matthias Mullie <mmul...@wikimedia.org> Gerrit-Reviewer: Mattflaschen <mflasc...@wikimedia.org> Gerrit-Reviewer: Matthias Mullie <mmul...@wikimedia.org> Gerrit-Reviewer: Sbisson <sbis...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits