jenkins-bot has submitted this change and it was merged.
Change subject: Script to remove topics before a certain date
......................................................................
Script to remove topics before a certain date
Meanwhile also fixed up PostRevisionTopicHistoryIndex, where
the "post has been removed" case is not properly handled: it
attempts to find the root, based on data that may already have
been removed.
Bug: T119509
Change-Id: I593aac084939ef7317ac91ad932da2c23d463ad7
---
M includes/Data/Index/PostRevisionTopicHistoryIndex.php
M includes/Repository/TreeRepository.php
A maintenance/FlowRemoveOldTopics.php
3 files changed, 319 insertions(+), 3 deletions(-)
Approvals:
Catrope: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/Data/Index/PostRevisionTopicHistoryIndex.php
b/includes/Data/Index/PostRevisionTopicHistoryIndex.php
index df16ff7..8ba6e1d 100644
--- a/includes/Data/Index/PostRevisionTopicHistoryIndex.php
+++ b/includes/Data/Index/PostRevisionTopicHistoryIndex.php
@@ -2,11 +2,12 @@
namespace Flow\Data\Index;
+use Flow\Collection\PostCollection;
use Flow\Data\BufferedCache;
use Flow\Data\ObjectMapper;
use Flow\Data\Storage\PostRevisionTopicHistoryStorage;
+use Flow\Exception\DataModelException;
use Flow\Model\PostRevision;
-use Flow\Model\PostSummary;
use Flow\Model\UUID;
use MWException;
@@ -66,9 +67,25 @@
*
* @param PostRevision $post
* @return UUID Topic ID
+ * @throws DataModelException
*/
protected function findTopicId( PostRevision $post ) {
- return $post->getRootPost()->getPostId();
+ try {
+ $root = $post->getCollection()->getRoot();
+ } catch ( DataModelException $e ) {
+ // in some cases, we may fail to find root post from
the current
+ // object (e.g. data has already been removed)
+ // try to find if via parent, in that case
+ $parentId = $post->getReplyToId();
+ if ( $parentId === null ) {
+ throw new DataModelException( 'Unable to locate
root for post ' . $post->getCollectionId() );
+ }
+
+ $parent = PostCollection::newFromId( $parentId );
+ $root = $parent->getRoot();
+ }
+
+ return $root->getId();
}
protected function backingStoreFindMulti( array $queries ) {
diff --git a/includes/Repository/TreeRepository.php
b/includes/Repository/TreeRepository.php
index 8d43cda..925eaf8 100644
--- a/includes/Repository/TreeRepository.php
+++ b/includes/Repository/TreeRepository.php
@@ -202,6 +202,35 @@
}
}
+ /**
+ * Deletes a descendant from the tree repo.
+ *
+ * @param UUID $descendant
+ * @return bool
+ */
+ public function delete( UUID $descendant ) {
+ $dbw = $this->dbFactory->getDB( DB_MASTER );
+ $res = $dbw->delete(
+ $this->tableName,
+ array(
+ 'tree_descendant_id' =>
$descendant->getBinary(),
+ ),
+ __METHOD__
+ );
+
+ if ( $res ) {
+ $subtreeKey = $this->cacheKey( 'subtree', $descendant );
+ $parentKey = $this->cacheKey( 'parent', $descendant );
+ $pathKey = $this->cacheKey( 'rootpath', $descendant );
+
+ $this->cache->delete( $subtreeKey );
+ $this->cache->delete( $parentKey );
+ $this->cache->delete( $pathKey );
+ }
+
+ return $res;
+ }
+
public function findParent( UUID $descendant ) {
$map = $this->fetchParentMap( array( $descendant ) );
return isset( $map[$descendant->getAlphadecimal()] ) ?
$map[$descendant->getAlphadecimal()] : null;
@@ -376,7 +405,7 @@
throw new DataModelException( 'No root exists in the
identityMap', 'process-data' );
}
- return $identityMap[$root];
+ return $identityMap[$root->getAlphadecimal()];
}
public function fetchFullTree( UUID $nodeId ) {
diff --git a/maintenance/FlowRemoveOldTopics.php
b/maintenance/FlowRemoveOldTopics.php
new file mode 100644
index 0000000..a2371f0
--- /dev/null
+++ b/maintenance/FlowRemoveOldTopics.php
@@ -0,0 +1,270 @@
+<?php
+
+use Flow\Container;
+use Flow\Data\ManagerGroup;
+use Flow\Data\Utils\RawSql;
+use Flow\DbFactory;
+use Flow\Model\AbstractRevision;
+use Flow\Model\Header;
+use Flow\Model\PostRevision;
+use Flow\Model\UUID;
+use Flow\Model\Workflow;
+use Flow\Repository\TreeRepository;
+
+require_once ( getenv( 'MW_INSTALL_PATH' ) !== false
+ ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php'
+ : dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' );
+
+/**
+ * @ingroup Maintenance
+ */
+class FlowRemoveOldTopics extends Maintenance {
+ /**
+ * @var ManagerGroup
+ */
+ protected $storage;
+
+ /**
+ * @var TreeRepository
+ */
+ protected $treeRepo;
+
+ /**
+ * @var DbFactory
+ */
+ protected $dbFactory;
+
+ public function __construct() {
+ parent::__construct();
+
+ $this->mDescription = "Deletes old topics";
+
+ $this->addOption( 'date', 'Date cutoff (in any format
understood by wfTimestamp), topics older than this date will be deleted.',
true, true );
+
+ $this->setBatchSize( 10 );
+ }
+
+ public function execute() {
+ $this->storage = Container::get( 'storage' );
+ $this->treeRepo = Container::get( 'repository.tree' );
+ $this->dbFactory = Container::get( 'db.factory' );
+
+ $timestamp = wfTimestamp( TS_MW, $this->getOption( 'date' ) );
+
+ $this->removeHeader( $timestamp );
+ $this->removeWorkflows( $timestamp );
+ }
+
+ protected function removeHeader( $timestamp ) {
+ $dbr = $this->dbFactory->getDB( DB_SLAVE );
+
+ // we don't store a timestamp with revisions - the id also
holds date
+ // info, so that's what we should compare against
+ $endId = UUID::getComparisonUUID( $timestamp );
+
+ // start from around unix epoch - there can be no Flow data
before that
+ $startId = UUID::getComparisonUUID( '1' );
+ do {
+ /** @var Header[] $revisions */
+ $revisions = $this->storage->find(
+ 'Header',
+ array(
+ 'rev_user_wiki' => wfWikiId(),
+ 'rev_type' => 'header',
+ new RawSql( 'rev_id > ' .
$dbr->addQuotes( $startId->getBinary() ) ),
+ new RawSql( 'rev_id < ' .
$dbr->addQuotes( $endId->getBinary() ) ),
+ // only fetch original post at this
point: we still need to
+ // narrow down the results
+ 'rev_parent_id' => null,
+ ),
+ array( 'limit' => $this->mBatchSize )
+ );
+
+ if ( empty( $revisions ) ) {
+ break;
+ }
+
+ // prepare for next batch, which will start at this
+ /** @var UUID $startId */
+ $startId = end( $revisions )->getRevisionId();
+
+ // we've now found all first revisions prior to a
certain date, but we
+ // don't want to remove those that have revisions after
that date cutoff
+ // (we don't want to break history)
+ // let's see if any has revisions more recent than
timestamp
+ $conds = array();
+ $uuids = array();
+ foreach ( $revisions as $revision ) {
+ // keep track of UUIDs we may want to delete
+
$uuids[$revision->getCollectionId()->getAlphadecimal()] =
$revision->getCollectionId();
+
+ $conds[] = array(
+ 'rev_user_wiki' => wfWikiId(),
+ 'rev_type' => 'header',
+ new RawSql( 'rev_id >= ' .
$dbr->addQuotes( $endId->getBinary() ) ),
+ 'rev_type_id' =>
$revision->getCollectionId()->getBinary(),
+ );
+ }
+
+ /** @var Header[] $recent */
+ $recent = $this->storage->findMulti( 'Header', $conds,
array( 'limit' => 1 ) );
+
+ // now exclude collection ids where there's a revision
that is more
+ // recent than the timestamp cutoff
+ foreach ( $recent as $revisions ) {
+ foreach ( $revisions as $revision ) {
+ unset(
$uuids[$revision->getCollectionId()->getAlphadecimal()] );
+ }
+ }
+
+ // by now, there may be nothing left to remove, so move
on to the
+ // next batch...
+ if ( empty( $uuids ) ) {
+ continue;
+ }
+
+ $revisions = $this->storage->find(
+ 'Header',
+ array(
+ 'rev_user_wiki' => wfWikiId(),
+ 'rev_type' => 'header',
+ 'rev_type_id' => UUID::convertUUIDs(
$uuids ),
+ )
+ );
+
+ $this->output( 'Removing ' . count( $revisions ) . '
header revisions from ' . count( $uuids ) . ' headers (up to ' .
$startId->getTimestamp() . ")\n" );
+
+ foreach ( $revisions as $revision ) {
+ $this->removeReferences( $revision );
+ }
+
+ $this->multiRemove( $revisions );
+
+ $this->dbFactory->waitForSlaves();
+ } while ( !empty( $revisions ) );
+ }
+
+ /**
+ * @param string $timestamp Timestamp in TS_MW format
+ * @throws \Flow\Exception\FlowException
+ */
+ protected function removeWorkflows( $timestamp ) {
+ $dbr = $this->dbFactory->getDB( DB_SLAVE );
+
+ // start from around unix epoch - there can be no Flow data
before that
+ $startId = UUID::getComparisonUUID( '1' );
+ do {
+ $workflows = $this->storage->find(
+ 'Workflow',
+ array(
+ new RawSql( 'workflow_id > ' .
$dbr->addQuotes( $startId->getBinary() ) ),
+ 'workflow_wiki' => wfWikiId(),
+ 'workflow_type' => 'topic',
+ new RawSql(
'workflow_last_update_timestamp < ' . $dbr->addQuotes( $timestamp ) ),
+ ),
+ array( 'limit' => $this->mBatchSize )
+ );
+
+ if ( empty( $workflows ) ) {
+ break;
+ }
+
+ // prepare for next batch
+ /** @var UUID $startId */
+ $startId = end( $workflows )->getId();
+
+ $this->dbFactory->getDB( DB_MASTER )->begin();
+ foreach ( $workflows as $workflow ) {
+ $this->removeSummary( $workflow );
+ $this->removePosts( $workflow );
+ $this->removeTopicList( $workflow );
+ }
+
+ $this->output( 'Removing ' . count( $workflows ) . '
topic workflows (up to ' . $startId->getTimestamp() . ")\n" );
+ $this->multiRemove( $workflows );
+ $this->dbFactory->getDB( DB_MASTER )->commit();
+
+ $this->dbFactory->waitForSlaves();
+ } while ( !empty( $workflows ) );
+ }
+
+ protected function removeTopicList( Workflow $workflow ) {
+ $entries = $this->storage->find( 'TopicListEntry', array(
'topic_id' => $workflow->getId() ) );
+ if ( $entries ) {
+ $this->output( 'Removing ' . count( $entries ) . "
topiclist entries.\n" );
+ $this->multiRemove( $entries );
+ }
+ }
+
+ protected function removeSummary( Workflow $workflow ) {
+ $revisions = $this->storage->find( 'PostSummary', array(
'rev_type_id' => $workflow->getId() ) );
+ if ( $revisions ) {
+ foreach ( $revisions as $revision ) {
+ $this->removeReferences( $revision );
+ }
+
+ $this->output( 'Removing ' . count( $revisions ) . "
summary revisions from 1 topic.\n" );
+ $this->multiRemove( $revisions );
+ }
+ }
+
+ protected function removePosts( Workflow $workflow ) {
+ // fetch all children (posts) from a topic
+ $subtree = $this->treeRepo->fetchSubtreeIdentityMap(
$workflow->getId() );
+
+ // reverse-sort all nodes: that way we'll never delete a parent
before
+ // having already deleted a child (which will always be more
recent)
+ krsort( $subtree );
+
+ $conds = array();
+ foreach ( $subtree as $id => $data ) {
+ $conds[] = array( 'rev_type_id' => UUID::create( $id )
);
+ }
+
+ $posts = $this->storage->findMulti( 'PostRevision', $conds );
+ $count = 0;
+ foreach ( $posts as $revisions ) {
+ /** @var PostRevision[] $revisions */
+ foreach ( $revisions as $revision ) {
+ $this->removeReferences( $revision );
+ }
+
+ $count += count( $revisions );
+ $this->multiRemove( $revisions );
+
+ foreach ( $revisions as $revision ) {
+ $this->treeRepo->delete(
$revision->getCollectionId() );
+ }
+ }
+ $this->output( 'Removing ' . $count . ' post revisions from ' .
count( $posts ) . " posts.\n" );
+ }
+
+ protected function removeReferences( AbstractRevision $revision ) {
+ $wikiReferences = $this->storage->find( 'WikiReference', array(
+ 'ref_src_wiki' => wfWikiId(),
+ 'ref_src_object_type' => $revision->getRevisionType(),
+ 'ref_src_object_id' => $revision->getCollectionId(),
+ ) );
+ if ( $wikiReferences ) {
+ $this->output( 'Removing ' . count( $wikiReferences ) .
" wiki references from 1 revision.\n" );
+ $this->multiRemove( $wikiReferences );
+ }
+
+ $urlReferences = $this->storage->find( 'URLReference', array(
+ 'ref_src_wiki' => wfWikiId(),
+ 'ref_src_object_type' => $revision->getRevisionType(),
+ 'ref_src_object_id' => $revision->getCollectionId(),
+ ) );
+ if ( $urlReferences ) {
+ $this->output( 'Removing ' . count( $urlReferences ) .
" url references from 1 revision.\n" );
+ $this->multiRemove( $urlReferences );
+ }
+ }
+
+ protected function multiRemove( array $objects ) {
+ $this->storage->multiRemove( $objects );
+ }
+}
+
+$maintClass = 'FlowRemoveOldTopics';
+require_once( RUN_MAINTENANCE_IF_MAIN );
--
To view, visit https://gerrit.wikimedia.org/r/256449
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I593aac084939ef7317ac91ad932da2c23d463ad7
Gerrit-PatchSet: 9
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Mattflaschen <[email protected]>
Gerrit-Reviewer: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Sbisson <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits