jenkins-bot has submitted this change and it was merged.

Change subject: Script to remove topics before a certain date
......................................................................


Script to remove topics before a certain date

Meanwhile also fixed up PostRevisionTopicHistoryIndex, where
the "post has been removed" case is not properly handled: it
attempts to find the root, based on data that may already have
been removed.

Bug: T119509
Change-Id: I593aac084939ef7317ac91ad932da2c23d463ad7
---
M includes/Data/Index/PostRevisionTopicHistoryIndex.php
M includes/Repository/TreeRepository.php
A maintenance/FlowRemoveOldTopics.php
3 files changed, 319 insertions(+), 3 deletions(-)

Approvals:
  Catrope: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/Data/Index/PostRevisionTopicHistoryIndex.php 
b/includes/Data/Index/PostRevisionTopicHistoryIndex.php
index df16ff7..8ba6e1d 100644
--- a/includes/Data/Index/PostRevisionTopicHistoryIndex.php
+++ b/includes/Data/Index/PostRevisionTopicHistoryIndex.php
@@ -2,11 +2,12 @@
 
 namespace Flow\Data\Index;
 
+use Flow\Collection\PostCollection;
 use Flow\Data\BufferedCache;
 use Flow\Data\ObjectMapper;
 use Flow\Data\Storage\PostRevisionTopicHistoryStorage;
+use Flow\Exception\DataModelException;
 use Flow\Model\PostRevision;
-use Flow\Model\PostSummary;
 use Flow\Model\UUID;
 use MWException;
 
@@ -66,9 +67,25 @@
         *
         * @param PostRevision $post
         * @return UUID Topic ID
+        * @throws DataModelException
         */
        protected function findTopicId( PostRevision $post ) {
-               return $post->getRootPost()->getPostId();
+               try {
+                       $root = $post->getCollection()->getRoot();
+               } catch ( DataModelException $e ) {
+                       // in some cases, we may fail to find root post from 
the current
+                       // object (e.g. data has already been removed)
+                       // try to find if via parent, in that case
+                       $parentId = $post->getReplyToId();
+                       if ( $parentId === null ) {
+                               throw new DataModelException( 'Unable to locate 
root for post ' . $post->getCollectionId() );
+                       }
+
+                       $parent = PostCollection::newFromId( $parentId );
+                       $root = $parent->getRoot();
+               }
+
+               return $root->getId();
        }
 
        protected function backingStoreFindMulti( array $queries ) {
diff --git a/includes/Repository/TreeRepository.php 
b/includes/Repository/TreeRepository.php
index 8d43cda..925eaf8 100644
--- a/includes/Repository/TreeRepository.php
+++ b/includes/Repository/TreeRepository.php
@@ -202,6 +202,35 @@
                }
        }
 
+       /**
+        * Deletes a descendant from the tree repo.
+        *
+        * @param UUID $descendant
+        * @return bool
+        */
+       public function delete( UUID $descendant ) {
+               $dbw = $this->dbFactory->getDB( DB_MASTER );
+               $res = $dbw->delete(
+                       $this->tableName,
+                       array(
+                               'tree_descendant_id' => 
$descendant->getBinary(),
+                       ),
+                       __METHOD__
+               );
+
+               if ( $res ) {
+                       $subtreeKey = $this->cacheKey( 'subtree', $descendant );
+                       $parentKey = $this->cacheKey( 'parent', $descendant );
+                       $pathKey = $this->cacheKey( 'rootpath', $descendant );
+
+                       $this->cache->delete( $subtreeKey );
+                       $this->cache->delete( $parentKey );
+                       $this->cache->delete( $pathKey );
+               }
+
+               return $res;
+       }
+
        public function findParent( UUID $descendant ) {
                $map = $this->fetchParentMap( array( $descendant ) );
                return isset( $map[$descendant->getAlphadecimal()] ) ? 
$map[$descendant->getAlphadecimal()] : null;
@@ -376,7 +405,7 @@
                        throw new DataModelException( 'No root exists in the 
identityMap', 'process-data' );
                }
 
-               return $identityMap[$root];
+               return $identityMap[$root->getAlphadecimal()];
        }
 
        public function fetchFullTree( UUID $nodeId ) {
diff --git a/maintenance/FlowRemoveOldTopics.php 
b/maintenance/FlowRemoveOldTopics.php
new file mode 100644
index 0000000..a2371f0
--- /dev/null
+++ b/maintenance/FlowRemoveOldTopics.php
@@ -0,0 +1,270 @@
+<?php
+
+use Flow\Container;
+use Flow\Data\ManagerGroup;
+use Flow\Data\Utils\RawSql;
+use Flow\DbFactory;
+use Flow\Model\AbstractRevision;
+use Flow\Model\Header;
+use Flow\Model\PostRevision;
+use Flow\Model\UUID;
+use Flow\Model\Workflow;
+use Flow\Repository\TreeRepository;
+
+require_once ( getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) . '/maintenance/Maintenance.php'
+       : dirname( __FILE__ ) . '/../../../maintenance/Maintenance.php' );
+
+/**
+ * @ingroup Maintenance
+ */
+class FlowRemoveOldTopics extends Maintenance {
+       /**
+        * @var ManagerGroup
+        */
+       protected $storage;
+
+       /**
+        * @var TreeRepository
+        */
+       protected $treeRepo;
+
+       /**
+        * @var DbFactory
+        */
+       protected $dbFactory;
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->mDescription = "Deletes old topics";
+
+               $this->addOption( 'date', 'Date cutoff (in any format 
understood by wfTimestamp), topics older than this date will be deleted.', 
true, true );
+
+               $this->setBatchSize( 10 );
+       }
+
+       public function execute() {
+               $this->storage = Container::get( 'storage' );
+               $this->treeRepo = Container::get( 'repository.tree' );
+               $this->dbFactory = Container::get( 'db.factory' );
+
+               $timestamp = wfTimestamp( TS_MW, $this->getOption( 'date' ) );
+
+               $this->removeHeader( $timestamp );
+               $this->removeWorkflows( $timestamp );
+       }
+
+       protected function removeHeader( $timestamp ) {
+               $dbr = $this->dbFactory->getDB( DB_SLAVE );
+
+               // we don't store a timestamp with revisions - the id also 
holds date
+               // info, so that's what we should compare against
+               $endId = UUID::getComparisonUUID( $timestamp );
+
+               // start from around unix epoch - there can be no Flow data 
before that
+               $startId = UUID::getComparisonUUID( '1' );
+               do {
+                       /** @var Header[] $revisions */
+                       $revisions = $this->storage->find(
+                               'Header',
+                               array(
+                                       'rev_user_wiki' => wfWikiId(),
+                                       'rev_type' => 'header',
+                                       new RawSql( 'rev_id > ' . 
$dbr->addQuotes( $startId->getBinary() ) ),
+                                       new RawSql( 'rev_id < ' . 
$dbr->addQuotes( $endId->getBinary() ) ),
+                                       // only fetch original post at this 
point: we still need to
+                                       // narrow down the results
+                                       'rev_parent_id' => null,
+                               ),
+                               array( 'limit' => $this->mBatchSize )
+                       );
+
+                       if ( empty( $revisions ) ) {
+                               break;
+                       }
+
+                       // prepare for next batch, which will start at this
+                       /** @var UUID $startId */
+                       $startId = end( $revisions )->getRevisionId();
+
+                       // we've now found all first revisions prior to a 
certain date, but we
+                       // don't want to remove those that have revisions after 
that date cutoff
+                       // (we don't want to break history)
+                       // let's see if any has revisions more recent than 
timestamp
+                       $conds = array();
+                       $uuids = array();
+                       foreach ( $revisions as $revision ) {
+                               // keep track of UUIDs we may want to delete
+                               
$uuids[$revision->getCollectionId()->getAlphadecimal()] = 
$revision->getCollectionId();
+
+                               $conds[] = array(
+                                       'rev_user_wiki' => wfWikiId(),
+                                       'rev_type' => 'header',
+                                       new RawSql( 'rev_id >= ' . 
$dbr->addQuotes( $endId->getBinary() ) ),
+                                       'rev_type_id' => 
$revision->getCollectionId()->getBinary(),
+                               );
+                       }
+
+                       /** @var Header[] $recent */
+                       $recent = $this->storage->findMulti( 'Header', $conds, 
array( 'limit' => 1 ) );
+
+                       // now exclude collection ids where there's a revision 
that is more
+                       // recent than the timestamp cutoff
+                       foreach ( $recent as $revisions ) {
+                               foreach ( $revisions as $revision ) {
+                                       unset( 
$uuids[$revision->getCollectionId()->getAlphadecimal()] );
+                               }
+                       }
+
+                       // by now, there may be nothing left to remove, so move 
on to the
+                       // next batch...
+                       if ( empty( $uuids ) ) {
+                               continue;
+                       }
+
+                       $revisions = $this->storage->find(
+                               'Header',
+                               array(
+                                       'rev_user_wiki' => wfWikiId(),
+                                       'rev_type' => 'header',
+                                       'rev_type_id' => UUID::convertUUIDs( 
$uuids ),
+                               )
+                       );
+
+                       $this->output( 'Removing ' . count( $revisions ) . ' 
header revisions from ' . count( $uuids ) . ' headers (up to ' . 
$startId->getTimestamp() . ")\n" );
+
+                       foreach ( $revisions as $revision ) {
+                               $this->removeReferences( $revision );
+                       }
+
+                       $this->multiRemove( $revisions );
+
+                       $this->dbFactory->waitForSlaves();
+               } while ( !empty( $revisions ) );
+       }
+
+       /**
+        * @param string $timestamp Timestamp in TS_MW format
+        * @throws \Flow\Exception\FlowException
+        */
+       protected function removeWorkflows( $timestamp ) {
+               $dbr = $this->dbFactory->getDB( DB_SLAVE );
+
+               // start from around unix epoch - there can be no Flow data 
before that
+               $startId = UUID::getComparisonUUID( '1' );
+               do {
+                       $workflows = $this->storage->find(
+                               'Workflow',
+                               array(
+                                       new RawSql( 'workflow_id > ' . 
$dbr->addQuotes( $startId->getBinary() ) ),
+                                       'workflow_wiki' => wfWikiId(),
+                                       'workflow_type' => 'topic',
+                                       new RawSql( 
'workflow_last_update_timestamp < ' . $dbr->addQuotes( $timestamp ) ),
+                               ),
+                               array( 'limit' => $this->mBatchSize )
+                       );
+
+                       if ( empty( $workflows ) ) {
+                               break;
+                       }
+
+                       // prepare for next batch
+                       /** @var UUID $startId */
+                       $startId = end( $workflows )->getId();
+
+                       $this->dbFactory->getDB( DB_MASTER )->begin();
+                       foreach ( $workflows as $workflow ) {
+                               $this->removeSummary( $workflow );
+                               $this->removePosts( $workflow );
+                               $this->removeTopicList( $workflow );
+                       }
+
+                       $this->output( 'Removing ' . count( $workflows ) . ' 
topic workflows (up to ' . $startId->getTimestamp() . ")\n" );
+                       $this->multiRemove( $workflows );
+                       $this->dbFactory->getDB( DB_MASTER )->commit();
+
+                       $this->dbFactory->waitForSlaves();
+               } while ( !empty( $workflows ) );
+       }
+
+       protected function removeTopicList( Workflow $workflow ) {
+               $entries = $this->storage->find( 'TopicListEntry', array( 
'topic_id' => $workflow->getId() ) );
+               if ( $entries ) {
+                       $this->output( 'Removing ' . count( $entries ) . " 
topiclist entries.\n" );
+                       $this->multiRemove( $entries );
+               }
+       }
+
+       protected function removeSummary( Workflow $workflow ) {
+               $revisions = $this->storage->find( 'PostSummary', array( 
'rev_type_id' => $workflow->getId() ) );
+               if ( $revisions ) {
+                       foreach ( $revisions as $revision ) {
+                               $this->removeReferences( $revision );
+                       }
+
+                       $this->output( 'Removing ' . count( $revisions ) . " 
summary revisions from 1 topic.\n" );
+                       $this->multiRemove( $revisions );
+               }
+       }
+
+       protected function removePosts( Workflow $workflow ) {
+               // fetch all children (posts) from a topic
+               $subtree = $this->treeRepo->fetchSubtreeIdentityMap( 
$workflow->getId() );
+
+               // reverse-sort all nodes: that way we'll never delete a parent 
before
+               // having already deleted a child (which will always be more 
recent)
+               krsort( $subtree );
+
+               $conds = array();
+               foreach ( $subtree as $id => $data ) {
+                       $conds[] = array( 'rev_type_id' => UUID::create( $id ) 
);
+               }
+
+               $posts = $this->storage->findMulti( 'PostRevision', $conds );
+               $count = 0;
+               foreach ( $posts as $revisions ) {
+                       /** @var PostRevision[] $revisions */
+                       foreach ( $revisions as $revision ) {
+                               $this->removeReferences( $revision );
+                       }
+
+                       $count += count( $revisions );
+                       $this->multiRemove( $revisions );
+
+                       foreach ( $revisions as $revision ) {
+                               $this->treeRepo->delete( 
$revision->getCollectionId() );
+                       }
+               }
+               $this->output( 'Removing ' . $count . ' post revisions from ' . 
count( $posts ) . " posts.\n" );
+       }
+
+       protected function removeReferences( AbstractRevision $revision ) {
+               $wikiReferences = $this->storage->find( 'WikiReference', array(
+                       'ref_src_wiki' => wfWikiId(),
+                       'ref_src_object_type' => $revision->getRevisionType(),
+                       'ref_src_object_id' => $revision->getCollectionId(),
+               ) );
+               if ( $wikiReferences ) {
+                       $this->output( 'Removing ' . count( $wikiReferences ) . 
" wiki references from 1 revision.\n" );
+                       $this->multiRemove( $wikiReferences );
+               }
+
+               $urlReferences = $this->storage->find( 'URLReference', array(
+                       'ref_src_wiki' => wfWikiId(),
+                       'ref_src_object_type' => $revision->getRevisionType(),
+                       'ref_src_object_id' => $revision->getCollectionId(),
+               ) );
+               if ( $urlReferences ) {
+                       $this->output( 'Removing ' . count( $urlReferences ) . 
" url references from 1 revision.\n" );
+                       $this->multiRemove( $urlReferences );
+               }
+       }
+
+       protected function multiRemove( array $objects ) {
+               $this->storage->multiRemove( $objects );
+       }
+}
+
+$maintClass = 'FlowRemoveOldTopics';
+require_once( RUN_MAINTENANCE_IF_MAIN );

-- 
To view, visit https://gerrit.wikimedia.org/r/256449
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I593aac084939ef7317ac91ad932da2c23d463ad7
Gerrit-PatchSet: 9
Gerrit-Project: mediawiki/extensions/Flow
Gerrit-Branch: master
Gerrit-Owner: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Catrope <[email protected]>
Gerrit-Reviewer: Mattflaschen <[email protected]>
Gerrit-Reviewer: Matthias Mullie <[email protected]>
Gerrit-Reviewer: Sbisson <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to