Mattflaschen has uploaded a new change for review. https://gerrit.wikimedia.org/r/230723
Change subject: WIP: Eliminate some memory leaks affecting LQT->Flow: ...................................................................... WIP: Eliminate some memory leaks affecting LQT->Flow: * LocalBufferedBagOStuff does not clear its in-process cache after commit, so have the importer use BufferedBagOStuff instead. The only downside is that it won't have reads cached in-process at all, but for the importer's usage pattern this downside seems unlikely to come up much. Also rename in container.php for clarity. * Call ManagerGroup->clear(), which clears all the ObjectManager-s, their mappers, and triggers onAfterClear. I suspect that ObjectManager and CachingObjectMapper together may be a major part of the memory leak. However, this requires being very careful to avoid inserting an object that is already in the database. See clearManagerGroup. * Change UserNameBatch to use MapCacheLRU and limit to 250 per wiki. This may not be strictly required for the importer's use case, since UserNameListener has a onAfterClear listener (which is triggered by ObjectManager->clear). However, there are other uses of UserNameBatch, and this adds an extra layer of protection. * Add some notes about other possible improvements that apparently wouldn't impact the importer, but might impact other code. * Some comment updates I noticed along the way. Bug: T106614 Bug: T108601 Change-Id: I988dff81de9e32da1470a89cc7cc5dfc07ef82e8 --- M container.php M includes/Data/BagOStuff/LocalBufferedBagOStuff.php M includes/Formatter/AbstractQuery.php M includes/Import/Importer.php M includes/Model/WikiReference.php M includes/Repository/UserNameBatch.php M includes/Templating.php 7 files changed, 102 insertions(+), 33 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Flow refs/changes/23/230723/1 diff --git a/container.php b/container.php index 548b562..9bdfdfd 100644 --- a/container.php +++ b/container.php @@ -37,7 +37,7 @@ $c['repository.tree'] = function( $c ) { return new Flow\Repository\TreeRepository( $c['db.factory'], - $c['memcache.buffered'] + $c['memcache.local_buffered'] ); }; @@ -132,14 +132,28 @@ use Flow\Model\PostRevision; use Flow\Model\PostSummary; -$c['memcache.buffered'] = function( $c ) { +// This currently never clears $this->bag, which makes it unusuable for long-running batch. +// Use 'memcache.buffered' for those instead. +$c['memcache.local_buffered'] = function( $c ) { global $wgFlowCacheTime; - // This is the real buffered cached that will allow transactional-like cache + // This is the real buffered cached that will allow transactional-like cache. + // It also caches all reads in-memory. $bufferedCache = new Flow\Data\BagOStuff\LocalBufferedBagOStuff( $c['memcache'] ); // This is Flow's wrapper around it, to have a fixed cache expiry time return new BufferedCache( $bufferedCache, $wgFlowCacheTime ); }; + +$c['memcache.buffered'] = function( $c ) { + global $wgFlowCacheTime; + + // This is the real buffered cached that will allow transactional-like cache + $bufferedCache = new Flow\Data\BagOStuff\BufferedBagOStuff( $c['memcache'] ); + + // This is Flow's wrapper around it, to have a fixed cache expiry time + return new BufferedCache( $bufferedCache, $wgFlowCacheTime ); +}; + // Batched username loader $c['repository.username.query'] = function( $c ) { return new Flow\Repository\UserName\TwoStepUserNameQuery( @@ -173,7 +187,7 @@ }; $c['storage.workflow.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.workflow.backend'], 'flow_workflow:v2:pk', $c['storage.workflow.primary_key'] @@ -181,7 +195,7 @@ }; $c['storage.workflow.indexes.title_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.workflow.backend'], 'flow_workflow:title:v2:', array( 'workflow_wiki', 'workflow_namespace', 'workflow_title_text', 'workflow_type' ), @@ -252,7 +266,7 @@ }; $c['storage.board_history.indexes.primary'] = function( $c ) { return new BoardHistoryIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], // backend storage $c['storage.board_history.backend'], // key prefix @@ -338,7 +352,7 @@ }; $c['storage.header.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.header.backend'], 'flow_header:v2:pk', $c['storage.header.primary_key'] @@ -346,7 +360,7 @@ }; $c['storage.header.indexes.topic_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.header.backend'], 'flow_header:workflow', array( 'rev_type_id' ), @@ -416,7 +430,7 @@ }; $c['storage.post_summary.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.post_summary.backend'], 'flow_post_summary:v2:pk', $c['storage.post_summary.primary_key'] @@ -424,7 +438,7 @@ }; $c['storage.post_summary.indexes.topic_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.post_summary.backend'], 'flow_post_summary:workflow', array( 'rev_type_id' ), @@ -482,7 +496,7 @@ // Lookup from topic_id to its owning board id $c['storage.topic_list.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.topic_list.backend'], 'flow_topic_list:topic', array( 'topic_id' ) @@ -493,7 +507,7 @@ /// In reverse order by topic_id $c['storage.topic_list.indexes.reverse_lookup'] = function( $c ) { return new TopicListTopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.topic_list.backend'], 'flow_topic_list:list', array( 'topic_list_id' ), @@ -503,7 +517,7 @@ /// In reverse order by topic last_updated $c['storage.topic_list.indexes.last_updated'] = function( $c ) { return new TopicListTopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.topic_list.indexes.last_updated.backend'], 'flow_topic_list_last_updated:list', array( 'topic_list_id' ), @@ -595,7 +609,7 @@ }; $c['storage.post.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.post.backend'], 'flow_revision:v4:pk', $c['storage.post.primary_key'] @@ -604,7 +618,7 @@ // Each bucket holds a list of revisions in a single post $c['storage.post.indexes.post_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.post.backend'], 'flow_revision:descendant', array( 'rev_type_id' ), @@ -646,7 +660,7 @@ }; $c['storage.topic_history.indexes.primary'] = function( $c ) { return new UniqueFeatureIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.topic_history.backend'], 'flow_revision:v4:pk', $c['storage.topic_history.primary_key'] @@ -654,7 +668,7 @@ }; $c['storage.topic_history.indexes.topic_lookup'] = function( $c ) { return new TopicHistoryIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.topic_history.backend'], $c['repository.tree'], 'flow_revision:topic:v2', @@ -764,7 +778,7 @@ return new Flow\SubmissionHandler( $c['storage'], $c['db.factory'], - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['deferred_queue'] ); }; @@ -1046,7 +1060,7 @@ }; $c['storage.wiki_reference.indexes.source_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.wiki_reference.backend'], 'flow_ref:wiki:by-source', array( @@ -1061,7 +1075,7 @@ }; $c['storage.wiki_reference.indexes.revision_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.wiki_reference.backend'], 'flow_ref:wiki:by-revision:v2', array( @@ -1113,7 +1127,7 @@ $c['storage.url_reference.indexes.revision_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.url_reference.backend'], 'flow_ref:url:by-source', array( @@ -1128,7 +1142,7 @@ }; $c['storage.url_reference.indexes.source_lookup'] = function( $c ) { return new TopKIndex( - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage.url_reference.backend'], 'flow_ref:url:by-revision:v2', array( @@ -1230,7 +1244,7 @@ $c['board_mover'] = function( $c ) { return new Flow\BoardMover( $c['db.factory'], - $c['memcache.buffered'], + $c['memcache.local_buffered'], $c['storage'], $c['occupation_controller']->getTalkpageManager() ); diff --git a/includes/Data/BagOStuff/LocalBufferedBagOStuff.php b/includes/Data/BagOStuff/LocalBufferedBagOStuff.php index 948b67a..22ae4ac 100755 --- a/includes/Data/BagOStuff/LocalBufferedBagOStuff.php +++ b/includes/Data/BagOStuff/LocalBufferedBagOStuff.php @@ -36,6 +36,10 @@ protected function clearLocal() { // contrary to BufferedBagOStuff, don't clear $this->bag + // + // TODO: Reconsider this, or at least bound $this->bag somehow? This + // *might* be acceptable for web requests, but $this->bag growing + // unbounded can easily cause problems for batch. $this->buffer = array(); $this->committed = array(); } diff --git a/includes/Formatter/AbstractQuery.php b/includes/Formatter/AbstractQuery.php index c506ccc..5c0cd36 100644 --- a/includes/Formatter/AbstractQuery.php +++ b/includes/Formatter/AbstractQuery.php @@ -30,6 +30,8 @@ */ protected $treeRepository; + // Consider converting these in-process caches to MapCacheLRU to avoid + // memory leaks. Should only be an issue if a batch is repeatedly doing queries. /** * @var UUID[] Associative array of post ID to root post's UUID object. */ diff --git a/includes/Import/Importer.php b/includes/Import/Importer.php index 9b13ef9..0f09951 100644 --- a/includes/Import/Importer.php +++ b/includes/Import/Importer.php @@ -282,12 +282,29 @@ /** * Gets the given object from storage * + * WARNING: Before calling this method, ensure that you follow the rule + * given in clearManagerGroup. + * * @param string $type Class name to retrieve * @param UUID $id ID of the object to retrieve * @return Object|false */ public function get( $type, UUID $id ) { return $this->storage->get( $type, $id ); + } + + /** + * Clears information about which objects are loaded, to avoid memory leaks. + * This will also: + * * Clear the mapper associated with each ObjectManager that has been used. + * * Trigger onAfterClear on any listeners. + * + * WARNING: You can *NOT* call ->get before calling clearManagerGroup, then ->put + * after calling clearManagerGroup, on the same object. This will cause a + * duplicate object to be inserted. + */ + public function clearManagerGroup() { + $this->storage->clear(); } /** @@ -589,6 +606,8 @@ $this->importTopic( $topicState, $topic ); $state->commit(); $state->postprocessor->afterTopicImported( $topicState, $topic ); + $state->clearManagerGroup(); + $imported++; } catch ( ImportSourceStoreException $e ) { // errors from the source store are more serious and shuld diff --git a/includes/Model/WikiReference.php b/includes/Model/WikiReference.php index 43c4f08..0fdae05 100644 --- a/includes/Model/WikiReference.php +++ b/includes/Model/WikiReference.php @@ -72,8 +72,14 @@ } /** - * Many loaded references typically point to the same Title, cache those instead - * of generating a bunch of duplicate title classes. + * Gets a title give a namespace number and title text + * + * Many loaded references typically point to the same Title, so we cache those + * instead of generating a bunch of duplicate title classes. + * + * @param int $namespace Namespace number + * @param string $title Title text + * @return Title */ public static function makeTitle( $namespace, $title ) { try { diff --git a/includes/Repository/UserNameBatch.php b/includes/Repository/UserNameBatch.php index e3a86ee..bcd1e4d 100644 --- a/includes/Repository/UserNameBatch.php +++ b/includes/Repository/UserNameBatch.php @@ -12,6 +12,9 @@ * Batch together queries for a bunch of wiki+userid -> username */ class UserNameBatch { + // Maximum number of usernames to cache for each wiki + const USERNAMES_PER_WIKI = 250; + /** * @var UserName\UserNameQuery */ @@ -23,7 +26,8 @@ protected $queued = array(); /** - * @var array[] 2-d map from wiki id and user id to display username or false + * @var array Map from wiki id to MapCacheLRU. MapCacheLRU is a map of user ID (as + * string, though to PHP it's the same anyway...) to username. */ protected $usernames = array(); @@ -35,6 +39,17 @@ $this->query = $query; foreach ( $queued as $wiki => $userIds ) { $this->queued[$wiki] = array_map( 'intval', $userIds ); + } + } + + /* + * Make sure the LRU for the given wiki is in place. + * + * @param string $wiki Wiki identifier + */ + protected function ensureLRU( $wiki ) { + if ( !isset( $this->usernames[$wiki] ) ) { + $this->usernames[$wiki] = new MapCacheLRU( self::USERNAMES_PER_WIKI ); } } @@ -50,9 +65,11 @@ */ public function add( $wiki, $userId, $userName = null ) { $userId = (int)$userId; + + $this->ensureLRU( $wiki ); if ( $userName !== null ) { - $this->usernames[$wiki][$userId] = $userName; - } elseif ( !isset( $this->usernames[$wiki][$userId] ) ) { + $this->usernames[$wiki]->set( (string) $userId, $userName ); + } elseif ( !$this->usernames[$wiki]->has( (string) $userId ) ) { $this->queued[$wiki][] = $userId; } } @@ -79,11 +96,13 @@ if ( $userId === 0 ) { return $userIp; } - if ( !isset( $this->usernames[$wiki][$userId] ) ) { + + $this->ensureLRU( $wiki ); + if ( !$this->usernames[$wiki]->has( (string) $userId ) ) { $this->queued[$wiki][] = $userId; $this->resolve( $wiki ); } - return $this->usernames[$wiki][$userId]; + return $this->usernames[$wiki]->get( (string) $userId ); } /** @@ -105,15 +124,19 @@ } $queued = array_unique( $this->queued[$wiki] ); if ( isset( $this->usernames[$wiki] ) ) { - $queued = array_diff( $queued, array_keys( $this->usernames[$wiki] ) ); + $queued = array_diff( $queued, $this->usernames[$wiki]->getAllKeys() ); + } else { + $this->ensureLRU( $wiki ); } + $res = $this->query->execute( $wiki, $queued ); unset( $this->queued[$wiki] ); if ( $res ) { $usernames = array(); foreach ( $res as $row ) { $id = (int)$row->user_id; - $this->usernames[$wiki][$id] = $usernames[$id] = $row->user_name; + $usernames[$id] = $row->user_name; + $this->usernames[$wiki]->set( (string) $id, $row->user_name ); } $this->resolveUserPages( $wiki, $usernames ); $missing = array_diff( $queued, array_keys( $usernames ) ); @@ -121,7 +144,7 @@ $missing = $queued; } foreach ( $missing as $id ) { - $this->usernames[$wiki][$id] = false; + $this->usernames[$wiki]->set( (string) $id, false ); } } diff --git a/includes/Templating.php b/includes/Templating.php index e3351da..2b71ae6 100644 --- a/includes/Templating.php +++ b/includes/Templating.php @@ -92,6 +92,7 @@ throw new PermissionException( 'Insufficient permissions to see userlinks for rev_id = ' . $revision->getRevisionId()->getAlphadecimal() ); } + // Convert to use MapCacheLRU? // if this specific revision is moderated, its usertext can always be // displayed, since it will be the moderator user static $cache; -- To view, visit https://gerrit.wikimedia.org/r/230723 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I988dff81de9e32da1470a89cc7cc5dfc07ef82e8 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Flow Gerrit-Branch: master Gerrit-Owner: Mattflaschen <mflasc...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits