Milimetric has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/346915 )
Change subject: Simplify and fix EventLogging instrumentation ...................................................................... Simplify and fix EventLogging instrumentation The ExternalLinksChange schema has instrumentation in this extension. The former instrumentation was trying to optimize for speed and never worked properly. I simplified the implementation in the hope of starting a review process. I am very new at mediawiki development and so far I only used my intuition and the documentation on Manual:Hooks Bug: T162365 Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e --- M SpamBlacklistHooks.php M SpamBlacklist_body.php 2 files changed, 67 insertions(+), 68 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/SpamBlacklist refs/changes/15/346915/1 diff --git a/SpamBlacklistHooks.php b/SpamBlacklistHooks.php index eb0d5dd..352c3b2 100644 --- a/SpamBlacklistHooks.php +++ b/SpamBlacklistHooks.php @@ -189,7 +189,7 @@ ) { if ( $revision ) { BaseBlacklist::getInstance( 'spam' ) - ->doLogging( $user, $wikiPage->getTitle(), $revision->getId() ); + ->doLogging( $user, $wikiPage->getTitle(), $revision->getId(), $content ); } if ( !BaseBlacklist::isLocalSource( $wikiPage->getTitle() ) ) { @@ -266,10 +266,10 @@ return; } - // Log the changes, but we only commit them once the deletion has happened. - // We do that since the external links table could get cleared before the - // ArticleDeleteComplete hook runs - $spam->logUrlChanges( $spam->getCurrentLinks( $article->getTitle() ), [], [] ); + // The external links table could get cleared before the deletion is complete, + // so stash the old links to log them in onArticleDeleteComplete + // NOTE: I would've done that in this class but I'm still unsure about style + $spam->stashOldLinks( $article->getTitle() ); } /** @@ -285,6 +285,9 @@ ) { /** @var SpamBlacklist $spam */ $spam = BaseBlacklist::getInstance( 'spam' ); - $spam->doLogging( $user, $page->getTitle(), $page->getLatest() ); + // passing null as content means the new version has no links + // otherwise since the page is just archived there would be no change + // NOTE: restoring the page will not log that new links are added + $spam->doLogging( $user, $page->getTitle(), $page->getLatest(), null ); } } diff --git a/SpamBlacklist_body.php b/SpamBlacklist_body.php index 5dc807d..65ab5bd 100644 --- a/SpamBlacklist_body.php +++ b/SpamBlacklist_body.php @@ -11,10 +11,10 @@ const STASH_AGE_DYING = 150; /** - * Changes to external links, for logging purposes - * @var array[] - */ - private $urlChangeLog = array(); + * old links can be purged before logging, stash here and clear after using + * @var array[] + */ + private $stashedOldLinks = array(); /** * Returns the code for the blacklist implementation @@ -55,12 +55,6 @@ $statsd = MediaWikiServices::getInstance()->getStatsdDataFactory(); $cache = ObjectCache::getLocalClusterInstance(); - // If there are no new links, and we are logging, - // mark all of the current links as being removed. - if ( !$links && $this->isLoggingEnabled() ) { - $this->logUrlChanges( $this->getCurrentLinks( $title ), [], [] ); - } - if ( !$links ) { return false; } @@ -95,25 +89,7 @@ if ( count( $blacklists ) ) { // poor man's anti-spoof, see bug 12896 $newLinks = array_map( array( $this, 'antiSpoof' ), $links ); - - $oldLinks = array(); - if ( $title !== null ) { - $oldLinks = $this->getCurrentLinks( $title ); - $addedLinks = array_diff( $newLinks, $oldLinks ); - } else { - // can't load old links, so treat all links as added. - $addedLinks = $newLinks; - } - - wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', ', $oldLinks ) ); wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', ', $newLinks ) ); - wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( ', ', $addedLinks ) ); - - if ( !$preventLog ) { - $this->logUrlChanges( $oldLinks, $newLinks, $addedLinks ); - } - - $links = implode( "\n", $addedLinks ); # Strip whitelisted URLs from the match if( is_array( $whitelists ) ) { @@ -180,26 +156,8 @@ return $wgSpamBlacklistEventLogging && class_exists( 'EventLogging' ); } - /** - * Diff added/removed urls and generate events for them - * - * @param string[] $oldLinks - * @param string[] $newLinks - * @param string[] $addedLinks - */ - public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) { - if ( !$this->isLoggingEnabled() ) { - return; - } - - $removedLinks = array_diff( $oldLinks, $newLinks ); - foreach ( $addedLinks as $url ) { - $this->logUrlChange( $url, 'insert' ); - } - - foreach ( $removedLinks as $url ) { - $this->logUrlChange( $url, 'remove' ); - } + public function stashOldLinks( Title $title ) { + $this->stashedOldLinks = $this->getCurrentLinks( $title ); } /** @@ -209,23 +167,56 @@ * @param Title $title * @param int $revId */ - public function doLogging( User $user, Title $title, $revId ) { + public function doLogging( User $user, Title $title, $revId, $content ) { if ( !$this->isLoggingEnabled() ) { return; } - $baseInfo = array( - 'revId' => $revId, - 'pageId' => $title->getArticleID(), - 'pageNamespace' => $title->getNamespace(), - 'userId' => $user->getId(), - 'userText' => $user->getName(), - ); - $changes = $this->urlChangeLog; - // Empty the changes queue in case this function gets called more than once - $this->urlChangeLog = array(); + // NOTE: this may be duplicate work in the critical save path, + // is that ok? I tested changing 7000 links and it doesn't slow too much + $oldLinks = array(); - DeferredUpdates::addCallableUpdate( function() use ( $changes, $baseInfo ) { + if ( empty( $this->$stashedOldLinks ) && $title !== null ) { + $oldLinks = $this->getCurrentLinks( $title ); + } else { + $oldLinks = $this->$stashedOldLinks; + $this->$stashedOldLinks = array(); + } + + DeferredUpdates::addCallableUpdate( function() use ( $user, $title, $revId, $oldLinks ) { + + // Moving as much logic as possible in deferred update to make save quick + sort($oldLinks); + + $newLinks = array(); + if ( $title !== null ) { + if ($content !== null) { + $newLinks = $this->getNewLinks( $title, $content ); + sort($newLinks); + } + } + + $addedLinks = array_diff( $newLinks, $oldLinks ); + $removedLinks = array_diff( $oldLinks, $newLinks ); + + $inserts = array_map(function ($a) { + return $this->getLogInfo($a, 'insert'); + }, $addedLinks); + + $removes = array_map(function ($a) { + return $this->getLogInfo($a, 'remove'); + }, $removedLinks); + + $changes = array_merge($inserts, $removes); + + $baseInfo = array( + 'revId' => $revId, + 'pageId' => $title->getArticleID(), + 'pageNamespace' => $title->getNamespace(), + 'userId' => $user->getId(), + 'userText' => $user->getName(), + ); + foreach ( $changes as $change ) { EventLogging::logEvent( 'ExternalLinksChange', @@ -242,7 +233,7 @@ * @param string $url * @param string $action 'insert' or 'remove' */ - private function logUrlChange( $url, $action ) { + private function getLogInfo( $url, $action ) { $parsed = wfParseUrl( $url ); if ( !isset( $parsed['host'] ) ) { wfDebugLog( 'SpamBlacklist', "Unable to parse $url" ); @@ -256,8 +247,13 @@ 'query' => isset( $parsed['query'] ) ? $parsed['query'] : '', 'fragment' => isset( $parsed['fragment'] ) ? $parsed['fragment'] : '', ); + return $info; + } - $this->urlChangeLog[] = $info; + function getNewLinks( Title $title, Content $content ) { + $parserOptions = $content->getContentHandler()->makeParserOptions( 'canonical' ); + $output = $content->getParserOutput( $title, null, $parserOptions ); + return array_keys( $output->getExternalLinks() ); } /** -- To view, visit https://gerrit.wikimedia.org/r/346915 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/SpamBlacklist Gerrit-Branch: master Gerrit-Owner: Milimetric <dandree...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits