Milimetric has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/346915 )

Change subject: Simplify and fix EventLogging instrumentation
......................................................................

Simplify and fix EventLogging instrumentation

The ExternalLinksChange schema has instrumentation in this extension.
The former instrumentation was trying to optimize for speed and never
worked properly.  I simplified the implementation in the hope of
starting a review process.  I am very new at mediawiki development and
so far I only used my intuition and the documentation on Manual:Hooks

Bug: T162365
Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e
---
M SpamBlacklistHooks.php
M SpamBlacklist_body.php
2 files changed, 67 insertions(+), 68 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/SpamBlacklist 
refs/changes/15/346915/1

diff --git a/SpamBlacklistHooks.php b/SpamBlacklistHooks.php
index eb0d5dd..352c3b2 100644
--- a/SpamBlacklistHooks.php
+++ b/SpamBlacklistHooks.php
@@ -189,7 +189,7 @@
        ) {
                if ( $revision ) {
                        BaseBlacklist::getInstance( 'spam' )
-                               ->doLogging( $user, $wikiPage->getTitle(), 
$revision->getId() );
+                               ->doLogging( $user, $wikiPage->getTitle(), 
$revision->getId(), $content );
                }
 
                if ( !BaseBlacklist::isLocalSource( $wikiPage->getTitle() ) ) {
@@ -266,10 +266,10 @@
                        return;
                }
 
-               // Log the changes, but we only commit them once the deletion 
has happened.
-               // We do that since the external links table could get cleared 
before the
-               // ArticleDeleteComplete hook runs
-               $spam->logUrlChanges( $spam->getCurrentLinks( 
$article->getTitle() ), [], [] );
+               // The external links table could get cleared before the 
deletion is complete,
+               // so stash the old links to log them in onArticleDeleteComplete
+               // NOTE: I would've done that in this class but I'm still 
unsure about style
+               $spam->stashOldLinks( $article->getTitle() );
        }
 
        /**
@@ -285,6 +285,9 @@
        ) {
                /** @var SpamBlacklist $spam */
                $spam = BaseBlacklist::getInstance( 'spam' );
-               $spam->doLogging( $user, $page->getTitle(), $page->getLatest() 
);
+               // passing null as content means the new version has no links
+               // otherwise since the page is just archived there would be no 
change
+               // NOTE: restoring the page will not log that new links are 
added
+               $spam->doLogging( $user, $page->getTitle(), $page->getLatest(), 
null );
        }
 }
diff --git a/SpamBlacklist_body.php b/SpamBlacklist_body.php
index 5dc807d..65ab5bd 100644
--- a/SpamBlacklist_body.php
+++ b/SpamBlacklist_body.php
@@ -11,10 +11,10 @@
        const STASH_AGE_DYING = 150;
 
        /**
-        * Changes to external links, for logging purposes
-        * @var array[]
-        */
-       private $urlChangeLog = array();
+       * old links can be purged before logging, stash here and clear after 
using
+       * @var array[]
+       */
+       private $stashedOldLinks = array();
 
        /**
         * Returns the code for the blacklist implementation
@@ -55,12 +55,6 @@
                $statsd = 
MediaWikiServices::getInstance()->getStatsdDataFactory();
                $cache = ObjectCache::getLocalClusterInstance();
 
-               // If there are no new links, and we are logging,
-               // mark all of the current links as being removed.
-               if ( !$links && $this->isLoggingEnabled() ) {
-                       $this->logUrlChanges( $this->getCurrentLinks( $title ), 
[], [] );
-               }
-
                if ( !$links ) {
                        return false;
                }
@@ -95,25 +89,7 @@
                if ( count( $blacklists ) ) {
                        // poor man's anti-spoof, see bug 12896
                        $newLinks = array_map( array( $this, 'antiSpoof' ), 
$links );
-
-                       $oldLinks = array();
-                       if ( $title !== null ) {
-                               $oldLinks = $this->getCurrentLinks( $title );
-                               $addedLinks = array_diff( $newLinks, $oldLinks 
);
-                       } else {
-                               // can't load old links, so treat all links as 
added.
-                               $addedLinks = $newLinks;
-                       }
-
-                       wfDebugLog( 'SpamBlacklist', "Old URLs: " . implode( ', 
', $oldLinks ) );
                        wfDebugLog( 'SpamBlacklist', "New URLs: " . implode( ', 
', $newLinks ) );
-                       wfDebugLog( 'SpamBlacklist', "Added URLs: " . implode( 
', ', $addedLinks ) );
-
-                       if ( !$preventLog ) {
-                               $this->logUrlChanges( $oldLinks, $newLinks, 
$addedLinks );
-                       }
-
-                       $links = implode( "\n", $addedLinks );
 
                        # Strip whitelisted URLs from the match
                        if( is_array( $whitelists ) ) {
@@ -180,26 +156,8 @@
                return $wgSpamBlacklistEventLogging && class_exists( 
'EventLogging' );
        }
 
-       /**
-        * Diff added/removed urls and generate events for them
-        *
-        * @param string[] $oldLinks
-        * @param string[] $newLinks
-        * @param string[] $addedLinks
-        */
-       public function logUrlChanges( $oldLinks, $newLinks, $addedLinks ) {
-               if ( !$this->isLoggingEnabled() ) {
-                       return;
-               }
-
-               $removedLinks = array_diff( $oldLinks, $newLinks );
-               foreach ( $addedLinks as $url ) {
-                       $this->logUrlChange( $url, 'insert' );
-               }
-
-               foreach ( $removedLinks as $url ) {
-                       $this->logUrlChange( $url, 'remove' );
-               }
+       public function stashOldLinks( Title $title ) {
+               $this->stashedOldLinks = $this->getCurrentLinks( $title );
        }
 
        /**
@@ -209,23 +167,56 @@
         * @param Title $title
         * @param int $revId
         */
-       public function doLogging( User $user, Title $title, $revId ) {
+       public function doLogging( User $user, Title $title, $revId, $content ) 
{
                if ( !$this->isLoggingEnabled() ) {
                        return;
                }
 
-               $baseInfo = array(
-                       'revId' => $revId,
-                       'pageId' => $title->getArticleID(),
-                       'pageNamespace' => $title->getNamespace(),
-                       'userId' => $user->getId(),
-                       'userText' => $user->getName(),
-               );
-               $changes = $this->urlChangeLog;
-               // Empty the changes queue in case this function gets called 
more than once
-               $this->urlChangeLog = array();
+               // NOTE: this may be duplicate work in the critical save path,
+               // is that ok?  I tested changing 7000 links and it doesn't 
slow too much
+               $oldLinks = array();
 
-               DeferredUpdates::addCallableUpdate( function() use ( $changes, 
$baseInfo ) {
+               if ( empty( $this->$stashedOldLinks ) && $title !== null ) {
+                       $oldLinks = $this->getCurrentLinks( $title );
+               } else {
+                       $oldLinks = $this->$stashedOldLinks;
+                       $this->$stashedOldLinks = array();
+               }
+
+               DeferredUpdates::addCallableUpdate( function() use ( $user, 
$title, $revId, $oldLinks ) {
+
+                       // Moving as much logic as possible in deferred update 
to make save quick
+                       sort($oldLinks);
+
+                       $newLinks = array();
+                       if ( $title !== null ) {
+                               if ($content !== null) {
+                                       $newLinks = $this->getNewLinks( $title, 
$content );
+                                       sort($newLinks);
+                               }
+                       }
+
+                       $addedLinks = array_diff( $newLinks, $oldLinks );
+                       $removedLinks = array_diff( $oldLinks, $newLinks );
+
+                       $inserts = array_map(function ($a) {
+                               return $this->getLogInfo($a, 'insert');
+                       }, $addedLinks);
+
+                       $removes = array_map(function ($a) {
+                               return $this->getLogInfo($a, 'remove');
+                       }, $removedLinks);
+
+                       $changes = array_merge($inserts, $removes);
+
+                       $baseInfo = array(
+                               'revId' => $revId,
+                               'pageId' => $title->getArticleID(),
+                               'pageNamespace' => $title->getNamespace(),
+                               'userId' => $user->getId(),
+                               'userText' => $user->getName(),
+                       );
+
                        foreach ( $changes as $change ) {
                                EventLogging::logEvent(
                                        'ExternalLinksChange',
@@ -242,7 +233,7 @@
         * @param string $url
         * @param string $action 'insert' or 'remove'
         */
-       private function logUrlChange( $url, $action ) {
+       private function getLogInfo( $url, $action ) {
                $parsed = wfParseUrl( $url );
                if ( !isset( $parsed['host'] ) ) {
                        wfDebugLog( 'SpamBlacklist', "Unable to parse $url" );
@@ -256,8 +247,13 @@
                        'query' => isset( $parsed['query'] ) ? $parsed['query'] 
: '',
                        'fragment' => isset( $parsed['fragment'] ) ? 
$parsed['fragment'] : '',
                );
+               return $info;
+       }
 
-               $this->urlChangeLog[] = $info;
+       function getNewLinks( Title $title, Content $content ) {
+               $parserOptions = 
$content->getContentHandler()->makeParserOptions( 'canonical' );
+               $output = $content->getParserOutput( $title, null, 
$parserOptions );
+               return array_keys( $output->getExternalLinks() );
        }
 
        /**

-- 
To view, visit https://gerrit.wikimedia.org/r/346915
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I455b09fe2e77d3f6faccfdd3e2b4e7940344219e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/SpamBlacklist
Gerrit-Branch: master
Gerrit-Owner: Milimetric <dandree...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to