jenkins-bot has submitted this change and it was merged.

Change subject: Simplify incoming_links counting from es query to mysql
......................................................................


Simplify incoming_links counting from es query to mysql

Started working up this patch to allow changing the analysis chain used
for incoming_links counting. After more consideration will probably
add a new field to incoming_links rather than changing the existing
chain, but this patch still seemed usefull as it simplifys how the
incoming links are calculated and uses the canonical data source.

This might be slightly slower, but likely not much. The previous method
used an elasticsearch multi-search to do many documents at once. We
rarely index multiple documents at once though, only during a full
reindex that forces parsing (as opposed to in-place reindex). This
should be minimal load on the mysql slaves, the query is full covered
by the existing backlinks index. On a manual test of a few larger
pages the query to count ~150k incoming links to a page plus it's
50 reidrects is in the 100-150ms range.

Change-Id: Id7aaea59dd1ad5a7cb770c226cbf6ef7194233e8
---
M CirrusSearch.php
M includes/BuildDocument/RedirectsAndIncomingLinks.php
M includes/Updater.php
3 files changed, 33 insertions(+), 121 deletions(-)

Approvals:
  Smalyshev: Looks good to me, but someone else must approve
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  DCausse: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 2b330c9..a285eec 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -958,7 +958,6 @@
 /**
  * Hooks
  */
-$wgHooks[ 'CirrusSearchBuildDocumentFinishBatch'][] = 
'CirrusSearch\BuildDocument\RedirectsAndIncomingLinks::finishBatch';
 $wgHooks[ 'CirrusSearchBuildDocumentLinks'][] = 
'CirrusSearch\BuildDocument\RedirectsAndIncomingLinks::buildDocument';
 $wgHooks[ 'AfterImportPage' ][] = 'CirrusSearch\Hooks::onAfterImportPage';
 $wgHooks[ 'ApiBeforeMain' ][] = 'CirrusSearch\Hooks::onApiBeforeMain';
diff --git a/includes/BuildDocument/RedirectsAndIncomingLinks.php 
b/includes/BuildDocument/RedirectsAndIncomingLinks.php
index 78bd317..0751f1c 100644
--- a/includes/BuildDocument/RedirectsAndIncomingLinks.php
+++ b/includes/BuildDocument/RedirectsAndIncomingLinks.php
@@ -32,75 +32,16 @@
  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  * http://www.gnu.org/copyleft/gpl.html
  */
-class RedirectsAndIncomingLinks extends ElasticsearchIntermediary {
+class RedirectsAndIncomingLinks {
        /**
-        * @var SplObjectStorage|null copy of this class kept during batches
-        */
-       private static $externalLinks = null;
-
-       /**
-        * @var SearchConfig
-        */
-       private $config;
-
-       /**
-        * @var \Elastica\Multi\Search
-        */
-       private $linkCountMultiSearch;
-
-       /**
-        * @var callable[]
-        */
-       private $linkCountClosures = array();
-
-       /**
-        * @param SearchConfig $config
-        * @param \Elastica\Document $doc
         * @param Title $title
         * @param Connection $conn
         * @return bool
         */
        public static function buildDocument( \Elastica\Document $doc, Title 
$title, Connection $conn ) {
-               if ( self::$externalLinks === null ) {
-                       self::$externalLinks = new SplObjectStorage;
-               }
-               if ( !isset( self::$externalLinks[$conn] ) ) {
-                       $config = 
MediaWikiServices::getInstance()->getConfigFactory()->makeConfig( 
'CirrusSearch' );
-                       // @todo should there instead be a way to source 
$config from $conn?
-                       self::$externalLinks[$conn] = new self( $config, $conn 
);
-               }
-               /** @var self $externalLink */
-               $externalLink = self::$externalLinks[$conn];
-               $externalLink->realBuildDocument( $doc, $title );
-               return true;
-       }
-
-       /**
-        * @param WikiPage[]
-        * @return bool
-        */
-       public static function finishBatch( $pages ) {
-               if ( self::$externalLinks === null ) {
-                       // Nothing to do as we haven't set up any actions 
during the buildDocument phase
-                       return false;
-               }
-               foreach ( self::$externalLinks as $conn ) {
-                       self::$externalLinks[$conn]->realFinishBatch( $pages );
-               }
-               self::$externalLinks = null;
-               return true;
-       }
-
-       protected function __construct( SearchConfig $config, Connection $conn 
) {
-               parent::__construct( $conn, null, 0 );
-               $this->config = $config;
-               $this->linkCountMultiSearch = new \Elastica\Multi\Search( 
$conn->getClient() );
-       }
-
-       private function realBuildDocument( \Elastica\Document $doc, Title 
$title ) {
                global $wgCirrusSearchIndexedRedirects;
 
-               $outgoingLinksToCount = array( $title->getPrefixedDBkey() );
+               $outgoingLinksToCount = array( $title );
 
                // Gather redirects to this page
                $redirectTitles = $title->getBacklinkCache()
@@ -113,7 +54,7 @@
                                        'namespace' => 
$redirect->getNamespace(),
                                        'title' => $redirect->getText()
                                );
-                               $outgoingLinksToCount[] = 
$redirect->getPrefixedDBKey();
+                               $outgoingLinksToCount[] = $redirect;
                        }
                }
                $doc->set( 'redirect', $redirects );
@@ -123,71 +64,45 @@
                //  #1 Number of redirects to the page
                //  #2 Number of links to the title
                //  #3 Number of links to all the redirects
+               $incomingCount = self::countIncomingLinks( 
$outgoingLinksToCount );
 
-               // #1 we have a list of the "first" 
$wgCirrusSearchIndexedRedirects redirect so we just count it:
-               $redirectCount = count( $redirects );
-
-               // #2 and #3 we count the number of links to the page with 
Elasticsearch.
-               // Since we only have $wgCirrusSearchIndexedRedirects we only 
count that many terms.
-               $this->linkCountMultiSearch->addSearch( $this->buildCount( 
$outgoingLinksToCount ) );
-               $this->linkCountClosures[] = function ( $count ) use( $doc, 
$redirectCount ) {
-                       $doc->set( 'incoming_links', $count + $redirectCount );
-               };
-       }
-
-       /**
-        * @param WikiPage[] $pages
-        */
-       private function realFinishBatch( array $pages ) {
-               $linkCountClosureCount = count( $this->linkCountClosures );
-               if ( $linkCountClosureCount ) {
-                       try {
-                               $pageCount = count( $pages );
-                               $this->start( "counting links to {pageCount} 
pages", array(
-                                       'pageCount' => $pageCount,
-                                       'queryType' => 'count_links',
-                                       'query' => $pageCount,
-                               ) );
-                               $result = $this->linkCountMultiSearch->search();
-                               $this->success();
-                               for ( $index = 0; $index < 
$linkCountClosureCount; $index++ ) {
-                                       $this->linkCountClosures[ $index ]( 
$result[ $index ]->getTotalHits() );
-                               }
-                       } catch ( \Elastica\Exception\ExceptionInterface $e ) {
-                               // Note that we still return the pages and 
execute the update here, we just complain
-                               $this->failure( $e );
-                               $pageIds = array_map( function( WikiPage $page 
) {
-                                       return $page->getId();
-                               }, $pages );
-                               LoggerFactory::getInstance( 
'CirrusSearchChangeFailed' )->info(
-                                       'Links for page ids: ' . implode( ',', 
$pageIds ) );
-                       }
+               // If there was some sort of failure counting links don't 
attach it to the document, instead allowing
+               // whatever value is already stored to continue (rather than 
sending an incorrect count).
+               if ( $incomingCount !== null ) {
+                       $doc->set( 'incoming_links', $incomingCount );
                }
+
+               return true;
        }
 
        /**
-        * Build a Search that will count all pages that link to $titles.
+        * Count the number of incoming links to $titles. This could alternately
+        * be calculated with the BacklinkCache, but that only handles a single
+        * title at a time which is very inefficient for our use case (querying
+        * potentially hundreds of titles for counts). Instead this query gets 
it
+        * done in one single query which the database should be able to 
optimize
+        * for (and use the existing backlink index).
         *
-        * @param string[] $titles title in prefixedDBKey form
-        * @return \Elastica\Search that counts all pages that link to $titles
+        * @param Title[] $titles
+        * @return int|null The number of incoming links, or null on failure
         */
-       private function buildCount( array $titles ) {
+       private static function countIncomingLinks( array $titles ) {
+               $dbr = wfGetDB( DB_SLAVE );
 
-               $bool = new BoolQuery();
-               $bool->addFilter( new Terms( 'outgoing_link', $titles ) );
+               foreach ( $titles as $title ) {
+                       $conditions[] = $dbr->makeList( array(
+                               'pl_namespace' => $title->getNamespace(),
+                               'pl_title' => $title->getDBkey(),
+                       ), LIST_AND );
+               }
 
-               $indexPrefix = $this->config->get( 
SearchConfig::INDEX_BASE_NAME );
-               $type = $this->connection->getPageType( $indexPrefix );
-               $search = new \Elastica\Search( $type->getIndex()->getClient() 
);
-               $search->addIndex( $type->getIndex() );
-               $search->addType( $type );
-               $search->setOption(
-                       \Elastica\Search::OPTION_SEARCH_TYPE,
-                       \Elastica\Search::OPTION_SEARCH_TYPE_COUNT
-               );
-               $search->setQuery( $bool );
-               $search->getQuery()->addParam( 'stats', 'link_count' );
+               $condition = $dbr->makeList( $conditions, LIST_OR );
 
-               return $search;
+               $res = $dbr->selectField( 'pagelinks', 'count(1)', $condition, 
__METHOD__ );
+               if ( $res === false ) {
+                       return null;
+               }
+
+               return (int) $res;
        }
 }
diff --git a/includes/Updater.php b/includes/Updater.php
index c3f7710..389eb7e 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -333,8 +333,6 @@
                        $documents[] = $doc;
                }
 
-               MWHooks::run( 'CirrusSearchBuildDocumentFinishBatch', array( 
$pages ) );
-
                return $documents;
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/305325
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id7aaea59dd1ad5a7cb770c226cbf6ef7194233e8
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to