Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/79789


Change subject: Return most relevent prefix search results.
......................................................................

Return most relevent prefix search results.

Relevency is calculated in much the same way as lsearchd - based on the
number of incoming links.

This solution will also create some knock-on work, namely that we'll
have to reindex pages when the number of links to them changes
substantially.  Exactly what that means is up for debate but without
doing something a page won't notice it has become popular until it has
changed.

For bug 52886.

Change-Id: I099849fa1d9b5cd4bedef2266ac8f7ec3e31e990
---
M CirrusSearch.body.php
M CirrusSearchMappingConfigBuilder.php
M CirrusSearchUpdater.php
M forceSearchIndex.php
M updateOneSearchIndexConfig.php
5 files changed, 74 insertions(+), 32 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/89/79789/1

diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php
index 2673975..4d3c30c 100644
--- a/CirrusSearch.body.php
+++ b/CirrusSearch.body.php
@@ -109,14 +109,20 @@
 
                // Query params
                $query->setLimit( $limit );
-               $query->setFilter( CirrusSearch::buildNamespaceFilter( $ns ) );
-               $indexType = CirrusSearch::pickIndexTypeFromNamespaces( $ns );
+               $mainFilter = new \Elastica\Filter\Bool();
+               $mainFilter->addMust( CirrusSearch::buildNamespaceFilter( $ns ) 
);
+               $prefixFilterQuery = new \Elastica\Filter\Query();
                $match = new \Elastica\Query\Match();
                $match->setField( 'title.prefix', array(
                        'query' => substr( $search, 0, 
CirrusSearch::MAX_PREFIX_SEARCH ),
                        'analyzer' => 'prefix_query'
                ) );
-               $query->setQuery( $match );
+               $mainFilter->addMust( new \Elastica\Filter\Query( $match ) );
+               $query->setFilter( $mainFilter );
+               // This query doesn't have a score because it is all filters so 
force sorting on the boost
+               $query->setSort( array( 'boost' ) );
+
+               $indexType = CirrusSearch::pickIndexTypeFromNamespaces( $ns );
 
                // Perform the search
                $work = new PoolCounterWorkViaCallback( 'CirrusSearch-Search', 
"_elasticsearch", array(
@@ -387,7 +393,7 @@
                        }
                        CirrusSearchUpdater::updateRevisions( array( array(
                                'rev' => $revision,
-                               'text' => $text
+                               'text' => $text,
                        ) ) );
                        CirrusSearch::$updated[] = $id;
                }
diff --git a/CirrusSearchMappingConfigBuilder.php 
b/CirrusSearchMappingConfigBuilder.php
index 68d7772..91928fd 100644
--- a/CirrusSearchMappingConfigBuilder.php
+++ b/CirrusSearchMappingConfigBuilder.php
@@ -31,14 +31,20 @@
                // Note never to set something as type='object' here because 
that isn't returned by elasticsearch
                // and is infered anyway.
                return array(
-                       'title' => $this->buildStringField( 'title', array( 
'suggest', 'prefix' ), true ),
-                       'text' => $this->buildStringField( 'text', array( 
'suggest' ), true ),
-                       'category' => $this->buildStringField(),
-                       'redirect' => array(
-                               'properties' => array(
-                                       'title' => $this->buildStringField( 
'title', null, true )
+                       'properties' => array(
+                               'title' => $this->buildStringField( 'title', 
array( 'suggest', 'prefix' ), true ),
+                               'text' => $this->buildStringField( 'text', 
array( 'suggest' ), true ),
+                               'category' => $this->buildStringField(),
+                               'redirect' => array(
+                                       'properties' => array(
+                                               'title' => 
$this->buildStringField( 'title', null, true )
+                                       )
                                )
-                       )
+                       ),
+                       '_boost' => array(
+                               'name' => 'boost',
+                               'null_value' => 1.0,
+                       ),
                );
        }
 
diff --git a/CirrusSearchUpdater.php b/CirrusSearchUpdater.php
index 482b951..d44ccfb 100644
--- a/CirrusSearchUpdater.php
+++ b/CirrusSearchUpdater.php
@@ -23,7 +23,12 @@
         *
         * @param array $pageData An array of revisions and their pre-processed
         * data. The format is as follows:
-        *   array( array( 'rev' => $revision, 'text' => $text ), ... )
+        *   array(
+        *     array(
+        *       'rev' => current revision object
+        *       'text' => text of the current page
+        *     )
+        *   )
         */
        public static function updateRevisions( $pageData ) {
                wfProfileIn( __METHOD__ );
@@ -31,7 +36,7 @@
                $contentDocuments = array();
                $generalDocuments = array();
                foreach ( $pageData as $page ) {
-                       $document = 
CirrusSearchUpdater::buildDocumentforRevision( $page['rev'], $page['text'] );
+                       $document = 
CirrusSearchUpdater::buildDocumentforRevision( $page );
                        if ( MWNamespace::isContent( $document->get( 
'namespace' ) ) ) {
                                $contentDocuments[] = $document;
                        } else {
@@ -66,9 +71,11 @@
                wfProfileOut( __METHOD__ );
        }
 
-       public static function buildDocumentforRevision( $revision, $text ) {
+       public static function buildDocumentforRevision( $page ) {
                global $wgCirrusSearchIndexedRedirects;
                wfProfileIn( __METHOD__ );
+               $revision = $page[ 'rev' ];
+               $text = $page[ 'text' ];
                $title = $revision->getTitle();
                $article = new Article( $title, $revision->getId() );
                $parserOutput = $article->getParserOutput( $revision->getId() );
@@ -87,6 +94,17 @@
                        );
                }
 
+               // Calculate the query boost.
+               $backlinkCache = new BacklinkCache( $revision->getTitle() );
+               $links = $backlinkCache->getNumLinks( 'pagelinks' );
+               // Boost should increase at log speed with number of links but 
should never be 0 so just
+               // peg it at one until log( $links ) > 1.
+               if ( $links > 2 ) {
+                       $boost = log( $links );
+               } else {
+                       $boost = 1;
+               }
+
                $doc = new \Elastica\Document( $revision->getPage(), array(
                        'namespace' => $title->getNamespace(),
                        'title' => $title->getText(),
@@ -94,7 +112,8 @@
                        'textLen' => $revision->getSize(),
                        'timestamp' => wfTimestamp( TS_ISO_8601, 
$revision->getTimestamp() ),
                        'category' => $categories,
-                       'redirect' => $redirects
+                       'redirect' => $redirects,
+                       'boost' => $boost,
                ) );
 
                wfProfileOut( __METHOD__ );
diff --git a/forceSearchIndex.php b/forceSearchIndex.php
index 0105930..de6ab4f 100644
--- a/forceSearchIndex.php
+++ b/forceSearchIndex.php
@@ -149,14 +149,18 @@
                        }
                        $res = $dbr->select(
                                array( 'revision', 'text', 'page' ),
-                               array_merge( Revision::selectFields(), 
Revision::selectTextFields(), Revision::selectPageFields() ),
-                                       "$minId < page_id"
-                                       . $toIdPart
-                                       . ' AND rev_text_id = old_id'
-                                       . ' AND rev_id = page_latest'
-                                       . ' AND page_is_redirect = 0',
-                                       // Note that we attempt to filter out 
redirects because everything about the redirect
-                                       // will be covered when we index the 
page to which it points.
+                               array_merge(
+                                       Revision::selectFields(),
+                                       Revision::selectTextFields(),
+                                       Revision::selectPageFields()
+                               ),
+                               "$minId < page_id"
+                               . $toIdPart
+                               . ' AND rev_text_id = old_id'
+                               . ' AND rev_id = page_latest'
+                               . ' AND page_is_redirect = 0',
+                               // Note that we attempt to filter out redirects 
because everything about the redirect
+                               // will be covered when we index the page to 
which it points.
                                __METHOD__,
                                array( 'ORDER BY' => 'page_id',
                                       'LIMIT' => $this->mBatchSize )
@@ -166,13 +170,17 @@
                        $maxUpdate = $dbr->addQuotes( $dbr->timestamp( 
$maxUpdate ) );
                        $res = $dbr->select(
                                array( 'revision', 'text', 'page' ),
-                               array_merge( Revision::selectFields(), 
Revision::selectTextFields(), Revision::selectPageFields() ),
-                                       'page_id = rev_page'
-                                       . ' AND rev_text_id = old_id'
-                                       . ' AND rev_id = page_latest'
-                                       . " AND ( ( $minUpdate = rev_timestamp 
AND $minId < page_id ) OR $minUpdate < rev_timestamp )"
-                                       . " AND rev_timestamp <= $maxUpdate",
-                                       // Note that redirects are allowed here 
so we can pick up redirects made during search downtime
+                               array_merge(
+                                       Revision::selectFields(),
+                                       Revision::selectTextFields(),
+                                       Revision::selectPageFields()
+                               ),
+                               'page_id = rev_page'
+                               . ' AND rev_text_id = old_id'
+                               . ' AND rev_id = page_latest'
+                               . " AND ( ( $minUpdate = rev_timestamp AND 
$minId < page_id ) OR $minUpdate < rev_timestamp )"
+                               . " AND rev_timestamp <= $maxUpdate",
+                               // Note that redirects are allowed here so we 
can pick up redirects made during search downtime
                                __METHOD__,
                                array( 'ORDER BY' => 'rev_timestamp, rev_page',
                                       'LIMIT' => $this->mBatchSize )
diff --git a/updateOneSearchIndexConfig.php b/updateOneSearchIndexConfig.php
index 56718fc..179337a 100644
--- a/updateOneSearchIndexConfig.php
+++ b/updateOneSearchIndexConfig.php
@@ -184,12 +184,15 @@
                $this->output( $this->indent . "\tValidating mapping for page 
type..." );
                $requiredPageMappings = 
CirrusSearchMappingConfigBuilder::build();
                if ( array_key_exists( 'page', $actualMappings) &&
-                               $this->vmActualMatchRequired( $actualMappings[ 
'page' ][ 'properties' ], $requiredPageMappings ) ) {
+                               $this->vmActualMatchRequired( $actualMappings[ 
'page' ], $requiredPageMappings ) ) {
                        $this->output( "ok\n" );
                } else {
                        $this->output( "different..." );
                        // TODO Conflict resolution here might leave old 
portions of mappings
-                       $action = new \Elastica\Type\Mapping( 
$this->getPageType(), $requiredPageMappings );
+                       $action = new \Elastica\Type\Mapping( 
$this->getPageType() );
+                       foreach ( $requiredPageMappings as $key => $value ) {
+                               $action->setParam( $key, $value );
+                       }
                        try {
                                $action->send();
                                $this->output( "corrected\n" );

-- 
To view, visit https://gerrit.wikimedia.org/r/79789
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I099849fa1d9b5cd4bedef2266ac8f7ec3e31e990
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to