Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/79789
Change subject: Return most relevent prefix search results. ...................................................................... Return most relevent prefix search results. Relevency is calculated in much the same way as lsearchd - based on the number of incoming links. This solution will also create some knock-on work, namely that we'll have to reindex pages when the number of links to them changes substantially. Exactly what that means is up for debate but without doing something a page won't notice it has become popular until it has changed. For bug 52886. Change-Id: I099849fa1d9b5cd4bedef2266ac8f7ec3e31e990 --- M CirrusSearch.body.php M CirrusSearchMappingConfigBuilder.php M CirrusSearchUpdater.php M forceSearchIndex.php M updateOneSearchIndexConfig.php 5 files changed, 74 insertions(+), 32 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/89/79789/1 diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php index 2673975..4d3c30c 100644 --- a/CirrusSearch.body.php +++ b/CirrusSearch.body.php @@ -109,14 +109,20 @@ // Query params $query->setLimit( $limit ); - $query->setFilter( CirrusSearch::buildNamespaceFilter( $ns ) ); - $indexType = CirrusSearch::pickIndexTypeFromNamespaces( $ns ); + $mainFilter = new \Elastica\Filter\Bool(); + $mainFilter->addMust( CirrusSearch::buildNamespaceFilter( $ns ) ); + $prefixFilterQuery = new \Elastica\Filter\Query(); $match = new \Elastica\Query\Match(); $match->setField( 'title.prefix', array( 'query' => substr( $search, 0, CirrusSearch::MAX_PREFIX_SEARCH ), 'analyzer' => 'prefix_query' ) ); - $query->setQuery( $match ); + $mainFilter->addMust( new \Elastica\Filter\Query( $match ) ); + $query->setFilter( $mainFilter ); + // This query doesn't have a score because it is all filters so force sorting on the boost + $query->setSort( array( 'boost' ) ); + + $indexType = CirrusSearch::pickIndexTypeFromNamespaces( $ns ); // Perform the search $work = new PoolCounterWorkViaCallback( 'CirrusSearch-Search', "_elasticsearch", array( @@ -387,7 +393,7 @@ } CirrusSearchUpdater::updateRevisions( array( array( 'rev' => $revision, - 'text' => $text + 'text' => $text, ) ) ); CirrusSearch::$updated[] = $id; } diff --git a/CirrusSearchMappingConfigBuilder.php b/CirrusSearchMappingConfigBuilder.php index 68d7772..91928fd 100644 --- a/CirrusSearchMappingConfigBuilder.php +++ b/CirrusSearchMappingConfigBuilder.php @@ -31,14 +31,20 @@ // Note never to set something as type='object' here because that isn't returned by elasticsearch // and is infered anyway. return array( - 'title' => $this->buildStringField( 'title', array( 'suggest', 'prefix' ), true ), - 'text' => $this->buildStringField( 'text', array( 'suggest' ), true ), - 'category' => $this->buildStringField(), - 'redirect' => array( - 'properties' => array( - 'title' => $this->buildStringField( 'title', null, true ) + 'properties' => array( + 'title' => $this->buildStringField( 'title', array( 'suggest', 'prefix' ), true ), + 'text' => $this->buildStringField( 'text', array( 'suggest' ), true ), + 'category' => $this->buildStringField(), + 'redirect' => array( + 'properties' => array( + 'title' => $this->buildStringField( 'title', null, true ) + ) ) - ) + ), + '_boost' => array( + 'name' => 'boost', + 'null_value' => 1.0, + ), ); } diff --git a/CirrusSearchUpdater.php b/CirrusSearchUpdater.php index 482b951..d44ccfb 100644 --- a/CirrusSearchUpdater.php +++ b/CirrusSearchUpdater.php @@ -23,7 +23,12 @@ * * @param array $pageData An array of revisions and their pre-processed * data. The format is as follows: - * array( array( 'rev' => $revision, 'text' => $text ), ... ) + * array( + * array( + * 'rev' => current revision object + * 'text' => text of the current page + * ) + * ) */ public static function updateRevisions( $pageData ) { wfProfileIn( __METHOD__ ); @@ -31,7 +36,7 @@ $contentDocuments = array(); $generalDocuments = array(); foreach ( $pageData as $page ) { - $document = CirrusSearchUpdater::buildDocumentforRevision( $page['rev'], $page['text'] ); + $document = CirrusSearchUpdater::buildDocumentforRevision( $page ); if ( MWNamespace::isContent( $document->get( 'namespace' ) ) ) { $contentDocuments[] = $document; } else { @@ -66,9 +71,11 @@ wfProfileOut( __METHOD__ ); } - public static function buildDocumentforRevision( $revision, $text ) { + public static function buildDocumentforRevision( $page ) { global $wgCirrusSearchIndexedRedirects; wfProfileIn( __METHOD__ ); + $revision = $page[ 'rev' ]; + $text = $page[ 'text' ]; $title = $revision->getTitle(); $article = new Article( $title, $revision->getId() ); $parserOutput = $article->getParserOutput( $revision->getId() ); @@ -87,6 +94,17 @@ ); } + // Calculate the query boost. + $backlinkCache = new BacklinkCache( $revision->getTitle() ); + $links = $backlinkCache->getNumLinks( 'pagelinks' ); + // Boost should increase at log speed with number of links but should never be 0 so just + // peg it at one until log( $links ) > 1. + if ( $links > 2 ) { + $boost = log( $links ); + } else { + $boost = 1; + } + $doc = new \Elastica\Document( $revision->getPage(), array( 'namespace' => $title->getNamespace(), 'title' => $title->getText(), @@ -94,7 +112,8 @@ 'textLen' => $revision->getSize(), 'timestamp' => wfTimestamp( TS_ISO_8601, $revision->getTimestamp() ), 'category' => $categories, - 'redirect' => $redirects + 'redirect' => $redirects, + 'boost' => $boost, ) ); wfProfileOut( __METHOD__ ); diff --git a/forceSearchIndex.php b/forceSearchIndex.php index 0105930..de6ab4f 100644 --- a/forceSearchIndex.php +++ b/forceSearchIndex.php @@ -149,14 +149,18 @@ } $res = $dbr->select( array( 'revision', 'text', 'page' ), - array_merge( Revision::selectFields(), Revision::selectTextFields(), Revision::selectPageFields() ), - "$minId < page_id" - . $toIdPart - . ' AND rev_text_id = old_id' - . ' AND rev_id = page_latest' - . ' AND page_is_redirect = 0', - // Note that we attempt to filter out redirects because everything about the redirect - // will be covered when we index the page to which it points. + array_merge( + Revision::selectFields(), + Revision::selectTextFields(), + Revision::selectPageFields() + ), + "$minId < page_id" + . $toIdPart + . ' AND rev_text_id = old_id' + . ' AND rev_id = page_latest' + . ' AND page_is_redirect = 0', + // Note that we attempt to filter out redirects because everything about the redirect + // will be covered when we index the page to which it points. __METHOD__, array( 'ORDER BY' => 'page_id', 'LIMIT' => $this->mBatchSize ) @@ -166,13 +170,17 @@ $maxUpdate = $dbr->addQuotes( $dbr->timestamp( $maxUpdate ) ); $res = $dbr->select( array( 'revision', 'text', 'page' ), - array_merge( Revision::selectFields(), Revision::selectTextFields(), Revision::selectPageFields() ), - 'page_id = rev_page' - . ' AND rev_text_id = old_id' - . ' AND rev_id = page_latest' - . " AND ( ( $minUpdate = rev_timestamp AND $minId < page_id ) OR $minUpdate < rev_timestamp )" - . " AND rev_timestamp <= $maxUpdate", - // Note that redirects are allowed here so we can pick up redirects made during search downtime + array_merge( + Revision::selectFields(), + Revision::selectTextFields(), + Revision::selectPageFields() + ), + 'page_id = rev_page' + . ' AND rev_text_id = old_id' + . ' AND rev_id = page_latest' + . " AND ( ( $minUpdate = rev_timestamp AND $minId < page_id ) OR $minUpdate < rev_timestamp )" + . " AND rev_timestamp <= $maxUpdate", + // Note that redirects are allowed here so we can pick up redirects made during search downtime __METHOD__, array( 'ORDER BY' => 'rev_timestamp, rev_page', 'LIMIT' => $this->mBatchSize ) diff --git a/updateOneSearchIndexConfig.php b/updateOneSearchIndexConfig.php index 56718fc..179337a 100644 --- a/updateOneSearchIndexConfig.php +++ b/updateOneSearchIndexConfig.php @@ -184,12 +184,15 @@ $this->output( $this->indent . "\tValidating mapping for page type..." ); $requiredPageMappings = CirrusSearchMappingConfigBuilder::build(); if ( array_key_exists( 'page', $actualMappings) && - $this->vmActualMatchRequired( $actualMappings[ 'page' ][ 'properties' ], $requiredPageMappings ) ) { + $this->vmActualMatchRequired( $actualMappings[ 'page' ], $requiredPageMappings ) ) { $this->output( "ok\n" ); } else { $this->output( "different..." ); // TODO Conflict resolution here might leave old portions of mappings - $action = new \Elastica\Type\Mapping( $this->getPageType(), $requiredPageMappings ); + $action = new \Elastica\Type\Mapping( $this->getPageType() ); + foreach ( $requiredPageMappings as $key => $value ) { + $action->setParam( $key, $value ); + } try { $action->send(); $this->output( "corrected\n" ); -- To view, visit https://gerrit.wikimedia.org/r/79789 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I099849fa1d9b5cd4bedef2266ac8f7ec3e31e990 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits