Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/224612
Change subject: Revert "Revert "Remove lots of dynamic groovy"" ...................................................................... Revert "Revert "Remove lots of dynamic groovy"" We can have this once we upgrade to Elasticsearch 1.6.0! This reverts commit bab556ff409d875d416851e0c7f90f9c968a34a4. Change-Id: I827f5c54b93db5229daf7caacecbf82159ba6502 --- M CirrusSearch.php A includes/Extra/Filter/IdHashMod.php M includes/Hooks.php M includes/Maintenance/Reindexer.php M includes/Maintenance/Validators/CacheWarmersValidator.php M includes/Searcher.php M includes/Updater.php M tests/jenkins/FullyFeaturedConfig.php M tests/jenkins/Jenkins.php 9 files changed, 107 insertions(+), 69 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/12/224612/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 2c20a0b..1bc2701 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -104,6 +104,15 @@ // 'max_terms_in_all_queries' => 128, // ) // ); +// This turns on noop-detection for updates and is compatible with +// wikimedia-extra versions 1.3.1, 1.4.2, and 1.5.0: +// $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true; +// This turns on field_value_factors for rankings and is compatible with +// wikimedia-extra versions 1.3.1, 1.4.2, and 1.5.0: +// $wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] = true; +// This allows forking on reindexing and is compatible with wikimedia-extra +// versions 1.3.1, 1.4.2, and 1.5.0 +// $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true; // Should CirrusSearch try to support regular expressions with insource:? // These can be really expensive, but mostly ok, especially if you have the @@ -565,6 +574,7 @@ $wgAutoloadClasses['CirrusSearch\Connection'] = $includes . 'Connection.php'; $wgAutoloadClasses['CirrusSearch\Dump'] = $includes . 'Dump.php'; $wgAutoloadClasses['CirrusSearch\ElasticsearchIntermediary'] = $includes . 'ElasticsearchIntermediary.php'; +$wgAutoloadClasses['CirrusSearch\Extra\Filter\IdHashMod'] = $extraFilterDir . 'IdHashMod.php'; $wgAutoloadClasses['CirrusSearch\Extra\Filter\SourceRegex'] = $extraFilterDir . 'SourceRegex.php'; $wgAutoloadClasses['CirrusSearch\ForceSearchIndex'] = __DIR__ . '/maintenance/forceSearchIndex.php'; $wgAutoloadClasses['CirrusSearch\Hooks'] = $includes . 'Hooks.php'; diff --git a/includes/Extra/Filter/IdHashMod.php b/includes/Extra/Filter/IdHashMod.php new file mode 100644 index 0000000..eeba709 --- /dev/null +++ b/includes/Extra/Filter/IdHashMod.php @@ -0,0 +1,36 @@ +<?php + +namespace CirrusSearch\Extra\Filter; + +use Elastica\Filter\AbstractFilter; + +/** + * Creates an IdHashMod filter. + * + * @link https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class IdHashMod extends AbstractFilter { + /** + * @param int $mod modulus to use. Number of chunks to cut the data into. + * @param int $match value to match. Must be less than $mod. Its the + * current chunk number. + */ + public function __construct( $mod, $match ) { + $this->setParam( 'mod', $mod )->setParam( 'match', $match ); + } +} diff --git a/includes/Hooks.php b/includes/Hooks.php index 0890f18..9301907 100644 --- a/includes/Hooks.php +++ b/includes/Hooks.php @@ -345,7 +345,7 @@ /** * Hooked to delegate prefix searching to Searcher. - * @param int $namespace namespace to search + * @param int[] $namespaces namespace to search * @param string $search search text * @param int $limit maximum number of titles to return * @param array(string) $results outbound variable with string versions of titles diff --git a/includes/Maintenance/Reindexer.php b/includes/Maintenance/Reindexer.php index bf22b90..ad2713b 100644 --- a/includes/Maintenance/Reindexer.php +++ b/includes/Maintenance/Reindexer.php @@ -7,7 +7,6 @@ use Elastica\Document; use Elastica\Exception\Connection\HttpException; use Elastica\Exception\ExceptionInterface; -use Elastica\Filter\Script; use Elastica\Index; use Elastica\Query; use Elastica\Type; @@ -130,6 +129,8 @@ * @param float $acceptableCountDeviation */ public function reindex( $processes = 1, $refreshInterval = 1, $retryAttempts = 5, $chunkSize = 100, $acceptableCountDeviation = .05 ) { + global $wgCirrusSearchWikimediaExtraPlugin; + // Set some settings that should help io load during bulk indexing. We'll have to // optimize after this to consolidate down to a proper number of shards but that is // is worth the price. total_shards_per_node will help to make sure that each shard @@ -144,6 +145,11 @@ ) ); if ( $processes > 1 ) { + if ( !isset( $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] ) || + !$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] ) { + $this->error( "Can't use multiple processes without \$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true", 1 ); + } + $fork = new ReindexForkController( $processes ); $forkResult = $fork->start(); // Forking clears the timeout so we have to reinstate it. @@ -259,17 +265,14 @@ $messagePrefix = "\t\t[$childNumber] "; $this->outputIndented( $messagePrefix . "Starting child process reindex\n" ); // Note that it is not ok to abs(_uid.hashCode) because hashCode(Integer.MIN_VALUE) == Integer.MIN_VALUE - $filter = new Script( array( - 'script' => "(doc['_uid'].value.hashCode() & Integer.MAX_VALUE) % $children == $childNumber", - 'lang' => 'groovy' - ) ); + $filter = new \CirrusSearch\Extra\Filter\IdHashMod( $children, $childNumber ); } $properties = $this->mappingConfig[$oldType->getName()]['properties']; try { $query = new Query(); $query->setFields( array( '_id', '_source' ) ); if ( $filter ) { - $query->setFilter( $filter ); + $query->setQuery( new \Elastica\Query\Filtered( new \Elastica\Query\MatchAll(), $filter ) ); } // Note here we dump from the current index (using the alias) so we can use Connection::getPageType diff --git a/includes/Maintenance/Validators/CacheWarmersValidator.php b/includes/Maintenance/Validators/CacheWarmersValidator.php index 42ab657..72ffdd9 100644 --- a/includes/Maintenance/Validators/CacheWarmersValidator.php +++ b/includes/Maintenance/Validators/CacheWarmersValidator.php @@ -81,7 +81,7 @@ // filters. That is probably OK because most searches don't use one. // It'd be overeager. null - // Null user because we won't be logging anything about the user. + // Null user because we won't be logging anything about the user. ); $searcher->setReturnQuery( true ); $searcher->setResultsType( new FullTextResultsType( FullTextResultsType::HIGHLIGHT_ALL ) ); diff --git a/includes/Searcher.php b/includes/Searcher.php index 872367d..b2d7c36 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -1420,6 +1420,7 @@ */ private function installBoosts() { global $wgCirrusSearchFunctionRescoreWindowSize, + $wgCirrusSearchWikimediaExtraPlugin, $wgCirrusSearchLanguageWeight, $wgLanguageCode; @@ -1436,26 +1437,38 @@ // Customize score by boosting based on incoming links count if ( $this->boostLinks ) { - $incomingLinks = "(doc['incoming_links'].isEmpty() ? 0 : doc['incoming_links'].value)"; - $scoreBoostGroovy = "log10($incomingLinks + 2)"; - $functionScore->addScriptScoreFunction( new \Elastica\Script( $scoreBoostGroovy, null, 'groovy' ) ); $useFunctionScore = true; + if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] ) && + $wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] ) { + $functionScore->addFunction( 'field_value_factor_with_default', array( + 'field' => 'incoming_links', + 'modifier' => 'log2p', + 'missing' => 0, + ) ); + } else { + $scoreBoostExpression = "log10(doc['incoming_links'].value + 2)"; + $functionScore->addScriptScoreFunction( new \Elastica\Script( $scoreBoostExpression, null, 'expression' ) ); + } } // Customize score by decaying a portion by time since last update if ( $this->preferRecentDecayPortion > 0 && $this->preferRecentHalfLife > 0 ) { // Convert half life for time in days to decay constant for time in milliseconds. $decayConstant = log( 2 ) / $this->preferRecentHalfLife / 86400000; - // e^ct - 1 where t is last modified time - now which is negative - $exponentialDecayGroovy = "Math.expm1($decayConstant * (doc['timestamp'].value - Instant.now().getMillis()))"; - // p(e^ct - 1) + $parameters = array( + 'decayConstant' => $decayConstant, + 'decayPortion' => $this->preferRecentDecayPortion, + 'nonDecayPortion' => 1 - $this->preferRecentDecayPortion, + 'now' => time() * 1000 + ); + + // e^ct where t is last modified time - now which is negative + $exponentialDecayExpression = "exp(decayConstant * (doc['timestamp'].value - now))"; if ( $this->preferRecentDecayPortion !== 1.0 ) { - $exponentialDecayGroovy = "$exponentialDecayGroovy * $this->preferRecentDecayPortion"; + $exponentialDecayExpression = "$exponentialDecayExpression * decayPortion + nonDecayPortion"; } - // p(e^ct - 1) + 1 which is easier to calculate than, but reduces to 1 - p + pe^ct - // Which breaks the score into an unscaled portion (1 - p) and a scaled portion (p) - $exponentialDecayGroovy = "$exponentialDecayGroovy + 1"; - $functionScore->addScriptScoreFunction( new \Elastica\Script( $exponentialDecayGroovy, null, 'groovy' ) ); + $functionScore->addScriptScoreFunction( new \Elastica\Script( $exponentialDecayExpression, + $parameters, 'expression' ) ); $useFunctionScore = true; } diff --git a/includes/Updater.php b/includes/Updater.php index 78e3c8f..e2cb1f5 100644 --- a/includes/Updater.php +++ b/includes/Updater.php @@ -172,6 +172,8 @@ * @return int Number of documents updated of -1 if there was an error */ public function updatePages( $pages, $shardTimeout, $clientSideTimeout, $flags ) { + global $wgCirrusSearchWikimediaExtraPlugin; + // Don't update the same page twice. We shouldn't, but meh $pageIds = array(); $pages = array_filter( $pages, function( $page ) use ( &$pageIds ) { @@ -191,10 +193,11 @@ $allData = array_fill_keys( Connection::getAllIndexTypes(), array() ); foreach ( $this->buildDocumentsForPages( $pages, $flags ) as $document ) { $suffix = Connection::getIndexSuffixForNamespace( $document->get( 'namespace' ) ); - $allData[$suffix][] = $this->docToScript( $document ); - // To quickly switch back to sending doc as upsert instead of script, remove the line above - // and switch to the one below: - // $allData[$suffix][] = $document; + if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] ) && + $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] ) { + $document = $this->docToSuperDetectNoopScript( $document ); + } + $allData[$suffix][] = $document; } $count = 0; foreach( $allData as $indexType => $data ) { @@ -357,47 +360,17 @@ return $documents; } - private function docToScript( $doc ) { - $scriptText = <<<GROOVY -changed = false; - -GROOVY; + /** + * Converts a document into a call to super_detect_noop from the wikimedia-extra plugin. + */ + private function docToSuperDetectNoopScript( $doc ) { $params = $doc->getParams(); - foreach ( $doc->getData() as $key => $value ) { - if ( $key === 'incoming_links' ) { - // incoming links has to be more then 20% incorrect before we update it. - $scriptText .= <<<GROOVY -if ( ctx._source.$key == null ) { - thisChanged = true; -} else if ( $key == 0 ) { - thisChanged = ctx._source.$key != 0; -} else { - thisChanged = abs( ctx._source.$key - $key ) / $key > 0.2; -} -if ( thisChanged ) { - changed = true; - ctx._source.$key = $key; -} + $params[ 'source' ] = $doc->getData(); + $params[ 'detectors' ] = array( + 'incoming_links' => 'within 20%', + ); -GROOVY; - } else { - $scriptText .= <<<GROOVY -if ( changed || ctx._source.$key != $key ) { - changed = true; - ctx._source.$key = $key; -} - -GROOVY; - } - $params[ $key ] = $value; - } - $scriptText .= <<<GROOVY -if ( !changed ) { - ctx.op = "none"; -} - -GROOVY; - $script = new \Elastica\Script( $scriptText, $params, 'groovy' ); + $script = new \Elastica\Script( 'super_detect_noop', $params, 'native' ); if ( $doc->getDocAsUpsert() ) { $script->setUpsert( $doc ); } diff --git a/tests/jenkins/FullyFeaturedConfig.php b/tests/jenkins/FullyFeaturedConfig.php index cb67fc0..b3e7fe8 100644 --- a/tests/jenkins/FullyFeaturedConfig.php +++ b/tests/jenkins/FullyFeaturedConfig.php @@ -32,7 +32,16 @@ ) ); $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true; +$wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] = true; +$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true; +$wgJobQueueAggregator = array( + 'class' => 'JobQueueAggregatorRedis', + 'redisServer' => 'localhost', + 'redisConfig' => array( + 'password' => null, + ), +); if ( class_exists( 'PoolCounter_Client' ) ) { // If the pool counter is around set up prod like pool counter settings diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php index f75783c..8fd5acd 100644 --- a/tests/jenkins/Jenkins.php +++ b/tests/jenkins/Jenkins.php @@ -63,13 +63,7 @@ 'password' => null, ), ); -$wgJobQueueAggregator = array( - 'class' => 'JobQueueAggregatorRedis', - 'redisServer' => 'localhost', - 'redisConfig' => array( - 'password' => null, - ), -); + $wgCiteEnablePopups = true; $wgExtraNamespaces[ 760 ] = 'Mó'; -- To view, visit https://gerrit.wikimedia.org/r/224612 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I827f5c54b93db5229daf7caacecbf82159ba6502 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits