Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/224612

Change subject: Revert "Revert "Remove lots of dynamic groovy""
......................................................................

Revert "Revert "Remove lots of dynamic groovy""

We can have this once we upgrade to Elasticsearch 1.6.0!

This reverts commit bab556ff409d875d416851e0c7f90f9c968a34a4.

Change-Id: I827f5c54b93db5229daf7caacecbf82159ba6502
---
M CirrusSearch.php
A includes/Extra/Filter/IdHashMod.php
M includes/Hooks.php
M includes/Maintenance/Reindexer.php
M includes/Maintenance/Validators/CacheWarmersValidator.php
M includes/Searcher.php
M includes/Updater.php
M tests/jenkins/FullyFeaturedConfig.php
M tests/jenkins/Jenkins.php
9 files changed, 107 insertions(+), 69 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/12/224612/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 2c20a0b..1bc2701 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -104,6 +104,15 @@
 //             'max_terms_in_all_queries' => 128,
 //     )
 // );
+// This turns on noop-detection for updates and is compatible with
+// wikimedia-extra versions 1.3.1, 1.4.2, and 1.5.0:
+// $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true;
+// This turns on field_value_factors for rankings and is compatible with
+// wikimedia-extra versions 1.3.1, 1.4.2, and 1.5.0:
+// $wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] = 
true;
+// This allows forking on reindexing and is compatible with wikimedia-extra
+// versions 1.3.1, 1.4.2, and 1.5.0
+// $wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
 
 // Should CirrusSearch try to support regular expressions with insource:?
 // These can be really expensive, but mostly ok, especially if you have the
@@ -565,6 +574,7 @@
 $wgAutoloadClasses['CirrusSearch\Connection'] = $includes . 'Connection.php';
 $wgAutoloadClasses['CirrusSearch\Dump'] = $includes . 'Dump.php';
 $wgAutoloadClasses['CirrusSearch\ElasticsearchIntermediary'] = $includes . 
'ElasticsearchIntermediary.php';
+$wgAutoloadClasses['CirrusSearch\Extra\Filter\IdHashMod'] = $extraFilterDir . 
'IdHashMod.php';
 $wgAutoloadClasses['CirrusSearch\Extra\Filter\SourceRegex'] = $extraFilterDir 
. 'SourceRegex.php';
 $wgAutoloadClasses['CirrusSearch\ForceSearchIndex'] = __DIR__ . 
'/maintenance/forceSearchIndex.php';
 $wgAutoloadClasses['CirrusSearch\Hooks'] = $includes . 'Hooks.php';
diff --git a/includes/Extra/Filter/IdHashMod.php 
b/includes/Extra/Filter/IdHashMod.php
new file mode 100644
index 0000000..eeba709
--- /dev/null
+++ b/includes/Extra/Filter/IdHashMod.php
@@ -0,0 +1,36 @@
+<?php
+
+namespace CirrusSearch\Extra\Filter;
+
+use Elastica\Filter\AbstractFilter;
+
+/**
+ * Creates an IdHashMod filter.
+ *
+ * @link 
https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+class IdHashMod extends AbstractFilter {
+    /**
+     * @param int $mod modulus to use. Number of chunks to cut the data into.
+     * @param int $match value to match. Must be less than $mod. Its the
+     *     current chunk number.
+     */
+    public function __construct( $mod, $match ) {
+        $this->setParam( 'mod', $mod )->setParam( 'match', $match );
+    }
+}
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 0890f18..9301907 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -345,7 +345,7 @@
 
        /**
         * Hooked to delegate prefix searching to Searcher.
-        * @param int $namespace namespace to search
+        * @param int[] $namespaces namespace to search
         * @param string $search search text
         * @param int $limit maximum number of titles to return
         * @param array(string) $results outbound variable with string versions 
of titles
diff --git a/includes/Maintenance/Reindexer.php 
b/includes/Maintenance/Reindexer.php
index bf22b90..ad2713b 100644
--- a/includes/Maintenance/Reindexer.php
+++ b/includes/Maintenance/Reindexer.php
@@ -7,7 +7,6 @@
 use Elastica\Document;
 use Elastica\Exception\Connection\HttpException;
 use Elastica\Exception\ExceptionInterface;
-use Elastica\Filter\Script;
 use Elastica\Index;
 use Elastica\Query;
 use Elastica\Type;
@@ -130,6 +129,8 @@
         * @param float $acceptableCountDeviation
         */
        public function reindex( $processes = 1, $refreshInterval = 1, 
$retryAttempts = 5, $chunkSize = 100, $acceptableCountDeviation = .05 ) {
+               global $wgCirrusSearchWikimediaExtraPlugin;
+
                // Set some settings that should help io load during bulk 
indexing.  We'll have to
                // optimize after this to consolidate down to a proper number 
of shards but that is
                // is worth the price.  total_shards_per_node will help to make 
sure that each shard
@@ -144,6 +145,11 @@
                ) );
 
                if ( $processes > 1 ) {
+                       if ( !isset( $wgCirrusSearchWikimediaExtraPlugin[ 
'id_hash_mod_filter' ] ) ||
+                                       !$wgCirrusSearchWikimediaExtraPlugin[ 
'id_hash_mod_filter' ] ) {
+                               $this->error( "Can't use multiple processes 
without \$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true", 1 
);
+                       }
+
                        $fork = new ReindexForkController( $processes );
                        $forkResult = $fork->start();
                        // Forking clears the timeout so we have to reinstate 
it.
@@ -259,17 +265,14 @@
                        $messagePrefix = "\t\t[$childNumber] ";
                        $this->outputIndented( $messagePrefix . "Starting child 
process reindex\n" );
                        // Note that it is not ok to abs(_uid.hashCode) because 
hashCode(Integer.MIN_VALUE) == Integer.MIN_VALUE
-                       $filter = new Script( array(
-                               'script' => "(doc['_uid'].value.hashCode() & 
Integer.MAX_VALUE) % $children == $childNumber",
-                               'lang' => 'groovy'
-                       ) );
+                       $filter = new \CirrusSearch\Extra\Filter\IdHashMod( 
$children, $childNumber );
                }
                $properties = 
$this->mappingConfig[$oldType->getName()]['properties'];
                try {
                        $query = new Query();
                        $query->setFields( array( '_id', '_source' ) );
                        if ( $filter ) {
-                               $query->setFilter( $filter );
+                               $query->setQuery( new \Elastica\Query\Filtered( 
new \Elastica\Query\MatchAll(), $filter ) );
                        }
 
                        // Note here we dump from the current index (using the 
alias) so we can use Connection::getPageType
diff --git a/includes/Maintenance/Validators/CacheWarmersValidator.php 
b/includes/Maintenance/Validators/CacheWarmersValidator.php
index 42ab657..72ffdd9 100644
--- a/includes/Maintenance/Validators/CacheWarmersValidator.php
+++ b/includes/Maintenance/Validators/CacheWarmersValidator.php
@@ -81,7 +81,7 @@
                        // filters. That is probably OK because most searches 
don't use one.
                        // It'd be overeager.
                        null
-               // Null user because we won't be logging anything about the 
user.
+                       // Null user because we won't be logging anything about 
the user.
                );
                $searcher->setReturnQuery( true );
                $searcher->setResultsType( new FullTextResultsType( 
FullTextResultsType::HIGHLIGHT_ALL ) );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 872367d..b2d7c36 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -1420,6 +1420,7 @@
         */
        private function installBoosts() {
                global $wgCirrusSearchFunctionRescoreWindowSize,
+                       $wgCirrusSearchWikimediaExtraPlugin,
                        $wgCirrusSearchLanguageWeight,
                        $wgLanguageCode;
 
@@ -1436,26 +1437,38 @@
 
                // Customize score by boosting based on incoming links count
                if ( $this->boostLinks ) {
-                       $incomingLinks = "(doc['incoming_links'].isEmpty() ? 0 
: doc['incoming_links'].value)";
-                       $scoreBoostGroovy = "log10($incomingLinks + 2)";
-                       $functionScore->addScriptScoreFunction( new 
\Elastica\Script( $scoreBoostGroovy, null, 'groovy' ) );
                        $useFunctionScore = true;
+                       if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 
'field_value_factor_with_default' ] ) &&
+                                       $wgCirrusSearchWikimediaExtraPlugin[ 
'field_value_factor_with_default' ] ) {
+                               $functionScore->addFunction( 
'field_value_factor_with_default', array(
+                                       'field' => 'incoming_links',
+                                       'modifier' => 'log2p',
+                                       'missing' => 0,
+                               ) );
+                       } else {
+                               $scoreBoostExpression = 
"log10(doc['incoming_links'].value + 2)";
+                               $functionScore->addScriptScoreFunction( new 
\Elastica\Script( $scoreBoostExpression, null, 'expression' ) );
+                       }
                }
 
                // Customize score by decaying a portion by time since last 
update
                if ( $this->preferRecentDecayPortion > 0 && 
$this->preferRecentHalfLife > 0 ) {
                        // Convert half life for time in days to decay constant 
for time in milliseconds.
                        $decayConstant = log( 2 ) / $this->preferRecentHalfLife 
/ 86400000;
-                       // e^ct - 1 where t is last modified time - now which 
is negative
-                       $exponentialDecayGroovy = "Math.expm1($decayConstant * 
(doc['timestamp'].value - Instant.now().getMillis()))";
-                       // p(e^ct - 1)
+                       $parameters = array(
+                               'decayConstant' => $decayConstant,
+                               'decayPortion' => 
$this->preferRecentDecayPortion,
+                               'nonDecayPortion' => 1 - 
$this->preferRecentDecayPortion,
+                               'now' => time() * 1000
+                       );
+
+                       // e^ct where t is last modified time - now which is 
negative
+                       $exponentialDecayExpression = "exp(decayConstant * 
(doc['timestamp'].value - now))";
                        if ( $this->preferRecentDecayPortion !== 1.0 ) {
-                               $exponentialDecayGroovy = 
"$exponentialDecayGroovy * $this->preferRecentDecayPortion";
+                               $exponentialDecayExpression = 
"$exponentialDecayExpression * decayPortion + nonDecayPortion";
                        }
-                       // p(e^ct - 1) + 1 which is easier to calculate than, 
but reduces to 1 - p + pe^ct
-                       // Which breaks the score into an unscaled portion (1 - 
p) and a scaled portion (p)
-                       $exponentialDecayGroovy = "$exponentialDecayGroovy + 1";
-                       $functionScore->addScriptScoreFunction( new 
\Elastica\Script( $exponentialDecayGroovy, null, 'groovy' ) );
+                       $functionScore->addScriptScoreFunction( new 
\Elastica\Script( $exponentialDecayExpression,
+                               $parameters, 'expression' ) );
                        $useFunctionScore = true;
                }
 
diff --git a/includes/Updater.php b/includes/Updater.php
index 78e3c8f..e2cb1f5 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -172,6 +172,8 @@
         * @return int Number of documents updated of -1 if there was an error
         */
        public function updatePages( $pages, $shardTimeout, $clientSideTimeout, 
$flags ) {
+               global $wgCirrusSearchWikimediaExtraPlugin;
+
                // Don't update the same page twice. We shouldn't, but meh
                $pageIds = array();
                $pages = array_filter( $pages, function( $page ) use ( 
&$pageIds ) {
@@ -191,10 +193,11 @@
                $allData = array_fill_keys( Connection::getAllIndexTypes(), 
array() );
                foreach ( $this->buildDocumentsForPages( $pages, $flags ) as 
$document ) {
                        $suffix = Connection::getIndexSuffixForNamespace( 
$document->get( 'namespace' ) );
-                       $allData[$suffix][] = $this->docToScript( $document );
-                       // To quickly switch back to sending doc as upsert 
instead of script, remove the line above
-                       // and switch to the one below:
-                       // $allData[$suffix][] = $document;
+                       if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 
'super_detect_noop' ] ) &&
+                                       $wgCirrusSearchWikimediaExtraPlugin[ 
'super_detect_noop' ] ) {
+                               $document = $this->docToSuperDetectNoopScript( 
$document );
+                       }
+                       $allData[$suffix][] = $document;
                }
                $count = 0;
                foreach( $allData as $indexType => $data ) {
@@ -357,47 +360,17 @@
                return $documents;
        }
 
-       private function docToScript( $doc ) {
-               $scriptText = <<<GROOVY
-changed = false;
-
-GROOVY;
+       /**
+        * Converts a document into a call to super_detect_noop from the 
wikimedia-extra plugin.
+        */
+       private function docToSuperDetectNoopScript( $doc ) {
                $params = $doc->getParams();
-               foreach ( $doc->getData() as $key => $value ) {
-                       if ( $key === 'incoming_links' ) {
-                               // incoming links has to be more then 20% 
incorrect before we update it.
-                               $scriptText .= <<<GROOVY
-if ( ctx._source.$key == null ) {
-       thisChanged = true;
-} else if ( $key == 0 ) {
-       thisChanged = ctx._source.$key != 0;
-} else {
-       thisChanged = abs( ctx._source.$key - $key ) / $key > 0.2;
-}
-if ( thisChanged ) {
-       changed = true;
-       ctx._source.$key = $key;
-}
+               $params[ 'source' ] = $doc->getData();
+               $params[ 'detectors' ] = array(
+                       'incoming_links' => 'within 20%',
+               );
 
-GROOVY;
-                       } else {
-                               $scriptText .= <<<GROOVY
-if ( changed || ctx._source.$key != $key ) {
-       changed = true;
-       ctx._source.$key = $key;
-}
-
-GROOVY;
-                       }
-                       $params[ $key ] = $value;
-               }
-               $scriptText .= <<<GROOVY
-if ( !changed ) {
-       ctx.op = "none";
-}
-
-GROOVY;
-               $script = new \Elastica\Script( $scriptText, $params, 'groovy' 
);
+               $script = new \Elastica\Script( 'super_detect_noop', $params, 
'native' );
                if ( $doc->getDocAsUpsert() ) {
                        $script->setUpsert( $doc );
                }
diff --git a/tests/jenkins/FullyFeaturedConfig.php 
b/tests/jenkins/FullyFeaturedConfig.php
index cb67fc0..b3e7fe8 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -32,7 +32,16 @@
        )
 );
 $wgCirrusSearchWikimediaExtraPlugin[ 'super_detect_noop' ] = true;
+$wgCirrusSearchWikimediaExtraPlugin[ 'field_value_factor_with_default' ] = 
true;
+$wgCirrusSearchWikimediaExtraPlugin[ 'id_hash_mod_filter' ] = true;
 
+$wgJobQueueAggregator = array(
+       'class'       => 'JobQueueAggregatorRedis',
+       'redisServer' => 'localhost',
+       'redisConfig' => array(
+               'password' => null,
+       ),
+);
 
 if ( class_exists( 'PoolCounter_Client' ) ) {
        // If the pool counter is around set up prod like pool counter settings
diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php
index f75783c..8fd5acd 100644
--- a/tests/jenkins/Jenkins.php
+++ b/tests/jenkins/Jenkins.php
@@ -63,13 +63,7 @@
                'password' => null,
        ),
 );
-$wgJobQueueAggregator = array(
-       'class'       => 'JobQueueAggregatorRedis',
-       'redisServer' => 'localhost',
-       'redisConfig' => array(
-               'password' => null,
-       ),
-);
+
 $wgCiteEnablePopups = true;
 $wgExtraNamespaces[ 760 ] = 'Mó';
 

-- 
To view, visit https://gerrit.wikimedia.org/r/224612
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I827f5c54b93db5229daf7caacecbf82159ba6502
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to