jenkins-bot has submitted this change and it was merged.

Change subject: Prefer articles in a user's language on multilingual wikis
......................................................................


Prefer articles in a user's language on multilingual wikis

Bug: 66829
Change-Id: I52402fb7e2d3c3ee56c64322c13dbec1e92c52d0
---
M CirrusSearch.php
M includes/BuildDocument/PageDataBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/MappingConfigBuilder.php
M includes/Searcher.php
M tests/browser/features/full_text.feature
A tests/browser/features/relevancy.feature
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/support/hooks.rb
M tests/jenkins/Jenkins.php
10 files changed, 109 insertions(+), 40 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index da9ac57..4cc2484 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -239,6 +239,16 @@
 // Default weight of a talk namespace relative to its corresponding non-talk 
namespace.
 $wgCirrusSearchTalkNamespaceWeight = 0.25;
 
+// Default weight of language field for multilingual wikis.
+// 'user' is the weight given to the user's language
+// 'wiki' is the weight given to the wiki's content language
+// If your wiki is only one language you can leave these at 0, otherwise try 
setting it
+// to something like 5.0 for 'user' and 2.5 for 'wiki'
+$wgCirrusSearchLanguageWeight = array(
+       'user' => 0.0,
+       'wiki' => 0.0,
+);
+
 // Portion of an article's score that decays with time since it's last update. 
 Defaults to 0
 // meaning don't decay the score at all unless prefer-recent: prefixes the 
query.
 $wgCirrusSearchPreferRecentDefaultDecayPortion = 0;
diff --git a/includes/BuildDocument/PageDataBuilder.php 
b/includes/BuildDocument/PageDataBuilder.php
index f384174..26409d8 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -45,6 +45,10 @@
                                $this->templates();
                }
 
+               // All content types have a language
+               $this->doc->add( 'language',
+                       $this->title->getPageLanguage()->getCode() );
+
                return $this->doc;
        }
 
diff --git a/includes/ElasticsearchIntermediary.php 
b/includes/ElasticsearchIntermediary.php
index 4e5aae6..ba959d8 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -24,10 +24,10 @@
  */
 class ElasticsearchIntermediary {
        /**
-        * @var string|null the name or ip of the user for which we're 
performing this search or null in the case of
+        * @var User|null user for which we're performing this search or null 
in the case of
         * requests kicked off by jobs
         */
-       private $user = 'nobody';
+       protected $user;
        /**
         * @var float|null start time of current request or null if none is 
running
         */
@@ -58,9 +58,7 @@
         * slow.  0 means none count as slow.
         */
        protected function __construct( $user, $slowSeconds ) {
-               if ( $user ) {
-                       $this->user = 'User:' . $user->getName(); // name is 
the ip address of anonymous users
-               }
+               $this->user = $user;
                $this->slowMillis = round( 1000 * $slowSeconds );
        }
 
@@ -154,9 +152,7 @@
                // Now log and clear our state.
                wfDebugLog( 'CirrusSearchRequests', $logMessage );
                if ( $this->slowMillis && $took >= $this->slowMillis ) {
-                       if ( $this->user ) {
-                               $logMessage .= " for $this->user";
-                       }
+                       $logMessage .= $this->user ? ' for ' . 
$this->user->getName() : '';
                        wfDebugLog( 'CirrusSearchSlowRequests', $logMessage );
                }
                $this->requestStart = null;
diff --git a/includes/MappingConfigBuilder.php 
b/includes/MappingConfigBuilder.php
index e771a2f..5b3d17b 100644
--- a/includes/MappingConfigBuilder.php
+++ b/includes/MappingConfigBuilder.php
@@ -33,7 +33,7 @@
         * and change the minor version when it changes but isn't
         * incompatible
         */
-       const VERSION = '1.3';
+       const VERSION = '1.4';
 
        /**
         * Whether to allow prefix searches to match on any word
@@ -145,7 +145,8 @@
                                'suggest' => array(
                                        'type' => 'string',
                                        'analyzer' => 'suggest',
-                               )
+                               ),
+                               'language' => $this->buildKeywordField(),
                        ),
                );
                wfRunHooks( 'CirrusSearchMappingConfig', array( &$config, $this 
) );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 0b010d6..9f0d9cb 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -6,6 +6,7 @@
 use \MWNamespace;
 use \PoolCounterWorkViaCallback;
 use \ProfileSection;
+use \RequestContext;
 use \Sanitizer;
 use \Status;
 use \Title;
@@ -1197,7 +1198,9 @@
         * If there is any boosting to be done munge the the current query to 
get it right.
         */
        private function installBoosts() {
-               global $wgCirrusSearchFunctionRescoreWindowSize;
+               global $wgCirrusSearchFunctionRescoreWindowSize,
+                       $wgCirrusSearchLanguageWeight,
+                       $wgLanguageCode;
 
                // Quick note:  At the moment ".isEmpty()" is _much_ faster 
then ".empty".  Never
                // use ".empty".  See 
https://github.com/elasticsearch/elasticsearch/issues/5086
@@ -1265,6 +1268,26 @@
                        }
                }
 
+               // Boost pages in a user's language
+               // I suppose using $wgLang would've been more evil than this, 
but
+               // only marginally so. Find some real context to use here.
+               $userLang = RequestContext::getMain()->getLanguage()->getCode();
+               if ( $wgCirrusSearchLanguageWeight['user'] ) {
+                       $functionScore->addBoostFactorFunction(
+                               $wgCirrusSearchLanguageWeight['user'],
+                               new \Elastica\Filter\Term( array( 'language' => 
$userLang ) )
+                       );
+                       $useFunctionScore = true;
+               }
+               // And a wiki's language, if it's different
+               if ( $userLang != $wgLanguageCode && 
$wgCirrusSearchLanguageWeight['wiki'] ) {
+                       $functionScore->addBoostFactorFunction(
+                               $wgCirrusSearchLanguageWeight['wiki'],
+                               new \Elastica\Filter\Term( array( 'language' => 
$wgLanguageCode ) )
+                       );
+                       $useFunctionScore = true;
+               }
+
                if ( !$useFunctionScore ) {
                        // Nothing to do
                        return;
diff --git a/tests/browser/features/full_text.feature 
b/tests/browser/features/full_text.feature
index 6bca3fa..4bb33e6 100644
--- a/tests/browser/features/full_text.feature
+++ b/tests/browser/features/full_text.feature
@@ -153,34 +153,6 @@
     When I search for incategory:ManyRedirectsTest Many Redirects Test
     Then Manyredirectstarget is the first search result
 
-  @relevancy
-  Scenario: Results are sorted in the order we expect
-    When I search for Relevancytest
-    Then Relevancytest is the first search result
-    And Relevancytestviaredirect is the second search result
-    And Relevancytestviacategory is the third search result
-    And Relevancytestviaheading is the fourth search result
-    And Relevancytestviaopening is the fifth search result
-    And Relevancytestviatext is the sixth search result
-    And Relevancytestviaauxtext is the seventh search result
-
-  @relevancy
-  Scenario: Two word searches are sorted in the order we expect
-    When I search for Relevancytwo Wordtest
-    Then Relevancytwo Wordtest is the first search result
-    And Wordtest Relevancytwo is the second search result
-
-  @relevancy
-  Scenario: Results are effected by the namespace boost
-    When I search for all:Relevancynamespacetest
-    Then Relevancynamespacetest is the first search result
-    And Talk:Relevancynamespacetest is the second search result
-    And File:Relevancynamespacetest is the third search result
-    And Help:Relevancynamespacetest is the fourth search result
-    And File talk:Relevancynamespacetest is the fifth search result
-    And User talk:Relevancynamespacetest is the sixth search result
-    And Template:Relevancynamespacetest is the seventh search result
-
   @fallback_finder
   Scenario: I can find things that Elasticsearch typically thinks of as word 
breaks in the title
     When I search for $US
diff --git a/tests/browser/features/relevancy.feature 
b/tests/browser/features/relevancy.feature
new file mode 100644
index 0000000..a6840ae
--- /dev/null
+++ b/tests/browser/features/relevancy.feature
@@ -0,0 +1,40 @@
+@relevancy
+Feature: Result scoring
+  Background:
+    Given I am at a random page
+
+  Scenario: Results are sorted based on what part of the page matches: title, 
redirect, category, etc
+    When I search for Relevancytest
+    Then Relevancytest is the first search result
+    And Relevancytestviaredirect is the second search result
+    And Relevancytestviacategory is the third search result
+    And Relevancytestviaheading is the fourth search result
+    And Relevancytestviaopening is the fifth search result
+    And Relevancytestviatext is the sixth search result
+    And Relevancytestviaauxtext is the seventh search result
+
+  Scenario: Words in order are worth more then words out of order
+    When I search for Relevancytwo Wordtest
+    Then Relevancytwo Wordtest is the first search result
+    And Wordtest Relevancytwo is the second search result
+
+  Scenario: Results are sorted based on namespace: main, talk, file, help, 
file talk, etc
+    When I search for all:Relevancynamespacetest
+    Then Relevancynamespacetest is the first search result
+    And Talk:Relevancynamespacetest is the second search result
+    And File:Relevancynamespacetest is the third search result
+    And Help:Relevancynamespacetest is the fourth search result
+    And File talk:Relevancynamespacetest is the fifth search result
+    And User talk:Relevancynamespacetest is the sixth search result
+    And Template:Relevancynamespacetest is the seventh search result
+
+  Scenario: When the user doesn't set a language are sorted with wiki language 
ahead of other languages
+    When I search for Relevancylanguagetest
+    Then Relevancylanguagetest/en is the first search result
+
+  Scenario: When the user has a language results are sorted with user language 
ahead of wiki language ahead of other languages
+    When I search for Relevancylanguagetest
+    And I switch the language to ja
+    Then Relevancylanguagetest/ja is the first search result
+    And Relevancylanguagetest/en is the second search result
+    And Relevancylanguagetest/ar is the third search result
diff --git a/tests/browser/features/step_definitions/search_steps.rb 
b/tests/browser/features/step_definitions/search_steps.rb
index 1218e89..813ac20 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -51,6 +51,9 @@
     end
   end
 end
+When(/^I switch the language to (.+)$/) do |language|
+  @browser.goto("#{@browser.url}&uselang=#{language}")
+end
 When(/^I click the (.*) link$/) do |text|
   @browser.link(:text => text).click
 end
diff --git a/tests/browser/features/support/hooks.rb 
b/tests/browser/features/support/hooks.rb
index 71e3310..2b71a15 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -419,6 +419,9 @@
       And a page named File talk:Relevancynamespacetest exists
       And a page named User talk:Relevancynamespacetest exists
       And a page named Template:Relevancynamespacetest exists
+      And a page named Relevancylanguagetest/ja exists
+      And a page named Relevancylanguagetest/en exists
+      And a page named Relevancylanguagetest/ar exists
     }
   end
   $relevancy = true
diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php
index cdaf3a4..020c429 100644
--- a/tests/jenkins/Jenkins.php
+++ b/tests/jenkins/Jenkins.php
@@ -36,7 +36,7 @@
 $wgAutoloadClasses[ 'CirrusSearch\Jenkins\NukeAllIndexes' ] = __DIR__ . 
'/nukeAllIndexes.php';
 $wgHooks[ 'LoadExtensionSchemaUpdates' ][] = 
'CirrusSearch\Jenkins\Jenkins::installDatabaseUpdatePostActions';
 $wgHooks[ 'BeforeInitialize' ][] = 
'CirrusSearch\Jenkins\Jenkins::recyclePruneAndUndelayJobs';
-
+$wgHooks[ 'PageContentLanguage' ][] = 
'CirrusSearch\Jenkins\Jenkins::setLanguage';
 
 // Dependencies
 // Jenkins will automatically load these for us but it makes this file more 
generally useful
@@ -83,6 +83,9 @@
 $wgShowExceptionDetails = true;
 $wgCirrusSearchShowScore = true;
 
+$wgCirrusSearchLanguageWeight[ 'user' ] = 10.0;
+$wgCirrusSearchLanguageWeight[ 'wiki' ] = 5.0;
+
 class Jenkins {
        /**
         * Installs maintenance scripts that provide a clean Elasticsearch 
index for testing.
@@ -101,4 +104,18 @@
                        $jobQueue->recyclePruneAndUndelayJobs();
                }
        }
+
+       /**
+        * If the page ends in '/<language code>' then set the page's language 
to that code.
+        * @param Title @title page title object
+        * @param string|Language $pageLang the page content language (either 
an object or a language code)
+        * @param Language $wgLang the user language
+        */
+       public static function setLanguage( $title, &$pageLang, $wgLang ) {
+               $matches = array();
+               if ( preg_match( '/\/..$/', $title->getText(), $matches ) ) {
+                       $pageLang = substr( $matches[ 0 ], 1 );
+               }
+               return true;
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/140866
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I52402fb7e2d3c3ee56c64322c13dbec1e92c52d0
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Chad <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Nemo bis <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to