jenkins-bot has submitted this change and it was merged.
Change subject: Prefer articles in a user's language on multilingual wikis
......................................................................
Prefer articles in a user's language on multilingual wikis
Bug: 66829
Change-Id: I52402fb7e2d3c3ee56c64322c13dbec1e92c52d0
---
M CirrusSearch.php
M includes/BuildDocument/PageDataBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/MappingConfigBuilder.php
M includes/Searcher.php
M tests/browser/features/full_text.feature
A tests/browser/features/relevancy.feature
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/support/hooks.rb
M tests/jenkins/Jenkins.php
10 files changed, 109 insertions(+), 40 deletions(-)
Approvals:
Chad: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index da9ac57..4cc2484 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -239,6 +239,16 @@
// Default weight of a talk namespace relative to its corresponding non-talk
namespace.
$wgCirrusSearchTalkNamespaceWeight = 0.25;
+// Default weight of language field for multilingual wikis.
+// 'user' is the weight given to the user's language
+// 'wiki' is the weight given to the wiki's content language
+// If your wiki is only one language you can leave these at 0, otherwise try
setting it
+// to something like 5.0 for 'user' and 2.5 for 'wiki'
+$wgCirrusSearchLanguageWeight = array(
+ 'user' => 0.0,
+ 'wiki' => 0.0,
+);
+
// Portion of an article's score that decays with time since it's last update.
Defaults to 0
// meaning don't decay the score at all unless prefer-recent: prefixes the
query.
$wgCirrusSearchPreferRecentDefaultDecayPortion = 0;
diff --git a/includes/BuildDocument/PageDataBuilder.php
b/includes/BuildDocument/PageDataBuilder.php
index f384174..26409d8 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -45,6 +45,10 @@
$this->templates();
}
+ // All content types have a language
+ $this->doc->add( 'language',
+ $this->title->getPageLanguage()->getCode() );
+
return $this->doc;
}
diff --git a/includes/ElasticsearchIntermediary.php
b/includes/ElasticsearchIntermediary.php
index 4e5aae6..ba959d8 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -24,10 +24,10 @@
*/
class ElasticsearchIntermediary {
/**
- * @var string|null the name or ip of the user for which we're
performing this search or null in the case of
+ * @var User|null user for which we're performing this search or null
in the case of
* requests kicked off by jobs
*/
- private $user = 'nobody';
+ protected $user;
/**
* @var float|null start time of current request or null if none is
running
*/
@@ -58,9 +58,7 @@
* slow. 0 means none count as slow.
*/
protected function __construct( $user, $slowSeconds ) {
- if ( $user ) {
- $this->user = 'User:' . $user->getName(); // name is
the ip address of anonymous users
- }
+ $this->user = $user;
$this->slowMillis = round( 1000 * $slowSeconds );
}
@@ -154,9 +152,7 @@
// Now log and clear our state.
wfDebugLog( 'CirrusSearchRequests', $logMessage );
if ( $this->slowMillis && $took >= $this->slowMillis ) {
- if ( $this->user ) {
- $logMessage .= " for $this->user";
- }
+ $logMessage .= $this->user ? ' for ' .
$this->user->getName() : '';
wfDebugLog( 'CirrusSearchSlowRequests', $logMessage );
}
$this->requestStart = null;
diff --git a/includes/MappingConfigBuilder.php
b/includes/MappingConfigBuilder.php
index e771a2f..5b3d17b 100644
--- a/includes/MappingConfigBuilder.php
+++ b/includes/MappingConfigBuilder.php
@@ -33,7 +33,7 @@
* and change the minor version when it changes but isn't
* incompatible
*/
- const VERSION = '1.3';
+ const VERSION = '1.4';
/**
* Whether to allow prefix searches to match on any word
@@ -145,7 +145,8 @@
'suggest' => array(
'type' => 'string',
'analyzer' => 'suggest',
- )
+ ),
+ 'language' => $this->buildKeywordField(),
),
);
wfRunHooks( 'CirrusSearchMappingConfig', array( &$config, $this
) );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 0b010d6..9f0d9cb 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -6,6 +6,7 @@
use \MWNamespace;
use \PoolCounterWorkViaCallback;
use \ProfileSection;
+use \RequestContext;
use \Sanitizer;
use \Status;
use \Title;
@@ -1197,7 +1198,9 @@
* If there is any boosting to be done munge the the current query to
get it right.
*/
private function installBoosts() {
- global $wgCirrusSearchFunctionRescoreWindowSize;
+ global $wgCirrusSearchFunctionRescoreWindowSize,
+ $wgCirrusSearchLanguageWeight,
+ $wgLanguageCode;
// Quick note: At the moment ".isEmpty()" is _much_ faster
then ".empty". Never
// use ".empty". See
https://github.com/elasticsearch/elasticsearch/issues/5086
@@ -1265,6 +1268,26 @@
}
}
+ // Boost pages in a user's language
+ // I suppose using $wgLang would've been more evil than this,
but
+ // only marginally so. Find some real context to use here.
+ $userLang = RequestContext::getMain()->getLanguage()->getCode();
+ if ( $wgCirrusSearchLanguageWeight['user'] ) {
+ $functionScore->addBoostFactorFunction(
+ $wgCirrusSearchLanguageWeight['user'],
+ new \Elastica\Filter\Term( array( 'language' =>
$userLang ) )
+ );
+ $useFunctionScore = true;
+ }
+ // And a wiki's language, if it's different
+ if ( $userLang != $wgLanguageCode &&
$wgCirrusSearchLanguageWeight['wiki'] ) {
+ $functionScore->addBoostFactorFunction(
+ $wgCirrusSearchLanguageWeight['wiki'],
+ new \Elastica\Filter\Term( array( 'language' =>
$wgLanguageCode ) )
+ );
+ $useFunctionScore = true;
+ }
+
if ( !$useFunctionScore ) {
// Nothing to do
return;
diff --git a/tests/browser/features/full_text.feature
b/tests/browser/features/full_text.feature
index 6bca3fa..4bb33e6 100644
--- a/tests/browser/features/full_text.feature
+++ b/tests/browser/features/full_text.feature
@@ -153,34 +153,6 @@
When I search for incategory:ManyRedirectsTest Many Redirects Test
Then Manyredirectstarget is the first search result
- @relevancy
- Scenario: Results are sorted in the order we expect
- When I search for Relevancytest
- Then Relevancytest is the first search result
- And Relevancytestviaredirect is the second search result
- And Relevancytestviacategory is the third search result
- And Relevancytestviaheading is the fourth search result
- And Relevancytestviaopening is the fifth search result
- And Relevancytestviatext is the sixth search result
- And Relevancytestviaauxtext is the seventh search result
-
- @relevancy
- Scenario: Two word searches are sorted in the order we expect
- When I search for Relevancytwo Wordtest
- Then Relevancytwo Wordtest is the first search result
- And Wordtest Relevancytwo is the second search result
-
- @relevancy
- Scenario: Results are effected by the namespace boost
- When I search for all:Relevancynamespacetest
- Then Relevancynamespacetest is the first search result
- And Talk:Relevancynamespacetest is the second search result
- And File:Relevancynamespacetest is the third search result
- And Help:Relevancynamespacetest is the fourth search result
- And File talk:Relevancynamespacetest is the fifth search result
- And User talk:Relevancynamespacetest is the sixth search result
- And Template:Relevancynamespacetest is the seventh search result
-
@fallback_finder
Scenario: I can find things that Elasticsearch typically thinks of as word
breaks in the title
When I search for $US
diff --git a/tests/browser/features/relevancy.feature
b/tests/browser/features/relevancy.feature
new file mode 100644
index 0000000..a6840ae
--- /dev/null
+++ b/tests/browser/features/relevancy.feature
@@ -0,0 +1,40 @@
+@relevancy
+Feature: Result scoring
+ Background:
+ Given I am at a random page
+
+ Scenario: Results are sorted based on what part of the page matches: title,
redirect, category, etc
+ When I search for Relevancytest
+ Then Relevancytest is the first search result
+ And Relevancytestviaredirect is the second search result
+ And Relevancytestviacategory is the third search result
+ And Relevancytestviaheading is the fourth search result
+ And Relevancytestviaopening is the fifth search result
+ And Relevancytestviatext is the sixth search result
+ And Relevancytestviaauxtext is the seventh search result
+
+ Scenario: Words in order are worth more then words out of order
+ When I search for Relevancytwo Wordtest
+ Then Relevancytwo Wordtest is the first search result
+ And Wordtest Relevancytwo is the second search result
+
+ Scenario: Results are sorted based on namespace: main, talk, file, help,
file talk, etc
+ When I search for all:Relevancynamespacetest
+ Then Relevancynamespacetest is the first search result
+ And Talk:Relevancynamespacetest is the second search result
+ And File:Relevancynamespacetest is the third search result
+ And Help:Relevancynamespacetest is the fourth search result
+ And File talk:Relevancynamespacetest is the fifth search result
+ And User talk:Relevancynamespacetest is the sixth search result
+ And Template:Relevancynamespacetest is the seventh search result
+
+ Scenario: When the user doesn't set a language are sorted with wiki language
ahead of other languages
+ When I search for Relevancylanguagetest
+ Then Relevancylanguagetest/en is the first search result
+
+ Scenario: When the user has a language results are sorted with user language
ahead of wiki language ahead of other languages
+ When I search for Relevancylanguagetest
+ And I switch the language to ja
+ Then Relevancylanguagetest/ja is the first search result
+ And Relevancylanguagetest/en is the second search result
+ And Relevancylanguagetest/ar is the third search result
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index 1218e89..813ac20 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -51,6 +51,9 @@
end
end
end
+When(/^I switch the language to (.+)$/) do |language|
+ @browser.goto("#{@browser.url}&uselang=#{language}")
+end
When(/^I click the (.*) link$/) do |text|
@browser.link(:text => text).click
end
diff --git a/tests/browser/features/support/hooks.rb
b/tests/browser/features/support/hooks.rb
index 71e3310..2b71a15 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -419,6 +419,9 @@
And a page named File talk:Relevancynamespacetest exists
And a page named User talk:Relevancynamespacetest exists
And a page named Template:Relevancynamespacetest exists
+ And a page named Relevancylanguagetest/ja exists
+ And a page named Relevancylanguagetest/en exists
+ And a page named Relevancylanguagetest/ar exists
}
end
$relevancy = true
diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php
index cdaf3a4..020c429 100644
--- a/tests/jenkins/Jenkins.php
+++ b/tests/jenkins/Jenkins.php
@@ -36,7 +36,7 @@
$wgAutoloadClasses[ 'CirrusSearch\Jenkins\NukeAllIndexes' ] = __DIR__ .
'/nukeAllIndexes.php';
$wgHooks[ 'LoadExtensionSchemaUpdates' ][] =
'CirrusSearch\Jenkins\Jenkins::installDatabaseUpdatePostActions';
$wgHooks[ 'BeforeInitialize' ][] =
'CirrusSearch\Jenkins\Jenkins::recyclePruneAndUndelayJobs';
-
+$wgHooks[ 'PageContentLanguage' ][] =
'CirrusSearch\Jenkins\Jenkins::setLanguage';
// Dependencies
// Jenkins will automatically load these for us but it makes this file more
generally useful
@@ -83,6 +83,9 @@
$wgShowExceptionDetails = true;
$wgCirrusSearchShowScore = true;
+$wgCirrusSearchLanguageWeight[ 'user' ] = 10.0;
+$wgCirrusSearchLanguageWeight[ 'wiki' ] = 5.0;
+
class Jenkins {
/**
* Installs maintenance scripts that provide a clean Elasticsearch
index for testing.
@@ -101,4 +104,18 @@
$jobQueue->recyclePruneAndUndelayJobs();
}
}
+
+ /**
+ * If the page ends in '/<language code>' then set the page's language
to that code.
+ * @param Title @title page title object
+ * @param string|Language $pageLang the page content language (either
an object or a language code)
+ * @param Language $wgLang the user language
+ */
+ public static function setLanguage( $title, &$pageLang, $wgLang ) {
+ $matches = array();
+ if ( preg_match( '/\/..$/', $title->getText(), $matches ) ) {
+ $pageLang = substr( $matches[ 0 ], 1 );
+ }
+ return true;
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/140866
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I52402fb7e2d3c3ee56c64322c13dbec1e92c52d0
Gerrit-PatchSet: 5
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Chad <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Nemo bis <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits