jenkins-bot has submitted this change and it was merged. Change subject: Use unicode plugin when present ......................................................................
Use unicode plugin when present We replace the lowercase with unicode case folding which should do the same thing _and_ fold together similar characters. Change-Id: Ied112d1c942045e357f9e7ad5f97400377ce164c --- M includes/AnalysisConfigBuilder.php M tests/browser/features/full_text.feature M tests/browser/features/step_definitions/search_steps.rb M tests/browser/features/support/hooks.rb 4 files changed, 56 insertions(+), 2 deletions(-) Approvals: Chad: Looks good to me, approved jenkins-bot: Verified diff --git a/includes/AnalysisConfigBuilder.php b/includes/AnalysisConfigBuilder.php index 52ac4ee..41a42d4 100644 --- a/includes/AnalysisConfigBuilder.php +++ b/includes/AnalysisConfigBuilder.php @@ -29,13 +29,18 @@ * and change the minor version when it changes but isn't * incompatible */ - const VERSION = '0.6'; + const VERSION = '0.7'; /** * Language code we're building analysis for * @var string */ private $language; + + /** + * @var boolean is the icu plugin available? + */ + private $icu; /** * Constructor @@ -49,6 +54,7 @@ $this->elasticsearchLanguageAnalyzers = array_merge( $this->elasticsearchLanguageAnalyzers, $extra ); } } + $this->icu = in_array( 'analysis-icu', $plugins ); } /** @@ -65,7 +71,7 @@ * Build an analysis config with sane defaults. */ private function defaults() { - return array( + $defaults = array( 'analyzer' => array( 'text' => array( 'type' => $this->getDefaultTextAnalyzerType(), @@ -165,6 +171,22 @@ ), ), ); + foreach ( $defaults[ 'analyzer' ] as &$analyzer ) { + if ( $analyzer[ 'type' ] === 'default' ) { + $analyzer = array( + 'type' => 'custom', + 'tokenizer' => 'standard', + 'filter' => array( 'standard', 'lowercase' ), + ); + } + } + if ( $this->icu ) { + $defaults[ 'filter' ][ 'icu_normalizer' ] = array( + 'type' => 'icu_normalizer', + 'name' => 'nfkc_cf', + ); + } + return $defaults; } /** @@ -215,6 +237,20 @@ if ( $config[ 'analyzer' ][ 'text_search' ][ 'type' ] === 'hebrew' ) { $config[ 'analyzer' ][ 'text_search' ][ 'type' ] = 'hebrew_exact'; } + break; + } + if ( $this->icu ) { + foreach ( $config[ 'analyzer' ] as &$analyzer ) { + if ( !isset( $analyzer[ 'filter' ] ) ) { + continue; + } + $analyzer[ 'filter' ] = array_map( function( $filter ) { + if ( $filter === 'lowercase' ) { + return 'icu_normalizer'; + } + return $filter; + }, $analyzer[ 'filter' ] ); + } } return $config; } diff --git a/tests/browser/features/full_text.feature b/tests/browser/features/full_text.feature index 8f20239..df3dcef 100644 --- a/tests/browser/features/full_text.feature +++ b/tests/browser/features/full_text.feature @@ -225,3 +225,14 @@ | Africa | África | | AlphaBeta | AlphaBeta | | ÁlphaBeta | none | + + @unicode_normalization + Scenario Outline: Searching for similar unicode characters finds all variants + When I search for <term> + Then there are 4 search results + Examples: + | term | + | वाङ्मय | + | वाङ्मय | + | वाङ्मय | + | वाङ्मय | diff --git a/tests/browser/features/step_definitions/search_steps.rb b/tests/browser/features/step_definitions/search_steps.rb index c386647..ccca030 100644 --- a/tests/browser/features/step_definitions/search_steps.rb +++ b/tests/browser/features/step_definitions/search_steps.rb @@ -171,6 +171,9 @@ Then(/^there are no search results$/) do on(SearchResultsPage).first_result_element.should_not exist end +Then(/^there are (\d+) search results$/) do |results| + on(SearchResultsPage).search_results_element.items.should == results.to_i +end Then(/^within (\d+) seconds searching for (.*) yields (.*) as the first result$/) do |seconds, term, title| within(seconds) do step("I search for " + term) diff --git a/tests/browser/features/support/hooks.rb b/tests/browser/features/support/hooks.rb index f2a0335..1e2c8d9 100644 --- a/tests/browser/features/support/hooks.rb +++ b/tests/browser/features/support/hooks.rb @@ -11,6 +11,10 @@ And a page named Two Words exists with contents ffnonesenseword catapult {{Template_Test}} anotherword [[Category:TwoWords]] [[Category:Categorywith Twowords]] And a page named AlphaBeta exists with contents [[Category:Alpha]] [[Category:Beta]] And a page named IHaveATwoWordCategory exists with contents [[Category:CategoryWith ASpace]] + And a page named वाङ्मय exists + And a page named वाङ्मय exists + And a page named वाङ्मय exists + And a page named वाङ्मय exists } $setup_main = true end -- To view, visit https://gerrit.wikimedia.org/r/132226 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ied112d1c942045e357f9e7ad5f97400377ce164c Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Chad <ch...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits