Thcipriani has uploaded a new change for review. https://gerrit.wikimedia.org/r/307350
Change subject: Revert "Revert "CirrusSearch BM25 A/B test config"" ...................................................................... Revert "Revert "CirrusSearch BM25 A/B test config"" This reverts commit 15e4dcafcbc053de433767181a601bd1e8b7ec5a. Change-Id: Ia26197e9b7c6e496d8d7e21b629bb0725974286b --- M tests/cirrusTest.php M wmf-config/CirrusSearch-common.php 2 files changed, 309 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config refs/changes/50/307350/1 diff --git a/tests/cirrusTest.php b/tests/cirrusTest.php index 06268d5..7772c42 100644 --- a/tests/cirrusTest.php +++ b/tests/cirrusTest.php @@ -101,6 +101,9 @@ // variables that would have been setup elsewhere, perhaps in mediawiki // default settings or by CommonSettings.php, or by CirrusSearch.php, // but none of those are a part of this repository + $wgCirrusSearchRescoreProfiles = array(); + $wgCirrusSearchRescoreFunctionScoreChains = array(); + $wgCirrusSearchFullTextQueryBuilderProfiles = array(); $wgJobTypeConf = array( 'default' => array() ); $wgCirrusSearchWeights = array(); $wgCirrusSearchNamespaceWeights = array(); diff --git a/wmf-config/CirrusSearch-common.php b/wmf-config/CirrusSearch-common.php index e20cbea..0b73b1c 100644 --- a/wmf-config/CirrusSearch-common.php +++ b/wmf-config/CirrusSearch-common.php @@ -33,6 +33,312 @@ # Enable user testing $wgCirrusSearchUserTesting = $wmgCirrusSearchUserTesting; +# BM25 A/B test, enabled only on enwiki to avoid conflicts with +# with TextCat language detection +if ( $wgDBname === 'enwiki' ) { +# UserTesting requires that a var exists in $GLOBALS before setting it +# All extra vars needed to customize rescore weights + $wgCirrusSearchPageViewsW = 1.0; + $wgCirrusSearchPageViewsK = 1.0; + $wgCirrusSearchPageViewsA = 1.0; + $wgCirrusSearchIncLinksW = 1.0; + $wgCirrusSearchIncLinksK = 1.0; + $wgCirrusSearchIncLinksA = 1.0; + $wgCirrusSearchIncLinksAloneW = 1.0; + $wgCirrusSearchIncLinksAloneK = 1.0; + $wgCirrusSearchIncLinksAloneA = 1.0; + + $wgCirrusSearchUserTesting['bm25'] = [ + 'sampleRate' => 0, + 'globals' => [ + 'wgCirrusSearchBoostTemplates' => [], + 'wgCirrusSearchRescoreProfiles' => $wgCirrusSearchRescoreProfiles + [ + 'wsum_inclinks' => [ + 'supported_namespaces' => 'all', + 'rescore' => [ + [ + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'total', + 'type' => 'function_score', + 'function_chain' => 'wsum_inclinks' + ], + [ + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'multiply', + 'type' => 'function_score', + 'function_chain' => 'optional_chain' + ], + ], + ], + 'wsum_inclinks_pv' => [ + 'supported_namespaces' => 'content', + 'fallback_profile' => 'wsum_inclinks', + 'rescore' => [ + [ + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'total', + 'type' => 'function_score', + 'function_chain' => 'wsum_inclinks_pv' + ], + [ + 'window' => 8192, + 'window_size_override' => 'CirrusSearchFunctionRescoreWindowSize', + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'multiply', + 'type' => 'function_score', + 'function_chain' => 'optional_chain' + ], + ], + ], + ], + 'wgCirrusSearchRescoreFunctionScoreChains' => $wgCirrusSearchRescoreFunctionScoreChains + [ + 'wsum_inclinks' => [ + 'functions' => [ + [ + 'type' => 'satu', + 'weight' => [ + 'value' => 1.2, + 'config_override' => 'CirrusSearchIncLinksAloneW', + 'uri_param_override' => 'cirrusIncLinksAloneW', + ], + 'params' => [ + 'field' => 'incoming_links', + 'k' => [ + 'value' => 10, + 'config_override' => 'CirrusSearchIncLinksAloneK', + 'uri_param_override' => 'cirrusIncLinksAloneK', + ], + 'a' => [ + 'value' => 1, + 'config_override' => 'CirrusSearchIncLinksAloneA', + 'uri_param_override' => 'cirrusIncLinksAloneA', + ] + ], + ], + ], + ], + 'wsum_inclinks_pv' => [ + 'score_mode' => 'sum', + 'boost_mode' => 'sum', + 'functions' => [ + [ + 'type' => 'satu', + 'weight' => [ + 'value' => 1.8, + 'config_override' => 'CirrusSearchPageViewsW', + 'uri_param_override' => 'cirrusPageViewsW', + ], + 'params' => [ + 'field' => 'popularity_score', + 'k' => [ + 'value' => 0.0000007, + 'config_override' => 'CirrusSearchPageViewsK', + 'uri_param_override' => 'cirrusPageViewsK', + ], + 'a' => [ + 'value' => 1, + 'config_override' => 'CirrusSearchPageViewsA', + 'uri_param_override' => 'cirrusPageViewsA', + ], + ], + ], + [ + 'type' => 'satu', + 'weight' => [ + 'value' => 0.6, + 'config_override' => 'CirrusSearchIncLinksW', + 'uri_param_override' => 'cirrusIncLinkssW', + ], + 'params' => [ + 'field' => 'incoming_links', + 'k' => [ + 'value' => 10, + 'config_override' => 'CirrusSearchIncLinksK', + 'uri_param_override' => 'cirrusIncLinksK', + ], + 'a' => [ + 'value' => 1, + 'config_override' => 'CirrusSearchIncLinksA', + 'uri_param_override' => 'cirrusIncLinksA', + ], + ], + ], + ], + ], + ], + 'wgCirrusSearchFullTextQueryBuilderProfiles' => $wgCirrusSearchFullTextQueryBuilderProfiles + [ + 'perfield_builder' => [ + 'builder_class' => \CirrusSearch\Query\FullTextSimpleMatchQueryBuilder::class, + 'settings' => [ + 'default_min_should_match' => '1', + 'default_query_type' => 'most_fields', + 'default_stem_weight' => 3.0, + 'fields' => [ + 'title' => 0.3, + 'redirect.title' => [ + 'boost' => 0.27, + 'in_dismax' => 'redirects_or_shingles' + ], + 'suggest' => [ + 'is_plain' => true, + 'boost' => 0.20, + 'in_dismax' => 'redirects_or_shingles', + ], + 'category' => 0.05, + 'heading' => 0.05, + 'text' => [ + 'boost' => 0.6, + 'in_dismax' => 'text_and_opening_text', + ], + 'opening_text' => [ + 'boost' => 0.5, + 'in_dismax' => 'text_and_opening_text', + ], + 'auxiliary_text' => 0.05, + 'file_text' => 0.5, + ], + 'phrase_rescore_fields' => [ + // very low (don't forget it's multiplied by 10 by default) + // Use the all field to avoid loading positions on another field, + // score is roughly the same when used on text + 'all' => 0.03, + 'all.plain' => 0.05, + ], + ], + ], + ], + ], + 'buckets' => [ + // Prod settings on eqiad + // nDCG@5 0.2772 (enwiki scores excluded) + 'control' => [ + 'trigger' => 'bm25:control', + 'globals' => [ + 'wgCirrusSearchFullTextQueryBuilderProfile' => 'default', + 'wgCirrusSearchExtraBackendLatency' => 30000, + ], + ], + // BM25+allfield and QueryString, inclinks as a sum + // nDCG@5 0.2689 (enwiki scores excluded) + 'bm25_allfield' => [ + 'trigger' => 'bm25:allfield', + 'globals' => [ + 'wgCirrusSearchDefaultCluster' => 'codfw', + 'wgCirrusSearchFullTextQueryBuilderProfile' => 'default', + 'wgCirrusSearchPhraseSuggestReverseField' => [ + 'build' => true, + 'use' => false, + ], + 'wgCirrusSearchIgnoreOnWikiBoostTemplates' => true, + // set only here because only needed for reindexing + 'wgCirrusSearchSimilarityProfile' => [ + 'similarity' => [ + 'arrays' => [ + 'type' => 'BM25', + 'k1' => 1.2, + 'b' => 0.3, + ], + 'default' => [ + 'type' => 'BM25', + 'k1' => 1.2, + 'b' => 0.75, + ], + ], + 'fields' => [ + '__default__' => 'default', + 'category' => 'arrays', + 'heading' => 'arrays', + 'redirect' => 'arrays', + 'suggest' => 'arrays', + ], + ], + 'wgCirrusSearchRescoreProfile' => 'wsum_inclinks', + 'wgCirrusSearchIncLinksAloneW' => 1.3, + 'wgCirrusSearchIncLinksAloneK' => 30, + 'wgCirrusSearchIncLinksAloneA' => 0.7, + ] + ], + // BM25, perfield and SimpleMatch Query builder, inclinks as a sum + // nDCG@5 0.3371 (enwiki scores excluded) + 'bm25_inclinks' => [ + 'trigger' => 'bm25:inclinks', + 'globals' => [ + 'wgCirrusSearchDefaultCluster' => 'codfw', + 'wgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder', + 'wgCirrusSearchIgnoreOnWikiBoostTemplates' => true, + 'wgCirrusSearchPhraseSuggestReverseField' => [ + 'build' => true, + 'use' => false, + ], + 'wgCirrusSearchRescoreProfile' => 'wsum_inclinks', + 'wgCirrusSearchIncLinksAloneW' => 6.5, + 'wgCirrusSearchIncLinksAloneK' => 30, + 'wgCirrusSearchIncLinksAloneA' => 0.7, + ] + ], + // BM25, perfield and SimpleMatch Query builder, inclinks+pop score as a sum + // nDCG@5 0.3368 (enwiki scores excluded) + 'bm25_inclinks_pv' => [ + 'trigger' => 'bm25:inclinks_pv', + 'globals' => [ + 'wgCirrusSearchDefaultCluster' => 'codfw', + 'wgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder', + 'wgCirrusSearchIgnoreOnWikiBoostTemplates' => true, + 'wgCirrusSearchPhraseSuggestReverseField' => [ + 'build' => true, + 'use' => false, + ], + 'wgCirrusSearchRescoreProfile' => 'wsum_inclinks_pv', + 'wgCirrusSearchPageViewsW' => 1.5, + 'wgCirrusSearchPageViewsK' => 8E-6, + 'wgCirrusSearchPageViewsA' => 0.8, + 'wgCirrusSearchIncLinksW' => 5.0, + 'wgCirrusSearchIncLinksK' => 30, + 'wgCirrusSearchIncLinksA' => 0.7, + 'wgCirrusSearchIncLinksAloneW' => 6.5, + 'wgCirrusSearchIncLinksAloneK' => 30, + 'wgCirrusSearchIncLinksAloneA' => 0.7, + ] + ], + // BM25, perfield and SimpleMatch Query builder, inclinks+pop score as a sum + // nDCG@5 0.3368 (enwiki scores excluded) + // Reverse field enabled for DYM + 'bm25_inclinks_pv_rev' => [ + 'trigger' => 'bm25:inclinks_pv_rev', + 'globals' => [ + 'wgCirrusSearchDefaultCluster' => 'codfw', + 'wgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder', + 'wgCirrusSearchPhraseSuggestReverseField' => [ + 'build' => true, + 'use' => true, + ], + 'wgCirrusSearchIgnoreOnWikiBoostTemplates' => true, + 'wgCirrusSearchPageViewsW' => 1.5, + 'wgCirrusSearchPageViewsK' => 8E-6, + 'wgCirrusSearchPageViewsA' => 0.8, + 'wgCirrusSearchIncLinksW' => 5.0, + 'wgCirrusSearchIncLinksK' => 30, + 'wgCirrusSearchIncLinksA' => 0.7, + 'wgCirrusSearchRescoreProfile' => 'wsum_inclinks_pv', + 'wgCirrusSearchIncLinksAloneW' => 6.5, + 'wgCirrusSearchIncLinksAloneK' => 30, + 'wgCirrusSearchIncLinksAloneA' => 0.7, + ], + ], + ], + ]; +} + # Turn off leading wildcard matches, they are a very slow and inefficient query $wgCirrusSearchAllowLeadingWildcard = false; -- To view, visit https://gerrit.wikimedia.org/r/307350 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia26197e9b7c6e496d8d7e21b629bb0725974286b Gerrit-PatchSet: 1 Gerrit-Project: operations/mediawiki-config Gerrit-Branch: master Gerrit-Owner: Thcipriani <tcipri...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits