jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/375087 )
Change subject: Fix display for fields which have alias in non-target language ...................................................................... Fix display for fields which have alias in non-target language Also supports new highlighter result syntax: 0:0-XX:YY|Text. For complete correctness, needs deployment of: https://gerrit.wikimedia.org/r/#/c/371747/ But should partially work without it while not distinguishing aliases/labels for non-target languages. Bug: T173231 Change-Id: Ib415113608a6e1c852932ffadd5e07c6642611da --- M repo/includes/Search/Elastic/ElasticTermResult.php M repo/tests/phpunit/data/entitySearch/search_de-ch-en.expected M repo/tests/phpunit/data/entitySearch/search_de-ch.expected M repo/tests/phpunit/data/entitySearch/search_de-ch_strict.expected M repo/tests/phpunit/data/entitySearch/search_en.expected M repo/tests/phpunit/data/entitySearch/search_en_strict.expected M repo/tests/phpunit/data/entitySearch/search_zh-de-ch.expected M repo/tests/phpunit/data/entitySearch/search_zh.expected M repo/tests/phpunit/includes/Search/Elastic/ElasticTermResultTest.php 9 files changed, 262 insertions(+), 68 deletions(-) Approvals: jenkins-bot: Verified DCausse: Looks good to me, approved diff --git a/repo/includes/Search/Elastic/ElasticTermResult.php b/repo/includes/Search/Elastic/ElasticTermResult.php index 6d1b043..c0c379c 100644 --- a/repo/includes/Search/Elastic/ElasticTermResult.php +++ b/repo/includes/Search/Elastic/ElasticTermResult.php @@ -105,14 +105,20 @@ 'type' => 'experimental', 'fragmenter' => "none", 'number_of_fragments' => 0, - 'options' => [ 'skip_if_last_matched' => true ], + 'options' => [ + 'skip_if_last_matched' => true, + 'return_snippets_and_offsets' => true + ], ]; } - $config['fields']['labels_all.prefix'] = [ + $config['fields']['labels.*.prefix'] = [ 'type' => 'experimental', 'fragmenter' => "none", 'number_of_fragments' => 0, - 'options' => [ 'skip_if_last_matched' => true ], + 'options' => [ + 'skip_if_last_matched' => true, + 'return_snippets_and_offsets' => true + ], ]; return $config; @@ -172,7 +178,6 @@ // Highlight part contains information about what has actually been matched. $highlight = $r->getHighlights(); - $matchedTermType = 'label'; $displayLabel = $this->findTermForDisplay( $sourceData, 'labels' ); $displayDescription = $this->findTermForDisplay( $sourceData, 'descriptions' ); @@ -184,25 +189,8 @@ // Something went wrong, we don't have any highlighting data continue; } else { - // We matched the actual label - find which language it is and - // whether it's main label (always first) or alias (not first) - $term = reset( $highlight ); - $term = $term[0]; // Highlighter returns array - $field = key( $highlight ); - if ( preg_match( '/^labels\.(.+?)\.prefix$/', $field, $match ) ) { - $langCode = $match[1]; - if ( !empty( $sourceData['labels'][$langCode] ) ) { - // Primary label always comes first, so if it's not the first one, - // it's an alias. - if ( $sourceData['labels'][$langCode][0] !== $term ) { - $matchedTermType = 'alias'; - } - } - } else { - // This is weird since we didn't ask to match anything else, - // but we'll return it anyway for debugging. - $langCode = 'unknown'; - } + list( $matchedTermType, $langCode, $term ) = + $this->extractTermFromHighlight( $highlight, $sourceData ); $matchedTerm = new Term( $langCode, $term ); } @@ -221,6 +209,60 @@ } /** + * New highlighter pattern. + * The new highlighter can return offsets as: 1:1-XX:YY|Text Snippet + * or even SNIPPET_START:MATCH1_START-MATCH1_END,MATCH2_START-MATCH2_END,...:SNIPPET_END|Text + */ + const HIGHLIGHT_PATTERN = '/^\d+:\d+-\d+(?:,\d+-\d+)*:\d+\|(.+)/'; + + /** + * Extract term, language and type from highlighter results. + * @param array $highlight Data from highlighter + * @param array $sourceData Data from _source + * @return array [ type, language, term ] + */ + private function extractTermFromHighlight( array $highlight, $sourceData ) { + /** + * Highlighter returns: + * { + * labels.en.prefix: [ + * "metre" // or "0:0-5:5|metre" + * ] + * } + */ + $matchedTermType = 'label'; + $term = reset( $highlight ); // Take the first one + $term = $term[0]; // Highlighter returns array + $field = key( $highlight ); + if ( preg_match( '/^labels\.([^.]+)\.prefix$/', $field, $match ) ) { + $langCode = $match[1]; + if ( preg_match( self::HIGHLIGHT_PATTERN, $term, $termMatch ) ) { + $isFirst = ( $term[0] === '0' ); + $term = $termMatch[1]; + } else { + $isFirst = true; + } + if ( !empty( $sourceData['labels'][$langCode] ) ) { + // Here we have match in one of the languages we asked for. + // Primary label always comes first, so if it's not the first one, + // it's an alias. + if ( $sourceData['labels'][$langCode][0] !== $term ) { + $matchedTermType = 'alias'; + } + } else { + // Here we have match in one of the "other" languages. + // If it's the first one in the list, it's label, otherwise it is alias. + $matchedTermType = $isFirst ? 'label' : 'alias'; + } + } else { + // This is weird since we didn't ask to match anything else, + // but we'll return it anyway for debugging. + $langCode = 'unknown'; + } + return [ $matchedTermType, $langCode, $term ]; + } + + /** * @return TermSearchResult[] Empty set of search results */ public function createEmptyResult() { diff --git a/repo/tests/phpunit/data/entitySearch/search_de-ch-en.expected b/repo/tests/phpunit/data/entitySearch/search_de-ch-en.expected index 59b17af..48d79ef 100644 --- a/repo/tests/phpunit/data/entitySearch/search_de-ch-en.expected +++ b/repo/tests/phpunit/data/entitySearch/search_de-ch-en.expected @@ -171,7 +171,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.de.prefix": { @@ -179,7 +180,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.en.prefix": { @@ -187,15 +189,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_de-ch.expected b/repo/tests/phpunit/data/entitySearch/search_de-ch.expected index 47d506d..48ef5ea 100644 --- a/repo/tests/phpunit/data/entitySearch/search_de-ch.expected +++ b/repo/tests/phpunit/data/entitySearch/search_de-ch.expected @@ -175,7 +175,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.de.prefix": { @@ -183,7 +184,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.en.prefix": { @@ -191,15 +193,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_de-ch_strict.expected b/repo/tests/phpunit/data/entitySearch/search_de-ch_strict.expected index e2fd15d..bc3b388 100644 --- a/repo/tests/phpunit/data/entitySearch/search_de-ch_strict.expected +++ b/repo/tests/phpunit/data/entitySearch/search_de-ch_strict.expected @@ -105,7 +105,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.de.prefix": { @@ -113,7 +114,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.en.prefix": { @@ -121,15 +123,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_en.expected b/repo/tests/phpunit/data/entitySearch/search_en.expected index fa1d28b..857c6bf 100644 --- a/repo/tests/phpunit/data/entitySearch/search_en.expected +++ b/repo/tests/phpunit/data/entitySearch/search_en.expected @@ -111,15 +111,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_en_strict.expected b/repo/tests/phpunit/data/entitySearch/search_en_strict.expected index 3030ae9..c6a9848 100644 --- a/repo/tests/phpunit/data/entitySearch/search_en_strict.expected +++ b/repo/tests/phpunit/data/entitySearch/search_en_strict.expected @@ -101,15 +101,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_zh-de-ch.expected b/repo/tests/phpunit/data/entitySearch/search_zh-de-ch.expected index a952591..008723b 100644 --- a/repo/tests/phpunit/data/entitySearch/search_zh-de-ch.expected +++ b/repo/tests/phpunit/data/entitySearch/search_zh-de-ch.expected @@ -385,7 +385,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hans.prefix": { @@ -393,7 +394,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hant.prefix": { @@ -401,7 +403,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-cn.prefix": { @@ -409,7 +412,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-tw.prefix": { @@ -417,7 +421,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hk.prefix": { @@ -425,7 +430,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-sg.prefix": { @@ -433,7 +439,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-mo.prefix": { @@ -441,7 +448,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-my.prefix": { @@ -449,7 +457,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.en.prefix": { @@ -457,15 +466,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/data/entitySearch/search_zh.expected b/repo/tests/phpunit/data/entitySearch/search_zh.expected index ae8061e..88effae 100644 --- a/repo/tests/phpunit/data/entitySearch/search_zh.expected +++ b/repo/tests/phpunit/data/entitySearch/search_zh.expected @@ -399,7 +399,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hans.prefix": { @@ -407,7 +408,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hant.prefix": { @@ -415,7 +417,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-cn.prefix": { @@ -423,7 +426,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-tw.prefix": { @@ -431,7 +435,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-hk.prefix": { @@ -439,7 +444,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-sg.prefix": { @@ -447,7 +453,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-mo.prefix": { @@ -455,7 +462,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.zh-my.prefix": { @@ -463,7 +471,8 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, "labels.en.prefix": { @@ -471,15 +480,17 @@ "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } }, - "labels_all.prefix": { + "labels.*.prefix": { "type": "experimental", "fragmenter": "none", "number_of_fragments": 0, "options": { - "skip_if_last_matched": true + "skip_if_last_matched": true, + "return_snippets_and_offsets": true } } } diff --git a/repo/tests/phpunit/includes/Search/Elastic/ElasticTermResultTest.php b/repo/tests/phpunit/includes/Search/Elastic/ElasticTermResultTest.php index 7cb9ed2..24c3743 100644 --- a/repo/tests/phpunit/includes/Search/Elastic/ElasticTermResultTest.php +++ b/repo/tests/phpunit/includes/Search/Elastic/ElasticTermResultTest.php @@ -208,6 +208,120 @@ 'matchedType' => 'alias' ], ], + 'other language, label' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q56', + 'labels' => [ + 'en' => [ 'Name', 'And alias' ], + ], + ], + 'highlight' => [ 'labels.de.prefix' => [ 'Something else' ] ] + ], + [ + 'id' => 'Q56', + 'label' => [ 'en', 'Name' ], + 'matched' => [ 'de', 'Something else' ], + 'matchedType' => 'label' + ], + ], + 'simple, new highlighter' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q71', + 'labels' => [ 'en' => [ 'Test 1', 'Test 1 alias' ] ], + 'descriptions' => [ 'en' => 'Describe it' ], + ], + 'highlight' => [ 'labels.en.prefix' => [ '0:0-5:5|Test 1' ] ] + ], + [ + 'id' => 'Q71', + 'label' => [ 'en', 'Test 1' ], + 'description' => [ 'en', 'Describe it' ], + 'matched' => [ 'en', 'Test 1' ], + 'matchedType' => 'label' + ] + ], + 'alias, new highlighter' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q82', + 'labels' => [ 'en' => [ 'Test 1', 'Alias', 'Another' ] ], + 'descriptions' => [ 'en' => 'Describe it' ], + ], + 'highlight' => [ 'labels.en.prefix' => [ '10:10-15:15|Another' ] ] + ], + [ + 'id' => 'Q82', + 'label' => [ 'en', 'Test 1' ], + 'description' => [ 'en', 'Describe it' ], + 'matched' => [ 'en', 'Another' ], + 'matchedType' => 'alias' + ] + ], + 'other language, new hl' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q96', + 'labels' => [ + 'en' => [ 'Name', 'And alias' ], + ], + ], + 'highlight' => [ 'labels.de.prefix' => [ '0:0-7:7|Something else' ] ] + ], + [ + 'id' => 'Q96', + 'label' => [ 'en', 'Name' ], + 'matched' => [ 'de', 'Something else' ], + 'matchedType' => 'label' + ], + ], + 'other language, alias, new hl' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q106', + 'labels' => [ + 'en' => [ 'Name', 'And alias' ], + ], + ], + 'highlight' => [ 'labels.de.prefix' => [ '1:1-8:8|Something else' ] ] + ], + [ + 'id' => 'Q106', + 'label' => [ 'en', 'Name' ], + 'matched' => [ 'de', 'Something else' ], + 'matchedType' => 'alias' + ], + ], + 'alias, new highlighter, extended' => [ + [ 'en' ], + [ 'en' ], + [ + '_source' => [ + 'title' => 'Q117', + 'labels' => [ 'en' => [ 'Test 1', 'Alias', 'Another' ] ], + 'descriptions' => [ 'en' => 'Describe it' ], + ], + 'highlight' => [ 'labels.en.prefix' => [ '10:10-15,20-30,40-45:15|Another' ] ] + ], + [ + 'id' => 'Q117', + 'label' => [ 'en', 'Test 1' ], + 'description' => [ 'en', 'Describe it' ], + 'matched' => [ 'en', 'Another' ], + 'matchedType' => 'alias' + ] + ], ]; } -- To view, visit https://gerrit.wikimedia.org/r/375087 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib415113608a6e1c852932ffadd5e07c6642611da Gerrit-PatchSet: 8 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Smalyshev <[email protected]> Gerrit-Reviewer: DCausse <[email protected]> Gerrit-Reviewer: Daniel Kinzler <[email protected]> Gerrit-Reviewer: EBernhardson <[email protected]> Gerrit-Reviewer: Smalyshev <[email protected]> Gerrit-Reviewer: Thiemo Mättig (WMDE) <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
