Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/387025 )
Change subject: Add special case handling for some forms of IDs ...................................................................... Add special case handling for some forms of IDs Cases handled: - q42 - (q42) - leading/trailing spaces - http://www.wikidata.org/entity/Q42 Bug: T179045 Bug: T179061 Bug: T179130 Change-Id: Icd588ab550a9d62f7f70603e2869e600bcc0f629 --- M repo/includes/Search/Elastic/EntitySearchElastic.php A repo/tests/phpunit/data/entitySearch/search_par.expected A repo/tests/phpunit/data/entitySearch/search_par.query A repo/tests/phpunit/data/entitySearch/search_prop.expected A repo/tests/phpunit/data/entitySearch/search_prop.query A repo/tests/phpunit/data/entitySearch/search_url.expected A repo/tests/phpunit/data/entitySearch/search_url.query 7 files changed, 572 insertions(+), 26 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/25/387025/1 diff --git a/repo/includes/Search/Elastic/EntitySearchElastic.php b/repo/includes/Search/Elastic/EntitySearchElastic.php index 7452a2f..18c9aa1 100644 --- a/repo/includes/Search/Elastic/EntitySearchElastic.php +++ b/repo/includes/Search/Elastic/EntitySearchElastic.php @@ -12,6 +12,7 @@ use Language; use WebRequest; use Wikibase\DataModel\Entity\EntityIdParser; +use Wikibase\DataModel\Entity\EntityIdParsingException; use Wikibase\LanguageFallbackChainFactory; use Wikibase\Lib\Interactors\TermSearchResult; use Wikibase\Repo\Api\EntitySearchHelper; @@ -139,6 +140,8 @@ $query = new BoolQuery(); $context->setOriginalSearchTerm( $text ); + // Drop leading spaces + $text = ltrim( $text ); if ( empty( $this->contentModelMap[$entityType] ) ) { $context->setResultsPossible( false ); $context->addWarning( 'wikibase-search-bad-entity-type', $entityType ); @@ -191,8 +194,8 @@ $labelsQuery->addFilter( $labelsFilter ); $labelsQuery->addMust( $dismax ); // TODO: this is a bit hacky, better way would be to make the field case-insensitive - // or add new subfield which is case-insensitive - $titleMatch = new Term( [ 'title.keyword' => strtoupper( $text ) ] ); + // or add new subfiled which is case-insensitive + $titleMatch = new Term( [ 'title.keyword' => $this->normalizeId( $text ) ] ); // Match either labels or exact match to title $query->addShould( $labelsQuery ); @@ -206,37 +209,43 @@ } /** - * Create constant score query for a field. - * @param string $field - * @param string|double $boost - * @param string $text - * @return ConstantScore + * Parse entity ID or return null + * @param $text + * @return null|\Wikibase\DataModel\Entity\EntityId */ - private function makeConstScoreQuery( $field, $boost, $text ) { - $csquery = new ConstantScore(); - $csquery->setFilter( new Match( $field, $text ) ); - $csquery->setBoost( $boost ); - return $csquery; + private function parseOrNull($text) { + try { + $id = $this->idParser->parse( $text ); + } + catch ( EntityIdParsingException $ex ) { + return null; + } + return $id; } /** - * Get suitable rescore profile. - * If internal config has non, return just the name and let RescoureBuilder handle it. - * @return string|array + * If the text looks like ID, normalize it to ID title + * Cases handled: + * - q42 + * - (q42) + * - leading/trailing spaces + * - http://www.wikidata.org/entity/Q42 + * @param string $text + * @return string Normalized ID or original string */ - private function getRescoreProfile() { - - $rescoreProfile = $this->request->getVal( 'cirrusRescoreProfile' ); - if ( !$rescoreProfile && isset( $this->settings['defaultPrefixRescoreProfile'] ) ) { - $rescoreProfile = $this->settings['defaultPrefixRescoreProfile']; + private function normalizeId( $text ) { + $text = strtoupper( str_replace( [ '(', ')' ], '', trim( $text ) ) ); + $id = $this->parseOrNull( $text ); + if ( $id ) { + return $id->getSerialization(); } - if ( !$rescoreProfile ) { - $rescoreProfile = self::DEFAULT_RESCORE_PROFILE; + if ( preg_match( '/\b(\w+)$/', $text, $matches ) && $matches[1] ) { + $id = $this->parseOrNull( $matches[1] ); + if ( $id ) { + return $id->getSerialization(); + } } - if ( $this->settings['rescoreProfiles'][$rescoreProfile] ) { - return $this->settings['rescoreProfiles'][$rescoreProfile]; - } - return $rescoreProfile; + return $text; } /** diff --git a/repo/tests/phpunit/data/entitySearch/search_par.expected b/repo/tests/phpunit/data/entitySearch/search_par.expected new file mode 100644 index 0000000..fde2463 --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_par.expected @@ -0,0 +1,172 @@ +{ + "description": "wikibase_prefix search for '(p128) '", + "params": { + "timeout": "20s" + }, + "query": { + "query": { + "bool": { + "should": [ + { + "bool": { + "filter": [ + { + "match": { + "labels_all.prefix": "(p128) " + } + } + ], + "must": [ + { + "dis_max": { + "tie_breaker": 0, + "queries": [ + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match": "(p128) " + } + }, + "boost": 2 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match_folded": "(p128) " + } + }, + "boost": 1.8 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.prefix": "(p128) " + } + }, + "boost": 1.1 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels_all.near_match_folded": "(p128) " + } + }, + "boost": 0.001 + } + } + ] + } + } + ] + } + }, + { + "term": { + "title.keyword": "P128" + } + } + ], + "minimum_should_match": 1, + "filter": [ + { + "term": { + "content_model": "wikibase-item" + } + } + ] + } + }, + "_source": [ + "namespace", + "title", + "labels.en", + "descriptions.en" + ], + "stored_fields": [], + "highlight": { + "pre_tags": [ + "" + ], + "post_tags": [ + "" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "matched_fields": [ + "title.keyword" + ] + }, + "labels.en.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + }, + "labels.*.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + } + } + }, + "size": 10, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "total", + "rescore_query": { + "function_score": { + "score_mode": "sum", + "functions": [ + { + "script_score": { + "script": { + "inline": "pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) + pow(50,2))", + "lang": "expression" + } + }, + "weight": 0.6 + }, + { + "script_score": { + "script": { + "inline": "pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) + pow(20,2))", + "lang": "expression" + } + }, + "weight": 0.4 + } + ] + } + } + } + } + ], + "stats": [ + "wikibase-prefix" + ] + }, + "options": { + "timeout": "20s" + } +} \ No newline at end of file diff --git a/repo/tests/phpunit/data/entitySearch/search_par.query b/repo/tests/phpunit/data/entitySearch/search_par.query new file mode 100644 index 0000000..53d59f8 --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_par.query @@ -0,0 +1,7 @@ +{ + "search": "(p128) ", + "language": "en", + "userLang": "en", + "type": "item", + "strictlanguage": false +} diff --git a/repo/tests/phpunit/data/entitySearch/search_prop.expected b/repo/tests/phpunit/data/entitySearch/search_prop.expected new file mode 100644 index 0000000..420dee0 --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_prop.expected @@ -0,0 +1,172 @@ +{ + "description": "wikibase_prefix search for '\tp42'", + "params": { + "timeout": "20s" + }, + "query": { + "query": { + "bool": { + "should": [ + { + "bool": { + "filter": [ + { + "match": { + "labels_all.prefix": "p42" + } + } + ], + "must": [ + { + "dis_max": { + "tie_breaker": 0, + "queries": [ + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match": "p42" + } + }, + "boost": 2 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match_folded": "p42" + } + }, + "boost": 1.8 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.prefix": "p42" + } + }, + "boost": 1.1 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels_all.near_match_folded": "p42" + } + }, + "boost": 0.001 + } + } + ] + } + } + ] + } + }, + { + "term": { + "title.keyword": "P42" + } + } + ], + "minimum_should_match": 1, + "filter": [ + { + "term": { + "content_model": "wikibase-item" + } + } + ] + } + }, + "_source": [ + "namespace", + "title", + "labels.en", + "descriptions.en" + ], + "stored_fields": [], + "highlight": { + "pre_tags": [ + "" + ], + "post_tags": [ + "" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "matched_fields": [ + "title.keyword" + ] + }, + "labels.en.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + }, + "labels.*.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + } + } + }, + "size": 10, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "total", + "rescore_query": { + "function_score": { + "score_mode": "sum", + "functions": [ + { + "script_score": { + "script": { + "inline": "pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) + pow(50,2))", + "lang": "expression" + } + }, + "weight": 0.6 + }, + { + "script_score": { + "script": { + "inline": "pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) + pow(20,2))", + "lang": "expression" + } + }, + "weight": 0.4 + } + ] + } + } + } + } + ], + "stats": [ + "wikibase-prefix" + ] + }, + "options": { + "timeout": "20s" + } +} \ No newline at end of file diff --git a/repo/tests/phpunit/data/entitySearch/search_prop.query b/repo/tests/phpunit/data/entitySearch/search_prop.query new file mode 100644 index 0000000..6f73cae --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_prop.query @@ -0,0 +1,7 @@ +{ + "search": " p42", + "language": "en", + "userLang": "en", + "type": "item", + "strictlanguage": false +} diff --git a/repo/tests/phpunit/data/entitySearch/search_url.expected b/repo/tests/phpunit/data/entitySearch/search_url.expected new file mode 100644 index 0000000..6f9ab27 --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_url.expected @@ -0,0 +1,172 @@ +{ + "description": "wikibase_prefix search for 'https:\/\/www.wikidata.org\/wiki\/Q56'", + "params": { + "timeout": "20s" + }, + "query": { + "query": { + "bool": { + "should": [ + { + "bool": { + "filter": [ + { + "match": { + "labels_all.prefix": "https:\/\/www.wikidata.org\/wiki\/Q56" + } + } + ], + "must": [ + { + "dis_max": { + "tie_breaker": 0, + "queries": [ + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match": "https:\/\/www.wikidata.org\/wiki\/Q56" + } + }, + "boost": 2 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.near_match_folded": "https:\/\/www.wikidata.org\/wiki\/Q56" + } + }, + "boost": 1.8 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels.en.prefix": "https:\/\/www.wikidata.org\/wiki\/Q56" + } + }, + "boost": 1.1 + } + }, + { + "constant_score": { + "filter": { + "match": { + "labels_all.near_match_folded": "https:\/\/www.wikidata.org\/wiki\/Q56" + } + }, + "boost": 0.001 + } + } + ] + } + } + ] + } + }, + { + "term": { + "title.keyword": "Q56" + } + } + ], + "minimum_should_match": 1, + "filter": [ + { + "term": { + "content_model": "wikibase-item" + } + } + ] + } + }, + "_source": [ + "namespace", + "title", + "labels.en", + "descriptions.en" + ], + "stored_fields": [], + "highlight": { + "pre_tags": [ + "" + ], + "post_tags": [ + "" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "matched_fields": [ + "title.keyword" + ] + }, + "labels.en.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + }, + "labels.*.prefix": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 0, + "options": { + "skip_if_last_matched": true, + "return_snippets_and_offsets": true + } + } + } + }, + "size": 10, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "total", + "rescore_query": { + "function_score": { + "score_mode": "sum", + "functions": [ + { + "script_score": { + "script": { + "inline": "pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) + pow(50,2))", + "lang": "expression" + } + }, + "weight": 0.6 + }, + { + "script_score": { + "script": { + "inline": "pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) + pow(20,2))", + "lang": "expression" + } + }, + "weight": 0.4 + } + ] + } + } + } + } + ], + "stats": [ + "wikibase-prefix" + ] + }, + "options": { + "timeout": "20s" + } +} \ No newline at end of file diff --git a/repo/tests/phpunit/data/entitySearch/search_url.query b/repo/tests/phpunit/data/entitySearch/search_url.query new file mode 100644 index 0000000..1758414 --- /dev/null +++ b/repo/tests/phpunit/data/entitySearch/search_url.query @@ -0,0 +1,7 @@ +{ + "search": "https://www.wikidata.org/wiki/Q56", + "language": "en", + "userLang": "en", + "type": "item", + "strictlanguage": false +} -- To view, visit https://gerrit.wikimedia.org/r/387025 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icd588ab550a9d62f7f70603e2869e600bcc0f629 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Smalyshev <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
