Smalyshev has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/387025 )

Change subject: Add special case handling for some forms of IDs
......................................................................

Add special case handling for some forms of IDs

Cases handled:
 - q42
 - (q42)
 - leading/trailing spaces
 - http://www.wikidata.org/entity/Q42

Bug: T179045
Bug: T179061
Bug: T179130
Change-Id: Icd588ab550a9d62f7f70603e2869e600bcc0f629
---
M repo/includes/Search/Elastic/EntitySearchElastic.php
A repo/tests/phpunit/data/entitySearch/search_par.expected
A repo/tests/phpunit/data/entitySearch/search_par.query
A repo/tests/phpunit/data/entitySearch/search_prop.expected
A repo/tests/phpunit/data/entitySearch/search_prop.query
A repo/tests/phpunit/data/entitySearch/search_url.expected
A repo/tests/phpunit/data/entitySearch/search_url.query
7 files changed, 572 insertions(+), 26 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/25/387025/1

diff --git a/repo/includes/Search/Elastic/EntitySearchElastic.php 
b/repo/includes/Search/Elastic/EntitySearchElastic.php
index 7452a2f..18c9aa1 100644
--- a/repo/includes/Search/Elastic/EntitySearchElastic.php
+++ b/repo/includes/Search/Elastic/EntitySearchElastic.php
@@ -12,6 +12,7 @@
 use Language;
 use WebRequest;
 use Wikibase\DataModel\Entity\EntityIdParser;
+use Wikibase\DataModel\Entity\EntityIdParsingException;
 use Wikibase\LanguageFallbackChainFactory;
 use Wikibase\Lib\Interactors\TermSearchResult;
 use Wikibase\Repo\Api\EntitySearchHelper;
@@ -139,6 +140,8 @@
                $query = new BoolQuery();
 
                $context->setOriginalSearchTerm( $text );
+               // Drop leading spaces
+               $text = ltrim( $text );
                if ( empty( $this->contentModelMap[$entityType] ) ) {
                        $context->setResultsPossible( false );
                        $context->addWarning( 
'wikibase-search-bad-entity-type', $entityType );
@@ -191,8 +194,8 @@
                $labelsQuery->addFilter( $labelsFilter );
                $labelsQuery->addMust( $dismax );
                // TODO: this is a bit hacky, better way would be to make the 
field case-insensitive
-               // or add new subfield which is case-insensitive
-               $titleMatch = new Term( [ 'title.keyword' => strtoupper( $text 
) ] );
+               // or add new subfiled which is case-insensitive
+               $titleMatch = new Term( [ 'title.keyword' => 
$this->normalizeId( $text ) ] );
 
                // Match either labels or exact match to title
                $query->addShould( $labelsQuery );
@@ -206,37 +209,43 @@
        }
 
        /**
-        * Create constant score query for a field.
-        * @param string $field
-        * @param string|double $boost
-        * @param string $text
-        * @return ConstantScore
+        * Parse entity ID or return null
+        * @param $text
+        * @return null|\Wikibase\DataModel\Entity\EntityId
         */
-       private function makeConstScoreQuery( $field, $boost, $text ) {
-               $csquery = new ConstantScore();
-               $csquery->setFilter( new Match( $field, $text ) );
-               $csquery->setBoost( $boost );
-               return $csquery;
+       private function parseOrNull($text) {
+               try {
+                       $id = $this->idParser->parse( $text );
+               }
+               catch ( EntityIdParsingException $ex ) {
+                       return null;
+               }
+               return $id;
        }
 
        /**
-        * Get suitable rescore profile.
-        * If internal config has non, return just the name and let 
RescoureBuilder handle it.
-        * @return string|array
+        * If the text looks like ID, normalize it to ID title
+        * Cases handled:
+        * - q42
+        * - (q42)
+        * - leading/trailing spaces
+        * - http://www.wikidata.org/entity/Q42
+        * @param string $text
+        * @return string Normalized ID or original string
         */
-       private function getRescoreProfile() {
-
-               $rescoreProfile = $this->request->getVal( 
'cirrusRescoreProfile' );
-               if ( !$rescoreProfile && isset( 
$this->settings['defaultPrefixRescoreProfile'] ) ) {
-                       $rescoreProfile = 
$this->settings['defaultPrefixRescoreProfile'];
+       private function normalizeId( $text ) {
+               $text = strtoupper( str_replace( [ '(', ')' ], '', trim( $text 
) ) );
+               $id = $this->parseOrNull( $text );
+               if ( $id ) {
+                       return $id->getSerialization();
                }
-               if ( !$rescoreProfile ) {
-                       $rescoreProfile = self::DEFAULT_RESCORE_PROFILE;
+               if ( preg_match( '/\b(\w+)$/', $text, $matches ) && $matches[1] 
) {
+                       $id = $this->parseOrNull( $matches[1] );
+                       if ( $id ) {
+                               return $id->getSerialization();
+                       }
                }
-               if ( $this->settings['rescoreProfiles'][$rescoreProfile] ) {
-                       return 
$this->settings['rescoreProfiles'][$rescoreProfile];
-               }
-               return $rescoreProfile;
+               return $text;
        }
 
        /**
diff --git a/repo/tests/phpunit/data/entitySearch/search_par.expected 
b/repo/tests/phpunit/data/entitySearch/search_par.expected
new file mode 100644
index 0000000..fde2463
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_par.expected
@@ -0,0 +1,172 @@
+{
+    "description": "wikibase_prefix search for '(p128) '",
+    "params": {
+        "timeout": "20s"
+    },
+    "query": {
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "bool": {
+                            "filter": [
+                                {
+                                    "match": {
+                                        "labels_all.prefix": "(p128) "
+                                    }
+                                }
+                            ],
+                            "must": [
+                                {
+                                    "dis_max": {
+                                        "tie_breaker": 0,
+                                        "queries": [
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match": "(p128) "
+                                                        }
+                                                    },
+                                                    "boost": 2
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match_folded": "(p128) "
+                                                        }
+                                                    },
+                                                    "boost": 1.8
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.prefix": "(p128) "
+                                                        }
+                                                    },
+                                                    "boost": 1.1
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels_all.near_match_folded": "(p128) "
+                                                        }
+                                                    },
+                                                    "boost": 0.001
+                                                }
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    },
+                    {
+                        "term": {
+                            "title.keyword": "P128"
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "term": {
+                            "content_model": "wikibase-item"
+                        }
+                    }
+                ]
+            }
+        },
+        "_source": [
+            "namespace",
+            "title",
+            "labels.en",
+            "descriptions.en"
+        ],
+        "stored_fields": [],
+        "highlight": {
+            "pre_tags": [
+                ""
+            ],
+            "post_tags": [
+                ""
+            ],
+            "fields": {
+                "title": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "matched_fields": [
+                        "title.keyword"
+                    ]
+                },
+                "labels.en.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                },
+                "labels.*.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                }
+            }
+        },
+        "size": 10,
+        "rescore": [
+            {
+                "window_size": 8192,
+                "query": {
+                    "query_weight": 1,
+                    "rescore_query_weight": 1,
+                    "score_mode": "total",
+                    "rescore_query": {
+                        "function_score": {
+                            "score_mode": "sum",
+                            "functions": [
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) 
+ pow(50,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.6
+                                },
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) 
+ pow(20,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.4
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        ],
+        "stats": [
+            "wikibase-prefix"
+        ]
+    },
+    "options": {
+        "timeout": "20s"
+    }
+}
\ No newline at end of file
diff --git a/repo/tests/phpunit/data/entitySearch/search_par.query 
b/repo/tests/phpunit/data/entitySearch/search_par.query
new file mode 100644
index 0000000..53d59f8
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_par.query
@@ -0,0 +1,7 @@
+{
+       "search": "(p128) ",
+       "language": "en",
+       "userLang": "en",
+       "type": "item",
+       "strictlanguage": false
+}
diff --git a/repo/tests/phpunit/data/entitySearch/search_prop.expected 
b/repo/tests/phpunit/data/entitySearch/search_prop.expected
new file mode 100644
index 0000000..420dee0
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_prop.expected
@@ -0,0 +1,172 @@
+{
+    "description": "wikibase_prefix search for '\tp42'",
+    "params": {
+        "timeout": "20s"
+    },
+    "query": {
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "bool": {
+                            "filter": [
+                                {
+                                    "match": {
+                                        "labels_all.prefix": "p42"
+                                    }
+                                }
+                            ],
+                            "must": [
+                                {
+                                    "dis_max": {
+                                        "tie_breaker": 0,
+                                        "queries": [
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match": "p42"
+                                                        }
+                                                    },
+                                                    "boost": 2
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match_folded": "p42"
+                                                        }
+                                                    },
+                                                    "boost": 1.8
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.prefix": "p42"
+                                                        }
+                                                    },
+                                                    "boost": 1.1
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels_all.near_match_folded": "p42"
+                                                        }
+                                                    },
+                                                    "boost": 0.001
+                                                }
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    },
+                    {
+                        "term": {
+                            "title.keyword": "P42"
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "term": {
+                            "content_model": "wikibase-item"
+                        }
+                    }
+                ]
+            }
+        },
+        "_source": [
+            "namespace",
+            "title",
+            "labels.en",
+            "descriptions.en"
+        ],
+        "stored_fields": [],
+        "highlight": {
+            "pre_tags": [
+                ""
+            ],
+            "post_tags": [
+                ""
+            ],
+            "fields": {
+                "title": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "matched_fields": [
+                        "title.keyword"
+                    ]
+                },
+                "labels.en.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                },
+                "labels.*.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                }
+            }
+        },
+        "size": 10,
+        "rescore": [
+            {
+                "window_size": 8192,
+                "query": {
+                    "query_weight": 1,
+                    "rescore_query_weight": 1,
+                    "score_mode": "total",
+                    "rescore_query": {
+                        "function_score": {
+                            "score_mode": "sum",
+                            "functions": [
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) 
+ pow(50,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.6
+                                },
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) 
+ pow(20,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.4
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        ],
+        "stats": [
+            "wikibase-prefix"
+        ]
+    },
+    "options": {
+        "timeout": "20s"
+    }
+}
\ No newline at end of file
diff --git a/repo/tests/phpunit/data/entitySearch/search_prop.query 
b/repo/tests/phpunit/data/entitySearch/search_prop.query
new file mode 100644
index 0000000..6f73cae
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_prop.query
@@ -0,0 +1,7 @@
+{
+       "search": "     p42",
+       "language": "en",
+       "userLang": "en",
+       "type": "item",
+       "strictlanguage": false
+}
diff --git a/repo/tests/phpunit/data/entitySearch/search_url.expected 
b/repo/tests/phpunit/data/entitySearch/search_url.expected
new file mode 100644
index 0000000..6f9ab27
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_url.expected
@@ -0,0 +1,172 @@
+{
+    "description": "wikibase_prefix search for 
'https:\/\/www.wikidata.org\/wiki\/Q56'",
+    "params": {
+        "timeout": "20s"
+    },
+    "query": {
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "bool": {
+                            "filter": [
+                                {
+                                    "match": {
+                                        "labels_all.prefix": 
"https:\/\/www.wikidata.org\/wiki\/Q56"
+                                    }
+                                }
+                            ],
+                            "must": [
+                                {
+                                    "dis_max": {
+                                        "tie_breaker": 0,
+                                        "queries": [
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match": "https:\/\/www.wikidata.org\/wiki\/Q56"
+                                                        }
+                                                    },
+                                                    "boost": 2
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.near_match_folded": "https:\/\/www.wikidata.org\/wiki\/Q56"
+                                                        }
+                                                    },
+                                                    "boost": 1.8
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels.en.prefix": "https:\/\/www.wikidata.org\/wiki\/Q56"
+                                                        }
+                                                    },
+                                                    "boost": 1.1
+                                                }
+                                            },
+                                            {
+                                                "constant_score": {
+                                                    "filter": {
+                                                        "match": {
+                                                            
"labels_all.near_match_folded": "https:\/\/www.wikidata.org\/wiki\/Q56"
+                                                        }
+                                                    },
+                                                    "boost": 0.001
+                                                }
+                                            }
+                                        ]
+                                    }
+                                }
+                            ]
+                        }
+                    },
+                    {
+                        "term": {
+                            "title.keyword": "Q56"
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "term": {
+                            "content_model": "wikibase-item"
+                        }
+                    }
+                ]
+            }
+        },
+        "_source": [
+            "namespace",
+            "title",
+            "labels.en",
+            "descriptions.en"
+        ],
+        "stored_fields": [],
+        "highlight": {
+            "pre_tags": [
+                ""
+            ],
+            "post_tags": [
+                ""
+            ],
+            "fields": {
+                "title": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "matched_fields": [
+                        "title.keyword"
+                    ]
+                },
+                "labels.en.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                },
+                "labels.*.prefix": {
+                    "type": "experimental",
+                    "fragmenter": "none",
+                    "number_of_fragments": 0,
+                    "options": {
+                        "skip_if_last_matched": true,
+                        "return_snippets_and_offsets": true
+                    }
+                }
+            }
+        },
+        "size": 10,
+        "rescore": [
+            {
+                "window_size": 8192,
+                "query": {
+                    "query_weight": 1,
+                    "rescore_query_weight": 1,
+                    "score_mode": "total",
+                    "rescore_query": {
+                        "function_score": {
+                            "score_mode": "sum",
+                            "functions": [
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['incoming_links'].value , 2) \/ ( pow(doc['incoming_links'].value, 2) 
+ pow(50,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.6
+                                },
+                                {
+                                    "script_score": {
+                                        "script": {
+                                            "inline": 
"pow(doc['sitelink_count'].value , 2) \/ ( pow(doc['sitelink_count'].value, 2) 
+ pow(20,2))",
+                                            "lang": "expression"
+                                        }
+                                    },
+                                    "weight": 0.4
+                                }
+                            ]
+                        }
+                    }
+                }
+            }
+        ],
+        "stats": [
+            "wikibase-prefix"
+        ]
+    },
+    "options": {
+        "timeout": "20s"
+    }
+}
\ No newline at end of file
diff --git a/repo/tests/phpunit/data/entitySearch/search_url.query 
b/repo/tests/phpunit/data/entitySearch/search_url.query
new file mode 100644
index 0000000..1758414
--- /dev/null
+++ b/repo/tests/phpunit/data/entitySearch/search_url.query
@@ -0,0 +1,7 @@
+{
+       "search": "https://www.wikidata.org/wiki/Q56";,
+       "language": "en",
+       "userLang": "en",
+       "type": "item",
+       "strictlanguage": false
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/387025
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icd588ab550a9d62f7f70603e2869e600bcc0f629
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to