jenkins-bot has submitted this change and it was merged. Change subject: Support ORs of incategory ......................................................................
Support ORs of incategory This supports incategory as names of the category or as the page id of the category's wikipage. The latter is to support integration with CatGraph. This patch has a non-strict dependency (nothing will break without it) on I7b7be24f in core. Until that patch is merged users searching for a non-existent category by id will be offered to create the page matching the search query when they should not. Bug: T89823 Change-Id: Iad60c33d6a582a1ad005bd45165ffa198637add0 --- M CirrusSearch.php M includes/Searcher.php M tests/browser/features/incategory_api.feature M tests/browser/features/step_definitions/search_steps.rb M tests/browser/features/support/cirrus_search_api_helper.rb M tests/browser/features/support/hooks.rb 6 files changed, 89 insertions(+), 5 deletions(-) Approvals: Manybubbles: Looks good to me, approved Daniel Kinzler: Looks good to me, but someone else must approve jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index 3f9a99b..cffc871 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -529,6 +529,8 @@ */ $wgCirrusSearchEnableSearchLogging = false; +// The maximum number of incategory:a|b|c items to OR together. +$wgCirrusSearchMaxIncategoryOptions = 100; $includes = __DIR__ . "/includes/"; $apiDir = $includes . 'Api/'; diff --git a/includes/Searcher.php b/includes/Searcher.php index b077bfc..f067b9d 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -2,6 +2,7 @@ namespace CirrusSearch; use Elastica; +use \Category; use \CirrusSearch; use \CirrusSearch\Extra\Filter\SourceRegex; use \CirrusSearch\Search\Escaper; @@ -497,10 +498,12 @@ // The {7,15} keeps this from having horrible performance on big strings $escaper = $this->escaper; $fuzzyQuery = $this->fuzzyQuery; + $isEmptyQuery = false; $this->extractSpecialSyntaxFromTerm( '/(?<key>[a-z\\-]{7,15}):\s*(?<value>"(?:[^"]|(?:(?<=\\\)"))+"|[^ "]+) ?/', function ( $matches ) use ( $searcher, $escaper, &$filters, &$notFilters, &$boostTemplates, - &$searchContainedSyntax, &$fuzzyQuery, &$highlightSource ) { + &$searchContainedSyntax, &$fuzzyQuery, &$highlightSource, &$isEmptyQuery ) { + global $wgCirrusSearchMaxIncategoryOptions; $key = $matches['key']; $value = $matches['value']; // Note that if the user supplied quotes they are not removed $value = str_replace( '\"', '"', $value ); @@ -543,7 +546,13 @@ $searchContainedSyntax = true; return ''; case 'incategory': - $filterDestination[] = $searcher->matchPage( 'category.lowercase_keyword', $value ); + $categories = array_slice( explode( '|', $value ), 0, $wgCirrusSearchMaxIncategoryOptions ); + $categoryFilters = $searcher->matchPageCategories( $categories ); + if ( $categoryFilters === null ) { + $isEmptyQuery = true; + } else { + $filterDestination[] = $categoryFilters; + } $searchContainedSyntax = true; return ''; case 'insource': @@ -559,6 +568,9 @@ } } ); + if ( $isEmptyQuery ) { + return Status::newGood( new SearchResultSet( true ) ); + } $this->filters = $filters; $this->notFilters = $notFilters; $this->boostTemplates = $boostTemplates; @@ -754,6 +766,38 @@ } /** + * Builds an or between many categories that the page could be in. + * @param string[] $categories categories to match + * @return \Elastica\Filter\Bool|null A null return value means all values are filtered + * and an empty result set should be returned. + */ + public function matchPageCategories( $categories ) { + $filter = new \Elastica\Filter\Bool(); + $ids = array(); + $names = array(); + foreach ( $categories as $category ) { + if ( substr( $category, 0, 3 ) === 'id:' ) { + $id = substr( $category, 3 ); + if ( ctype_digit( $id ) ) { + $ids[] = $id; + } + } else { + $names[] = $category; + } + } + foreach ( Title::newFromIds( $ids ) as $title ) { + $names[] = $title->getText(); + } + if ( !$names ) { + return null; + } + foreach( $names as $name ) { + $filter->addShould( $this->matchPage( 'category.lowercase_keyword', $name ) ); + } + return $filter; + } + + /** * Find articles that contain similar text to the provided title array. * @param Title[] $titles array of titles of articles to search for * @param int $options bitset of options: diff --git a/tests/browser/features/incategory_api.feature b/tests/browser/features/incategory_api.feature index 296b18a..20b68e4 100644 --- a/tests/browser/features/incategory_api.feature +++ b/tests/browser/features/incategory_api.feature @@ -7,6 +7,27 @@ And Amazing Catapult is in the api search results But Two Words is not in the api search results + Scenario: incategory: splits on | to create an OR query + When I api search for incategory:weaponry|nothing + Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + + Scenario Outline: incategory: does not fail when the category is unknown + When I api search for incategory:<category> + Then there are no api search results + Examples: + | category | + | doesnotexistatleastihopenot | + | id:2147483600 | + + Scenario: incategory: finds categories by page id + When I locate the page id of Category:Weaponry and store it as %weaponry_id% + And I api search for incategory:id:%weaponry_id% + Then Catapult is in the api search results + And Amazing Catapult is in the api search results + But Two Words is not in the api search results + Scenario: incategory: works on categories from templates When I api search for incategory:templatetagged incategory:twowords Then Two Words is the first api search result @@ -42,11 +63,11 @@ Scenario: incategory: can be combined with other text When I api search for incategory:weaponry amazing Then Amazing Catapult is the first api search result - + Scenario: -incategory: excludes pages with the category When I api search for -incategory:weaponry incategory:twowords Then Two Words is the first api search result - + Scenario: incategory: can handle a space after the : When I api search for incategory: weaponry Then Catapult is in the api search results diff --git a/tests/browser/features/step_definitions/search_steps.rb b/tests/browser/features/step_definitions/search_steps.rb index 01f6eef..7eefe4c 100644 --- a/tests/browser/features/step_definitions/search_steps.rb +++ b/tests/browser/features/step_definitions/search_steps.rb @@ -9,11 +9,19 @@ When(/^I go search for (.*)$/) do |search| visit(SearchResultsPage, using_params: { search: search }) end +Before do + @search_vars = { + "idiographic_whitspace" => "u\3000".force_encoding("utf-8") + } +end +When(/^I locate the page id of (.*) and store it as (%.*%)$/) do |title, varname| + @search_vars[varname] = page_id_of title +end # rubocop:disable LineLength When(/^I api search( with disabled incoming link weighting)?(?: with offset (\d+))?(?: in the (.*) language)?(?: in namespaces? (\d+(?: \d+)*))? for (.*)$/) do |incoming_links, offset, lang, namespaces, search| begin @api_result = search_for( - search.gsub("%idiographic_whitespace%", "\u3000".force_encoding("utf-8")), + search.gsub(/%[^ ]+%/, @search_vars), sroffset: offset, srnamespace: (namespaces || "0").split(/ /), uselang: lang, diff --git a/tests/browser/features/support/cirrus_search_api_helper.rb b/tests/browser/features/support/cirrus_search_api_helper.rb index 400d091..630988a 100644 --- a/tests/browser/features/support/cirrus_search_api_helper.rb +++ b/tests/browser/features/support/cirrus_search_api_helper.rb @@ -93,4 +93,12 @@ token_type: "edit" ) end + + # Locate a category id + def page_id_of(title) + res = api.query( + titles: title + ) + res["query"]["pages"].first[1]["pageid"] + end end diff --git a/tests/browser/features/support/hooks.rb b/tests/browser/features/support/hooks.rb index e1b37c5..1991b6d 100644 --- a/tests/browser/features/support/hooks.rb +++ b/tests/browser/features/support/hooks.rb @@ -11,6 +11,7 @@ And a page named Links To Catapult exists with contents [[Catapult]] And a page named Catapult exists with contents ♙ asdf [[Category:Weaponry]] And a page named Amazing Catapult exists with contents test [[Catapult]] [[Category:Weaponry]] + And a page named Category:Weaponry exists with contents Weaponry refers to any items designed or used to attack and kill or destroy other people and property. And a page named Two Words exists with contents ffnonesenseword catapult {{Template_Test}} anotherword [[Category:TwoWords]] [[Category:Categorywith Twowords]] [[Category:Categorywith " Quote]] And a page named AlphaBeta exists with contents [[Category:Alpha]] [[Category:Beta]] And a page named IHaveATwoWordCategory exists with contents [[Category:CategoryWith ASpace]] -- To view, visit https://gerrit.wikimedia.org/r/191622 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Iad60c33d6a582a1ad005bd45165ffa198637add0 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Jkroll <jkr...@toolserver.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: TMg <mr.h...@gmx.de> Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> Gerrit-Reviewer: Tobias Gritschacher <tobias.gritschac...@wikimedia.de> Gerrit-Reviewer: WMDE-Fisch <christoph.fisc...@wikimedia.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits