jenkins-bot has submitted this change and it was merged. Change subject: Extract ParserOutput search index data fields from WikiTextContentHandler ......................................................................
Extract ParserOutput search index data fields from WikiTextContentHandler Bug: T142491 Change-Id: I69b010b893135e53fac7f16f4b927b8fbcba06d2 --- M autoload.php M includes/content/ContentHandler.php M includes/content/WikiTextStructure.php M includes/content/WikitextContentHandler.php A includes/search/DummySearchIndexFieldDefinition.php A includes/search/ParserOutputSearchDataExtractor.php M includes/search/SearchIndexFieldDefinition.php M tests/phpunit/includes/content/ContentHandlerTest.php M tests/phpunit/includes/content/WikitextStructureTest.php A tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php 10 files changed, 281 insertions(+), 139 deletions(-) Approvals: Smalyshev: Looks good to me, approved jenkins-bot: Verified diff --git a/autoload.php b/autoload.php index f6edd94..f75c5af 100644 --- a/autoload.php +++ b/autoload.php @@ -372,6 +372,7 @@ 'DoubleRedirectsPage' => __DIR__ . '/includes/specials/SpecialDoubleRedirects.php', 'DoubleReplacer' => __DIR__ . '/includes/libs/replacers/DoubleReplacer.php', 'DummyLinker' => __DIR__ . '/includes/DummyLinker.php', + 'DummySearchIndexFieldDefinition' => __DIR__ . '/includes/search/DummySearchIndexFieldDefinition.php', 'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php', 'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php', 'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php', @@ -856,6 +857,7 @@ 'MediaWiki\\Logger\\NullSpi' => __DIR__ . '/includes/debug/logger/NullSpi.php', 'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php', 'MediaWiki\\MediaWikiServices' => __DIR__ . '/includes/MediaWikiServices.php', + 'MediaWiki\\Search\\ParserOutputSearchDataExtractor' => __DIR__ . '/includes/search/ParserOutputSearchDataExtractor.php', 'MediaWiki\\Services\\CannotReplaceActiveServiceException' => __DIR__ . '/includes/Services/CannotReplaceActiveServiceException.php', 'MediaWiki\\Services\\ContainerDisabledException' => __DIR__ . '/includes/Services/ContainerDisabledException.php', 'MediaWiki\\Services\\DestructibleService' => __DIR__ . '/includes/Services/DestructibleService.php', diff --git a/includes/content/ContentHandler.php b/includes/content/ContentHandler.php index 7184980..3a75f50 100644 --- a/includes/content/ContentHandler.php +++ b/includes/content/ContentHandler.php @@ -1,4 +1,7 @@ <?php + +use MediaWiki\Search\ParserOutputSearchDataExtractor; + /** * Base class for content handling. * @@ -1251,24 +1254,40 @@ /** * Get fields definition for search index + * + * @todo Expose title, redirect, namespace, text, source_text, text_bytes + * field mappings here. (see T142670 and T143409) + * * @param SearchEngine $engine * @return SearchIndexField[] List of fields this content handler can provide. * @since 1.28 */ public function getFieldsForSearchIndex( SearchEngine $engine ) { - /* Default fields: - /* - * namespace - * namespace_text - * redirect - * source_text - * suggest - * timestamp - * title - * text - * text_bytes - */ - return []; + $fields['category'] = $engine->makeSearchFieldMapping( + 'category', + SearchIndexField::INDEX_TYPE_TEXT + ); + + $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD ); + + $fields['external_link'] = $engine->makeSearchFieldMapping( + 'external_link', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['outgoing_link'] = $engine->makeSearchFieldMapping( + 'outgoing_link', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['template'] = $engine->makeSearchFieldMapping( + 'template', + SearchIndexField::INDEX_TYPE_KEYWORD + ); + + $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD ); + + return $fields; } /** @@ -1298,16 +1317,26 @@ */ public function getDataForSearchIndex( WikiPage $page, ParserOutput $output, SearchEngine $engine ) { - $fields = []; + $fieldData = []; $content = $page->getContent(); + if ( $content ) { + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $fieldData['category'] = $searchDataExtractor->getCategories( $output ); + $fieldData['external_link'] = $searchDataExtractor->getExternalLinks( $output ); + $fieldData['outgoing_link'] = $searchDataExtractor->getOutgoingLinks( $output ); + $fieldData['template'] = $searchDataExtractor->getTemplates( $output ); + $text = $content->getTextForSearchIndex(); - $fields['text'] = $text; - $fields['source_text'] = $text; - $fields['text_bytes'] = $content->getSize(); + + $fieldData['text'] = $text; + $fieldData['source_text'] = $text; + $fieldData['text_bytes'] = $content->getSize(); } - Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, $output, $engine ] ); - return $fields; + + Hooks::run( 'SearchDataForIndex', [ &$fieldData, $this, $page, $output, $engine ] ); + return $fieldData; } /** diff --git a/includes/content/WikiTextStructure.php b/includes/content/WikiTextStructure.php index e83c213..9768d36 100644 --- a/includes/content/WikiTextStructure.php +++ b/includes/content/WikiTextStructure.php @@ -59,50 +59,6 @@ } /** - * Get categories in the text. - * @return string[] - */ - public function categories() { - $categories = []; - foreach ( array_keys( $this->parserOutput->getCategories() ) as $key ) { - $categories[] = Category::newFromName( $key )->getTitle()->getText(); - } - return $categories; - } - - /** - * Get outgoing links. - * @return string[] - */ - public function outgoingLinks() { - $outgoingLinks = []; - foreach ( $this->parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { - foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { - $outgoingLinks[] = - Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); - } - } - return $outgoingLinks; - } - - /** - * Get templates in the text. - * @return string[] - */ - public function templates() { - $templates = []; - foreach ( $this->parserOutput->getTemplates() as $tNS => $templatesInNS ) { - foreach ( array_keys( $templatesInNS ) as $tDbKey ) { - $templateTitle = Title::makeTitleSafe( $tNS, $tDbKey ); - if ( $templateTitle && $templateTitle->exists() ) { - $templates[] = $templateTitle->getPrefixedText(); - } - } - } - return $templates; - } - - /** * Get headings on the page. * @return string[] * First strip out things that look like references. We can't use HTML filtering because diff --git a/includes/content/WikitextContentHandler.php b/includes/content/WikitextContentHandler.php index 9baf643..e51d246 100644 --- a/includes/content/WikitextContentHandler.php +++ b/includes/content/WikitextContentHandler.php @@ -111,13 +111,6 @@ public function getFieldsForSearchIndex( SearchEngine $engine ) { $fields = parent::getFieldsForSearchIndex( $engine ); - $fields['category'] = - $engine->makeSearchFieldMapping( 'category', SearchIndexField::INDEX_TYPE_TEXT ); - $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD ); - - $fields['external_link'] = - $engine->makeSearchFieldMapping( 'external_link', SearchIndexField::INDEX_TYPE_KEYWORD ); - $fields['heading'] = $engine->makeSearchFieldMapping( 'heading', SearchIndexField::INDEX_TYPE_TEXT ); $fields['heading']->setFlag( SearchIndexField::FLAG_SCORING ); @@ -129,13 +122,6 @@ $engine->makeSearchFieldMapping( 'opening_text', SearchIndexField::INDEX_TYPE_TEXT ); $fields['opening_text']->setFlag( SearchIndexField::FLAG_SCORING | SearchIndexField::FLAG_NO_HIGHLIGHT ); - - $fields['outgoing_link'] = - $engine->makeSearchFieldMapping( 'outgoing_link', SearchIndexField::INDEX_TYPE_KEYWORD ); - - $fields['template'] = - $engine->makeSearchFieldMapping( 'template', SearchIndexField::INDEX_TYPE_KEYWORD ); - $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD ); // FIXME: this really belongs in separate file handler but files // do not have separate handler. Sadness. @@ -165,11 +151,7 @@ $fields = parent::getDataForSearchIndex( $page, $parserOutput, $engine ); $structure = new WikiTextStructure( $parserOutput ); - $fields['external_link'] = array_keys( $parserOutput->getExternalLinks() ); - $fields['category'] = $structure->categories(); $fields['heading'] = $structure->headings(); - $fields['outgoing_link'] = $structure->outgoingLinks(); - $fields['template'] = $structure->templates(); // text fields $fields['opening_text'] = $structure->getOpeningText(); $fields['text'] = $structure->getMainText(); // overwrites one from ContentHandler diff --git a/includes/search/DummySearchIndexFieldDefinition.php b/includes/search/DummySearchIndexFieldDefinition.php new file mode 100644 index 0000000..a2a6760 --- /dev/null +++ b/includes/search/DummySearchIndexFieldDefinition.php @@ -0,0 +1,30 @@ +<?php + +/** + * Dummy implementation of SearchIndexFieldDefinition for testing purposes. + * + * @since 1.28 + */ +class DummySearchIndexFieldDefinition extends SearchIndexFieldDefinition { + + /** + * @param SearchEngine $engine + * + * @return array + */ + public function getMapping( SearchEngine $engine ) { + $mapping = [ + 'name' => $this->name, + 'type' => $this->type, + 'flags' => $this->flags, + 'subfields' => [] + ]; + + foreach ( $this->subfields as $subfield ) { + $mapping['subfields'][] = $subfield->getMapping(); + } + + return $mapping; + } + +} diff --git a/includes/search/ParserOutputSearchDataExtractor.php b/includes/search/ParserOutputSearchDataExtractor.php new file mode 100644 index 0000000..df653f1 --- /dev/null +++ b/includes/search/ParserOutputSearchDataExtractor.php @@ -0,0 +1,92 @@ +<?php + +namespace MediaWiki\Search; + +use Category; +use ParserOutput; +use Title; + +/** + * Extracts data from ParserOutput for indexing in the search engine. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + * + * @since 1.28 + */ +class ParserOutputSearchDataExtractor { + + /** + * Get a list of categories, as an array with title text strings. + * + * @return string[] + */ + public function getCategories( ParserOutput $parserOutput ) { + $categories = []; + + foreach ( $parserOutput->getCategoryLinks() as $key ) { + $categories[] = Category::newFromName( $key )->getTitle()->getText(); + } + + return $categories; + } + + /** + * Get a list of external links from ParserOutput, as an array of strings. + * + * @return string[] + */ + public function getExternalLinks( ParserOutput $parserOutput ) { + return array_keys( $parserOutput->getExternalLinks() ); + } + + /** + * Get a list of outgoing wiki links (including interwiki links), as + * an array of prefixed title strings. + * + * @return string[] + */ + public function getOutgoingLinks( ParserOutput $parserOutput ) { + $outgoingLinks = []; + + foreach ( $parserOutput->getLinks() as $linkedNamespace => $namespaceLinks ) { + foreach ( array_keys( $namespaceLinks ) as $linkedDbKey ) { + $outgoingLinks[] = + Title::makeTitle( $linkedNamespace, $linkedDbKey )->getPrefixedDBkey(); + } + } + + return $outgoingLinks; + } + + /** + * Get a list of templates used in the ParserOutput content, as prefixed title strings + * + * @return string[] + */ + public function getTemplates( ParserOutput $parserOutput ) { + $templates = []; + + foreach ( $parserOutput->getTemplates() as $tNS => $templatesInNS ) { + foreach ( array_keys( $templatesInNS ) as $tDbKey ) { + $templateTitle = Title::makeTitle( $tNS, $tDbKey ); + $templates[] = $templateTitle->getPrefixedText(); + } + } + + return $templates; + } + +} diff --git a/includes/search/SearchIndexFieldDefinition.php b/includes/search/SearchIndexFieldDefinition.php index 3a86c82..8a06b65 100644 --- a/includes/search/SearchIndexFieldDefinition.php +++ b/includes/search/SearchIndexFieldDefinition.php @@ -2,8 +2,10 @@ /** * Basic infrastructure of the field definition. - * Specific engines will need to override it at least for getMapping, - * but can reuse other parts. + * + * Specific engines should extend this class and at at least, + * override the getMapping method, but can reuse other parts. + * * @since 1.28 */ abstract class SearchIndexFieldDefinition implements SearchIndexField { @@ -115,4 +117,12 @@ $this->subfields = $subfields; return $this; } + + /** + * @param SearchEngine $engine + * + * @return array + */ + abstract public function getMapping( SearchEngine $engine ); + } diff --git a/tests/phpunit/includes/content/ContentHandlerTest.php b/tests/phpunit/includes/content/ContentHandlerTest.php index bb9050f..561f211 100644 --- a/tests/phpunit/includes/content/ContentHandlerTest.php +++ b/tests/phpunit/includes/content/ContentHandlerTest.php @@ -415,6 +415,32 @@ $this->assertInstanceOf( $handlerClass, $handler ); } + public function testGetFieldsForSearchIndex() { + $searchEngine = $this->newSearchEngine(); + + $handler = ContentHandler::getForModelID( CONTENT_MODEL_WIKITEXT ); + + $fields = $handler->getFieldsForSearchIndex( $searchEngine ); + + $this->assertArrayHasKey( 'category', $fields ); + $this->assertArrayHasKey( 'external_link', $fields ); + $this->assertArrayHasKey( 'outgoing_link', $fields ); + $this->assertArrayHasKey( 'template', $fields ); + } + + private function newSearchEngine() { + $searchEngine = $this->getMockBuilder( 'SearchEngine' ) + ->getMock(); + + $searchEngine->expects( $this->any() ) + ->method( 'makeSearchFieldMapping' ) + ->will( $this->returnCallback( function( $name, $type ) { + return new DummySearchIndexFieldDefinition( $name, $type ); + } ) ); + + return $searchEngine; + } + /** * @covers ContentHandler::getDataForSearchIndex */ @@ -425,7 +451,7 @@ $this->setTemporaryHook( 'SearchDataForIndex', function ( &$fields, ContentHandler $handler, WikiPage $page, ParserOutput $output, - SearchEngine $engine ) { + SearchEngine $engine ) { $fields['testDataField'] = 'test content'; } ); diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php b/tests/phpunit/includes/content/WikitextStructureTest.php index 6d83057..4301fb8 100644 --- a/tests/phpunit/includes/content/WikitextStructureTest.php +++ b/tests/phpunit/includes/content/WikitextStructureTest.php @@ -25,61 +25,6 @@ return new WikiTextStructure( $this->getParserOutput( $text ) ); } - public function testCategories() { - $text = <<<END -We also have a {{Template}} and an {{Another template}} in addition. -This text also has [[Category:Some Category| ]] and then [[Category:Yet another category]]. -And [[Category:Some Category| this category]] is repeated. -END; - $struct = $this->getStructure( $text ); - $cats = $struct->categories(); - $this->assertCount( 2, $cats ); - $this->assertContains( "Some Category", $cats ); - $this->assertContains( "Yet another category", $cats ); - } - - public function testOutgoingLinks() { - $text = <<<END -Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked twice. -We also have [[File:Image.jpg|image]]. -We also have a {{Template}} and an {{Another template}} in addition. -Some templates are {{lowercase}}. -And [[Some_Page]] is linked again. -It also has [[Category:Some Category| ]] and then [[Category:Yet another category]]. -Also link to a [[Talk:TestTitle|talk page]] is here. -END; - $struct = $this->getStructure( $text ); - $links = $struct->outgoingLinks(); - $this->assertContains( "Some_Page", $links ); - $this->assertContains( "Template:Template", $links ); - $this->assertContains( "Template:Another_template", $links ); - $this->assertContains( "Template:Lowercase", $links ); - $this->assertContains( "Talk:TestTitle", $links ); - $this->assertCount( 5, $links ); - } - - public function testTemplates() { - $text = <<<END -We have a {{Template}} and an {{Another template}} in addition. -Some templates are {{lowercase}}. And this {{Template}} is repeated. -Here is {{another_template|with=argument}}. -This is a template that {{Xdoes not exist}}. -END; - $this->setTemporaryHook( 'TitleExists', function ( Title $title, &$exists ) { - $txt = $title->getBaseText(); - if ( $txt[0] != 'X' ) { - $exists = true; - } - return true; - } ); - $struct = $this->getStructure( $text ); - $templates = $struct->templates(); - $this->assertCount( 3, $templates ); - $this->assertContains( "Template:Template", $templates ); - $this->assertContains( "Template:Another template", $templates ); - $this->assertContains( "Template:Lowercase", $templates ); - } - public function testHeadings() { $text = <<<END Some text here diff --git a/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php b/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php new file mode 100644 index 0000000..69d0b76 --- /dev/null +++ b/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php @@ -0,0 +1,70 @@ +<?php + +use MediaWiki\Search\ParserOutputSearchDataExtractor; + +/** + * @group Search + * @covers MediaWiki\Search\ParserOutputSearchDataExtractor + */ +class ParserOutputSearchDataExtractorTest extends MediaWikiLangTestCase { + + public function testGetCategories() { + $categories = [ + 'Foo_bar' => 'Bar', + 'New_page' => '' + ]; + + $parserOutput = new ParserOutput( '', [], $categories ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'Foo bar', 'New page' ], + $searchDataExtractor->getCategories( $parserOutput ) + ); + } + + public function testGetExternalLinks() { + $parserOutput = new ParserOutput(); + + $parserOutput->addExternalLink( 'https://foo' ); + $parserOutput->addExternalLink( 'https://bar' ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'https://foo', 'https://bar' ], + $searchDataExtractor->getExternalLinks( $parserOutput ) + ); + } + + public function testGetOutgoingLinks() { + $parserOutput = new ParserOutput(); + + $parserOutput->addLink( Title::makeTitle( NS_MAIN, 'Foo_bar' ), 1 ); + $parserOutput->addLink( Title::makeTitle( NS_HELP, 'Contents' ), 2 ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + // this indexes links with db key + $this->assertEquals( + [ 'Foo_bar', 'Help:Contents' ], + $searchDataExtractor->getOutgoingLinks( $parserOutput ) + ); + } + + public function testGetTemplates() { + $title = Title::makeTitle( NS_TEMPLATE, 'Cite_news' ); + + $parserOutput = new ParserOutput(); + $parserOutput->addTemplate( $title, 10, 100 ); + + $searchDataExtractor = new ParserOutputSearchDataExtractor(); + + $this->assertEquals( + [ 'Template:Cite news' ], + $searchDataExtractor->getTemplates( $parserOutput ) + ); + } + +} -- To view, visit https://gerrit.wikimedia.org/r/303838 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I69b010b893135e53fac7f16f4b927b8fbcba06d2 Gerrit-PatchSet: 23 Gerrit-Project: mediawiki/core Gerrit-Branch: master Gerrit-Owner: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits