jenkins-bot has submitted this change and it was merged.

Change subject: Extract ParserOutput search index data fields from 
WikiTextContentHandler
......................................................................


Extract ParserOutput search index data fields from WikiTextContentHandler

Bug: T142491
Change-Id: I69b010b893135e53fac7f16f4b927b8fbcba06d2
---
M autoload.php
M includes/content/ContentHandler.php
M includes/content/WikiTextStructure.php
M includes/content/WikitextContentHandler.php
A includes/search/DummySearchIndexFieldDefinition.php
A includes/search/ParserOutputSearchDataExtractor.php
M includes/search/SearchIndexFieldDefinition.php
M tests/phpunit/includes/content/ContentHandlerTest.php
M tests/phpunit/includes/content/WikitextStructureTest.php
A tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php
10 files changed, 281 insertions(+), 139 deletions(-)

Approvals:
  Smalyshev: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/autoload.php b/autoload.php
index f6edd94..f75c5af 100644
--- a/autoload.php
+++ b/autoload.php
@@ -372,6 +372,7 @@
        'DoubleRedirectsPage' => __DIR__ . 
'/includes/specials/SpecialDoubleRedirects.php',
        'DoubleReplacer' => __DIR__ . 
'/includes/libs/replacers/DoubleReplacer.php',
        'DummyLinker' => __DIR__ . '/includes/DummyLinker.php',
+       'DummySearchIndexFieldDefinition' => __DIR__ . 
'/includes/search/DummySearchIndexFieldDefinition.php',
        'DummyTermColorer' => __DIR__ . '/maintenance/term/MWTerm.php',
        'Dump7ZipOutput' => __DIR__ . '/includes/export/Dump7ZipOutput.php',
        'DumpBZip2Output' => __DIR__ . '/includes/export/DumpBZip2Output.php',
@@ -856,6 +857,7 @@
        'MediaWiki\\Logger\\NullSpi' => __DIR__ . 
'/includes/debug/logger/NullSpi.php',
        'MediaWiki\\Logger\\Spi' => __DIR__ . '/includes/debug/logger/Spi.php',
        'MediaWiki\\MediaWikiServices' => __DIR__ . 
'/includes/MediaWikiServices.php',
+       'MediaWiki\\Search\\ParserOutputSearchDataExtractor' => __DIR__ . 
'/includes/search/ParserOutputSearchDataExtractor.php',
        'MediaWiki\\Services\\CannotReplaceActiveServiceException' => __DIR__ . 
'/includes/Services/CannotReplaceActiveServiceException.php',
        'MediaWiki\\Services\\ContainerDisabledException' => __DIR__ . 
'/includes/Services/ContainerDisabledException.php',
        'MediaWiki\\Services\\DestructibleService' => __DIR__ . 
'/includes/Services/DestructibleService.php',
diff --git a/includes/content/ContentHandler.php 
b/includes/content/ContentHandler.php
index 7184980..3a75f50 100644
--- a/includes/content/ContentHandler.php
+++ b/includes/content/ContentHandler.php
@@ -1,4 +1,7 @@
 <?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
 /**
  * Base class for content handling.
  *
@@ -1251,24 +1254,40 @@
 
        /**
         * Get fields definition for search index
+        *
+        * @todo Expose title, redirect, namespace, text, source_text, 
text_bytes
+        *       field mappings here. (see T142670 and T143409)
+        *
         * @param SearchEngine $engine
         * @return SearchIndexField[] List of fields this content handler can 
provide.
         * @since 1.28
         */
        public function getFieldsForSearchIndex( SearchEngine $engine ) {
-               /* Default fields:
-               /*
-                * namespace
-                * namespace_text
-                * redirect
-                * source_text
-                * suggest
-                * timestamp
-                * title
-                * text
-                * text_bytes
-                */
-               return [];
+               $fields['category'] = $engine->makeSearchFieldMapping(
+                       'category',
+                       SearchIndexField::INDEX_TYPE_TEXT
+               );
+
+               $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+               $fields['external_link'] = $engine->makeSearchFieldMapping(
+                       'external_link',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['outgoing_link'] = $engine->makeSearchFieldMapping(
+                       'outgoing_link',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['template'] = $engine->makeSearchFieldMapping(
+                       'template',
+                       SearchIndexField::INDEX_TYPE_KEYWORD
+               );
+
+               $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
+
+               return $fields;
        }
 
        /**
@@ -1298,16 +1317,26 @@
         */
        public function getDataForSearchIndex( WikiPage $page, ParserOutput 
$output,
                                               SearchEngine $engine ) {
-               $fields = [];
+               $fieldData = [];
                $content = $page->getContent();
+
                if ( $content ) {
+                       $searchDataExtractor = new 
ParserOutputSearchDataExtractor();
+
+                       $fieldData['category'] = 
$searchDataExtractor->getCategories( $output );
+                       $fieldData['external_link'] = 
$searchDataExtractor->getExternalLinks( $output );
+                       $fieldData['outgoing_link'] = 
$searchDataExtractor->getOutgoingLinks( $output );
+                       $fieldData['template'] = 
$searchDataExtractor->getTemplates( $output );
+
                        $text = $content->getTextForSearchIndex();
-                       $fields['text'] = $text;
-                       $fields['source_text'] = $text;
-                       $fields['text_bytes'] = $content->getSize();
+
+                       $fieldData['text'] = $text;
+                       $fieldData['source_text'] = $text;
+                       $fieldData['text_bytes'] = $content->getSize();
                }
-               Hooks::run( 'SearchDataForIndex', [ &$fields, $this, $page, 
$output, $engine ] );
-               return $fields;
+
+               Hooks::run( 'SearchDataForIndex', [ &$fieldData, $this, $page, 
$output, $engine ] );
+               return $fieldData;
        }
 
        /**
diff --git a/includes/content/WikiTextStructure.php 
b/includes/content/WikiTextStructure.php
index e83c213..9768d36 100644
--- a/includes/content/WikiTextStructure.php
+++ b/includes/content/WikiTextStructure.php
@@ -59,50 +59,6 @@
        }
 
        /**
-        * Get categories in the text.
-        * @return string[]
-        */
-       public function categories() {
-               $categories = [];
-               foreach ( array_keys( $this->parserOutput->getCategories() ) as 
$key ) {
-                       $categories[] = Category::newFromName( $key 
)->getTitle()->getText();
-               }
-               return $categories;
-       }
-
-       /**
-        * Get outgoing links.
-        * @return string[]
-        */
-       public function outgoingLinks() {
-               $outgoingLinks = [];
-               foreach ( $this->parserOutput->getLinks() as $linkedNamespace 
=> $namespaceLinks ) {
-                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey 
) {
-                               $outgoingLinks[] =
-                                       Title::makeTitle( $linkedNamespace, 
$linkedDbKey )->getPrefixedDBkey();
-                       }
-               }
-               return $outgoingLinks;
-       }
-
-       /**
-        * Get templates in the text.
-        * @return string[]
-        */
-       public function templates() {
-               $templates = [];
-               foreach ( $this->parserOutput->getTemplates() as $tNS => 
$templatesInNS ) {
-                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
-                               $templateTitle = Title::makeTitleSafe( $tNS, 
$tDbKey );
-                               if ( $templateTitle && $templateTitle->exists() 
) {
-                                       $templates[] = 
$templateTitle->getPrefixedText();
-                               }
-                       }
-               }
-               return $templates;
-       }
-
-       /**
         * Get headings on the page.
         * @return string[]
         * First strip out things that look like references.  We can't use HTML 
filtering because
diff --git a/includes/content/WikitextContentHandler.php 
b/includes/content/WikitextContentHandler.php
index 9baf643..e51d246 100644
--- a/includes/content/WikitextContentHandler.php
+++ b/includes/content/WikitextContentHandler.php
@@ -111,13 +111,6 @@
        public function getFieldsForSearchIndex( SearchEngine $engine ) {
                $fields = parent::getFieldsForSearchIndex( $engine );
 
-               $fields['category'] =
-                       $engine->makeSearchFieldMapping( 'category', 
SearchIndexField::INDEX_TYPE_TEXT );
-               $fields['category']->setFlag( SearchIndexField::FLAG_CASEFOLD );
-
-               $fields['external_link'] =
-                       $engine->makeSearchFieldMapping( 'external_link', 
SearchIndexField::INDEX_TYPE_KEYWORD );
-
                $fields['heading'] =
                        $engine->makeSearchFieldMapping( 'heading', 
SearchIndexField::INDEX_TYPE_TEXT );
                $fields['heading']->setFlag( SearchIndexField::FLAG_SCORING );
@@ -129,13 +122,6 @@
                        $engine->makeSearchFieldMapping( 'opening_text', 
SearchIndexField::INDEX_TYPE_TEXT );
                $fields['opening_text']->setFlag( 
SearchIndexField::FLAG_SCORING |
                                                  
SearchIndexField::FLAG_NO_HIGHLIGHT );
-
-               $fields['outgoing_link'] =
-                       $engine->makeSearchFieldMapping( 'outgoing_link', 
SearchIndexField::INDEX_TYPE_KEYWORD );
-
-               $fields['template'] =
-                       $engine->makeSearchFieldMapping( 'template', 
SearchIndexField::INDEX_TYPE_KEYWORD );
-               $fields['template']->setFlag( SearchIndexField::FLAG_CASEFOLD );
 
                // FIXME: this really belongs in separate file handler but files
                // do not have separate handler. Sadness.
@@ -165,11 +151,7 @@
                $fields = parent::getDataForSearchIndex( $page, $parserOutput, 
$engine );
 
                $structure = new WikiTextStructure( $parserOutput );
-               $fields['external_link'] = array_keys( 
$parserOutput->getExternalLinks() );
-               $fields['category'] = $structure->categories();
                $fields['heading'] = $structure->headings();
-               $fields['outgoing_link'] = $structure->outgoingLinks();
-               $fields['template'] = $structure->templates();
                // text fields
                $fields['opening_text'] = $structure->getOpeningText();
                $fields['text'] = $structure->getMainText(); // overwrites one 
from ContentHandler
diff --git a/includes/search/DummySearchIndexFieldDefinition.php 
b/includes/search/DummySearchIndexFieldDefinition.php
new file mode 100644
index 0000000..a2a6760
--- /dev/null
+++ b/includes/search/DummySearchIndexFieldDefinition.php
@@ -0,0 +1,30 @@
+<?php
+
+/**
+ * Dummy implementation of SearchIndexFieldDefinition for testing purposes.
+ *
+ * @since 1.28
+ */
+class DummySearchIndexFieldDefinition extends SearchIndexFieldDefinition {
+
+       /**
+        * @param SearchEngine $engine
+        *
+        * @return array
+        */
+       public function getMapping( SearchEngine $engine ) {
+               $mapping = [
+                       'name' => $this->name,
+                       'type' => $this->type,
+                       'flags' => $this->flags,
+                       'subfields' => []
+               ];
+
+               foreach ( $this->subfields as $subfield ) {
+                       $mapping['subfields'][] = $subfield->getMapping();
+               }
+
+               return $mapping;
+       }
+
+}
diff --git a/includes/search/ParserOutputSearchDataExtractor.php 
b/includes/search/ParserOutputSearchDataExtractor.php
new file mode 100644
index 0000000..df653f1
--- /dev/null
+++ b/includes/search/ParserOutputSearchDataExtractor.php
@@ -0,0 +1,92 @@
+<?php
+
+namespace MediaWiki\Search;
+
+use Category;
+use ParserOutput;
+use Title;
+
+/**
+ * Extracts data from ParserOutput for indexing in the search engine.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ *
+ * @since 1.28
+ */
+class ParserOutputSearchDataExtractor {
+
+       /**
+        * Get a list of categories, as an array with title text strings.
+        *
+        * @return string[]
+        */
+       public function getCategories( ParserOutput $parserOutput ) {
+               $categories = [];
+
+               foreach ( $parserOutput->getCategoryLinks() as $key ) {
+                       $categories[] = Category::newFromName( $key 
)->getTitle()->getText();
+               }
+
+               return $categories;
+       }
+
+       /**
+        * Get a list of external links from ParserOutput, as an array of 
strings.
+        *
+        * @return string[]
+        */
+       public function getExternalLinks( ParserOutput $parserOutput ) {
+               return array_keys( $parserOutput->getExternalLinks() );
+       }
+
+       /**
+        * Get a list of outgoing wiki links (including interwiki links), as
+        * an array of prefixed title strings.
+        *
+        * @return string[]
+        */
+       public function getOutgoingLinks( ParserOutput $parserOutput ) {
+               $outgoingLinks = [];
+
+               foreach ( $parserOutput->getLinks() as $linkedNamespace => 
$namespaceLinks ) {
+                       foreach ( array_keys( $namespaceLinks ) as $linkedDbKey 
) {
+                               $outgoingLinks[] =
+                                       Title::makeTitle( $linkedNamespace, 
$linkedDbKey )->getPrefixedDBkey();
+                       }
+               }
+
+               return $outgoingLinks;
+       }
+
+       /**
+        * Get a list of templates used in the ParserOutput content, as 
prefixed title strings
+        *
+        * @return string[]
+        */
+       public function getTemplates( ParserOutput $parserOutput ) {
+               $templates = [];
+
+               foreach ( $parserOutput->getTemplates() as $tNS => 
$templatesInNS ) {
+                       foreach ( array_keys( $templatesInNS ) as $tDbKey ) {
+                               $templateTitle = Title::makeTitle( $tNS, 
$tDbKey );
+                               $templates[] = 
$templateTitle->getPrefixedText();
+                       }
+               }
+
+               return $templates;
+       }
+
+}
diff --git a/includes/search/SearchIndexFieldDefinition.php 
b/includes/search/SearchIndexFieldDefinition.php
index 3a86c82..8a06b65 100644
--- a/includes/search/SearchIndexFieldDefinition.php
+++ b/includes/search/SearchIndexFieldDefinition.php
@@ -2,8 +2,10 @@
 
 /**
  * Basic infrastructure of the field definition.
- * Specific engines will need to override it at least for getMapping,
- * but can reuse other parts.
+ *
+ * Specific engines should extend this class and at at least,
+ * override the getMapping method, but can reuse other parts.
+ *
  * @since 1.28
  */
 abstract class SearchIndexFieldDefinition implements SearchIndexField {
@@ -115,4 +117,12 @@
                $this->subfields = $subfields;
                return $this;
        }
+
+       /**
+        * @param SearchEngine $engine
+        *
+        * @return array
+        */
+       abstract public function getMapping( SearchEngine $engine );
+
 }
diff --git a/tests/phpunit/includes/content/ContentHandlerTest.php 
b/tests/phpunit/includes/content/ContentHandlerTest.php
index bb9050f..561f211 100644
--- a/tests/phpunit/includes/content/ContentHandlerTest.php
+++ b/tests/phpunit/includes/content/ContentHandlerTest.php
@@ -415,6 +415,32 @@
                $this->assertInstanceOf( $handlerClass, $handler );
        }
 
+       public function testGetFieldsForSearchIndex() {
+               $searchEngine = $this->newSearchEngine();
+
+               $handler = ContentHandler::getForModelID( 
CONTENT_MODEL_WIKITEXT );
+
+               $fields = $handler->getFieldsForSearchIndex( $searchEngine );
+
+               $this->assertArrayHasKey( 'category', $fields );
+               $this->assertArrayHasKey( 'external_link', $fields );
+               $this->assertArrayHasKey( 'outgoing_link', $fields );
+               $this->assertArrayHasKey( 'template', $fields );
+       }
+
+       private function newSearchEngine() {
+               $searchEngine = $this->getMockBuilder( 'SearchEngine' )
+                       ->getMock();
+
+               $searchEngine->expects( $this->any() )
+                       ->method( 'makeSearchFieldMapping' )
+                       ->will( $this->returnCallback( function( $name, $type ) 
{
+                                       return new 
DummySearchIndexFieldDefinition( $name, $type );
+                       } ) );
+
+               return $searchEngine;
+       }
+
        /**
         * @covers ContentHandler::getDataForSearchIndex
         */
@@ -425,7 +451,7 @@
 
                $this->setTemporaryHook( 'SearchDataForIndex',
                        function ( &$fields, ContentHandler $handler, WikiPage 
$page, ParserOutput $output,
-                                  SearchEngine $engine ) {
+                                          SearchEngine $engine ) {
                                $fields['testDataField'] = 'test content';
                        } );
 
diff --git a/tests/phpunit/includes/content/WikitextStructureTest.php 
b/tests/phpunit/includes/content/WikitextStructureTest.php
index 6d83057..4301fb8 100644
--- a/tests/phpunit/includes/content/WikitextStructureTest.php
+++ b/tests/phpunit/includes/content/WikitextStructureTest.php
@@ -25,61 +25,6 @@
                return new WikiTextStructure( $this->getParserOutput( $text ) );
        }
 
-       public function testCategories() {
-               $text = <<<END
-We also have a {{Template}} and an {{Another template}} in addition. 
-This text also has [[Category:Some Category| ]] and then [[Category:Yet 
another category]].
-And [[Category:Some Category| this category]] is repeated.
-END;
-               $struct = $this->getStructure( $text );
-               $cats = $struct->categories();
-               $this->assertCount( 2, $cats );
-               $this->assertContains( "Some Category", $cats );
-               $this->assertContains( "Yet another category", $cats );
-       }
-
-       public function testOutgoingLinks() {
-               $text = <<<END
-Here I add link to [[Some Page]]. And [[Some Page|This same page]] gets linked 
twice. 
-We also have [[File:Image.jpg|image]].
-We also have a {{Template}} and an {{Another template}} in addition. 
-Some templates are {{lowercase}}.
-And [[Some_Page]] is linked again. 
-It also has [[Category:Some Category| ]] and then [[Category:Yet another 
category]].
-Also link to a [[Talk:TestTitle|talk page]] is here. 
-END;
-               $struct = $this->getStructure( $text );
-               $links = $struct->outgoingLinks();
-               $this->assertContains( "Some_Page", $links );
-               $this->assertContains( "Template:Template", $links );
-               $this->assertContains( "Template:Another_template", $links );
-               $this->assertContains( "Template:Lowercase", $links );
-               $this->assertContains( "Talk:TestTitle", $links );
-               $this->assertCount( 5, $links );
-       }
-
-       public function testTemplates() {
-               $text = <<<END
-We have a {{Template}} and an {{Another template}} in addition. 
-Some templates are {{lowercase}}. And this {{Template}} is repeated. 
-Here is {{another_template|with=argument}}.
-This is a template that {{Xdoes not exist}}.
-END;
-               $this->setTemporaryHook( 'TitleExists', function ( Title 
$title, &$exists ) {
-                       $txt = $title->getBaseText();
-                       if ( $txt[0] != 'X' ) {
-                               $exists = true;
-                       }
-                       return true;
-               } );
-               $struct = $this->getStructure( $text );
-               $templates = $struct->templates();
-               $this->assertCount( 3, $templates );
-               $this->assertContains( "Template:Template", $templates );
-               $this->assertContains( "Template:Another template", $templates 
);
-               $this->assertContains( "Template:Lowercase", $templates );
-       }
-
        public function testHeadings() {
                $text = <<<END
 Some text here
diff --git 
a/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php 
b/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php
new file mode 100644
index 0000000..69d0b76
--- /dev/null
+++ b/tests/phpunit/includes/search/ParserOutputSearchDataExtractorTest.php
@@ -0,0 +1,70 @@
+<?php
+
+use MediaWiki\Search\ParserOutputSearchDataExtractor;
+
+/**
+ * @group Search
+ * @covers MediaWiki\Search\ParserOutputSearchDataExtractor
+ */
+class ParserOutputSearchDataExtractorTest extends MediaWikiLangTestCase {
+
+       public function testGetCategories() {
+               $categories = [
+                       'Foo_bar' => 'Bar',
+                       'New_page' => ''
+               ];
+
+               $parserOutput = new ParserOutput( '', [], $categories );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'Foo bar', 'New page' ],
+                       $searchDataExtractor->getCategories( $parserOutput )
+               );
+       }
+
+       public function testGetExternalLinks() {
+               $parserOutput = new ParserOutput();
+
+               $parserOutput->addExternalLink( 'https://foo' );
+               $parserOutput->addExternalLink( 'https://bar' );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'https://foo', 'https://bar' ],
+                       $searchDataExtractor->getExternalLinks( $parserOutput )
+               );
+       }
+
+       public function testGetOutgoingLinks() {
+               $parserOutput = new ParserOutput();
+
+               $parserOutput->addLink( Title::makeTitle( NS_MAIN, 'Foo_bar' ), 
1 );
+               $parserOutput->addLink( Title::makeTitle( NS_HELP, 'Contents' 
), 2 );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               // this indexes links with db key
+               $this->assertEquals(
+                       [ 'Foo_bar', 'Help:Contents' ],
+                       $searchDataExtractor->getOutgoingLinks( $parserOutput )
+               );
+       }
+
+       public function testGetTemplates() {
+               $title = Title::makeTitle( NS_TEMPLATE, 'Cite_news' );
+
+               $parserOutput = new ParserOutput();
+               $parserOutput->addTemplate( $title, 10, 100 );
+
+               $searchDataExtractor = new ParserOutputSearchDataExtractor();
+
+               $this->assertEquals(
+                       [ 'Template:Cite news' ],
+                       $searchDataExtractor->getTemplates( $parserOutput )
+               );
+       }
+
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/303838
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I69b010b893135e53fac7f16f4b927b8fbcba06d2
Gerrit-PatchSet: 23
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: Aude <aude.w...@gmail.com>
Gerrit-Reviewer: Aude <aude.w...@gmail.com>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to