jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/281077 )

Change subject: Add deleted archive titles indexing and search
......................................................................


Add deleted archive titles indexing and search

Query in Searcher.php may need some tuning yet.

Bug: T109561
Change-Id: I48382f24d91b4421b6a1754d7f30c18b173263db
Depends-On: Id6099fe9fbf18481068a6f0a329bbde0d218135f
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
M includes/CirrusSearch.php
M includes/Connection.php
M includes/DataSender.php
M includes/Hooks.php
A includes/Job/DeleteArchive.php
M includes/Job/DeletePages.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Searcher.php
M includes/Updater.php
M maintenance/forceSearchIndex.php
M maintenance/updateOneSearchIndexConfig.php
M tests/browser/features/step_definitions/search_steps.rb
A tests/browser/features/support/pages/undelete_page.rb
M tests/browser/features/update_general_api.feature
M tests/jenkins/FullyFeaturedConfig.php
M tests/unit/SearcherTest.php
A tests/unit/fixtures/archiveSearch/namespaced.expected
A tests/unit/fixtures/archiveSearch/namespaced.query
A tests/unit/fixtures/archiveSearch/simple.expected
A tests/unit/fixtures/archiveSearch/simple.query
A tests/unit/fixtures/archiveSearch/spaces.expected
A tests/unit/fixtures/archiveSearch/spaces.query
25 files changed, 625 insertions(+), 12 deletions(-)

Approvals:
  EBernhardson: Looks good to me, approved
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 613839d..31ef36c 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1107,6 +1107,7 @@
 $wgHooks[ 'ArticleDelete' ][] = 'CirrusSearch\Hooks::onArticleDelete';
 $wgHooks[ 'ArticleDeleteComplete' ][] = 
'CirrusSearch\Hooks::onArticleDeleteComplete';
 $wgHooks[ 'ArticleRevisionVisibilitySet' ][] = 
'CirrusSearch\Hooks::onRevisionDelete';
+$wgHooks[ 'ArticleUndelete' ][] = 'CirrusSearch\Hooks::onArticleUndelete';
 $wgHooks[ 'BeforeInitialize' ][] = 'CirrusSearch\Hooks::onBeforeInitialize';
 $wgHooks[ 'GetBetaFeaturePreferences' ][] = 
'CirrusSearch\Hooks::getBetaFeaturePreferences';
 $wgHooks[ 'GetPreferences' ][] = 'CirrusSearch\Hooks::onGetPreferences';
@@ -1138,6 +1139,7 @@
 $wgJobClasses[ 'cirrusSearchOtherIndex' ] = 'CirrusSearch\Job\OtherIndex';
 $wgJobClasses[ 'cirrusSearchElasticaWrite' ] = 
'CirrusSearch\Job\ElasticaWrite';
 $wgJobClasses[ 'cirrusSearchCheckerJob' ] = 'CirrusSearch\Job\CheckerJob';
+$wgJobClasses[ 'cirrusSearchDeleteArchive' ] = 
'CirrusSearch\Job\DeleteArchive';
 
 /**
  * Actions
@@ -1213,6 +1215,15 @@
  */
 $wgCirrusSearchExtraIndexSettings = [];
 
+/**
+ * Whether to index deleted pages for archiving.
+ */
+$wgCirrusSearchIndexDeletes = false;
+/**
+ * Enable archive search.
+ */
+$wgCirrusSearchEnableArchive = false;
+
 /*
  * Please update docs/settings.txt if you add new values!
  */
diff --git a/autoload.php b/autoload.php
index 095fbb7..93276b4 100644
--- a/autoload.php
+++ b/autoload.php
@@ -59,6 +59,7 @@
        'CirrusSearch\\Iterator\\CallbackIterator' => __DIR__ . 
'/includes/iterator/CallbackIterator.php',
        'CirrusSearch\\Iterator\\IteratorDecorator' => __DIR__ . 
'/includes/iterator/IteratorDecorator.php',
        'CirrusSearch\\Job\\CheckerJob' => __DIR__ . 
'/includes/Job/CheckerJob.php',
+       'CirrusSearch\\Job\\DeleteArchive' => __DIR__ . 
'/includes/Job/DeleteArchive.php',
        'CirrusSearch\\Job\\DeletePages' => __DIR__ . 
'/includes/Job/DeletePages.php',
        'CirrusSearch\\Job\\ElasticaWrite' => __DIR__ . 
'/includes/Job/ElasticaWrite.php',
        'CirrusSearch\\Job\\IncomingLinkCount' => __DIR__ . 
'/includes/Job/IncomingLinkCount.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index f3663a8..6902115 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -1474,3 +1474,17 @@
         'search.slowlog.threshold.fetch.info' => '1s',
         'search.slowlog.threshold.fetch.info' => '800ms',
     ];
+
+; $wgCirrusSearchEnableArchive
+Default:
+    $wgCirrusSearchEnableArchive = false;
+
+Enable searching for deleted pages in the ElasticSearch indexed archive.
+
+; $wgCirrusSearchIndexDeletes
+
+Default:
+    $wgCirrusSearchIndexDeletes = false;
+
+Whether deletes are indexed for archive search when page is deleted. Note that 
searching
+for archived pages can be done by manually indexing them too.
\ No newline at end of file
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 915cdc8..ecfd76f 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -775,4 +775,35 @@
        public function setDumpAndDie( $dumpAndDie ) {
                $this->dumpAndDie = $dumpAndDie;
        }
+
+       /**
+        * Perform a title search in the article archive.
+        *
+        * @param string $term Raw search term
+        * @return Status<Title[]>
+        */
+       public function searchArchiveTitle ( $term ) {
+
+               if ( !$this->config->get( 'CirrusSearchEnableArchive' ) ) {
+                       return Status::newGood( [] );
+               }
+
+               $term = trim( $term );
+
+               if ( empty( $term ) ) {
+                       return Status::newGood( [] );
+               }
+
+               $searcher = new Searcher( $this->connection, $this->offset, 
$this->limit, $this->config, $this->namespaces,
+                               null, $this->indexBaseName );
+               $searcher->setOptionsFromRequest( $this->request );
+
+               $status = $searcher->searchArchive( $term );
+               if ( $status->isOK() && $searcher->isReturnRaw() ) {
+                       $status->setResult( true,
+                               $searcher->processRawReturn( 
$status->getValue(), $this->request, $this->dumpAndDie ) );
+               }
+               return $status;
+       }
+
 }
diff --git a/includes/Connection.php b/includes/Connection.php
index d3f27be..f6e67b7 100644
--- a/includes/Connection.php
+++ b/includes/Connection.php
@@ -64,6 +64,12 @@
        const TITLE_SUGGEST_TYPE_NAME = 'titlesuggest';
 
        /**
+        * Name of the archive type
+        * @var string
+        */
+       const ARCHIVE_TYPE_NAME = 'archive';
+
+       /**
         * @var SearchConfig
         */
        protected $config;
@@ -215,6 +221,15 @@
        }
 
        /**
+        * Fetch the Elastica Type for archive.
+        * @param mixed $name basename of index
+        * @return \Elastica\Type
+        */
+       public function getArchiveType( $name ) {
+               return $this->getIndex( $name, 'general' )->getType( 
self::ARCHIVE_TYPE_NAME );
+       }
+
+       /**
         * Get all index types we support, content, general, plus custom ones
         *
         * @return string[]
diff --git a/includes/DataSender.php b/includes/DataSender.php
index 8a3cf35..88c9206 100644
--- a/includes/DataSender.php
+++ b/includes/DataSender.php
@@ -159,6 +159,7 @@
        /**
         * @param string $indexType type of index to which to send $data
         * @param (\Elastica\Script|\Elastica\Document)[] $data documents to 
send
+        * @param string $elasticType Mapping type to use for the document
         * @return Status
         */
        public function sendData( $indexType, $data, $elasticType = 
Connection::PAGE_TYPE_NAME ) {
@@ -259,6 +260,7 @@
         *
         * @param string[] $docIds elasticsearch document ids to delete
         * @param string|null $indexType index from which to delete.  null 
means all.
+        * @param string|null $elasticType Mapping type to use for the 
document. null means all types.
         * @return Status
         */
        public function sendDeletes( $docIds, $indexType = null, $elasticType = 
null ) {
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 74b5fcc..d3d1a11 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -826,4 +826,29 @@
                );
                return true;
        }
+
+
+       /**
+        * When article is undeleted - check the archive for other instances of 
the title,
+        * if not there - drop it from the archive.
+        * @param Title $title
+        * @param bool $create
+        * @param string $comment
+        * @param string $oldPageId
+        * @param array $restoredPages
+        * @return bool
+        */
+       public static function onArticleUndelete( Title $title, $create, 
$comment, $oldPageId, $restoredPages ) {
+               global $wgCirrusSearchIndexDeletes;
+               if ( !$wgCirrusSearchIndexDeletes ) {
+                       // Not indexing, thus nothing to remove here.
+                       return true;
+               }
+               JobQueueGroup::singleton()->push(
+                       new Job\DeleteArchive( $title, [ 'docIds' => 
$restoredPages ] )
+               );
+               return true;
+
+       }
+
 }
diff --git a/includes/Job/DeleteArchive.php b/includes/Job/DeleteArchive.php
new file mode 100644
index 0000000..fb9560b
--- /dev/null
+++ b/includes/Job/DeleteArchive.php
@@ -0,0 +1,44 @@
+<?php
+
+namespace CirrusSearch\Job;
+
+use CirrusSearch\Connection;
+use MediaWiki\MediaWikiServices;
+
+/**
+ * Job wrapper for deleting pages from archive.
+ */
+class DeleteArchive extends Job {
+       public function __construct( $title, $params ) {
+               parent::__construct( $title, $params );
+
+               // Don't remove dupes since we do checks that may return 
different results
+               // Also, deletes are idempotent so it's no problem if we delete 
twice.
+               $this->removeDuplicates = false;
+       }
+
+       protected function doJob() {
+               $archive = new \PageArchive( $this->title );
+               $docs = $this->params['docIds'];
+
+               // Remove page IDs that still have archived revs
+               foreach ( $archive->listRevisions() as $rev ) {
+                       unset( $docs[$rev['ar_page_id']] );
+               }
+
+               if ( empty( $docs ) ) {
+                       // If we have more deleted instances of the same title, 
no need to bother.
+                       return true;
+               }
+
+               $updater = $this->createUpdater();
+               $updater->deletePages(
+                       [ $this->title ],
+                       array_keys( $docs ),
+                       Connection::GENERAL_INDEX_TYPE,
+                       Connection::ARCHIVE_TYPE_NAME
+               );
+
+               return true;
+       }
+}
diff --git a/includes/Job/DeletePages.php b/includes/Job/DeletePages.php
index 406a232..17294dd 100644
--- a/includes/Job/DeletePages.php
+++ b/includes/Job/DeletePages.php
@@ -43,8 +43,20 @@
        }
 
        protected function doJob() {
+               global $wgCirrusSearchIndexDeletes;
                $updater = $this->createUpdater();
                $indexType = isset( $this->params[ 'indexType' ] ) ? 
$this->params[ 'indexType' ] : null;
-               return $updater->deletePages( [ $this->title ], [ 
$this->params[ 'docId' ] ], $indexType );
+               $updater->deletePages( [ $this->title ], [ 
$this->params['docId'] ], $indexType );
+
+               if ( $wgCirrusSearchIndexDeletes ) {
+                       $success = $updater->archivePages( [
+                               [
+                                       'title' => $this->title,
+                                       'page' => $this->params['docId'],
+                               ],
+                       ] );
+               }
+
+               return true;
        }
 }
diff --git a/includes/Maintenance/MappingConfigBuilder.php 
b/includes/Maintenance/MappingConfigBuilder.php
index c0be235..60acaa8 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -270,6 +270,18 @@
                        ],
                ];
 
+               $config[ 'archive' ] = [
+                       'dynamic' => false,
+                       '_all' => [ 'enabled' => false ],
+                       'properties' => [
+                               'namespace' => $page['properties']['namespace'],
+                               'title' => $page['properties']['title'],
+                               'wiki' => $page['properties']['wiki'],
+                       ],
+               ];
+               // Do not use copy settings for archive
+               unset( $config['archive']['properties']['title']['copy_to'] );
+
                Hooks::run( 'CirrusSearchMappingConfig', [ &$config, $this ] );
 
                return $config;
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 50c8069..9db56c6 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -4,12 +4,15 @@
 
 use CirrusSearch\Query\SimpleKeywordFeature;
 use CirrusSearch\Search\FullTextResultsType;
+use CirrusSearch\Search\TitleResultsType;
 use CirrusSearch\Search\ResultsType;
 use CirrusSearch\Search\RescoreBuilder;
 use CirrusSearch\Search\SearchContext;
 use CirrusSearch\Query\FullTextQueryBuilder;
 use CirrusSearch\Elastica\MultiSearch as MultiSearch;
 use Elastica\Exception\RuntimeException;
+use Elastica\Query\BoolQuery;
+use Elastica\Query\MultiMatch;
 use Language;
 use MediaWiki\Logger\LoggerFactory;
 use MediaWiki\MediaWikiServices;
@@ -122,6 +125,12 @@
         * @var SearchContext
         */
        protected $searchContext;
+
+       /**
+        * Indexing type we'll be using.
+        * @var string|\Elastica\Type
+        */
+       private $pageType;
 
        /**
         * Constructor
@@ -559,6 +568,8 @@
                        $query->addParam( 'stats', $syntax );
                }
                switch ( $this->sort ) {
+               case 'just_match':
+                       // Use just matching scores, without any rescoring, and 
default sort.
                case 'relevance':
                        break;  // The default
                case 'title_asc':
@@ -579,6 +590,9 @@
                                'missing' => '_last',
                        ] ] );
                        break;
+               case 'none':
+                       $query->setSort( [ '_doc' ] );
+                       break;
                default:
                        LoggerFactory::getInstance( 'CirrusSearch' )->warning(
                                "Invalid sort type: {sort}",
@@ -595,8 +609,12 @@
                        $queryOptions[\Elastica\Search::OPTION_SEARCH_TYPE] = 
\Elastica\Search::OPTION_SEARCH_TYPE_DFS_QUERY_THEN_FETCH;
                }
 
-               $indexType = $this->connection->pickIndexTypeForNamespaces( 
$namespaces );
-               $pageType = $this->connection->getPageType( 
$this->indexBaseName, $indexType );
+               if ( $this->pageType ) {
+                       $pageType = $this->pageType;
+               } else {
+                       $indexType = 
$this->connection->pickIndexTypeForNamespaces( $namespaces );
+                       $pageType = $this->connection->getPageType( 
$this->indexBaseName, $indexType );
+               }
 
                $search = $pageType->createSearch( $query, $queryOptions );
                foreach ( $extraIndexes as $i ) {
@@ -1070,4 +1088,48 @@
 
                return $result;
        }
+
+       /**
+        * Search titles in archive
+        * @param string $term
+        * @return Status<Title[]>
+        */
+       public function searchArchive( $term ) {
+               list( $term, $fuzzyUnused ) = 
$this->searchContext->escaper()->fixupWholeQueryString( $term );
+               $this->setResultsType( new TitleResultsType( $this->config ) );
+
+               $this->pageType = $this->connection->getArchiveType( 
$this->indexBaseName );
+
+               // Setup the search query
+               $query = new BoolQuery();
+
+               $multi = new MultiMatch();
+               $multi->setType( 'best_fields' );
+               $multi->setTieBreaker( 0 );
+               $multi->setQuery( $term );
+               $multi->setFields( [
+                       'title.near_match^100',
+                       'title.near_match_asciifolding^75',
+                       'title.plain^50',
+                       'title^25'
+               ] );
+               $multi->setOperator( 'AND' );
+
+               $fuzzy = new \Elastica\Query\Match();
+               $fuzzy->setFieldQuery( 'title.plain', $term );
+               $fuzzy->setFieldFuzziness( 'title.plain', 'AUTO' );
+               $fuzzy->setFieldOperator( 'title.plain', 'AND' );
+
+               $query->addShould( $multi );
+               $query->addShould( $fuzzy );
+               $query->setMinimumShouldMatch( 1 );
+
+               $this->sort = 'just_match';
+
+               $this->searchContext->setMainQuery( $query );
+               $this->searchContext->addSyntaxUsed( 'archive' );
+
+               return $this->searchOne();
+       }
+
 }
diff --git a/includes/Updater.php b/includes/Updater.php
index 0b20675..9fe1108 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -233,26 +233,84 @@
         *
         * @param Title[] $titles List of titles to delete.  If empty then 
skipped other index
         *      maintenance is skipped.
-        * @param integer[] $docIds List of elasticsearch document ids to delete
-        * @param string $indexType index from which to delete
-        * @return bool True if nothing happened or we successfully deleted, 
false on failure
+        * @param int[]|string[] $docIds List of elasticsearch document ids to 
delete
+        * @param string|null $indexType index from which to delete.  null 
means all.
+        * @param string $elasticType Mapping type to use for the document
+        * @return bool Always returns true.
         */
-       public function deletePages( $titles, $docIds, $indexType = null ) {
+       public function deletePages( $titles, $docIds, $indexType = null, 
$elasticType = null ) {
                Job\OtherIndex::queueIfRequired( $titles, 
$this->writeToClusterName );
                $job = new Job\ElasticaWrite(
                        $titles ? reset( $titles ) : Title::makeTitle( 0, "" ),
                        [
                                'method' => 'sendDeletes',
-                               'arguments' => [ $docIds, $indexType ],
+                               'arguments' => [ $docIds, $indexType, 
$elasticType ],
                                'cluster' => $this->writeToClusterName,
                        ]
                );
                // This job type will insert itself into the job queue
                // with a delay if writes to ES are currently paused
                $job->run();
+
+               return true;
        }
 
        /**
+        * Add documents to archive index.
+        * @param array $archived
+        * @return bool
+        */
+       public function archivePages( $archived ) {
+               if ( !$this->searchConfig->getElement( 
'CirrusSearchIndexDeletes' ) ) {
+                       // Disabled by config - don't do anything
+                       return true;
+               }
+               $docs = $this->buildArchiveDocuments( $archived );
+               $head = reset( $archived );
+               foreach ( array_chunk( $docs, 10 ) as $chunked ) {
+                       $job = new Job\ElasticaWrite(
+                               $head['title'],
+                               [
+                                       'method' => 'sendData',
+                                       'arguments' => [ 
Connection::GENERAL_INDEX_TYPE, $chunked, Connection::ARCHIVE_TYPE_NAME ],
+                                       'cluster' => $this->writeToClusterName
+                               ]
+                       );
+                       $job->run();
+               }
+
+               return true;
+       }
+
+       /**
+        * Build Elastica documents for archived pages.
+        * @param array $archived
+        * @return \Elastica\Document[]
+        */
+       private function buildArchiveDocuments( array $archived ) {
+               $docs = [];
+               foreach ( $archived as $delete ) {
+                       if ( !isset( $delete['title'] ) ) {
+                               // These come from pages that still exist, but 
are redirects.
+                               // This is non-obvious and we probably need a 
better way...
+                               continue;
+                       }
+                       /** @var Title $title */
+                       $title = $delete['title'];
+                       $doc = new \Elastica\Document( $delete['page'], [
+                               'namespace' => $title->getNamespace(),
+                               'title' => $title->getText(),
+                               'wiki' => wfWikiId(),
+                       ] );
+                       $doc->setDocAsUpsert( true );
+                       $doc->setRetryOnConflict( 
$this->searchConfig->getElement( 'CirrusSearchUpdateConflictRetryCount' ) );
+
+                       $docs[] = $doc;
+               }
+
+               return $docs;
+       }
+       /**
         * @param \WikiPage[] $pages
         * @param int $flags
         * @return \Elastica\Document[]
diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php
index 399bfe2..d53404f 100644
--- a/maintenance/forceSearchIndex.php
+++ b/maintenance/forceSearchIndex.php
@@ -46,6 +46,7 @@
        public $toDate = null;
        public $toId = null;
        public $indexUpdates;
+       public $archiveOnly;
        public $limit;
        public $queue;
        public $maxJobs;
@@ -79,6 +80,7 @@
                $this->addOption( 'toId', 'Stop indexing at a specific page_id. 
 Not useful with --deletes or --from or --to.', false, true );
                $this->addOption( 'ids', 'List of page ids (comma separated) to 
reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true );
                $this->addOption( 'deletes', 'If this is set then just index 
deletes, not updates or creates.', false );
+               $this->addOption( 'archiveOnly', 'Don\'t delete pages, only 
index them into the archive. Only useful with --deletes', false, false );
                $this->addOption( 'limit', 'Maximum number of pages to process 
before exiting the script. Default to unlimited.', false, true );
                $this->addOption( 'buildChunks', 'Instead of running the script 
spit out commands that can be farmed out to ' .
                        'different processes or machines to rebuild the index.  
Works with fromId and toId, not from and to.  ' .
@@ -128,6 +130,7 @@
                }
                $this->toId = $this->getOption( 'toId' );
                $this->indexUpdates = !$this->getOption( 'deletes', false );
+               $this->archiveOnly = (bool) $this->getOption( 'archiveOnly', 
false );
                $this->limit = $this->getOption( 'limit' );
                $buildChunks = $this->getOption( 'buildChunks' );
                if ( $buildChunks !== null ) {
@@ -189,7 +192,10 @@
                        } else {
                                $size = count( $batch['titlesToDelete'] );
                                $updater = $this->createUpdater();
-                               $updater->deletePages( 
$batch['titlesToDelete'], $batch['docIdsToDelete'] );
+                               $updater->archivePages( $batch['archive'] );
+                               if ( !$this->archiveOnly ) {
+                                       $updater->deletePages( 
$batch['titlesToDelete'], $batch['docIdsToDelete'] );
+                               }
                        }
 
 
@@ -353,14 +359,22 @@
                return new CallbackIterator( $it, function ( $batch ) {
                        $titlesToDelete = [];
                        $docIdsToDelete = [];
+                       $archive = [];
                        foreach ( $batch as $row ) {
-                               $titlesToDelete[] = Title::makeTitle( 
$row->ar_namespace, $row->ar_title );
-                               $docIdsToDelete[] = 
$this->getSearchConfig()->makeId( $row->ar_page_id );
+                               $title = Title::makeTitle( $row->ar_namespace, 
$row->ar_title );
+                               $id = $this->getSearchConfig()->makeId( 
$row->ar_page_id );
+                               $titlesToDelete[] = $title;
+                               $docIdsToDelete[] = $id;
+                               $archive[] = [
+                                       'title' => $title,
+                                       'page' => $id,
+                               ];
                        }
 
                        return [
                                'titlesToDelete' => $titlesToDelete,
                                'docIdsToDelete' => $docIdsToDelete,
+                               'archive' => $archive,
                                'endingAt' => isset( $row )
                                        ? ( new MWTimestamp( $row->ar_timestamp 
) )->getTimestamp( TS_ISO_8601 )
                                        : 'unknown',
diff --git a/maintenance/updateOneSearchIndexConfig.php 
b/maintenance/updateOneSearchIndexConfig.php
index 9accb8c..3e29b01 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -379,7 +379,11 @@
                        $this->optimizeIndexForExperimentalHighlighter,
                        $this->availablePlugins,
                        $this->getMappingConfig(),
-                       [ 'page' => $this->getPageType(), 'namespace' => 
$this->getNamespaceType() ],
+                       [
+                               'page' => $this->getPageType(),
+                               'namespace' => $this->getNamespaceType(),
+                               'archive' => $this->getArchiveType()
+                       ],
                        $this
                );
                $validator->printDebugCheckConfig( $this->printDebugCheckConfig 
);
@@ -537,6 +541,15 @@
        }
 
        /**
+        * Get the namespace type being updated by the search config.
+        *
+        * @return Elastica\Type
+        */
+       protected function getArchiveType() {
+               return $this->getIndex()->getType( 
Connection::ARCHIVE_TYPE_NAME );
+       }
+
+       /**
         * @return Elastica\Type
         */
        protected function getOldPageType() {
diff --git a/tests/browser/features/step_definitions/search_steps.rb 
b/tests/browser/features/step_definitions/search_steps.rb
index 9e16a59..c015b6d 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -612,6 +612,20 @@
     token_type: false
   )
 end
+When(/^within (\d+) seconds I search deleted pages for (.*)/) do |seconds, 
search|
+  within(seconds) do
+    with_browser do
+      visit(SpecialUndeletePage)
+      on(SpecialUndeletePage).search_input = search
+      on(SpecialUndeletePage).search_button
+    end
+  end
+end
+Then(/^deleted page search returns (.*) as first result/) do |expected|
+  result = on(SpecialUndeletePage).first_result
+  result = result.gsub(/\s+\(\d+ revisions? deleted\)$/, "") unless result.nil?
+  result.should == expected
+end
 
 def within(seconds)
   end_time = Time.new + Integer(seconds)
diff --git a/tests/browser/features/support/pages/undelete_page.rb 
b/tests/browser/features/support/pages/undelete_page.rb
new file mode 100644
index 0000000..4116c9a
--- /dev/null
+++ b/tests/browser/features/support/pages/undelete_page.rb
@@ -0,0 +1,13 @@
+# Page with all the search options.
+class SpecialUndeletePage
+  include PageObject
+
+  page_url "Special:Undelete?fuzzy=1"
+
+  button(:search_button, id: "searchUndelete")
+  text_field(:search_input, id: "prefix")
+  ul(:search_results, id: "undeleteResultsList")
+  li(:first_result, class: "undeleteResult", index: 0)
+  li(:second_result, class: "undeleteResult", index: 1)
+  links(:all_results, class: "undeleteResult") { |page| 
page.search_results_element.link_elements }
+end
diff --git a/tests/browser/features/update_general_api.feature 
b/tests/browser/features/update_general_api.feature
index b50c0a0..4f770bb 100644
--- a/tests/browser/features/update_general_api.feature
+++ b/tests/browser/features/update_general_api.feature
@@ -62,3 +62,11 @@
     When I move Move%{epoch} From4 to User:Move%{epoch} To4 and do not leave a 
redirect via api
     Then within 20 seconds api searching for User:Move%{epoch} To4 yields 
User:Move%{epoch} To4 as the first result
       And within 20 seconds api searching for Move%{epoch} To4 yields none as 
the first result
+
+  Scenario: Deleted pages are added to archive index
+    Given a page named DeleteMeTest exists
+     And I am logged in
+    Then within 20 seconds api searching for DeleteMeTest yields DeleteMeTest 
as the first result
+    When I delete DeleteMeTest
+     And within 20 seconds I search deleted pages for deltemetest
+    Then deleted page search returns DeleteMeTest as first result
diff --git a/tests/jenkins/FullyFeaturedConfig.php 
b/tests/jenkins/FullyFeaturedConfig.php
index c035d52..f848050 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -90,3 +90,6 @@
                'maxqueue' => 200,
        );
 }
+
+$wgCirrusSearchIndexDeletes = true;
+$wgCirrusSearchEnableArchive = true;
diff --git a/tests/unit/SearcherTest.php b/tests/unit/SearcherTest.php
index c9fcb2c..8650aab 100644
--- a/tests/unit/SearcherTest.php
+++ b/tests/unit/SearcherTest.php
@@ -9,6 +9,7 @@
  * @group CirrusSearch
  */
 class SearcherTest extends CirrusTestCase {
+
        public function searchTextProvider() {
                $configs = [
                        'default' => [],
@@ -40,6 +41,7 @@
 
                return $tests;
        }
+
 
        /**
         * @dataProvider searchTextProvider
@@ -155,4 +157,71 @@
 
                return $query;
        }
+
+       public function archiveFixtureProvider() {
+               $tests = [];
+               foreach ( glob( __DIR__ . '/fixtures/archiveSearch/*.query' ) 
as $queryFile ) {
+                       $testName = substr( basename( $queryFile ), 0, - 6 );
+                       $query = file_get_contents( $queryFile );
+                       // Remove trailing newline
+                       $query = preg_replace( '/\n$/', '', $query );
+                       $expectedFile = substr( $queryFile, 0, - 5 ) . 
'expected';
+                       $expected =
+                               is_file( $expectedFile ) ? json_decode( 
file_get_contents( $expectedFile ), true )
+                                       // Flags test to generate a new fixture
+                                       : $expectedFile;
+                       $tests[$testName] = [
+                               $expected,
+                               $query,
+                       ];
+
+               }
+               return $tests;
+       }
+
+       /**
+        * @dataProvider archiveFixtureProvider
+        * @param $expected
+        * @param $query
+        */
+       public function testArchiveQuery( $expected, $query ) {
+               $this->setMwGlobals( [
+                               'wgCirrusSearchIndexBaseName' => 'wiki',
+                               
'wgCirrusSearchQueryStringMaxDeterminizedStates' => 500,
+                               'wgContentNamespaces' => [ NS_MAIN ],
+                               'wgCirrusSearchEnableArchive' => true,
+               ] );
+
+               \RequestContext::getMain()->setRequest( new \FauxRequest( [
+                       'cirrusDumpQuery' => 1,
+               ] ) );
+
+               $title = Title::newFromText( $query );
+               if ( $title ) {
+                       $ns = $title->getNamespace();
+                       $termMain = $title->getText();
+               } else {
+                       $ns = 0;
+                       $termMain = $query;
+               }
+
+               $engine = new \CirrusSearch();
+               $engine->setLimitOffset( 20, 0 );
+               $engine->setNamespaces( [ $ns ] );
+               $engine->setDumpAndDie( false );
+               $elasticQuery = $engine->searchArchiveTitle( $termMain 
)->getValue();
+               $decodedQuery = json_decode( $elasticQuery, true );
+               unset( $decodedQuery['path'] );
+
+               if ( is_string( $expected ) ) {
+                       // Flag to generate a new fixture.
+                       file_put_contents( $expected, json_encode( 
$decodedQuery, JSON_PRETTY_PRINT ) );
+               } else {
+                       // Repeat normalizations applied to $elasticQuery
+                       unset( $expected['path'] );
+
+                       // Finally compare some things
+                       $this->assertEquals( $expected, $decodedQuery, 
$elasticQuery );
+               }
+       }
 }
diff --git a/tests/unit/fixtures/archiveSearch/namespaced.expected 
b/tests/unit/fixtures/archiveSearch/namespaced.expected
new file mode 100644
index 0000000..00c0e3b
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/namespaced.expected
@@ -0,0 +1,63 @@
+{
+    "description": "archive search for '{query}'",
+    "params": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    },
+    "query": {
+        "_source": [
+            "namespace",
+            "title",
+            "namespace_text",
+            "wiki"
+        ],
+        "stored_fields": [],
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "multi_match": {
+                            "type": "best_fields",
+                            "tie_breaker": 0,
+                            "query": "Content",
+                            "fields": [
+                                "title.near_match^100",
+                                "title.near_match_asciifolding^75",
+                                "title.plain^50",
+                                "title^25"
+                            ],
+                            "operator": "AND"
+                        }
+                    },
+                    {
+                        "match": {
+                            "title.plain": {
+                                "query": "Content",
+                                "fuzziness": "AUTO",
+                                "operator": "AND"
+                            }
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "terms": {
+                            "namespace": [
+                                12
+                            ]
+                        }
+                    }
+                ]
+            }
+        },
+        "size": 20,
+        "stats": [
+            "archive"
+        ]
+    },
+    "options": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/namespaced.query 
b/tests/unit/fixtures/archiveSearch/namespaced.query
new file mode 100644
index 0000000..819b5fd
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/namespaced.query
@@ -0,0 +1 @@
+Help:Content
diff --git a/tests/unit/fixtures/archiveSearch/simple.expected 
b/tests/unit/fixtures/archiveSearch/simple.expected
new file mode 100644
index 0000000..178cd37
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/simple.expected
@@ -0,0 +1,63 @@
+{
+    "description": "archive search for '{query}'",
+    "params": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    },
+    "query": {
+        "_source": [
+            "namespace",
+            "title",
+            "namespace_text",
+            "wiki"
+        ],
+        "stored_fields": [],
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "multi_match": {
+                            "type": "best_fields",
+                            "tie_breaker": 0,
+                            "query": "Cheese",
+                            "fields": [
+                                "title.near_match^100",
+                                "title.near_match_asciifolding^75",
+                                "title.plain^50",
+                                "title^25"
+                            ],
+                            "operator": "AND"
+                        }
+                    },
+                    {
+                        "match": {
+                            "title.plain": {
+                                "query": "Cheese",
+                                "fuzziness": "AUTO",
+                                "operator": "AND"
+                            }
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "terms": {
+                            "namespace": [
+                                0
+                            ]
+                        }
+                    }
+                ]
+            }
+        },
+        "size": 20,
+        "stats": [
+            "archive"
+        ]
+    },
+    "options": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/simple.query 
b/tests/unit/fixtures/archiveSearch/simple.query
new file mode 100644
index 0000000..70de1df
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/simple.query
@@ -0,0 +1 @@
+cheese
diff --git a/tests/unit/fixtures/archiveSearch/spaces.expected 
b/tests/unit/fixtures/archiveSearch/spaces.expected
new file mode 100644
index 0000000..32a351c
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/spaces.expected
@@ -0,0 +1,63 @@
+{
+    "description": "archive search for '{query}'",
+    "params": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    },
+    "query": {
+        "_source": [
+            "namespace",
+            "title",
+            "namespace_text",
+            "wiki"
+        ],
+        "stored_fields": [],
+        "query": {
+            "bool": {
+                "should": [
+                    {
+                        "multi_match": {
+                            "type": "best_fields",
+                            "tie_breaker": 0,
+                            "query": "Two And two",
+                            "fields": [
+                                "title.near_match^100",
+                                "title.near_match_asciifolding^75",
+                                "title.plain^50",
+                                "title^25"
+                            ],
+                            "operator": "AND"
+                        }
+                    },
+                    {
+                        "match": {
+                            "title.plain": {
+                                "query": "Two And two",
+                                "fuzziness": "AUTO",
+                                "operator": "AND"
+                            }
+                        }
+                    }
+                ],
+                "minimum_should_match": 1,
+                "filter": [
+                    {
+                        "terms": {
+                            "namespace": [
+                                0
+                            ]
+                        }
+                    }
+                ]
+            }
+        },
+        "size": 20,
+        "stats": [
+            "archive"
+        ]
+    },
+    "options": {
+        "timeout": "20s",
+        "search_type": "dfs_query_then_fetch"
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/spaces.query 
b/tests/unit/fixtures/archiveSearch/spaces.query
new file mode 100644
index 0000000..58b7c75
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/spaces.query
@@ -0,0 +1 @@
+two And two

-- 
To view, visit https://gerrit.wikimedia.org/r/281077
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I48382f24d91b4421b6a1754d7f30c18b173263db
Gerrit-PatchSet: 49
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to