jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/281077 )
Change subject: Add deleted archive titles indexing and search
......................................................................
Add deleted archive titles indexing and search
Query in Searcher.php may need some tuning yet.
Bug: T109561
Change-Id: I48382f24d91b4421b6a1754d7f30c18b173263db
Depends-On: Id6099fe9fbf18481068a6f0a329bbde0d218135f
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
M includes/CirrusSearch.php
M includes/Connection.php
M includes/DataSender.php
M includes/Hooks.php
A includes/Job/DeleteArchive.php
M includes/Job/DeletePages.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Searcher.php
M includes/Updater.php
M maintenance/forceSearchIndex.php
M maintenance/updateOneSearchIndexConfig.php
M tests/browser/features/step_definitions/search_steps.rb
A tests/browser/features/support/pages/undelete_page.rb
M tests/browser/features/update_general_api.feature
M tests/jenkins/FullyFeaturedConfig.php
M tests/unit/SearcherTest.php
A tests/unit/fixtures/archiveSearch/namespaced.expected
A tests/unit/fixtures/archiveSearch/namespaced.query
A tests/unit/fixtures/archiveSearch/simple.expected
A tests/unit/fixtures/archiveSearch/simple.query
A tests/unit/fixtures/archiveSearch/spaces.expected
A tests/unit/fixtures/archiveSearch/spaces.query
25 files changed, 625 insertions(+), 12 deletions(-)
Approvals:
EBernhardson: Looks good to me, approved
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 613839d..31ef36c 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1107,6 +1107,7 @@
$wgHooks[ 'ArticleDelete' ][] = 'CirrusSearch\Hooks::onArticleDelete';
$wgHooks[ 'ArticleDeleteComplete' ][] =
'CirrusSearch\Hooks::onArticleDeleteComplete';
$wgHooks[ 'ArticleRevisionVisibilitySet' ][] =
'CirrusSearch\Hooks::onRevisionDelete';
+$wgHooks[ 'ArticleUndelete' ][] = 'CirrusSearch\Hooks::onArticleUndelete';
$wgHooks[ 'BeforeInitialize' ][] = 'CirrusSearch\Hooks::onBeforeInitialize';
$wgHooks[ 'GetBetaFeaturePreferences' ][] =
'CirrusSearch\Hooks::getBetaFeaturePreferences';
$wgHooks[ 'GetPreferences' ][] = 'CirrusSearch\Hooks::onGetPreferences';
@@ -1138,6 +1139,7 @@
$wgJobClasses[ 'cirrusSearchOtherIndex' ] = 'CirrusSearch\Job\OtherIndex';
$wgJobClasses[ 'cirrusSearchElasticaWrite' ] =
'CirrusSearch\Job\ElasticaWrite';
$wgJobClasses[ 'cirrusSearchCheckerJob' ] = 'CirrusSearch\Job\CheckerJob';
+$wgJobClasses[ 'cirrusSearchDeleteArchive' ] =
'CirrusSearch\Job\DeleteArchive';
/**
* Actions
@@ -1213,6 +1215,15 @@
*/
$wgCirrusSearchExtraIndexSettings = [];
+/**
+ * Whether to index deleted pages for archiving.
+ */
+$wgCirrusSearchIndexDeletes = false;
+/**
+ * Enable archive search.
+ */
+$wgCirrusSearchEnableArchive = false;
+
/*
* Please update docs/settings.txt if you add new values!
*/
diff --git a/autoload.php b/autoload.php
index 095fbb7..93276b4 100644
--- a/autoload.php
+++ b/autoload.php
@@ -59,6 +59,7 @@
'CirrusSearch\\Iterator\\CallbackIterator' => __DIR__ .
'/includes/iterator/CallbackIterator.php',
'CirrusSearch\\Iterator\\IteratorDecorator' => __DIR__ .
'/includes/iterator/IteratorDecorator.php',
'CirrusSearch\\Job\\CheckerJob' => __DIR__ .
'/includes/Job/CheckerJob.php',
+ 'CirrusSearch\\Job\\DeleteArchive' => __DIR__ .
'/includes/Job/DeleteArchive.php',
'CirrusSearch\\Job\\DeletePages' => __DIR__ .
'/includes/Job/DeletePages.php',
'CirrusSearch\\Job\\ElasticaWrite' => __DIR__ .
'/includes/Job/ElasticaWrite.php',
'CirrusSearch\\Job\\IncomingLinkCount' => __DIR__ .
'/includes/Job/IncomingLinkCount.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index f3663a8..6902115 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -1474,3 +1474,17 @@
'search.slowlog.threshold.fetch.info' => '1s',
'search.slowlog.threshold.fetch.info' => '800ms',
];
+
+; $wgCirrusSearchEnableArchive
+Default:
+ $wgCirrusSearchEnableArchive = false;
+
+Enable searching for deleted pages in the ElasticSearch indexed archive.
+
+; $wgCirrusSearchIndexDeletes
+
+Default:
+ $wgCirrusSearchIndexDeletes = false;
+
+Whether deletes are indexed for archive search when page is deleted. Note that
searching
+for archived pages can be done by manually indexing them too.
\ No newline at end of file
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 915cdc8..ecfd76f 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -775,4 +775,35 @@
public function setDumpAndDie( $dumpAndDie ) {
$this->dumpAndDie = $dumpAndDie;
}
+
+ /**
+ * Perform a title search in the article archive.
+ *
+ * @param string $term Raw search term
+ * @return Status<Title[]>
+ */
+ public function searchArchiveTitle ( $term ) {
+
+ if ( !$this->config->get( 'CirrusSearchEnableArchive' ) ) {
+ return Status::newGood( [] );
+ }
+
+ $term = trim( $term );
+
+ if ( empty( $term ) ) {
+ return Status::newGood( [] );
+ }
+
+ $searcher = new Searcher( $this->connection, $this->offset,
$this->limit, $this->config, $this->namespaces,
+ null, $this->indexBaseName );
+ $searcher->setOptionsFromRequest( $this->request );
+
+ $status = $searcher->searchArchive( $term );
+ if ( $status->isOK() && $searcher->isReturnRaw() ) {
+ $status->setResult( true,
+ $searcher->processRawReturn(
$status->getValue(), $this->request, $this->dumpAndDie ) );
+ }
+ return $status;
+ }
+
}
diff --git a/includes/Connection.php b/includes/Connection.php
index d3f27be..f6e67b7 100644
--- a/includes/Connection.php
+++ b/includes/Connection.php
@@ -64,6 +64,12 @@
const TITLE_SUGGEST_TYPE_NAME = 'titlesuggest';
/**
+ * Name of the archive type
+ * @var string
+ */
+ const ARCHIVE_TYPE_NAME = 'archive';
+
+ /**
* @var SearchConfig
*/
protected $config;
@@ -215,6 +221,15 @@
}
/**
+ * Fetch the Elastica Type for archive.
+ * @param mixed $name basename of index
+ * @return \Elastica\Type
+ */
+ public function getArchiveType( $name ) {
+ return $this->getIndex( $name, 'general' )->getType(
self::ARCHIVE_TYPE_NAME );
+ }
+
+ /**
* Get all index types we support, content, general, plus custom ones
*
* @return string[]
diff --git a/includes/DataSender.php b/includes/DataSender.php
index 8a3cf35..88c9206 100644
--- a/includes/DataSender.php
+++ b/includes/DataSender.php
@@ -159,6 +159,7 @@
/**
* @param string $indexType type of index to which to send $data
* @param (\Elastica\Script|\Elastica\Document)[] $data documents to
send
+ * @param string $elasticType Mapping type to use for the document
* @return Status
*/
public function sendData( $indexType, $data, $elasticType =
Connection::PAGE_TYPE_NAME ) {
@@ -259,6 +260,7 @@
*
* @param string[] $docIds elasticsearch document ids to delete
* @param string|null $indexType index from which to delete. null
means all.
+ * @param string|null $elasticType Mapping type to use for the
document. null means all types.
* @return Status
*/
public function sendDeletes( $docIds, $indexType = null, $elasticType =
null ) {
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 74b5fcc..d3d1a11 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -826,4 +826,29 @@
);
return true;
}
+
+
+ /**
+ * When article is undeleted - check the archive for other instances of
the title,
+ * if not there - drop it from the archive.
+ * @param Title $title
+ * @param bool $create
+ * @param string $comment
+ * @param string $oldPageId
+ * @param array $restoredPages
+ * @return bool
+ */
+ public static function onArticleUndelete( Title $title, $create,
$comment, $oldPageId, $restoredPages ) {
+ global $wgCirrusSearchIndexDeletes;
+ if ( !$wgCirrusSearchIndexDeletes ) {
+ // Not indexing, thus nothing to remove here.
+ return true;
+ }
+ JobQueueGroup::singleton()->push(
+ new Job\DeleteArchive( $title, [ 'docIds' =>
$restoredPages ] )
+ );
+ return true;
+
+ }
+
}
diff --git a/includes/Job/DeleteArchive.php b/includes/Job/DeleteArchive.php
new file mode 100644
index 0000000..fb9560b
--- /dev/null
+++ b/includes/Job/DeleteArchive.php
@@ -0,0 +1,44 @@
+<?php
+
+namespace CirrusSearch\Job;
+
+use CirrusSearch\Connection;
+use MediaWiki\MediaWikiServices;
+
+/**
+ * Job wrapper for deleting pages from archive.
+ */
+class DeleteArchive extends Job {
+ public function __construct( $title, $params ) {
+ parent::__construct( $title, $params );
+
+ // Don't remove dupes since we do checks that may return
different results
+ // Also, deletes are idempotent so it's no problem if we delete
twice.
+ $this->removeDuplicates = false;
+ }
+
+ protected function doJob() {
+ $archive = new \PageArchive( $this->title );
+ $docs = $this->params['docIds'];
+
+ // Remove page IDs that still have archived revs
+ foreach ( $archive->listRevisions() as $rev ) {
+ unset( $docs[$rev['ar_page_id']] );
+ }
+
+ if ( empty( $docs ) ) {
+ // If we have more deleted instances of the same title,
no need to bother.
+ return true;
+ }
+
+ $updater = $this->createUpdater();
+ $updater->deletePages(
+ [ $this->title ],
+ array_keys( $docs ),
+ Connection::GENERAL_INDEX_TYPE,
+ Connection::ARCHIVE_TYPE_NAME
+ );
+
+ return true;
+ }
+}
diff --git a/includes/Job/DeletePages.php b/includes/Job/DeletePages.php
index 406a232..17294dd 100644
--- a/includes/Job/DeletePages.php
+++ b/includes/Job/DeletePages.php
@@ -43,8 +43,20 @@
}
protected function doJob() {
+ global $wgCirrusSearchIndexDeletes;
$updater = $this->createUpdater();
$indexType = isset( $this->params[ 'indexType' ] ) ?
$this->params[ 'indexType' ] : null;
- return $updater->deletePages( [ $this->title ], [
$this->params[ 'docId' ] ], $indexType );
+ $updater->deletePages( [ $this->title ], [
$this->params['docId'] ], $indexType );
+
+ if ( $wgCirrusSearchIndexDeletes ) {
+ $success = $updater->archivePages( [
+ [
+ 'title' => $this->title,
+ 'page' => $this->params['docId'],
+ ],
+ ] );
+ }
+
+ return true;
}
}
diff --git a/includes/Maintenance/MappingConfigBuilder.php
b/includes/Maintenance/MappingConfigBuilder.php
index c0be235..60acaa8 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -270,6 +270,18 @@
],
];
+ $config[ 'archive' ] = [
+ 'dynamic' => false,
+ '_all' => [ 'enabled' => false ],
+ 'properties' => [
+ 'namespace' => $page['properties']['namespace'],
+ 'title' => $page['properties']['title'],
+ 'wiki' => $page['properties']['wiki'],
+ ],
+ ];
+ // Do not use copy settings for archive
+ unset( $config['archive']['properties']['title']['copy_to'] );
+
Hooks::run( 'CirrusSearchMappingConfig', [ &$config, $this ] );
return $config;
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 50c8069..9db56c6 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -4,12 +4,15 @@
use CirrusSearch\Query\SimpleKeywordFeature;
use CirrusSearch\Search\FullTextResultsType;
+use CirrusSearch\Search\TitleResultsType;
use CirrusSearch\Search\ResultsType;
use CirrusSearch\Search\RescoreBuilder;
use CirrusSearch\Search\SearchContext;
use CirrusSearch\Query\FullTextQueryBuilder;
use CirrusSearch\Elastica\MultiSearch as MultiSearch;
use Elastica\Exception\RuntimeException;
+use Elastica\Query\BoolQuery;
+use Elastica\Query\MultiMatch;
use Language;
use MediaWiki\Logger\LoggerFactory;
use MediaWiki\MediaWikiServices;
@@ -122,6 +125,12 @@
* @var SearchContext
*/
protected $searchContext;
+
+ /**
+ * Indexing type we'll be using.
+ * @var string|\Elastica\Type
+ */
+ private $pageType;
/**
* Constructor
@@ -559,6 +568,8 @@
$query->addParam( 'stats', $syntax );
}
switch ( $this->sort ) {
+ case 'just_match':
+ // Use just matching scores, without any rescoring, and
default sort.
case 'relevance':
break; // The default
case 'title_asc':
@@ -579,6 +590,9 @@
'missing' => '_last',
] ] );
break;
+ case 'none':
+ $query->setSort( [ '_doc' ] );
+ break;
default:
LoggerFactory::getInstance( 'CirrusSearch' )->warning(
"Invalid sort type: {sort}",
@@ -595,8 +609,12 @@
$queryOptions[\Elastica\Search::OPTION_SEARCH_TYPE] =
\Elastica\Search::OPTION_SEARCH_TYPE_DFS_QUERY_THEN_FETCH;
}
- $indexType = $this->connection->pickIndexTypeForNamespaces(
$namespaces );
- $pageType = $this->connection->getPageType(
$this->indexBaseName, $indexType );
+ if ( $this->pageType ) {
+ $pageType = $this->pageType;
+ } else {
+ $indexType =
$this->connection->pickIndexTypeForNamespaces( $namespaces );
+ $pageType = $this->connection->getPageType(
$this->indexBaseName, $indexType );
+ }
$search = $pageType->createSearch( $query, $queryOptions );
foreach ( $extraIndexes as $i ) {
@@ -1070,4 +1088,48 @@
return $result;
}
+
+ /**
+ * Search titles in archive
+ * @param string $term
+ * @return Status<Title[]>
+ */
+ public function searchArchive( $term ) {
+ list( $term, $fuzzyUnused ) =
$this->searchContext->escaper()->fixupWholeQueryString( $term );
+ $this->setResultsType( new TitleResultsType( $this->config ) );
+
+ $this->pageType = $this->connection->getArchiveType(
$this->indexBaseName );
+
+ // Setup the search query
+ $query = new BoolQuery();
+
+ $multi = new MultiMatch();
+ $multi->setType( 'best_fields' );
+ $multi->setTieBreaker( 0 );
+ $multi->setQuery( $term );
+ $multi->setFields( [
+ 'title.near_match^100',
+ 'title.near_match_asciifolding^75',
+ 'title.plain^50',
+ 'title^25'
+ ] );
+ $multi->setOperator( 'AND' );
+
+ $fuzzy = new \Elastica\Query\Match();
+ $fuzzy->setFieldQuery( 'title.plain', $term );
+ $fuzzy->setFieldFuzziness( 'title.plain', 'AUTO' );
+ $fuzzy->setFieldOperator( 'title.plain', 'AND' );
+
+ $query->addShould( $multi );
+ $query->addShould( $fuzzy );
+ $query->setMinimumShouldMatch( 1 );
+
+ $this->sort = 'just_match';
+
+ $this->searchContext->setMainQuery( $query );
+ $this->searchContext->addSyntaxUsed( 'archive' );
+
+ return $this->searchOne();
+ }
+
}
diff --git a/includes/Updater.php b/includes/Updater.php
index 0b20675..9fe1108 100644
--- a/includes/Updater.php
+++ b/includes/Updater.php
@@ -233,26 +233,84 @@
*
* @param Title[] $titles List of titles to delete. If empty then
skipped other index
* maintenance is skipped.
- * @param integer[] $docIds List of elasticsearch document ids to delete
- * @param string $indexType index from which to delete
- * @return bool True if nothing happened or we successfully deleted,
false on failure
+ * @param int[]|string[] $docIds List of elasticsearch document ids to
delete
+ * @param string|null $indexType index from which to delete. null
means all.
+ * @param string $elasticType Mapping type to use for the document
+ * @return bool Always returns true.
*/
- public function deletePages( $titles, $docIds, $indexType = null ) {
+ public function deletePages( $titles, $docIds, $indexType = null,
$elasticType = null ) {
Job\OtherIndex::queueIfRequired( $titles,
$this->writeToClusterName );
$job = new Job\ElasticaWrite(
$titles ? reset( $titles ) : Title::makeTitle( 0, "" ),
[
'method' => 'sendDeletes',
- 'arguments' => [ $docIds, $indexType ],
+ 'arguments' => [ $docIds, $indexType,
$elasticType ],
'cluster' => $this->writeToClusterName,
]
);
// This job type will insert itself into the job queue
// with a delay if writes to ES are currently paused
$job->run();
+
+ return true;
}
/**
+ * Add documents to archive index.
+ * @param array $archived
+ * @return bool
+ */
+ public function archivePages( $archived ) {
+ if ( !$this->searchConfig->getElement(
'CirrusSearchIndexDeletes' ) ) {
+ // Disabled by config - don't do anything
+ return true;
+ }
+ $docs = $this->buildArchiveDocuments( $archived );
+ $head = reset( $archived );
+ foreach ( array_chunk( $docs, 10 ) as $chunked ) {
+ $job = new Job\ElasticaWrite(
+ $head['title'],
+ [
+ 'method' => 'sendData',
+ 'arguments' => [
Connection::GENERAL_INDEX_TYPE, $chunked, Connection::ARCHIVE_TYPE_NAME ],
+ 'cluster' => $this->writeToClusterName
+ ]
+ );
+ $job->run();
+ }
+
+ return true;
+ }
+
+ /**
+ * Build Elastica documents for archived pages.
+ * @param array $archived
+ * @return \Elastica\Document[]
+ */
+ private function buildArchiveDocuments( array $archived ) {
+ $docs = [];
+ foreach ( $archived as $delete ) {
+ if ( !isset( $delete['title'] ) ) {
+ // These come from pages that still exist, but
are redirects.
+ // This is non-obvious and we probably need a
better way...
+ continue;
+ }
+ /** @var Title $title */
+ $title = $delete['title'];
+ $doc = new \Elastica\Document( $delete['page'], [
+ 'namespace' => $title->getNamespace(),
+ 'title' => $title->getText(),
+ 'wiki' => wfWikiId(),
+ ] );
+ $doc->setDocAsUpsert( true );
+ $doc->setRetryOnConflict(
$this->searchConfig->getElement( 'CirrusSearchUpdateConflictRetryCount' ) );
+
+ $docs[] = $doc;
+ }
+
+ return $docs;
+ }
+ /**
* @param \WikiPage[] $pages
* @param int $flags
* @return \Elastica\Document[]
diff --git a/maintenance/forceSearchIndex.php b/maintenance/forceSearchIndex.php
index 399bfe2..d53404f 100644
--- a/maintenance/forceSearchIndex.php
+++ b/maintenance/forceSearchIndex.php
@@ -46,6 +46,7 @@
public $toDate = null;
public $toId = null;
public $indexUpdates;
+ public $archiveOnly;
public $limit;
public $queue;
public $maxJobs;
@@ -79,6 +80,7 @@
$this->addOption( 'toId', 'Stop indexing at a specific page_id.
Not useful with --deletes or --from or --to.', false, true );
$this->addOption( 'ids', 'List of page ids (comma separated) to
reindex. Not allowed with deletes/from/to/fromId/toId/limit.', false, true );
$this->addOption( 'deletes', 'If this is set then just index
deletes, not updates or creates.', false );
+ $this->addOption( 'archiveOnly', 'Don\'t delete pages, only
index them into the archive. Only useful with --deletes', false, false );
$this->addOption( 'limit', 'Maximum number of pages to process
before exiting the script. Default to unlimited.', false, true );
$this->addOption( 'buildChunks', 'Instead of running the script
spit out commands that can be farmed out to ' .
'different processes or machines to rebuild the index.
Works with fromId and toId, not from and to. ' .
@@ -128,6 +130,7 @@
}
$this->toId = $this->getOption( 'toId' );
$this->indexUpdates = !$this->getOption( 'deletes', false );
+ $this->archiveOnly = (bool) $this->getOption( 'archiveOnly',
false );
$this->limit = $this->getOption( 'limit' );
$buildChunks = $this->getOption( 'buildChunks' );
if ( $buildChunks !== null ) {
@@ -189,7 +192,10 @@
} else {
$size = count( $batch['titlesToDelete'] );
$updater = $this->createUpdater();
- $updater->deletePages(
$batch['titlesToDelete'], $batch['docIdsToDelete'] );
+ $updater->archivePages( $batch['archive'] );
+ if ( !$this->archiveOnly ) {
+ $updater->deletePages(
$batch['titlesToDelete'], $batch['docIdsToDelete'] );
+ }
}
@@ -353,14 +359,22 @@
return new CallbackIterator( $it, function ( $batch ) {
$titlesToDelete = [];
$docIdsToDelete = [];
+ $archive = [];
foreach ( $batch as $row ) {
- $titlesToDelete[] = Title::makeTitle(
$row->ar_namespace, $row->ar_title );
- $docIdsToDelete[] =
$this->getSearchConfig()->makeId( $row->ar_page_id );
+ $title = Title::makeTitle( $row->ar_namespace,
$row->ar_title );
+ $id = $this->getSearchConfig()->makeId(
$row->ar_page_id );
+ $titlesToDelete[] = $title;
+ $docIdsToDelete[] = $id;
+ $archive[] = [
+ 'title' => $title,
+ 'page' => $id,
+ ];
}
return [
'titlesToDelete' => $titlesToDelete,
'docIdsToDelete' => $docIdsToDelete,
+ 'archive' => $archive,
'endingAt' => isset( $row )
? ( new MWTimestamp( $row->ar_timestamp
) )->getTimestamp( TS_ISO_8601 )
: 'unknown',
diff --git a/maintenance/updateOneSearchIndexConfig.php
b/maintenance/updateOneSearchIndexConfig.php
index 9accb8c..3e29b01 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -379,7 +379,11 @@
$this->optimizeIndexForExperimentalHighlighter,
$this->availablePlugins,
$this->getMappingConfig(),
- [ 'page' => $this->getPageType(), 'namespace' =>
$this->getNamespaceType() ],
+ [
+ 'page' => $this->getPageType(),
+ 'namespace' => $this->getNamespaceType(),
+ 'archive' => $this->getArchiveType()
+ ],
$this
);
$validator->printDebugCheckConfig( $this->printDebugCheckConfig
);
@@ -537,6 +541,15 @@
}
/**
+ * Get the namespace type being updated by the search config.
+ *
+ * @return Elastica\Type
+ */
+ protected function getArchiveType() {
+ return $this->getIndex()->getType(
Connection::ARCHIVE_TYPE_NAME );
+ }
+
+ /**
* @return Elastica\Type
*/
protected function getOldPageType() {
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index 9e16a59..c015b6d 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -612,6 +612,20 @@
token_type: false
)
end
+When(/^within (\d+) seconds I search deleted pages for (.*)/) do |seconds,
search|
+ within(seconds) do
+ with_browser do
+ visit(SpecialUndeletePage)
+ on(SpecialUndeletePage).search_input = search
+ on(SpecialUndeletePage).search_button
+ end
+ end
+end
+Then(/^deleted page search returns (.*) as first result/) do |expected|
+ result = on(SpecialUndeletePage).first_result
+ result = result.gsub(/\s+\(\d+ revisions? deleted\)$/, "") unless result.nil?
+ result.should == expected
+end
def within(seconds)
end_time = Time.new + Integer(seconds)
diff --git a/tests/browser/features/support/pages/undelete_page.rb
b/tests/browser/features/support/pages/undelete_page.rb
new file mode 100644
index 0000000..4116c9a
--- /dev/null
+++ b/tests/browser/features/support/pages/undelete_page.rb
@@ -0,0 +1,13 @@
+# Page with all the search options.
+class SpecialUndeletePage
+ include PageObject
+
+ page_url "Special:Undelete?fuzzy=1"
+
+ button(:search_button, id: "searchUndelete")
+ text_field(:search_input, id: "prefix")
+ ul(:search_results, id: "undeleteResultsList")
+ li(:first_result, class: "undeleteResult", index: 0)
+ li(:second_result, class: "undeleteResult", index: 1)
+ links(:all_results, class: "undeleteResult") { |page|
page.search_results_element.link_elements }
+end
diff --git a/tests/browser/features/update_general_api.feature
b/tests/browser/features/update_general_api.feature
index b50c0a0..4f770bb 100644
--- a/tests/browser/features/update_general_api.feature
+++ b/tests/browser/features/update_general_api.feature
@@ -62,3 +62,11 @@
When I move Move%{epoch} From4 to User:Move%{epoch} To4 and do not leave a
redirect via api
Then within 20 seconds api searching for User:Move%{epoch} To4 yields
User:Move%{epoch} To4 as the first result
And within 20 seconds api searching for Move%{epoch} To4 yields none as
the first result
+
+ Scenario: Deleted pages are added to archive index
+ Given a page named DeleteMeTest exists
+ And I am logged in
+ Then within 20 seconds api searching for DeleteMeTest yields DeleteMeTest
as the first result
+ When I delete DeleteMeTest
+ And within 20 seconds I search deleted pages for deltemetest
+ Then deleted page search returns DeleteMeTest as first result
diff --git a/tests/jenkins/FullyFeaturedConfig.php
b/tests/jenkins/FullyFeaturedConfig.php
index c035d52..f848050 100644
--- a/tests/jenkins/FullyFeaturedConfig.php
+++ b/tests/jenkins/FullyFeaturedConfig.php
@@ -90,3 +90,6 @@
'maxqueue' => 200,
);
}
+
+$wgCirrusSearchIndexDeletes = true;
+$wgCirrusSearchEnableArchive = true;
diff --git a/tests/unit/SearcherTest.php b/tests/unit/SearcherTest.php
index c9fcb2c..8650aab 100644
--- a/tests/unit/SearcherTest.php
+++ b/tests/unit/SearcherTest.php
@@ -9,6 +9,7 @@
* @group CirrusSearch
*/
class SearcherTest extends CirrusTestCase {
+
public function searchTextProvider() {
$configs = [
'default' => [],
@@ -40,6 +41,7 @@
return $tests;
}
+
/**
* @dataProvider searchTextProvider
@@ -155,4 +157,71 @@
return $query;
}
+
+ public function archiveFixtureProvider() {
+ $tests = [];
+ foreach ( glob( __DIR__ . '/fixtures/archiveSearch/*.query' )
as $queryFile ) {
+ $testName = substr( basename( $queryFile ), 0, - 6 );
+ $query = file_get_contents( $queryFile );
+ // Remove trailing newline
+ $query = preg_replace( '/\n$/', '', $query );
+ $expectedFile = substr( $queryFile, 0, - 5 ) .
'expected';
+ $expected =
+ is_file( $expectedFile ) ? json_decode(
file_get_contents( $expectedFile ), true )
+ // Flags test to generate a new fixture
+ : $expectedFile;
+ $tests[$testName] = [
+ $expected,
+ $query,
+ ];
+
+ }
+ return $tests;
+ }
+
+ /**
+ * @dataProvider archiveFixtureProvider
+ * @param $expected
+ * @param $query
+ */
+ public function testArchiveQuery( $expected, $query ) {
+ $this->setMwGlobals( [
+ 'wgCirrusSearchIndexBaseName' => 'wiki',
+
'wgCirrusSearchQueryStringMaxDeterminizedStates' => 500,
+ 'wgContentNamespaces' => [ NS_MAIN ],
+ 'wgCirrusSearchEnableArchive' => true,
+ ] );
+
+ \RequestContext::getMain()->setRequest( new \FauxRequest( [
+ 'cirrusDumpQuery' => 1,
+ ] ) );
+
+ $title = Title::newFromText( $query );
+ if ( $title ) {
+ $ns = $title->getNamespace();
+ $termMain = $title->getText();
+ } else {
+ $ns = 0;
+ $termMain = $query;
+ }
+
+ $engine = new \CirrusSearch();
+ $engine->setLimitOffset( 20, 0 );
+ $engine->setNamespaces( [ $ns ] );
+ $engine->setDumpAndDie( false );
+ $elasticQuery = $engine->searchArchiveTitle( $termMain
)->getValue();
+ $decodedQuery = json_decode( $elasticQuery, true );
+ unset( $decodedQuery['path'] );
+
+ if ( is_string( $expected ) ) {
+ // Flag to generate a new fixture.
+ file_put_contents( $expected, json_encode(
$decodedQuery, JSON_PRETTY_PRINT ) );
+ } else {
+ // Repeat normalizations applied to $elasticQuery
+ unset( $expected['path'] );
+
+ // Finally compare some things
+ $this->assertEquals( $expected, $decodedQuery,
$elasticQuery );
+ }
+ }
}
diff --git a/tests/unit/fixtures/archiveSearch/namespaced.expected
b/tests/unit/fixtures/archiveSearch/namespaced.expected
new file mode 100644
index 0000000..00c0e3b
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/namespaced.expected
@@ -0,0 +1,63 @@
+{
+ "description": "archive search for '{query}'",
+ "params": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ },
+ "query": {
+ "_source": [
+ "namespace",
+ "title",
+ "namespace_text",
+ "wiki"
+ ],
+ "stored_fields": [],
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "type": "best_fields",
+ "tie_breaker": 0,
+ "query": "Content",
+ "fields": [
+ "title.near_match^100",
+ "title.near_match_asciifolding^75",
+ "title.plain^50",
+ "title^25"
+ ],
+ "operator": "AND"
+ }
+ },
+ {
+ "match": {
+ "title.plain": {
+ "query": "Content",
+ "fuzziness": "AUTO",
+ "operator": "AND"
+ }
+ }
+ }
+ ],
+ "minimum_should_match": 1,
+ "filter": [
+ {
+ "terms": {
+ "namespace": [
+ 12
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "size": 20,
+ "stats": [
+ "archive"
+ ]
+ },
+ "options": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/namespaced.query
b/tests/unit/fixtures/archiveSearch/namespaced.query
new file mode 100644
index 0000000..819b5fd
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/namespaced.query
@@ -0,0 +1 @@
+Help:Content
diff --git a/tests/unit/fixtures/archiveSearch/simple.expected
b/tests/unit/fixtures/archiveSearch/simple.expected
new file mode 100644
index 0000000..178cd37
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/simple.expected
@@ -0,0 +1,63 @@
+{
+ "description": "archive search for '{query}'",
+ "params": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ },
+ "query": {
+ "_source": [
+ "namespace",
+ "title",
+ "namespace_text",
+ "wiki"
+ ],
+ "stored_fields": [],
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "type": "best_fields",
+ "tie_breaker": 0,
+ "query": "Cheese",
+ "fields": [
+ "title.near_match^100",
+ "title.near_match_asciifolding^75",
+ "title.plain^50",
+ "title^25"
+ ],
+ "operator": "AND"
+ }
+ },
+ {
+ "match": {
+ "title.plain": {
+ "query": "Cheese",
+ "fuzziness": "AUTO",
+ "operator": "AND"
+ }
+ }
+ }
+ ],
+ "minimum_should_match": 1,
+ "filter": [
+ {
+ "terms": {
+ "namespace": [
+ 0
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "size": 20,
+ "stats": [
+ "archive"
+ ]
+ },
+ "options": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/simple.query
b/tests/unit/fixtures/archiveSearch/simple.query
new file mode 100644
index 0000000..70de1df
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/simple.query
@@ -0,0 +1 @@
+cheese
diff --git a/tests/unit/fixtures/archiveSearch/spaces.expected
b/tests/unit/fixtures/archiveSearch/spaces.expected
new file mode 100644
index 0000000..32a351c
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/spaces.expected
@@ -0,0 +1,63 @@
+{
+ "description": "archive search for '{query}'",
+ "params": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ },
+ "query": {
+ "_source": [
+ "namespace",
+ "title",
+ "namespace_text",
+ "wiki"
+ ],
+ "stored_fields": [],
+ "query": {
+ "bool": {
+ "should": [
+ {
+ "multi_match": {
+ "type": "best_fields",
+ "tie_breaker": 0,
+ "query": "Two And two",
+ "fields": [
+ "title.near_match^100",
+ "title.near_match_asciifolding^75",
+ "title.plain^50",
+ "title^25"
+ ],
+ "operator": "AND"
+ }
+ },
+ {
+ "match": {
+ "title.plain": {
+ "query": "Two And two",
+ "fuzziness": "AUTO",
+ "operator": "AND"
+ }
+ }
+ }
+ ],
+ "minimum_should_match": 1,
+ "filter": [
+ {
+ "terms": {
+ "namespace": [
+ 0
+ ]
+ }
+ }
+ ]
+ }
+ },
+ "size": 20,
+ "stats": [
+ "archive"
+ ]
+ },
+ "options": {
+ "timeout": "20s",
+ "search_type": "dfs_query_then_fetch"
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/archiveSearch/spaces.query
b/tests/unit/fixtures/archiveSearch/spaces.query
new file mode 100644
index 0000000..58b7c75
--- /dev/null
+++ b/tests/unit/fixtures/archiveSearch/spaces.query
@@ -0,0 +1 @@
+two And two
--
To view, visit https://gerrit.wikimedia.org/r/281077
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I48382f24d91b4421b6a1754d7f30c18b173263db
Gerrit-PatchSet: 49
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits