jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/392471 )
Change subject: Add word count statistic for articles ...................................................................... Add word count statistic for articles The community survey asked for this feature, and it was pretty straight forward to add to cirrus. https://meta.wikimedia.org/wiki/2017_Community_Wishlist_Survey/Miscellaneous/Word_count_on_statistics Change-Id: I847f696405b447ab04972ad0215c09d0012c2098 --- M CirrusSearch.php M autoload.php M i18n/en.json M i18n/qqq.json M includes/CirrusSearch.php M includes/Hooks.php A includes/Query/CountContentWordsBuilder.php M includes/Search/ResultsType.php M includes/Search/SearchContext.php M includes/Search/SearchRequestBuilder.php M includes/Searcher.php 11 files changed, 146 insertions(+), 6 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve jenkins-bot: Verified DCausse: Looks good to me, approved diff --git a/CirrusSearch.php b/CirrusSearch.php index bf53382..d133928 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -1302,6 +1302,7 @@ $wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo'; $wgHooks[ 'SpecialSearchResults' ][] = 'CirrusSearch\Hooks::onSpecialSearchResults'; $wgHooks[ 'SpecialSearchResultsAppend' ][] = 'CirrusSearch\Hooks::onSpecialSearchResultsAppend'; +$wgHooks[ 'SpecialStatsAddExtra'][] = 'CirrusSearch\Hooks::onSpecialStatsAddExtra'; $wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove'; $wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete'; $wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList'; diff --git a/autoload.php b/autoload.php index 1cb17de..130efd6 100644 --- a/autoload.php +++ b/autoload.php @@ -114,6 +114,7 @@ 'CirrusSearch\\Query\\BoostTemplatesFeature' => __DIR__ . '/includes/Query/BoostTemplatesFeature.php', 'CirrusSearch\\Query\\CompSuggestQueryBuilder' => __DIR__ . '/includes/Query/CompSuggestQueryBuilder.php', 'CirrusSearch\\Query\\ContentModelFeature' => __DIR__ . '/includes/Query/ContentModelFeature.php', + 'CirrusSearch\\Query\\CountContentWordsBuilder' => __DIR__ . '/includes/Query/CountContentWordsBuilder.php', 'CirrusSearch\\Query\\FileNumericFeature' => __DIR__ . '/includes/Query/FileNumericFeature.php', 'CirrusSearch\\Query\\FileTypeFeature' => __DIR__ . '/includes/Query/FileTypeFeature.php', 'CirrusSearch\\Query\\FullTextQueryBuilder' => __DIR__ . '/includes/Query/FullTextQueryBuilder.php', @@ -194,6 +195,7 @@ 'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . '/includes/Search/SearchMetricsProvider.php', 'CirrusSearch\\Search\\SearchRequestBuilder' => __DIR__ . '/includes/Search/SearchRequestBuilder.php', 'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . '/includes/Search/ShortTextIndexField.php', + 'CirrusSearch\\Search\\SingleAggResultsType' => __DIR__ . '/includes/Search/ResultsType.php', 'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . '/includes/Search/SourceTextIndexField.php', 'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ . '/includes/Search/CrossProjectBlockScorer.php', 'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . '/includes/Search/TeamDraftInterleaver.php', diff --git a/i18n/en.json b/i18n/en.json index ab44497..76ff173 100644 --- a/i18n/en.json +++ b/i18n/en.json @@ -76,5 +76,6 @@ "cirrussearch-explore-similar-languages": "Languages", "cirrussearch-explore-similar-related-none": "No related pages available", "cirrussearch-explore-similar-categories-none": "No categories available", - "cirrussearch-explore-similar-languages-none": "Not available in other languages" + "cirrussearch-explore-similar-languages-none": "Not available in other languages", + "cirrussearch-article-words": "Words in all articles" } diff --git a/i18n/qqq.json b/i18n/qqq.json index 486768c..cac2d2b 100644 --- a/i18n/qqq.json +++ b/i18n/qqq.json @@ -86,5 +86,6 @@ "cirrussearch-explore-similar-languages": "Label for the 'Languages' section of the Explore Similar search results.\n{{Identical|Language}}", "cirrussearch-explore-similar-related-none": "Text shown when there are no related pages in the Explore Similar search results.", "cirrussearch-explore-similar-categories-none": "Text shown when there are no categories in the Explore Similar search results.", - "cirrussearch-explore-similar-languages-none": "Text shown when there are no other languages in the Explore Similar search results." + "cirrussearch-explore-similar-languages-none": "Text shown when there are no other languages in the Explore Similar search results.", + "cirrussearch-article-words": "Text shown on Special:Statistics along with a count of the number of words across all articles (mostly main namespace, but some wikis may have configured additional content namespaces)" } diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php index 089589a..0505a1e 100644 --- a/includes/CirrusSearch.php +++ b/includes/CirrusSearch.php @@ -798,10 +798,7 @@ return Status::newGood( [] ); } - $searcher = new Searcher( $this->connection, $this->offset, $this->limit, $this->config, $this->namespaces, - null, $this->indexBaseName ); - $searcher->setOptionsFromRequest( $this->request ); - + $searcher = $this->makeSearcher(); $status = $searcher->searchArchive( $term ); if ( $status->isOK() && $searcher->isReturnRaw() ) { $status->setResult( true, @@ -810,4 +807,26 @@ return $status; } + /** + * @return Status Contains a single integer indicating the number + * of content words in the wiki + */ + public function countContentWords() { + $this->limit = 1; + $searcher = $this->makeSearcher(); + $status = $searcher->countContentWords(); + + if ( $status->isOK() && $searcher->isReturnRaw() ) { + $status->setResult( true, + $searcher->processRawReturn( $status->getValue(), $this->request, $this->dumpAndDie ) ); + } + return $status; + } + + private function makeSearcher() { + $searcher = new Searcher( $this->connection, $this->offset, $this->limit, $this->config, $this->namespaces, + null, $this->indexBaseName ); + $searcher->setOptionsFromRequest( $this->request ); + return $searcher; + } } diff --git a/includes/Hooks.php b/includes/Hooks.php index 44cbc9d..63a1766 100644 --- a/includes/Hooks.php +++ b/includes/Hooks.php @@ -853,4 +853,16 @@ return true; } + public static function onSpecialStatsAddExtra( &$extraStats, $context ) { + $search = new CirrusSearch(); + + $status = $search->countContentWords(); + if ( !$status->isOK() ) { + return; + } + $wordCount = $status->getValue(); + if ( $wordCount !== null ) { + $extraStats['cirrussearch-article-words'] = $wordCount; + } + } } diff --git a/includes/Query/CountContentWordsBuilder.php b/includes/Query/CountContentWordsBuilder.php new file mode 100644 index 0000000..772aa38 --- /dev/null +++ b/includes/Query/CountContentWordsBuilder.php @@ -0,0 +1,28 @@ +<?php + +namespace CirrusSearch\Query; + +use Elastica\Aggregation\Sum; +use CirrusSearch\Search\SingleAggResultsType; +use CirrusSearch\Search\SearchContext; + +/** + * Build a query to sum up the word count of all articles + */ +class CountContentWordsBuilder { + // The count doesn't change all that quickly. Re-run the query + // no more than daily per-wiki. + const CACHE_SECONDS = 86400; + + /** + * @param SearchContext $context the search context + */ + public function build( SearchContext $context ) { + $context->addSyntaxUsed( 'sum_word_count' ); + $context->setResultsType( new SingleAggResultsType( 'word_count' ) ); + $context->setRescoreProfile( 'empty' ); + $context->addAggregation( + ( new Sum( 'word_count' ) )->setField( 'text.word_count' ) ); + $context->setCacheTtl( self::CACHE_SECONDS ); + } +} diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php index 98e3f19..89c96a1 100644 --- a/includes/Search/ResultsType.php +++ b/includes/Search/ResultsType.php @@ -570,3 +570,49 @@ return []; } } + +class SingleAggResultsType implements ResultsType { + /** @var string Name of aggregation */ + private $name; + + /** @param string $name Name of aggregation to return */ + public function __construct( $name ) { + $this->name = $name; + } + + /** + * @return false|string|array corresponding to Elasticsearch source filtering syntax + */ + public function getSourceFiltering() { + return false; + } + + public function getStoredFields() { + return []; + } + + public function getHighlightingConfiguration( array $highlightSource ) { + return null; + } + + /** + * @param SearchContext $context + * @param \Elastica\ResultSet $resultSet + * @return mixed|null Type depends on the aggregation performed. For + * a sum this will return an integer. + */ + public function transformElasticsearchResult( SearchContext $context, \Elastica\ResultSet $resultSet ) { + $aggs = $resultSet->getAggregations(); + if ( isset( $aggs[$this->name] ) ) { + return $aggs[$this->name]['value']; + } + return $this->createEmptyResult(); + } + + /** + * @return null + */ + public function createEmptyResult() { + return null; + } +} diff --git a/includes/Search/SearchContext.php b/includes/Search/SearchContext.php index 59e244a..cbdc984 100644 --- a/includes/Search/SearchContext.php +++ b/includes/Search/SearchContext.php @@ -4,6 +4,7 @@ use CirrusSearch\OtherIndexes; use CirrusSearch\SearchConfig; +use Elastica\Aggregation\AbstractAggregation; use Elastica\Query\AbstractQuery; /** @@ -203,6 +204,11 @@ * @var ResultsType Type of the result for the context. */ private $resultsType; + + /** + * @var AbstractAggregation[] Aggregations to perform + */ + private $aggs = []; /** * @param SearchConfig $config @@ -873,4 +879,13 @@ $this->phraseRescoreQuery = $phraseRescoreQuery; $this->isDirty = true; } + + public function addAggregation( AbstractAggregation $agg ) { + $this->aggs[] = $agg; + $this->isDirty = true; + } + + public function getAggregations() { + return $this->aggs; + } } diff --git a/includes/Search/SearchRequestBuilder.php b/includes/Search/SearchRequestBuilder.php index 8e69378..aaadc6a 100644 --- a/includes/Search/SearchRequestBuilder.php +++ b/includes/Search/SearchRequestBuilder.php @@ -69,6 +69,10 @@ $query->setQuery( $this->searchContext->getQuery() ); + foreach ( $this->searchContext->getAggregations() as $agg ) { + $query->addAggregation( $agg ); + } + $highlight = $this->searchContext->getHighlight( $resultsType ); if ( $highlight ) { $query->setHighlight( $highlight ); diff --git a/includes/Searcher.php b/includes/Searcher.php index 20a28e7..3c1752c 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -2,6 +2,7 @@ namespace CirrusSearch; +use CirrusSearch\Query\CountContentWordsBuilder; use CirrusSearch\Query\NearMatchQueryBuilder; use CirrusSearch\Query\PrefixSearchQueryBuilder; use CirrusSearch\Query\SimpleKeywordFeature; @@ -220,6 +221,16 @@ } /** + * Perform a sum over the number of words in the content index + * @return Status status containing a single integer + */ + public function countContentWords() { + ( new CountContentWordsBuilder() )->build( $this->searchContext ); + $this->limit = 1; + return $this->searchOne(); + } + + /** * Perform a prefix search. * @param string $term text by which to search * @param string[] $variants variants to search for -- To view, visit https://gerrit.wikimedia.org/r/392471 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I847f696405b447ab04972ad0215c09d0012c2098 Gerrit-PatchSet: 4 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: Tjones <tjo...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits