jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/392471 )

Change subject: Add word count statistic for articles
......................................................................


Add word count statistic for articles

The community survey asked for this feature, and it was pretty
straight forward to add to cirrus.

https://meta.wikimedia.org/wiki/2017_Community_Wishlist_Survey/Miscellaneous/Word_count_on_statistics

Change-Id: I847f696405b447ab04972ad0215c09d0012c2098
---
M CirrusSearch.php
M autoload.php
M i18n/en.json
M i18n/qqq.json
M includes/CirrusSearch.php
M includes/Hooks.php
A includes/Query/CountContentWordsBuilder.php
M includes/Search/ResultsType.php
M includes/Search/SearchContext.php
M includes/Search/SearchRequestBuilder.php
M includes/Searcher.php
11 files changed, 146 insertions(+), 6 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  jenkins-bot: Verified
  DCausse: Looks good to me, approved



diff --git a/CirrusSearch.php b/CirrusSearch.php
index bf53382..d133928 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1302,6 +1302,7 @@
 $wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo';
 $wgHooks[ 'SpecialSearchResults' ][] = 
'CirrusSearch\Hooks::onSpecialSearchResults';
 $wgHooks[ 'SpecialSearchResultsAppend' ][] = 
'CirrusSearch\Hooks::onSpecialSearchResultsAppend';
+$wgHooks[ 'SpecialStatsAddExtra'][] = 
'CirrusSearch\Hooks::onSpecialStatsAddExtra';
 $wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove';
 $wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete';
 $wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList';
diff --git a/autoload.php b/autoload.php
index 1cb17de..130efd6 100644
--- a/autoload.php
+++ b/autoload.php
@@ -114,6 +114,7 @@
        'CirrusSearch\\Query\\BoostTemplatesFeature' => __DIR__ . 
'/includes/Query/BoostTemplatesFeature.php',
        'CirrusSearch\\Query\\CompSuggestQueryBuilder' => __DIR__ . 
'/includes/Query/CompSuggestQueryBuilder.php',
        'CirrusSearch\\Query\\ContentModelFeature' => __DIR__ . 
'/includes/Query/ContentModelFeature.php',
+       'CirrusSearch\\Query\\CountContentWordsBuilder' => __DIR__ . 
'/includes/Query/CountContentWordsBuilder.php',
        'CirrusSearch\\Query\\FileNumericFeature' => __DIR__ . 
'/includes/Query/FileNumericFeature.php',
        'CirrusSearch\\Query\\FileTypeFeature' => __DIR__ . 
'/includes/Query/FileTypeFeature.php',
        'CirrusSearch\\Query\\FullTextQueryBuilder' => __DIR__ . 
'/includes/Query/FullTextQueryBuilder.php',
@@ -194,6 +195,7 @@
        'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . 
'/includes/Search/SearchMetricsProvider.php',
        'CirrusSearch\\Search\\SearchRequestBuilder' => __DIR__ . 
'/includes/Search/SearchRequestBuilder.php',
        'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . 
'/includes/Search/ShortTextIndexField.php',
+       'CirrusSearch\\Search\\SingleAggResultsType' => __DIR__ . 
'/includes/Search/ResultsType.php',
        'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . 
'/includes/Search/SourceTextIndexField.php',
        'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
        'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . 
'/includes/Search/TeamDraftInterleaver.php',
diff --git a/i18n/en.json b/i18n/en.json
index ab44497..76ff173 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -76,5 +76,6 @@
        "cirrussearch-explore-similar-languages": "Languages",
        "cirrussearch-explore-similar-related-none": "No related pages 
available",
        "cirrussearch-explore-similar-categories-none": "No categories 
available",
-       "cirrussearch-explore-similar-languages-none": "Not available in other 
languages"
+       "cirrussearch-explore-similar-languages-none": "Not available in other 
languages",
+       "cirrussearch-article-words": "Words in all articles"
 }
diff --git a/i18n/qqq.json b/i18n/qqq.json
index 486768c..cac2d2b 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -86,5 +86,6 @@
        "cirrussearch-explore-similar-languages": "Label for the 'Languages' 
section of the Explore Similar search results.\n{{Identical|Language}}",
        "cirrussearch-explore-similar-related-none": "Text shown when there are 
no related pages in the Explore Similar search results.",
        "cirrussearch-explore-similar-categories-none": "Text shown when there 
are no categories in the Explore Similar search results.",
-       "cirrussearch-explore-similar-languages-none": "Text shown when there 
are no other languages in the Explore Similar search results."
+       "cirrussearch-explore-similar-languages-none": "Text shown when there 
are no other languages in the Explore Similar search results.",
+       "cirrussearch-article-words": "Text shown on Special:Statistics along 
with a count of the number of words across all articles (mostly main namespace, 
but some wikis may have configured additional content namespaces)"
 }
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 089589a..0505a1e 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -798,10 +798,7 @@
                        return Status::newGood( [] );
                }
 
-               $searcher = new Searcher( $this->connection, $this->offset, 
$this->limit, $this->config, $this->namespaces,
-                               null, $this->indexBaseName );
-               $searcher->setOptionsFromRequest( $this->request );
-
+               $searcher = $this->makeSearcher();
                $status = $searcher->searchArchive( $term );
                if ( $status->isOK() && $searcher->isReturnRaw() ) {
                        $status->setResult( true,
@@ -810,4 +807,26 @@
                return $status;
        }
 
+       /**
+        * @return Status Contains a single integer indicating the number
+        *  of content words in the wiki
+        */
+       public function countContentWords() {
+               $this->limit = 1;
+               $searcher = $this->makeSearcher();
+               $status = $searcher->countContentWords();
+
+               if ( $status->isOK() && $searcher->isReturnRaw() ) {
+                       $status->setResult( true,
+                               $searcher->processRawReturn( 
$status->getValue(), $this->request, $this->dumpAndDie ) );
+               }
+               return $status;
+       }
+
+       private function makeSearcher() {
+               $searcher = new Searcher( $this->connection, $this->offset, 
$this->limit, $this->config, $this->namespaces,
+                               null, $this->indexBaseName );
+               $searcher->setOptionsFromRequest( $this->request );
+               return $searcher;
+       }
 }
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 44cbc9d..63a1766 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -853,4 +853,16 @@
                return true;
        }
 
+       public static function onSpecialStatsAddExtra( &$extraStats, $context ) 
{
+               $search = new CirrusSearch();
+
+               $status = $search->countContentWords();
+               if ( !$status->isOK() ) {
+                       return;
+               }
+               $wordCount = $status->getValue();
+               if ( $wordCount !== null ) {
+                       $extraStats['cirrussearch-article-words'] = $wordCount;
+               }
+       }
 }
diff --git a/includes/Query/CountContentWordsBuilder.php 
b/includes/Query/CountContentWordsBuilder.php
new file mode 100644
index 0000000..772aa38
--- /dev/null
+++ b/includes/Query/CountContentWordsBuilder.php
@@ -0,0 +1,28 @@
+<?php
+
+namespace CirrusSearch\Query;
+
+use Elastica\Aggregation\Sum;
+use CirrusSearch\Search\SingleAggResultsType;
+use CirrusSearch\Search\SearchContext;
+
+/**
+ * Build a query to sum up the word count of all articles
+ */
+class CountContentWordsBuilder {
+       // The count doesn't change all that quickly. Re-run the query
+       // no more than daily per-wiki.
+       const CACHE_SECONDS = 86400;
+
+       /**
+        * @param SearchContext $context the search context
+        */
+       public function build( SearchContext $context ) {
+               $context->addSyntaxUsed( 'sum_word_count' );
+               $context->setResultsType( new SingleAggResultsType( 
'word_count' ) );
+               $context->setRescoreProfile( 'empty' );
+               $context->addAggregation(
+                       ( new Sum( 'word_count' ) )->setField( 
'text.word_count' ) );
+               $context->setCacheTtl( self::CACHE_SECONDS );
+       }
+}
diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php
index 98e3f19..89c96a1 100644
--- a/includes/Search/ResultsType.php
+++ b/includes/Search/ResultsType.php
@@ -570,3 +570,49 @@
                return [];
        }
 }
+
+class SingleAggResultsType implements ResultsType {
+       /** @var string Name of aggregation */
+       private $name;
+
+       /** @param string $name Name of aggregation to return */
+       public function __construct( $name ) {
+               $this->name = $name;
+       }
+
+       /**
+        * @return false|string|array corresponding to Elasticsearch source 
filtering syntax
+        */
+       public function getSourceFiltering() {
+               return false;
+       }
+
+       public function getStoredFields() {
+               return [];
+       }
+
+       public function getHighlightingConfiguration( array $highlightSource ) {
+               return null;
+       }
+
+       /**
+        * @param SearchContext $context
+        * @param \Elastica\ResultSet $resultSet
+        * @return mixed|null Type depends on the aggregation performed. For
+        *  a sum this will return an integer.
+        */
+       public function transformElasticsearchResult( SearchContext $context, 
\Elastica\ResultSet $resultSet ) {
+               $aggs = $resultSet->getAggregations();
+               if ( isset( $aggs[$this->name] ) ) {
+                       return $aggs[$this->name]['value'];
+               }
+               return $this->createEmptyResult();
+       }
+
+       /**
+        * @return null
+        */
+       public function createEmptyResult() {
+               return null;
+       }
+}
diff --git a/includes/Search/SearchContext.php 
b/includes/Search/SearchContext.php
index 59e244a..cbdc984 100644
--- a/includes/Search/SearchContext.php
+++ b/includes/Search/SearchContext.php
@@ -4,6 +4,7 @@
 
 use CirrusSearch\OtherIndexes;
 use CirrusSearch\SearchConfig;
+use Elastica\Aggregation\AbstractAggregation;
 use Elastica\Query\AbstractQuery;
 
 /**
@@ -203,6 +204,11 @@
         * @var ResultsType Type of the result for the context.
         */
        private $resultsType;
+
+       /**
+        * @var AbstractAggregation[] Aggregations to perform
+        */
+       private $aggs = [];
 
        /**
         * @param SearchConfig $config
@@ -873,4 +879,13 @@
                $this->phraseRescoreQuery = $phraseRescoreQuery;
                $this->isDirty = true;
        }
+
+       public function addAggregation( AbstractAggregation $agg ) {
+               $this->aggs[] = $agg;
+               $this->isDirty = true;
+       }
+
+       public function getAggregations() {
+               return $this->aggs;
+       }
 }
diff --git a/includes/Search/SearchRequestBuilder.php 
b/includes/Search/SearchRequestBuilder.php
index 8e69378..aaadc6a 100644
--- a/includes/Search/SearchRequestBuilder.php
+++ b/includes/Search/SearchRequestBuilder.php
@@ -69,6 +69,10 @@
 
                $query->setQuery( $this->searchContext->getQuery() );
 
+               foreach ( $this->searchContext->getAggregations() as $agg ) {
+                       $query->addAggregation( $agg );
+               }
+
                $highlight = $this->searchContext->getHighlight( $resultsType );
                if ( $highlight ) {
                        $query->setHighlight( $highlight );
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 20a28e7..3c1752c 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -2,6 +2,7 @@
 
 namespace CirrusSearch;
 
+use CirrusSearch\Query\CountContentWordsBuilder;
 use CirrusSearch\Query\NearMatchQueryBuilder;
 use CirrusSearch\Query\PrefixSearchQueryBuilder;
 use CirrusSearch\Query\SimpleKeywordFeature;
@@ -220,6 +221,16 @@
        }
 
        /**
+        * Perform a sum over the number of words in the content index
+        * @return Status status containing a single integer
+        */
+       public function countContentWords() {
+               ( new CountContentWordsBuilder() )->build( $this->searchContext 
);
+               $this->limit = 1;
+               return $this->searchOne();
+       }
+
+       /**
         * Perform a prefix search.
         * @param string $term text by which to search
         * @param string[] $variants variants to search for

-- 
To view, visit https://gerrit.wikimedia.org/r/392471
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I847f696405b447ab04972ad0215c09d0012c2098
Gerrit-PatchSet: 4
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <guillaume.leder...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to