https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114129

Revision: 114129
Author:   maxsem
Date:     2012-03-19 12:19:58 +0000 (Mon, 19 Mar 2012)
Log Message:
-----------
Text extraction rewrite:
* Renamed prop=excerpts --> prop=extracts
* Made it optionally return whole page extracts
* More reasonably structured output: no more dummy 1-element arrays just 
because of API's awkward past. Looks good both in XML and sane formats.
Will rename the file in the next commit.

Modified Paths:
--------------
    trunk/extensions/MobileFrontend/MobileFrontend.php
    trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php

Modified: trunk/extensions/MobileFrontend/MobileFrontend.php
===================================================================
--- trunk/extensions/MobileFrontend/MobileFrontend.php  2012-03-19 11:30:08 UTC 
(rev 114128)
+++ trunk/extensions/MobileFrontend/MobileFrontend.php  2012-03-19 12:19:58 UTC 
(rev 114129)
@@ -52,7 +52,7 @@
 
        'ApiMobileView' => 'api/ApiMobileView',
        'ApiParseExtender' => 'api/ApiParseExtender',
-       'ApiQueryExcerpts' => 'api/ApiQueryExcerpts',
+       'ApiQueryExtracts' => 'api/ApiQueryExcerpts',
 
        'MobileFrontendTemplate' => 'templates/MobileFrontendTemplate',
        'ApplicationTemplate' => 'templates/ApplicationTemplate',
@@ -125,7 +125,7 @@
 
 $wgExtensionFunctions[] = 'efMobileFrontend_Setup';
 
-$wgAPIPropModules['excerpts'] = 'ApiQueryExcerpts';
+$wgAPIPropModules['extracts'] = 'ApiQueryExtracts';
 $wgAPIModules['mobileview'] = 'ApiMobileView';
 
 $wgHooks['APIGetAllowedParams'][] = 'ApiParseExtender::onAPIGetAllowedParams';

Modified: trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php
===================================================================
--- trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php    2012-03-19 
11:30:08 UTC (rev 114128)
+++ trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php    2012-03-19 
12:19:58 UTC (rev 114129)
@@ -1,10 +1,14 @@
 <?php
 
-class ApiQueryExcerpts extends ApiQueryBase {
+class ApiQueryExtracts extends ApiQueryBase {
+       const SECTION_MARKER_START = "\1\2";
+       const SECTION_MARKER_END = "\2\1";
+
        /**
         * @var ParserOptions
         */
        private $parserOptions;
+       private $params;
 
        public function __construct( $query, $moduleName ) {
                parent::__construct( $query, $moduleName, 'ex' );
@@ -17,8 +21,16 @@
                        wfProfileOut( __METHOD__ );
                        return;
                }
-               $params = $this->extractRequestParams();
+               $isXml = $this->getMain()->getPrinter()->getFormat() == 'XML';
+               $result = $this->getResult();
+               $params = $this->params = $this->extractRequestParams();
                $continue = 0;
+               $limit = intval( $params['limit'] );
+               if ( $limit > 1 && !$params['intro'] ) {
+                       $limit = 1;
+                       ///@todo:
+                       //$result->setWarning( "Provided limit was too large 
for requests for whole article extracts, lowered to $limit" );
+               }
                if ( isset( $params['continue'] ) ) {
                        $continue = intval( $params['continue'] );
                        if ( $continue < 0 || $continue > count( $titles ) ) {
@@ -28,15 +40,19 @@
                }
                $count = 0;
                foreach ( $titles as $id => $t ) {
-                       if ( ++$count > $params['limit'] ) {
+                       if ( ++$count > $limit ) {
                                $this->setContinueEnumParameter( 'continue', 
$continue + $count - 1 );
                                break;
                        }
-                       $text = $this->getExcerpt( $t, $params['plaintext'] );
+                       $text = $this->getExtract( $t );
                        if ( isset( $params['length'] ) ) {
-                               $text = $this->trimText( $text, 
$params['length'], $params['plaintext'] );
+                               $text = $this->trimText( $text );
                        }
-                       $fit = $this->addPageSubItem( $id, $text );
+                       if ( $isXml ) {
+                               $fit = $result->addValue( array( 'query', 
'pages', $id ), 'extract', array( '*' => $text ) );
+                       } else {
+                               $fit = $result->addValue( array( 'query', 
'pages', $id ), 'extract', $text );
+                       }
                        if ( !$fit ) {
                                $this->setContinueEnumParameter( 'continue', 
$continue + $count - 1 );
                                break;
@@ -68,7 +84,7 @@
                $data = $api->getResultData();
                foreach ( $pageIds as $id ) {
                        if ( isset( $data['query']['pages'][$id]['excerpts'][0] 
) ) {
-                               $results[$id]['extract'] = 
$data['query']['pages'][$id]['excerpts'][0];
+                               $results[$id]['extract'] = 
$data['query']['pages'][$id]['extract'][0];
                                $results[$id]['extract trimmed'] = false;
                        }
                }
@@ -78,28 +94,63 @@
        /**
         * Returns a processed, but not trimmed excerpt
         * @param Title $title
-        * @return string 
+        * @return string
         */
-       private function getExcerpt( Title $title, $plainText ) {
-               global $wgMemc;
-
+       private function getExtract( Title $title ) {
                wfProfileIn( __METHOD__ );
                $page = WikiPage::factory( $title );
-               $key = wfMemcKey( 'mf', 'excerpt', $plainText, 
$title->getArticleID(), $page->getLatest() );
-               $text = $wgMemc->get( $key );
-               if ( $text !== false ) {
-                       wfProfileOut( __METHOD__ );
-                       return $text;
+
+               $introOnly = $this->params['intro'];
+               $text = $this->getFromCache( $page, $introOnly );
+               // if we need just first section, try retrieving full page and 
getting first section out of it
+               if ( $text === false && $introOnly ) {
+                       $text = $this->getFromCache( $page, false );
+                       if ( $text !== false ) {
+                               $text = $this->getFirstSection( $text, 
$this->params['plaintext'] );
+                       }
                }
-               $text = $this->parse( $page );
-               $text = $this->convertText( $text, $title, $plainText );
-               $wgMemc->set( $key, $text );
+               if ( $text === false ) {
+                       $text = $this->parse( $page );
+                       $text = $this->convertText( $text, $title, 
$this->params['plaintext'] );
+                       $this->setCache( $page, $text );
+               }
                wfProfileOut( __METHOD__ );
                return $text;
        }
 
+       private function cacheKey( WikiPage $page, $introOnly ) {
+               return wfMemcKey( 'mf', 'extract', $page->getLatest(), 
$this->params['plaintext'], $introOnly );
+       }
+
+       private function getFromCache( WikiPage $page, $introOnly ) {
+               global $wgMemc;
+
+               $key = $this->cacheKey( $page, $introOnly );
+               return $wgMemc->get( $key );
+       }
+
+       private function setCache( WikiPage $page, $text ) {
+               global $wgMemc;
+
+               $key = $this->cacheKey( $page, $this->params['intro'] );
+               $wgMemc->set( $key, $text );
+       }
+
+       private function getFirstSection( $text, $plainText ) {
+               if ( $plainText ) {
+                       $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START . 
')/s';
+               } else {
+                       $regexp = '/^(.*?)(?=<h[1-6]\b)/s';
+               }
+               if ( preg_match( $regexp, $text, $matches ) ) {
+                       wfDebugDieBacktrace();
+                       $text = $matches[0];
+               }
+               return $text;
+       }
+
        /**
-        * Returns HTML of page's zeroth section
+        * Returns page HTML
         * @param WikiPage $page
         * @return string
         */
@@ -113,20 +164,23 @@
                        $pout = ParserCache::singleton()->get( $page, 
$this->parserOptions );
                        if ( $pout ) {
                                $text = $pout->getText();
-                               $s = preg_replace( '/<h[1-6].*$/s', '', $text );
+                               if ( $this->params['intro'] ) {
+                                       $text = $this->getFirstSection( $text, 
false );
+                               }
                                wfProfileOut( __METHOD__ );
-                               return $s;
+                               return $text;
                        }
                }
+               $request = array(
+                       'action' => 'parse',
+                       'page' => $page->getTitle()->getPrefixedText(),
+                       'prop' => 'text'
+               );
+               if ( $this->params['intro'] ) {
+                       $request['section'] = 0;
+               }
                // in case of cache miss, render just the needed section
-               $api = new ApiMain( new FauxRequest(
-                       array(
-                               'action' => 'parse',
-                               'page' => $page->getTitle()->getPrefixedText(),
-                               'section' => 0,
-                               'prop' => 'text'
-                       ) )
-               );
+               $api = new ApiMain( new FauxRequest( $request ) );
                $api->execute();
                $data = $api->getResultData();
                wfProfileOut( __METHOD__ );
@@ -140,23 +194,11 @@
         * @param bool $plainText
         * @return string 
         */
-       private function convertText( $text, Title $title, $plainText ) {
+       private function convertText( $text ) {
                wfProfileIn( __METHOD__ );
-               $fmt = new HtmlFormatter( HtmlFormatter::wrapHTML( $text, false 
), $title, 'XHTML' );
-               $fmt->removeImages();
-               $fmt->remove( array( 'table', 'div', 'sup.reference', 
'span.coordinates',
-                       'span.geo-multi-punct', 'span.geo-nondefault', 
'.noexcerpt', '.error' )
-               );
-               if ( $plainText ) {
-                       $fmt->flattenAllTags();
-               } else {
-                       $fmt->flatten( array( 'span', 'a' ) );
-               }
-               $fmt->filterContent();
+               $fmt = new ExtractFormatter( $text, $this->params['plaintext'], 
$this->params['sectionformat'] );
                $text = $fmt->getText();
-               if ( $plainText ) {
-                       $text = html_entity_decode( $text );
-               }
+
                wfProfileOut( __METHOD__ );
                return trim( $text );
        }
@@ -202,7 +244,12 @@
                                ApiBase::PARAM_MAX => 20,
                                ApiBase::PARAM_MAX2 => 20,
                        ),
+                       'intro' => false,
                        'plaintext' => false,
+                       'sectionformat' => array(
+                               ApiBase::PARAM_TYPE => 
ExtractFormatter::$sectionFormats,
+                               ApiBase::PARAM_DFLT => 'wiki',
+                       ),
                        'continue' => array(
                                ApiBase::PARAM_TYPE => 'integer',
                        ),
@@ -212,14 +259,21 @@
        public function getParamDescription() {
                return array(
                        'length' => 'How many characters to return, actual text 
returned might be slightly longer.',
-                       'limit' => 'How many excerpts to return',
-                       'plaintext' => 'Return excerpts as plaintext instead of 
limited HTML',
+                       'limit' => 'How many extracts to return. ',
+                       'intro' => 'Return only content before the first 
section',
+                       'plaintext' => 'Return extracts as plaintext instead of 
limited HTML',
+                       'sectionformat' => array(
+                               'How to format sections in plaintext mode:',
+                               ' none - No formatting',
+                               ' wiki - Wikitext-style formatting == like this 
==',
+                               " raw - Return in this module's internal 
representation (secton titles prefixed with <ASCII 1><ASCII 2><section 
level><ASCII 2><ASCII 1>",
+                       ),
                        'continue' => 'When more results are available, use 
this to continue',
                );
        }
 
        public function getDescription() {
-               return 'Returns excerpts of the given page(s)';
+               return 'Returns plain-text or limited HTML extracts of the 
given page(s)';
        }
 
        public function getPossibleErrors() {
@@ -230,7 +284,7 @@
 
        public function getExamples() {
                return array(
-                       
'api.php?action=query&prop=excerpts&exlength=175&titles=Therion' => 'Get a 
175-character excerpt',
+                       
'api.php?action=query&prop=extracts&exlength=175&titles=Therion' => 'Get a 
175-character extract',
                );
        }
 
@@ -244,4 +298,72 @@
        }
 }
 
+class ExtractFormatter extends HtmlFormatter {
+       private $plainText;
+       private $sectionFormat;
 
+       public static $sectionFormats = array(
+               'none',
+               'wiki',
+               'raw',
+       );
+
+       public function __construct( $text, $plainText, $sectionFormat ) {
+               parent::__construct( HtmlFormatter::wrapHTML( $text ) );
+               $this->plainText = $plainText;
+               $this->sectionFormat = $sectionFormat;
+
+               $this->removeImages();
+               $this->remove( array( 'table', 'div', '.editsection', 
'sup.reference', 'span.coordinates',
+                       'span.geo-multi-punct', 'span.geo-nondefault', 
'.noexcerpt', '.error' )
+               );
+               if ( $plainText ) {
+                       $this->flattenAllTags();
+               } else {
+                       $this->flatten( array( 'span', 'a' ) );
+               }
+       }
+
+       public function getText( $dummy = null ) {
+               $this->filterContent();
+               $text = parent::getText();
+               if ( $this->plainText ) {
+                       $text = html_entity_decode( $text );
+                       $text = str_replace( "\r", "\n", $text );
+                       $text = preg_replace( "/\n{3,}/", "\n\n", $text );
+                       $text = preg_replace_callback( 
+                               "/" . ApiQueryExtracts::SECTION_MARKER_START . 
'(\d)'. ApiQueryExtracts::SECTION_MARKER_END . "(.*?)$/m",
+                               array( $this, 'sectionCallback' ),
+                               $text
+                       );
+               }
+               return $text;
+       }
+
+       public function onHtmlReady( $html ) {
+               if ( $this->plainText ) {
+                       $html = preg_replace( '/\s*(<h([1-6])\b)/i',
+                               ApiQueryExtracts::SECTION_MARKER_START . '$2' . 
ApiQueryExtracts::SECTION_MARKER_END . '$1' ,
+                               $html
+                       );
+               }
+               return $html;
+       }
+
+       private function sectionCallback( $matches ) {
+               if ( $this->sectionFormat == 'raw' ) {
+                       return $matches[0];
+               }
+               $func = "ExtractFormatter::doSection_{$this->sectionFormat}";
+               return call_user_func( $func, $matches[1], trim( $matches[2] ) 
);
+       }
+
+       private static function doSection_wiki( $level, $text ) {
+               $bars = str_repeat( '=', $level );
+               return "\n$bars $text $bars";
+       }
+
+       private static function doSection_none( $level, $text ) {
+               return "\n$text";
+       }
+}
\ No newline at end of file


_______________________________________________
MediaWiki-CVS mailing list
MediaWiki-CVS@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to