https://www.mediawiki.org/wiki/Special:Code/MediaWiki/114129
Revision: 114129 Author: maxsem Date: 2012-03-19 12:19:58 +0000 (Mon, 19 Mar 2012) Log Message: ----------- Text extraction rewrite: * Renamed prop=excerpts --> prop=extracts * Made it optionally return whole page extracts * More reasonably structured output: no more dummy 1-element arrays just because of API's awkward past. Looks good both in XML and sane formats. Will rename the file in the next commit. Modified Paths: -------------- trunk/extensions/MobileFrontend/MobileFrontend.php trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php Modified: trunk/extensions/MobileFrontend/MobileFrontend.php =================================================================== --- trunk/extensions/MobileFrontend/MobileFrontend.php 2012-03-19 11:30:08 UTC (rev 114128) +++ trunk/extensions/MobileFrontend/MobileFrontend.php 2012-03-19 12:19:58 UTC (rev 114129) @@ -52,7 +52,7 @@ 'ApiMobileView' => 'api/ApiMobileView', 'ApiParseExtender' => 'api/ApiParseExtender', - 'ApiQueryExcerpts' => 'api/ApiQueryExcerpts', + 'ApiQueryExtracts' => 'api/ApiQueryExcerpts', 'MobileFrontendTemplate' => 'templates/MobileFrontendTemplate', 'ApplicationTemplate' => 'templates/ApplicationTemplate', @@ -125,7 +125,7 @@ $wgExtensionFunctions[] = 'efMobileFrontend_Setup'; -$wgAPIPropModules['excerpts'] = 'ApiQueryExcerpts'; +$wgAPIPropModules['extracts'] = 'ApiQueryExtracts'; $wgAPIModules['mobileview'] = 'ApiMobileView'; $wgHooks['APIGetAllowedParams'][] = 'ApiParseExtender::onAPIGetAllowedParams'; Modified: trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php =================================================================== --- trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php 2012-03-19 11:30:08 UTC (rev 114128) +++ trunk/extensions/MobileFrontend/api/ApiQueryExcerpts.php 2012-03-19 12:19:58 UTC (rev 114129) @@ -1,10 +1,14 @@ <?php -class ApiQueryExcerpts extends ApiQueryBase { +class ApiQueryExtracts extends ApiQueryBase { + const SECTION_MARKER_START = "\1\2"; + const SECTION_MARKER_END = "\2\1"; + /** * @var ParserOptions */ private $parserOptions; + private $params; public function __construct( $query, $moduleName ) { parent::__construct( $query, $moduleName, 'ex' ); @@ -17,8 +21,16 @@ wfProfileOut( __METHOD__ ); return; } - $params = $this->extractRequestParams(); + $isXml = $this->getMain()->getPrinter()->getFormat() == 'XML'; + $result = $this->getResult(); + $params = $this->params = $this->extractRequestParams(); $continue = 0; + $limit = intval( $params['limit'] ); + if ( $limit > 1 && !$params['intro'] ) { + $limit = 1; + ///@todo: + //$result->setWarning( "Provided limit was too large for requests for whole article extracts, lowered to $limit" ); + } if ( isset( $params['continue'] ) ) { $continue = intval( $params['continue'] ); if ( $continue < 0 || $continue > count( $titles ) ) { @@ -28,15 +40,19 @@ } $count = 0; foreach ( $titles as $id => $t ) { - if ( ++$count > $params['limit'] ) { + if ( ++$count > $limit ) { $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); break; } - $text = $this->getExcerpt( $t, $params['plaintext'] ); + $text = $this->getExtract( $t ); if ( isset( $params['length'] ) ) { - $text = $this->trimText( $text, $params['length'], $params['plaintext'] ); + $text = $this->trimText( $text ); } - $fit = $this->addPageSubItem( $id, $text ); + if ( $isXml ) { + $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', array( '*' => $text ) ); + } else { + $fit = $result->addValue( array( 'query', 'pages', $id ), 'extract', $text ); + } if ( !$fit ) { $this->setContinueEnumParameter( 'continue', $continue + $count - 1 ); break; @@ -68,7 +84,7 @@ $data = $api->getResultData(); foreach ( $pageIds as $id ) { if ( isset( $data['query']['pages'][$id]['excerpts'][0] ) ) { - $results[$id]['extract'] = $data['query']['pages'][$id]['excerpts'][0]; + $results[$id]['extract'] = $data['query']['pages'][$id]['extract'][0]; $results[$id]['extract trimmed'] = false; } } @@ -78,28 +94,63 @@ /** * Returns a processed, but not trimmed excerpt * @param Title $title - * @return string + * @return string */ - private function getExcerpt( Title $title, $plainText ) { - global $wgMemc; - + private function getExtract( Title $title ) { wfProfileIn( __METHOD__ ); $page = WikiPage::factory( $title ); - $key = wfMemcKey( 'mf', 'excerpt', $plainText, $title->getArticleID(), $page->getLatest() ); - $text = $wgMemc->get( $key ); - if ( $text !== false ) { - wfProfileOut( __METHOD__ ); - return $text; + + $introOnly = $this->params['intro']; + $text = $this->getFromCache( $page, $introOnly ); + // if we need just first section, try retrieving full page and getting first section out of it + if ( $text === false && $introOnly ) { + $text = $this->getFromCache( $page, false ); + if ( $text !== false ) { + $text = $this->getFirstSection( $text, $this->params['plaintext'] ); + } } - $text = $this->parse( $page ); - $text = $this->convertText( $text, $title, $plainText ); - $wgMemc->set( $key, $text ); + if ( $text === false ) { + $text = $this->parse( $page ); + $text = $this->convertText( $text, $title, $this->params['plaintext'] ); + $this->setCache( $page, $text ); + } wfProfileOut( __METHOD__ ); return $text; } + private function cacheKey( WikiPage $page, $introOnly ) { + return wfMemcKey( 'mf', 'extract', $page->getLatest(), $this->params['plaintext'], $introOnly ); + } + + private function getFromCache( WikiPage $page, $introOnly ) { + global $wgMemc; + + $key = $this->cacheKey( $page, $introOnly ); + return $wgMemc->get( $key ); + } + + private function setCache( WikiPage $page, $text ) { + global $wgMemc; + + $key = $this->cacheKey( $page, $this->params['intro'] ); + $wgMemc->set( $key, $text ); + } + + private function getFirstSection( $text, $plainText ) { + if ( $plainText ) { + $regexp = '/^(.*?)(?=' . self::SECTION_MARKER_START . ')/s'; + } else { + $regexp = '/^(.*?)(?=<h[1-6]\b)/s'; + } + if ( preg_match( $regexp, $text, $matches ) ) { + wfDebugDieBacktrace(); + $text = $matches[0]; + } + return $text; + } + /** - * Returns HTML of page's zeroth section + * Returns page HTML * @param WikiPage $page * @return string */ @@ -113,20 +164,23 @@ $pout = ParserCache::singleton()->get( $page, $this->parserOptions ); if ( $pout ) { $text = $pout->getText(); - $s = preg_replace( '/<h[1-6].*$/s', '', $text ); + if ( $this->params['intro'] ) { + $text = $this->getFirstSection( $text, false ); + } wfProfileOut( __METHOD__ ); - return $s; + return $text; } } + $request = array( + 'action' => 'parse', + 'page' => $page->getTitle()->getPrefixedText(), + 'prop' => 'text' + ); + if ( $this->params['intro'] ) { + $request['section'] = 0; + } // in case of cache miss, render just the needed section - $api = new ApiMain( new FauxRequest( - array( - 'action' => 'parse', - 'page' => $page->getTitle()->getPrefixedText(), - 'section' => 0, - 'prop' => 'text' - ) ) - ); + $api = new ApiMain( new FauxRequest( $request ) ); $api->execute(); $data = $api->getResultData(); wfProfileOut( __METHOD__ ); @@ -140,23 +194,11 @@ * @param bool $plainText * @return string */ - private function convertText( $text, Title $title, $plainText ) { + private function convertText( $text ) { wfProfileIn( __METHOD__ ); - $fmt = new HtmlFormatter( HtmlFormatter::wrapHTML( $text, false ), $title, 'XHTML' ); - $fmt->removeImages(); - $fmt->remove( array( 'table', 'div', 'sup.reference', 'span.coordinates', - 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' ) - ); - if ( $plainText ) { - $fmt->flattenAllTags(); - } else { - $fmt->flatten( array( 'span', 'a' ) ); - } - $fmt->filterContent(); + $fmt = new ExtractFormatter( $text, $this->params['plaintext'], $this->params['sectionformat'] ); $text = $fmt->getText(); - if ( $plainText ) { - $text = html_entity_decode( $text ); - } + wfProfileOut( __METHOD__ ); return trim( $text ); } @@ -202,7 +244,12 @@ ApiBase::PARAM_MAX => 20, ApiBase::PARAM_MAX2 => 20, ), + 'intro' => false, 'plaintext' => false, + 'sectionformat' => array( + ApiBase::PARAM_TYPE => ExtractFormatter::$sectionFormats, + ApiBase::PARAM_DFLT => 'wiki', + ), 'continue' => array( ApiBase::PARAM_TYPE => 'integer', ), @@ -212,14 +259,21 @@ public function getParamDescription() { return array( 'length' => 'How many characters to return, actual text returned might be slightly longer.', - 'limit' => 'How many excerpts to return', - 'plaintext' => 'Return excerpts as plaintext instead of limited HTML', + 'limit' => 'How many extracts to return. ', + 'intro' => 'Return only content before the first section', + 'plaintext' => 'Return extracts as plaintext instead of limited HTML', + 'sectionformat' => array( + 'How to format sections in plaintext mode:', + ' none - No formatting', + ' wiki - Wikitext-style formatting == like this ==', + " raw - Return in this module's internal representation (secton titles prefixed with <ASCII 1><ASCII 2><section level><ASCII 2><ASCII 1>", + ), 'continue' => 'When more results are available, use this to continue', ); } public function getDescription() { - return 'Returns excerpts of the given page(s)'; + return 'Returns plain-text or limited HTML extracts of the given page(s)'; } public function getPossibleErrors() { @@ -230,7 +284,7 @@ public function getExamples() { return array( - 'api.php?action=query&prop=excerpts&exlength=175&titles=Therion' => 'Get a 175-character excerpt', + 'api.php?action=query&prop=extracts&exlength=175&titles=Therion' => 'Get a 175-character extract', ); } @@ -244,4 +298,72 @@ } } +class ExtractFormatter extends HtmlFormatter { + private $plainText; + private $sectionFormat; + public static $sectionFormats = array( + 'none', + 'wiki', + 'raw', + ); + + public function __construct( $text, $plainText, $sectionFormat ) { + parent::__construct( HtmlFormatter::wrapHTML( $text ) ); + $this->plainText = $plainText; + $this->sectionFormat = $sectionFormat; + + $this->removeImages(); + $this->remove( array( 'table', 'div', '.editsection', 'sup.reference', 'span.coordinates', + 'span.geo-multi-punct', 'span.geo-nondefault', '.noexcerpt', '.error' ) + ); + if ( $plainText ) { + $this->flattenAllTags(); + } else { + $this->flatten( array( 'span', 'a' ) ); + } + } + + public function getText( $dummy = null ) { + $this->filterContent(); + $text = parent::getText(); + if ( $this->plainText ) { + $text = html_entity_decode( $text ); + $text = str_replace( "\r", "\n", $text ); + $text = preg_replace( "/\n{3,}/", "\n\n", $text ); + $text = preg_replace_callback( + "/" . ApiQueryExtracts::SECTION_MARKER_START . '(\d)'. ApiQueryExtracts::SECTION_MARKER_END . "(.*?)$/m", + array( $this, 'sectionCallback' ), + $text + ); + } + return $text; + } + + public function onHtmlReady( $html ) { + if ( $this->plainText ) { + $html = preg_replace( '/\s*(<h([1-6])\b)/i', + ApiQueryExtracts::SECTION_MARKER_START . '$2' . ApiQueryExtracts::SECTION_MARKER_END . '$1' , + $html + ); + } + return $html; + } + + private function sectionCallback( $matches ) { + if ( $this->sectionFormat == 'raw' ) { + return $matches[0]; + } + $func = "ExtractFormatter::doSection_{$this->sectionFormat}"; + return call_user_func( $func, $matches[1], trim( $matches[2] ) ); + } + + private static function doSection_wiki( $level, $text ) { + $bars = str_repeat( '=', $level ); + return "\n$bars $text $bars"; + } + + private static function doSection_none( $level, $text ) { + return "\n$text"; + } +} \ No newline at end of file _______________________________________________ MediaWiki-CVS mailing list MediaWiki-CVS@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs