Cenarium has uploaded a new change for review. https://gerrit.wikimedia.org/r/278703
Change subject: [WIP] Use custom self-regenerating parser cache for references ...................................................................... [WIP] Use custom self-regenerating parser cache for references Rather than saving the references data in the standard parser cache, this creates a custom parser cache where not only the references data is saved, but also the fully parsed text when it has been requested. It isn't stored initially, but when a parsed reference is requested, such as through the new Reference API module, parsing is resumed from the half parsed wikitext stored in the references data, served to the client and saved in the cache. The aforementioned Reference API module can be used to retrieve the fully parsed html of a reference accessed by its id. It also provides the MW timestamp of the last time this reference was changed, which allows clients accessing the ref from an old revision to make sure that the ref content is still valid. Knowing when a reference was changed is possible by comparing the references data between a freshly parsed page and its previously cached references data. This supports FlaggedRevs, which has it own parser cache for stable versions. This requires a couple commits to mediawiki core and one to FlaggedRevs. Bug: T125329 Bug: T127263 Depends-On: Change-Id: Ib39e26ad762e2da39ba0b3ca08ce1bf558af6de7 Depends-On: Change-Id: Ibb6e1c35ff08302ef7af4572010123b8caaa1980 Change-Id: Idcd8bcd2c59ab6b3f51409e7e8c2b2074d7c15c0 --- M ApiQueryReferences.php A ApiReference.php M CiteHooks.php A CiteParserCache.php A CiteParserOutput.php M Cite_body.php M README.md M extension.json 8 files changed, 473 insertions(+), 17 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Cite refs/changes/03/278703/1 diff --git a/ApiQueryReferences.php b/ApiQueryReferences.php index fcc52de..2ae2902 100644 --- a/ApiQueryReferences.php +++ b/ApiQueryReferences.php @@ -57,7 +57,7 @@ } if ( $checkStable ) { // if using FlaggedRevs and requested, get references for the stable revision - $page = FlaggableWikiPage::getTitleInstance( $title ); + $page = new FlaggableWikiPage( $title ); if ( $page->isReviewable() ) { $storedRefs = Cite::getStableReferences( $page ); if ( $storedRefs === false @@ -92,6 +92,19 @@ $ref['group'] = $group; $ref['reflist'] = $index; + if ( $ref['stripState'] !== null ) { + $ref['stripState'] = unserialize( $ref['stripState'] ); + $ref['stripState'] = $ref['stripState']->getPublicData(); + } else { + unset( $ref['stripState'] ); + } + if ( $ref['linkHolders'] !== null ) { + $ref['linkHolders'] = unserialize( $ref['linkHolders'] ); + $ref['linkHolders'] = $ref['linkHolders']->getPublicData(); + } else { + unset( $ref['linkHolders'] ); + } + $allReferences[$id] = $ref; } } diff --git a/ApiReference.php b/ApiReference.php new file mode 100644 index 0000000..3a1aee2 --- /dev/null +++ b/ApiReference.php @@ -0,0 +1,98 @@ +<?php +/** + * @ingroup API + */ +class ApiReference extends ApiBase { + + protected function isFlaggedRevsEnabled() { + return class_exists( 'FlaggableWikiPage' ); + } + + public function execute() { + global $wgParser; + // The data is hot but user-dependent, like page views, so we set vary cookies + $this->getMain()->setCacheMode( 'anon-public-user-private' ); + + // Get parameters + $params = $this->extractRequestParams(); + + if ( isset( $params['title'] ) ) { + if ( isset( $params['pageid'] ) ) { + $this->dieUsage( 'Params title and pageid cannot be used at the same time', 'citetitleandpageid' ); + return; + } + $title = Title::newFromText( $params['title'] ); + if ( $title === null || !$title->exists() ) { + $this->dieUsage( 'Title must exist', 'citeinvalidtitle' ); + return; + } + } elseif ( isset( $params['pageid'] ) ) { + $title = Title::newFromId( $params['pageid'] ); + if ( $title === null ) { + $this->dieUsage( 'Title must exist', 'citeinvalidtitle' ); + return; + } + } else { + $this->dieUsage( 'Param title or param pageid must be given', 'citenotitleorpageid' ); + return; + } + $refId = $params['refid']; + + if ( $this->isFlaggedRevsEnabled() && $params['stable'] ) { + $page = new FlaggableWikiPage( $title ); + $parserOptions = $page->makeParserOptions( $this->getContext() ); + if ( $page->isReviewable() ) { + $parserCache = CiteFRParserCacheStable::singleton(); + $ref = $parserCache->getParsedRef( $page, $refId, $parserOptions, $wgParser, + $parserOutput ); + if ( $ref !== false ) { + unset( $ref['index'], $ref['name'], $ref['parsetime'] ); + $this->getResult()->addValue( null, $this->getModuleName(), $ref ); + return; + } + if ( $parserOutput === false + && FlaggedRevs::inclusionSetting() == FR_INCLUDES_CURRENT + && $page->stableVersionIsSynced() + ) { + // the stable revision is identical to the latest version, so try it + $parserCache = CiteParserCache::singleton(); + } else { + return; + } + } else { + // page cannot have stable versions + $parserCache = CiteParserCache::singleton(); + } + } else { + $page = new WikiPage( $title ); + $parserOptions = $page->makeParserOptions( $this->getContext() ); + $parserCache = CiteParserCache::singleton(); + } + $ref = $parserCache->getParsedRef( $page, $refId, $parserOptions, $wgParser ); + if ( $ref !== false ) { + unset( $ref['index'], $ref['name'], $ref['parsetime'] ); + $this->getResult()->addValue( null, $this->getModuleName(), $ref ); + } + } + + public function getAllowedParams() { + $params = [ + 'title' => [ + ApiBase::PARAM_TYPE => 'string' + ], + 'pageid' => [ + ApiBase::PARAM_TYPE => 'integer', + ], + 'refid' => [ + ApiBase::PARAM_TYPE => 'string', + ApiBase::PARAM_REQUIRED => true + ], + ]; + if ( $this->isFlaggedRevsEnabled() ) { + $params['stable'] = [ + ApiBase::PARAM_TYPE => 'boolean' + ]; + } + return $params; + } +} diff --git a/CiteHooks.php b/CiteHooks.php index ae82eb5..4959f87 100644 --- a/CiteHooks.php +++ b/CiteHooks.php @@ -94,4 +94,32 @@ } $linksUpdate->getParserOutput()->setExtensionData( Cite::EXT_DATA_KEY, null ); } + + /** + * Callback for ParserCacheBeforeSave hook, called in ParserCache::save + * Generate CiteParserOutput based on ParserOutput + * Then save it to CiteParserCache reusing data from old cached CiteParserOutput if any + */ + public static function onParserCacheBeforeSave( $parserCache, $parserOutputKey, $parserOutput + ) { + global $wgCiteOldReferencesDataCacheExpiry; + + $citeParserOutput = CiteParserOutput::newFromParserOutput( $parserOutput ); + + // no longer needed in main parser output + $parserOutput->setExtensionData( Cite::EXT_DATA_KEY, null ); + + if ( $citeParserOutput === null ) { + // no references + return; + } + + if ( class_exists( 'FRParserCacheStable' ) + && $parserCache instanceof FRParserCacheStable + ) { + CiteFRParserCacheStable::singleton()->save( $parserOutputKey, $citeParserOutput ); + } else { + CiteParserCache::singleton()->save( $parserOutputKey, $citeParserOutput ); + } + } } diff --git a/CiteParserCache.php b/CiteParserCache.php new file mode 100644 index 0000000..671c12c --- /dev/null +++ b/CiteParserCache.php @@ -0,0 +1,196 @@ +<?php +/** + * Self-regenerating cache for CiteParserOutput + */ +class CiteParserCache extends ParserCache { + + /** + * Get an instance of this object + */ + public static function singleton() { + global $parserMemc; + return new self( $parserMemc ); + } + + protected function getParserOutputKey( $article, $hash ) { + $key = parent::getParserOutputKey( $article, $hash ); // call super! + return str_replace( ':pcache:', ':cite-pcache:', $key ); + } + + /** + * Called from CiteHooks::onParserCacheBeforeSave, itself called from ParserCache::save + * @param string $parserOutputKey key of generating ParserCache + * @param CiteParserOutput $newParserOutput + */ + public function save( $parserOutputKey, $newParserOutput ) { + global $wgCiteParserCacheExpireTime; + // pcache->cite-pcache OR stable-pcache->stable-cite-pcache + $parserOutputKey = str_replace( 'pcache:', 'cite-pcache:', + $parserOutputKey ); + $oldParserOutput = $this->mMemc->get( $parserOutputKey, BagOStuff::READ_VERIFIED ); + // @todo defer what follows + if ( $oldParserOutput !== false ) { + $newParserOutput->initialize( $oldParserOutput ); + } else { + $newParserOutput->initialize(); + } + $expiry = $newParserOutput->getCacheExpiry(); + $this->mMemc->set( $parserOutputKey, $newParserOutput, $expiry ); + } + + /** + * Override because validation checks are unecessary here. + * We allow a latency of $wgCiteParserCacheExpireTime, checked in getRef. + * Before that, they would have been done already by the main ParserCache and if failed, + * would have resulted in a new parse, ParserCache, thus CiteParserCache, being created. + */ + public function getKey( $article, $popts ) { + $optionsKey = $this->mMemc->get( + $this->getOptionsKey( $article ), BagOStuff::READ_VERIFIED ); + if ( $optionsKey === false ) { + return false; + } + + $usedOptions = $optionsKey->mUsedOptions; + wfDebug( "Parser cache options found.\n" ); + + return $this->getParserOutputKey( + $article, + $popts->optionsHash( $usedOptions, $article->getTitle() ) + ); + } + + /** + * Override because we don't want to call setEditSectionTokens or run the hook + * and validation checks are unecessary here (@see getKey) + */ + public function get( $article, $popts, &$parserOutputKey = null ) { + $parserOutputKey = $this->getKey( $article, $popts ); + if ( $parserOutputKey === false ) { + wfIncrStats( 'pcache.miss.absent' ); + return false; + } + + $value = $this->mMemc->get( $parserOutputKey, BagOStuff::READ_VERIFIED ); + if ( !$value ) { + wfDebug( "ParserOutput cache miss.\n" ); + wfIncrStats( "pcache.miss.absent" ); + return false; + } + + wfDebug( "ParserOutput cache found.\n" ); + wfIncrStats( "pcache.hit" ); + + return $value; + } + + /** + * Provide a fully parsed ref to caller, regenerating it from references data if not + * present in cache then saving it to cache + */ + public function getParsedRef( $article, $refId, $popts, $parser, + &$parserOutput = null, &$parserOutputKey = null + ) { + global $wgCiteParserCacheExpireTime; + $parserOutput = $this->get( $article, $popts, $parserOutputKey ); + if ( $parserOutput === false ) { + return false; + } + + $parsedRef = $parserOutput->getParsedRef( $refId ); + if ( $parsedRef === null ) { + // no ref with this key found + return false; + } + + if ( isset( $parsedRef['text'] ) ) { + // allow some latency since templates within <ref> tags aren't often edited + // so on heavily edited articles, this ref, which we know was not modified, + // won't have to be re-parsed at each edit + $expireLatency = $wgCiteParserCacheExpireTime; + if ( $article->getTouched() < $parsedRef['parsetime'] + $expireLatency ) { + // cache hit + wfIncrStats( "cite.pcache.hit" ); + wfDebug( "CiteParserOutput parsed ref found in cache.\n" ); + return $parsedRef; + } + } + + // parse ref from data and regenerate cache + $refData = $parserOutput->getRefData(); + + if ( $refData['halfparsedversion'] !== Parser::HALF_PARSED_VERSION ) { + // parse can't be resumed + return false; + } + + // we must be able to get the ref due to CiteParserOutput::initialize + if ( isset( $refData['references'][$parsedRef['index']][1][$parsedRef['name']] ) ) { + $ref = $refData['references'][$parsedRef['index']][1][$parsedRef['name']]; + } else { + // shouldn't happen + return false; + } + + if ( $ref['stripState'] !== null ) { + $ref['stripState'] = unserialize( $ref['stripState'] ); + } + if ( $ref['linkHolders'] !== null ) { + $ref['linkHolders'] = unserialize( $ref['linkHolders'] ); + } + + // resume parse + $parsedRef['parsetime'] = wfTimestampNow(); + $parsedRef['text'] = $parser->resumeParse( + $ref, + $article->getTitle(), + $popts, + true, // clear parser state + false, // not main + false // no line start + ); + + // save result in parser output + $parserOutput->setParsedRef( $refId, $parsedRef ); + + // stats / debug + wfIncrStats( "cite.pcache.regen" ); + $cacheTime = $parsedRef['parsetime']; + $lastModified = $parsedRef['lastmodified']; + $msg = "Saved ref $refId last modified $lastModified in Cite parser cache" . + " with key $parserOutputKey" . + " and timestamp $cacheTime" . + "\n"; + wfDebug( $msg ); + + // save in cache, decreasing expiry by time passed since original cache time + $originalExpiry = $parserOutput->getCacheExpiry(); + $newExpiry = $originalExpiry - ( wfTimestampNow() - $parserOutput->getCacheTime() ); + if ( $newExpiry > 0 ) { + $this->mMemc->set( $parserOutputKey, $parserOutput, $newExpiry ); + } + + return $parsedRef; + } +} + +/** + * Get parser cache for stable versions of FlaggedRevs + * @see FRParserCacheStable class + */ +class CiteFRParserCacheStable extends CiteParserCache { + public static function singleton() { + global $parserMemc; + return new self( $parserMemc ); + } + + protected function getParserOutputKey( $article, $hash ) { + // pcache->cite-pcache->stable-cite-pcache + $key = parent::getParserOutputKey( $article, $hash ); // call super! + return str_replace( ':cite-pcache:', ':stable-cite-pcache:', $key ); + } + + protected function getOptionsKey( $article ) { + return FRParserCacheStable::getOptionsKey( $article ); + } +} diff --git a/CiteParserOutput.php b/CiteParserOutput.php new file mode 100644 index 0000000..cb829e2 --- /dev/null +++ b/CiteParserOutput.php @@ -0,0 +1,78 @@ +<?php +/** + * + */ +class CiteParserOutput extends CacheTime { + private $mParsedRefs = []; + private $mRefData = []; + + public function initialize( CiteParserOutput $oldOutput = null ) { + $oldReferences = null; + if ( $oldOutput !== false + && $oldOutput->mRefData['refdataversion'] === $this->mRefData['refdataversion'] + ) { + $oldReferences = $oldOutput->mRefData['references']; + } + foreach ( $this->mRefData['references'] as $index => list( $group, $members ) ) { + // allow reuse of the refs in this reflist if it is for the same group + $inheritMembers = $oldReferences !== null && isset( $oldReferences[$index] ) + && $oldReferences[$index][0] === $group; + if ( $inheritMembers ) { + $oldMembers = $oldReferences[$index][1]; + } + foreach ( $members as $name => $ref ) { + // reuse this ref if it contains the same content as the corresponding old ref + $inheritRef = $inheritMembers && isset( $oldMembers[$name] ) + && $oldMembers[$name]['text'] === $ref['text'] + && $oldMembers[$name]['StripState'] === $ref['StripState'] + && $oldMembers[$name]['linkHolders'] === $ref['linkHolders']; + if ( is_string( $name ) ) { + $id = Cite::getReferencesKey( $name . '-' . $ref['key'] ); + } else { + $id = Cite::getReferencesKey( $ref['key'] ); + } + if ( $inheritRef ) { + $this->mParsedRefs[$id] = $oldOutput->mParsedRefs[$id]; + } else { + // new ref + $this->mParsedRefs[$id] = array(); + $this->mParsedRefs[$id]['index'] = $index; + $this->mParsedRefs[$id]['name'] = $name; + $this->mParsedRefs[$id]['lastmodified'] = $this->getCacheTime(); + } + } + } + } + + public function getRefData() { + return $this->mRefData; + } + + public static function newFromParserOutput( ParserOutput $parserOutput ) { + $refData = $parserOutput->getExtensionData( Cite::EXT_DATA_KEY ); + if ( $refData === null ) { + return null; + } + $instance = new self(); + $instance->mRefData = $refData; + $instance->mCacheTime = $parserOutput->getCacheTime(); + $instance->mCacheRevisionId = $parserOutput->getCacheRevisionId(); + $instance->mCacheExpiry = $parserOutput->getCacheExpiry(); + return $instance; + } + + /** + * Accessor to parsed text, like ParserOutput::getText, but id-based + * and with additional metadata + */ + public function getParsedRef( $id ) { + if ( isset( $this->mParsedRefs[$id] ) ) { + return $this->mParsedRefs[$id]; + } + return null; + } + + public function setParsedRef( $id, $parsedRef ) { + $this->mParsedRefs[$id] = $parsedRef; + } +} diff --git a/Cite_body.php b/Cite_body.php index 801fb28..4df8621 100644 --- a/Cite_body.php +++ b/Cite_body.php @@ -42,7 +42,7 @@ const EXT_DATA_KEY = 'Cite:References'; /** - * Version number in case we change the data structure in the future + * Bump version number when we incompatibly change the data structure */ const DATA_VERSION_NUMBER = 1; @@ -1188,15 +1188,42 @@ } static $reflistIndex = 1; - $savedRefs = $this->mParser->getOutput()->getExtensionData( self::EXT_DATA_KEY ); - if ( $savedRefs === null ) { + $refData = $this->mParser->getOutput()->getExtensionData( self::EXT_DATA_KEY ); + if ( $refData === null ) { // Initialize array structure - $savedRefs = array( 'version' => self::DATA_VERSION_NUMBER ); + $refData = array( + 'refdataversion' => self::DATA_VERSION_NUMBER, + 'halfparsedversion' => Parser::HALF_PARSED_VERSION, + ); + } + $refs = array(); + foreach ( $this->mRefs[$group] as $name => $ref ) { + // gather all necessary data + $ref = array_merge( + $ref, + $this->mParser->serializeHalfParsedText( $ref['text'] ) + ); + unset( $ref['version'] ); // HALF_PARSED_VERSION already saved above + if ( $ref['stripState'] !== null ) { + $ref['stripState'] = serialize( $ref['stripState'] ); + } + if ( $ref['linkHolders'] !== null ) { + $ref['linkHolders'] = serialize( $ref['linkHolders'] ); + } + // 'number' uneeded after page parse + unset( $ref['number'] ); + // give expected meaning to 'count' + if ( $ref['count'] < 0 ) { + $ref['count'] = 1; + } else { + $ref['count']++; + } + $refs[$name] = $ref; } // save group - $savedRefs['references'][$reflistIndex++] = array( $group, $this->mRefs[$group] ); + $refData['references'][$reflistIndex++] = array( $group, $refs ); - $this->mParser->getOutput()->setExtensionData( self::EXT_DATA_KEY, $savedRefs ); + $this->mParser->getOutput()->setExtensionData( self::EXT_DATA_KEY, $refData ); } /** @@ -1300,12 +1327,12 @@ return false; } - $parserCache = ParserCache::singleton(); + $parserCache = CiteParserCache::singleton(); $parserOptions = $page->makeParserOptions( 'canonical' ); - // get cached parser output, even if outdated - $parserOutput = $parserCache->get( $page, $parserOptions, true ); + // get cached parser output + $parserOutput = $parserCache->get( $page, $parserOptions ); if ( $parserOutput !== false ) { - return $parserOutput->getExtensionData( self::EXT_DATA_KEY ); + return $parserOutput->getRefData(); } if ( $wgCiteStoreReferencesDataInDB ) { @@ -1328,12 +1355,12 @@ return false; } - $parserCache = FRParserCacheStable::singleton(); + $parserCache = CiteFRParserCacheStable::singleton(); $parserOptions = $page->makeParserOptions( 'canonical' ); - // get cached parser output, even if outdated - $parserOutput = $parserCache->get( $page, $parserOptions, true ); + // get cached parser output + $parserOutput = $parserCache->get( $page, $parserOptions ); if ( $parserOutput !== false ) { - return $parserOutput->getExtensionData( self::EXT_DATA_KEY ); + return $parserOutput->getRefData(); } return false; } diff --git a/README.md b/README.md index e7d3384..edfd69e 100644 --- a/README.md +++ b/README.md @@ -11,3 +11,6 @@ other extensions can retrieve them independently of the main article content. * `$wgCiteStoreReferencesDataInDB`: (`$wgCiteStoreReferencesData` required) If set to true, in addition to storing references in the parser cache, they are stored in the page_props table. +* `$wgCiteParserCacheExpireTime`: (`$wgCiteStoreReferencesData` required) Duration in seconds +after which the cache of a reference accessed through the Reference Api that was not modified +in wikitext but may have templates included in it modified is set to expire diff --git a/extension.json b/extension.json index 4923ff8..461e167 100644 --- a/extension.json +++ b/extension.json @@ -18,6 +18,11 @@ "cite": "i18n", "ve-cite": "modules/ve-cite/i18n" }, + "APIModules": { + "reference": { + "class": "ApiReference" + } + }, "APIPropModules": { "references": { "class": "ApiQueryReferences" @@ -35,6 +40,9 @@ ], "LinksUpdate": [ "CiteHooks::onLinksUpdate" + ], + "ParserCacheBeforeSave": [ + "CiteHooks::onParserCacheBeforeSave" ] }, "ResourceModules": { @@ -167,14 +175,19 @@ "AllowCiteGroups": true, "CiteCacheReferences": false, "CiteStoreReferencesData": false, - "CiteStoreReferencesDataInDB": false + "CiteStoreReferencesDataInDB": false, + "CiteParserCacheExpireTime": 86400 }, "AutoloadClasses": { + "ApiReference": "ApiReference.php", "ApiQueryReferences": "ApiQueryReferences.php", "Cite": "Cite_body.php", "CiteHooks": "CiteHooks.php", "CiteDataModule": "CiteDataModule.php", - "CiteCSSFileModule": "CiteCSSFileModule.php" + "CiteCSSFileModule": "CiteCSSFileModule.php", + "CiteParserOutput": "CiteParserOutput.php", + "CiteParserCache": "CiteParserCache.php", + "CiteFRParserCacheStable": "CiteParserCache.php" }, "ParserTestFiles": [ "citeParserTests.txt" -- To view, visit https://gerrit.wikimedia.org/r/278703 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Idcd8bcd2c59ab6b3f51409e7e8c2b2074d7c15c0 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Cite Gerrit-Branch: master Gerrit-Owner: Cenarium <cenarium.sy...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits