Aude has uploaded a new change for review. https://gerrit.wikimedia.org/r/248061
Change subject: Script for populating geodata and pageimages ...................................................................... Script for populating geodata and pageimages I am not sure refreshLinks is going to work so well for us, and in case it is a problem then I propose a script for our specific use case. still would like to add at least some basic tests, if we want/need the script. Also, the LinksUpdate code in core could be refactored some to make bits of the code more re-usable such as for updating page props. Bug: T114868 Change-Id: I39a439df9cb34c549d90a7ad288f03f7352bbb74 --- A repo/maintenance/rebuildGeoData.php 1 file changed, 244 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/61/248061/3 diff --git a/repo/maintenance/rebuildGeoData.php b/repo/maintenance/rebuildGeoData.php new file mode 100644 index 0000000..48b440c --- /dev/null +++ b/repo/maintenance/rebuildGeoData.php @@ -0,0 +1,244 @@ +<?php + +namespace Wikibase\Repo\Maintenance; + +use DeferredUpdates; +use GeoDataHooks; +use LinksUpdate; +use Maintenance; +use MWException; +use Page; +use ParserOutput; +use SearchUpdate; +use Title; +use WikiPage; +use Wikibase\DataModel\Entity\EntityId; +use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookupException; +use Wikibase\DataModel\Statement\StatementList; +use Wikibase\DataModel\Statement\StatementListProvider; +use Wikibase\EntityContent; +use Wikibase\Repo\Store\EntityIdPager; +use Wikibase\Repo\Store\EntityPerPage; +use Wikibase\Repo\Store\SQL\EntityPerPageIdPager; +use Wikibase\Repo\WikibaseRepo; + +$basePath = getenv( 'MW_INSTALL_PATH' ) !== false + ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..'; + +require_once $basePath . '/maintenance/Maintenance.php'; + +/** + * Maintenance script for updating GeoData and PageImages page prop. + * + * @since 0.5 + * + * @licence GNU GPL v2+ + * @author Katie Filbert < aude.w...@gmail.com > + */ +class RebuildGeoData extends Maintenance { + + private $entityPerPage; + + private $entityTitleLookup; + + private $propertyDataTypeLookup; + + public function __construct() { + parent::__construct(); + + $this->mDescription = 'Updates GeoData and PageImages page prop.'; + + $this->addOption( 'batch-size', "Number of entities to process per batch", false, true ); + $this->addOption( 'limit', "Maximum number of entities processed", false, true ); + } + + /** + * Do the actual work. + */ + public function execute() { + if ( !class_exists( 'GeoDataHooks' ) ) { + $this->error( 'GeoData extension must be enabled for this script to run.', 1 ); + } + + $this->setServices(); + + $this->processEntities( + $this->makeIdQueryStream(), + (int)$this->getOption( 'batch-size', 100 ), + (int)$this->getOption( 'limit', 0 ) + ); + + $this->output( "Done\n" ); + } + + private function setServices() { + $wikibaseRepo = WikibaseRepo::getDefaultInstance(); + + $this->entityPerPage = $wikibaseRepo->getStore()->newEntityPerPage(); + $this->entityTitleLookup = $wikibaseRepo->getEntityTitleLookup(); + $this->propertyDataTypeLookup = $wikibaseRepo->getPropertyDataTypeLookup(); + } + + private function processEntities( EntityIdPager $idStream, $batchSize, $limit ) { + $entityCount = 0; + + while ( $ids = $idStream->fetchIds( $batchSize ) ) { + foreach ( $ids as $id ) { + $entityCount++; + + $this->processEntityId( $id ); + + if ( $limit > 0 && $entityCount >= $limit ) { + return; + } + } + } + + $this->output( "Processed $entityCount entities\n" ); + } + + private function processEntityId( EntityId $entityId ) { + $title = $this->entityTitleLookup->getTitleForId( $entityId ); + $titleText = $title->getPrefixedText(); + + $content = $this->loadContent( $title ); + + if ( $content === null ) { + // $content could not be loaded or accessed + $this->error( "Failed to load page content for $titleText.\n" ); + return; + } + + if ( $this->isRelevant( $content, $title ) ) { + $this->output( "Processing $titleText\n" ); + + // skip generating html + $parserOutput = $content->getParserOutput( $title, null, null, false ); + $linksUpdate = new LinksUpdate( $title, $parserOutput ); + + $this->updateGeoData( $linksUpdate ); + $this->updatePageProps( $linksUpdate ); + + $this->updateSearch( $title, $content ); + } + } + + private function updateGeoData( LinksUpdate $linksUpdate ) { + GeoDataHooks::onLinksUpdate( $linksUpdate ); + } + + private function updatePageProps( LinksUpdate $linksUpdate ) { + $existing = $this->getExistingProperties( $linksUpdate->mId ); + + $propertiesDeletes = $linksUpdate->getPropertyDeletions( $existing ); + + $linksUpdate->incrTableUpdate( 'page_props', 'pp', $propertiesDeletes, + $linksUpdate->getPropertyInsertions( $existing ) ); + } + + /** + * @return array Array of property names and values + */ + private function getExistingProperties( $pageId ) { + $dbr = wfGetDB( DB_MASTER ); + + $res = $dbr->select( + 'page_props', + array( 'pp_propname', 'pp_value' ), + array( 'pp_page' => $pageId ), + __METHOD__, + array() + ); + + $arr = array(); + + foreach ( $res as $row ) { + $arr[$row->pp_propname] = $row->pp_value; + } + + return $arr; + } + + private function updateSearch( Title $title, EntityContent $content ) { + DeferredUpdates::addUpdate( + new SearchUpdate( + $title->getArticleID(), + $title, + $content + ) + ); + + DeferredUpdates::doUpdates( '', 'enqueue' ); + } + + private function loadContent( Title $title ) { + try { + $page = WikiPage::factory( $title ); + } catch ( MWException $ex ) { + // $page does not exist or other error + $this->error( "Page not found for " . $title->getPrefixedText() . "\n" ); + return; + } + + return $page->getContent(); + } + + private function isRelevant( EntityContent $content, Title $title ) { + try { + $entity = $content->getEntity(); + } catch ( MWException $ex ) { + // normally happens if EntityContent is a redirect, though we filter these + // out when generting the EntityIdPager so shouldn't happen. + $this->error( 'Failed to load entity for ' . $title->getPrefixedText() . "\n" ); + return false; + } + + if ( !$entity instanceof StatementListProvider ) { + $this->error( "Entity is not a StatementListProvider\n" ); + return false; + } + + $statements = $entity->getStatements(); + + if ( $statements->isEmpty() ) { + return false; + } + + return $this->hasRelevantProperty( $statements ); + } + + private function hasRelevantProperty( StatementList $statements ) { + $propertyIds = $statements->getPropertyIds(); + + foreach ( $propertyIds as $propertyId ) { + try { + $dataType = $this->propertyDataTypeLookup->getDataTypeIdForProperty( $propertyId ); + } catch ( PropertyDataTypeLookupException $ex ) { + // property not found, skip + } + + if ( $dataType === 'commonsMedia' || $dataType === 'globe-coordinate' ) { + return true; + } + } + + return false; + } + + /** + * @param string|null $entityType + * + * @return EntityIdPager + */ + private function makeIdQueryStream() { + return new EntityPerPageIdPager( + $this->entityPerPage, + null, // any entity type, @todo could specify StatementListProvider types + EntityPerPage::NO_REDIRECTS + ); + } + +} + +$maintClass = 'Wikibase\Repo\Maintenance\RebuildGeoData'; +require_once RUN_MAINTENANCE_IF_MAIN; -- To view, visit https://gerrit.wikimedia.org/r/248061 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I39a439df9cb34c549d90a7ad288f03f7352bbb74 Gerrit-PatchSet: 3 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Aude <aude.w...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits