Aude has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/248061

Change subject: Script for populating geodata and pageimages
......................................................................

Script for populating geodata and pageimages

I am not sure refreshLinks is going to work so
well for us, and in case it is a problem then
I propose a script for our specific use case.

still would like to add at least some basic tests,
if we want/need the script. 

Also, the LinksUpdate code in core could be refactored
some to make bits of the code more re-usable such
as for updating page props.

Bug: T114868
Change-Id: I39a439df9cb34c549d90a7ad288f03f7352bbb74
---
A repo/maintenance/rebuildGeoData.php
1 file changed, 244 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/61/248061/3

diff --git a/repo/maintenance/rebuildGeoData.php 
b/repo/maintenance/rebuildGeoData.php
new file mode 100644
index 0000000..48b440c
--- /dev/null
+++ b/repo/maintenance/rebuildGeoData.php
@@ -0,0 +1,244 @@
+<?php
+
+namespace Wikibase\Repo\Maintenance;
+
+use DeferredUpdates;
+use GeoDataHooks;
+use LinksUpdate;
+use Maintenance;
+use MWException;
+use Page;
+use ParserOutput;
+use SearchUpdate;
+use Title;
+use WikiPage;
+use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookupException;
+use Wikibase\DataModel\Statement\StatementList;
+use Wikibase\DataModel\Statement\StatementListProvider;
+use Wikibase\EntityContent;
+use Wikibase\Repo\Store\EntityIdPager;
+use Wikibase\Repo\Store\EntityPerPage;
+use Wikibase\Repo\Store\SQL\EntityPerPageIdPager;
+use Wikibase\Repo\WikibaseRepo;
+
+$basePath = getenv( 'MW_INSTALL_PATH' ) !== false
+       ? getenv( 'MW_INSTALL_PATH' ) : __DIR__ . '/../../../..';
+
+require_once $basePath . '/maintenance/Maintenance.php';
+
+/**
+ * Maintenance script for updating GeoData and PageImages page prop.
+ *
+ * @since 0.5
+ *
+ * @licence GNU GPL v2+
+ * @author Katie Filbert < aude.w...@gmail.com >
+ */
+class RebuildGeoData extends Maintenance {
+
+       private $entityPerPage;
+
+       private $entityTitleLookup;
+
+       private $propertyDataTypeLookup;
+
+       public function __construct() {
+               parent::__construct();
+
+               $this->mDescription = 'Updates GeoData and PageImages page 
prop.';
+
+               $this->addOption( 'batch-size', "Number of entities to process 
per batch", false, true );
+               $this->addOption( 'limit', "Maximum number of entities 
processed", false, true );
+       }
+
+       /**
+        * Do the actual work.
+        */
+       public function execute() {
+               if ( !class_exists( 'GeoDataHooks' ) ) {
+                       $this->error( 'GeoData extension must be enabled for 
this script to run.', 1 );
+               }
+
+               $this->setServices();
+
+               $this->processEntities(
+                       $this->makeIdQueryStream(),
+                       (int)$this->getOption( 'batch-size', 100 ),
+                       (int)$this->getOption( 'limit', 0 )
+               );
+
+               $this->output( "Done\n" );
+       }
+
+       private function setServices() {
+               $wikibaseRepo = WikibaseRepo::getDefaultInstance();
+
+               $this->entityPerPage = 
$wikibaseRepo->getStore()->newEntityPerPage();
+               $this->entityTitleLookup = 
$wikibaseRepo->getEntityTitleLookup();
+               $this->propertyDataTypeLookup = 
$wikibaseRepo->getPropertyDataTypeLookup();
+       }
+
+       private function processEntities( EntityIdPager $idStream, $batchSize, 
$limit ) {
+               $entityCount = 0;
+
+               while ( $ids = $idStream->fetchIds( $batchSize ) ) {
+                       foreach ( $ids as $id ) {
+                               $entityCount++;
+
+                               $this->processEntityId( $id );
+
+                               if ( $limit > 0 && $entityCount >= $limit ) {
+                                       return;
+                               }
+                       }
+               }
+
+               $this->output( "Processed $entityCount entities\n" );
+       }
+
+       private function processEntityId( EntityId $entityId ) {
+               $title = $this->entityTitleLookup->getTitleForId( $entityId );
+               $titleText = $title->getPrefixedText();
+
+               $content = $this->loadContent( $title );
+
+               if ( $content === null ) {
+                       // $content could not be loaded or accessed
+                       $this->error( "Failed to load page content for 
$titleText.\n" );
+                       return;
+               }
+
+               if ( $this->isRelevant( $content, $title ) ) {
+                       $this->output( "Processing $titleText\n" );
+
+                       // skip generating html
+                       $parserOutput = $content->getParserOutput( $title, 
null, null, false );
+                       $linksUpdate = new LinksUpdate( $title, $parserOutput );
+
+                       $this->updateGeoData( $linksUpdate );
+                       $this->updatePageProps( $linksUpdate );
+
+                       $this->updateSearch( $title, $content );
+               }
+       }
+
+       private function updateGeoData( LinksUpdate $linksUpdate ) {
+               GeoDataHooks::onLinksUpdate( $linksUpdate );
+       }
+
+       private function updatePageProps( LinksUpdate $linksUpdate ) {
+               $existing = $this->getExistingProperties( $linksUpdate->mId );
+
+               $propertiesDeletes = $linksUpdate->getPropertyDeletions( 
$existing );
+
+               $linksUpdate->incrTableUpdate( 'page_props', 'pp', 
$propertiesDeletes,
+                       $linksUpdate->getPropertyInsertions( $existing ) );
+       }
+
+       /**
+        * @return array Array of property names and values
+        */
+       private function getExistingProperties( $pageId ) {
+               $dbr = wfGetDB( DB_MASTER );
+
+               $res = $dbr->select(
+                       'page_props',
+                       array( 'pp_propname', 'pp_value' ),
+                       array( 'pp_page' => $pageId ),
+                       __METHOD__,
+                       array()
+               );
+
+               $arr = array();
+
+               foreach ( $res as $row ) {
+                       $arr[$row->pp_propname] = $row->pp_value;
+               }
+
+               return $arr;
+       }
+
+       private function updateSearch( Title $title, EntityContent $content ) {
+               DeferredUpdates::addUpdate(
+                       new SearchUpdate(
+                               $title->getArticleID(),
+                               $title,
+                               $content
+                       )
+               );
+
+               DeferredUpdates::doUpdates( '', 'enqueue' );
+       }
+
+       private function loadContent( Title $title ) {
+               try {
+                       $page = WikiPage::factory( $title );
+               } catch ( MWException $ex ) {
+                       // $page does not exist or other error
+                       $this->error( "Page not found for " . 
$title->getPrefixedText() . "\n" );
+                       return;
+               }
+
+               return $page->getContent();
+       }
+
+       private function isRelevant( EntityContent $content, Title $title ) {
+               try {
+                       $entity = $content->getEntity();
+               } catch ( MWException $ex ) {
+                       // normally happens if EntityContent is a redirect, 
though we filter these
+                       // out when generting the EntityIdPager so shouldn't 
happen.
+                       $this->error( 'Failed to load entity for ' . 
$title->getPrefixedText() . "\n" );
+                       return false;
+               }
+
+               if ( !$entity instanceof StatementListProvider ) {
+                       $this->error( "Entity is not a StatementListProvider\n" 
);
+                       return false;
+               }
+
+               $statements = $entity->getStatements();
+
+               if ( $statements->isEmpty() ) {
+                       return false;
+               }
+
+               return $this->hasRelevantProperty( $statements );
+       }
+
+       private function hasRelevantProperty( StatementList $statements ) {
+               $propertyIds = $statements->getPropertyIds();
+
+               foreach ( $propertyIds as $propertyId ) {
+                       try {
+                               $dataType = 
$this->propertyDataTypeLookup->getDataTypeIdForProperty( $propertyId );
+                       } catch ( PropertyDataTypeLookupException $ex ) {
+                               // property not found, skip
+                       }
+
+                       if ( $dataType === 'commonsMedia' || $dataType === 
'globe-coordinate' ) {
+                               return true;
+                       }
+               }
+
+               return false;
+       }
+
+       /**
+        * @param string|null $entityType
+        *
+        * @return EntityIdPager
+        */
+       private function makeIdQueryStream() {
+               return new EntityPerPageIdPager(
+                       $this->entityPerPage,
+                       null, // any entity type, @todo could specify 
StatementListProvider types
+                       EntityPerPage::NO_REDIRECTS
+               );
+       }
+
+}
+
+$maintClass = 'Wikibase\Repo\Maintenance\RebuildGeoData';
+require_once RUN_MAINTENANCE_IF_MAIN;

-- 
To view, visit https://gerrit.wikimedia.org/r/248061
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I39a439df9cb34c549d90a7ad288f03f7352bbb74
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Aude <aude.w...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to