Smalyshev has uploaded a new change for review. https://gerrit.wikimedia.org/r/305575
Change subject: [WIP] Implement page props in RDF ...................................................................... [WIP] Implement page props in RDF Change-Id: I95868d6ae75c4ebf98ff414200fcdcc2155488f1 Bug: T129046 --- M repo/config/Wikibase.default.php M repo/includes/Dumpers/DumpGenerator.php M repo/includes/Dumpers/RdfDumpGenerator.php M repo/includes/LinkedData/EntityDataSerializationService.php M repo/includes/Rdf/RdfBuilder.php M repo/includes/Rdf/RdfProducer.php M repo/includes/Rdf/RdfVocabulary.php M repo/includes/WikibaseRepo.php M repo/maintenance/dumpRdf.php M repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php M repo/tests/phpunit/includes/Rdf/RdfBuilderTest.php 11 files changed, 211 insertions(+), 45 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/75/305575/1 diff --git a/repo/config/Wikibase.default.php b/repo/config/Wikibase.default.php index df2305b..078c474 100644 --- a/repo/config/Wikibase.default.php +++ b/repo/config/Wikibase.default.php @@ -183,4 +183,10 @@ 'http://www.wikidata.org/entity/Q3359' => 'triton', 'http://www.wikidata.org/entity/Q339' => 'pluto' ], + + // Map between page properties and Wikibase predicates + 'pagePropertiesRdf' => [ + 'wb-sitelinks' => 'sitelinks', + 'wb-claims' => 'claims' + ] ]; diff --git a/repo/includes/Dumpers/DumpGenerator.php b/repo/includes/Dumpers/DumpGenerator.php index 5a1a31d..65f6323 100644 --- a/repo/includes/Dumpers/DumpGenerator.php +++ b/repo/includes/Dumpers/DumpGenerator.php @@ -29,7 +29,7 @@ * @var int The max number of entities to process in a single batch. * Also controls the interval for progress reports. */ - private $batchSize = 100; + protected $batchSize = 100; /** * @var resource File handle for output @@ -214,6 +214,14 @@ } /** + * Do something before dumping a batch of entities + * @param EntityId[] $entities + */ + protected function preBatchDump( $entities ) { + $this->entityPrefetcher->prefetch( $entities ); + } + + /** * Do something before dumping entity * * @param int $dumpCount @@ -273,7 +281,8 @@ $toLoad[] = $entityId; } } - $this->entityPrefetcher->prefetch( $toLoad ); + + $this->preBatchDump( $toLoad ); foreach ( $toLoad as $entityId ) { try { diff --git a/repo/includes/Dumpers/RdfDumpGenerator.php b/repo/includes/Dumpers/RdfDumpGenerator.php index 673db5f..7bab7c3 100644 --- a/repo/includes/Dumpers/RdfDumpGenerator.php +++ b/repo/includes/Dumpers/RdfDumpGenerator.php @@ -11,6 +11,7 @@ use Wikibase\DataModel\Services\Entity\EntityPrefetcher; use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; use Wikibase\Lib\Store\EntityRevisionLookup; +use Wikibase\Lib\Store\EntityTitleLookup; use Wikibase\Lib\Store\StorageException; use Wikibase\DataModel\Services\Lookup\RedirectResolvingEntityLookup; use Wikibase\Lib\Store\RevisionedUnresolvedRedirectException; @@ -49,14 +50,25 @@ private $timestamp; /** - * @param resource $out - * @param EntityRevisionLookup $lookup Must not resolve redirects - * @param RdfBuilder $rdfBuilder - * @param EntityPrefetcher $entityPrefetcher - * - * @throws InvalidArgumentException + * @var \PageProps */ - public function __construct( $out, EntityRevisionLookup $lookup, RdfBuilder $rdfBuilder, EntityPrefetcher $entityPrefetcher ) { + private $pageProps; + + /** + * @var EntityTitleLookup + */ + private $titleLookup; + + /** + * @param resource $out + * @param EntityRevisionLookup $lookup Must not resolve redirects + * @param RdfBuilder $rdfBuilder + * @param EntityPrefetcher $entityPrefetcher + * @param EntityTitleLookup $titleLookup + */ + public function __construct( $out, EntityRevisionLookup $lookup, RdfBuilder $rdfBuilder, + EntityPrefetcher $entityPrefetcher, + EntityTitleLookup $titleLookup ) { parent::__construct( $out, $entityPrefetcher ); if ( $lookup instanceof RedirectResolvingEntityLookup ) { throw new InvalidArgumentException( '$lookup must not resolve redirects!' ); @@ -64,12 +76,17 @@ $this->rdfBuilder = $rdfBuilder; $this->entityRevisionLookup = $lookup; + $this->titleLookup = $titleLookup; } /** * Do something before dumping data */ protected function preDump() { + $this->pageProps = \PageProps::getInstance(); + // TODO: $this->pageProps->ensureCacheSize( $this->batchSize ); + $this->rdfBuilder->setPageProps( $this->pageProps ); + $this->rdfBuilder->startDocument(); $this->rdfBuilder->addDumpHeader( $this->timestamp ); @@ -85,6 +102,18 @@ $footer = $this->rdfBuilder->getRDF(); $this->writeToDump( $footer ); + } + + /** + * Do something before dumping a batch of entities + * @param EntityId[] $entities + */ + protected function preBatchDump( $entities ) { + parent::preBatchDump( $entities ); + $titles = array_map( [ $this->titleLookup, 'getTitleForId' ], $entities ); + $props = array_keys( $this->rdfBuilder->getPageProperties() ); + // Prefetch page props + $this->pageProps->getProperties( $titles, $props ); } /** @@ -109,6 +138,8 @@ $entityRevision->getRevisionId(), $entityRevision->getTimestamp() ); + + $this->rdfBuilder->addEntityPageProps( $entityRevision->getEntity()->getId() ); $this->rdfBuilder->addEntity( $entityRevision->getEntity() @@ -154,16 +185,16 @@ } /** - * @param string $format - * @param resource $output - * @param SiteList $sites - * @param EntityRevisionLookup $entityRevisionLookup - * @param PropertyDataTypeLookup $propertyLookup + * @param string $format + * @param resource $output + * @param SiteList $sites + * @param EntityRevisionLookup $entityRevisionLookup + * @param PropertyDataTypeLookup $propertyLookup * @param ValueSnakRdfBuilderFactory $valueSnakRdfBuilderFactory - * @param EntityPrefetcher $entityPrefetcher - * @param RdfVocabulary $vocabulary - * - * @return self + * @param EntityPrefetcher $entityPrefetcher + * @param RdfVocabulary $vocabulary + * @param EntityTitleLookup $titleLookup + * @return static * @throws MWException */ public static function createDumpGenerator( @@ -174,7 +205,8 @@ PropertyDataTypeLookup $propertyLookup, ValueSnakRdfBuilderFactory $valueSnakRdfBuilderFactory, EntityPrefetcher $entityPrefetcher, - RdfVocabulary $vocabulary + RdfVocabulary $vocabulary, + EntityTitleLookup $titleLookup ) { $rdfWriter = self::getRdfWriter( $format ); if ( !$rdfWriter ) { @@ -183,7 +215,7 @@ $flavor = RdfProducer::PRODUCE_ALL_STATEMENTS | RdfProducer::PRODUCE_TRUTHY_STATEMENTS | RdfProducer::PRODUCE_QUALIFIERS | RdfProducer::PRODUCE_REFERENCES | - RdfProducer::PRODUCE_SITELINKS | RdfProducer::PRODUCE_FULL_VALUES; + RdfProducer::PRODUCE_SITELINKS | RdfProducer::PRODUCE_FULL_VALUES | RdfProducer::PRODUCE_PAGE_PROPS; $rdfBuilder = new RdfBuilder( $sites, @@ -192,10 +224,11 @@ $propertyLookup, $flavor, $rdfWriter, - new HashDedupeBag() + new HashDedupeBag(), + $titleLookup ); - return new self( $output, $entityRevisionLookup, $rdfBuilder, $entityPrefetcher ); + return new self( $output, $entityRevisionLookup, $rdfBuilder, $entityPrefetcher, $titleLookup ); } } diff --git a/repo/includes/LinkedData/EntityDataSerializationService.php b/repo/includes/LinkedData/EntityDataSerializationService.php index 00c405b..ffee62f 100644 --- a/repo/includes/LinkedData/EntityDataSerializationService.php +++ b/repo/includes/LinkedData/EntityDataSerializationService.php @@ -222,6 +222,8 @@ $entityRevision->getTimestamp() ); + $rdfBuilder->addEntityPageProps( $entityRevision->getEntity()->getId() ); + $rdfBuilder->addEntity( $entityRevision->getEntity() ); $rdfBuilder->resolveMentionedEntities( $this->entityLookup ); } @@ -323,6 +325,7 @@ | RdfProducer::PRODUCE_REFERENCES | RdfProducer::PRODUCE_SITELINKS | RdfProducer::PRODUCE_FULL_VALUES + | RdfProducer::PRODUCE_PAGE_PROPS | RdfProducer::PRODUCE_VERSION_INFO; case 'long': return RdfProducer::PRODUCE_ALL_STATEMENTS @@ -363,9 +366,12 @@ $this->propertyLookup, $this->getFlavor( $flavorName ), $rdfWriter, - new HashDedupeBag() + new HashDedupeBag(), + $this->entityTitleLookup ); + $rdfBuilder->setPageProps( \PageProps::getInstance() ); + return $rdfBuilder; } diff --git a/repo/includes/Rdf/RdfBuilder.php b/repo/includes/Rdf/RdfBuilder.php index 850a9b1..20f7ffd 100644 --- a/repo/includes/Rdf/RdfBuilder.php +++ b/repo/includes/Rdf/RdfBuilder.php @@ -2,6 +2,7 @@ namespace Wikibase\Rdf; +use PageProps; use SiteList; use Wikibase\DataModel\Entity\EntityDocument; use Wikibase\DataModel\Entity\EntityId; @@ -10,6 +11,7 @@ use Wikibase\DataModel\Services\Lookup\EntityLookup; use Wikibase\DataModel\Services\Lookup\PropertyDataTypeLookup; use Wikibase\DataModel\Term\FingerprintProvider; +use Wikibase\Lib\Store\EntityTitleLookup; use Wikibase\Lib\Store\RevisionedUnresolvedRedirectException; use Wikimedia\Purtle\RdfWriter; @@ -82,13 +84,24 @@ private $valueSnakRdfBuilderFactory; /** - * @param SiteList $sites - * @param RdfVocabulary $vocabulary + * @var EntityTitleLookup + */ + private $titleLookup; + + /** + * @var PageProps + */ + private $pageProps; + + /** + * @param SiteList $sites + * @param RdfVocabulary $vocabulary * @param ValueSnakRdfBuilderFactory $valueSnakRdfBuilderFactory - * @param PropertyDataTypeLookup $propertyLookup - * @param int $flavor - * @param RdfWriter $writer - * @param DedupeBag $dedupeBag + * @param PropertyDataTypeLookup $propertyLookup + * @param int $flavor + * @param RdfWriter $writer + * @param DedupeBag $dedupeBag + * @param EntityTitleLookup $titleLookup */ public function __construct( SiteList $sites, @@ -97,7 +110,8 @@ PropertyDataTypeLookup $propertyLookup, $flavor, RdfWriter $writer, - DedupeBag $dedupeBag + DedupeBag $dedupeBag, + EntityTitleLookup $titleLookup ) { $this->vocabulary = $vocabulary; $this->propertyLookup = $propertyLookup; @@ -105,6 +119,7 @@ $this->writer = $writer; $this->produceWhat = $flavor; $this->dedupeBag = $dedupeBag ?: new HashDedupeBag(); + $this->titleLookup = $titleLookup; // XXX: move construction of sub-builders to a factory class. $this->termsBuilder = new TermsRdfBuilder( $vocabulary, $writer ); @@ -221,6 +236,15 @@ } /** + * Get map of page properties used by this builder + * + * @return string[] + */ + public function getPageProperties() { + return $this->vocabulary->getPageProperties(); + } + + /** * Should we produce this aspect? * * @param int $what @@ -307,6 +331,48 @@ } /** + * Set page props handler + * @param PageProps $pageProps + * @return $this + */ + public function setPageProps( PageProps $pageProps ) { + $this->pageProps = $pageProps; + return $this; + } + + /** + * Add page props information + * @param EntityId $entityId + */ + public function addEntityPageProps( EntityId $entityId ) { + if ( !$this->pageProps || !$this->shouldProduce( RdfProducer::PRODUCE_PAGE_PROPS ) ) { + return; + } + $title = $this->titleLookup->getTitleForId( $entityId ); + $props = $this->getPageProperties(); + if ( !$title || !$props ) { + return; + } + $propValues = $this->pageProps->getProperties( $title, array_keys( $props ) ); + if ( !$propValues ) { + return; + } + reset( $propValues ); + $entityProps = current( $propValues ); + if ( !$entityProps ) { + return; + } + foreach ( $entityProps as $name => $value ) { + if ( !isset( $props[$name] ) ) { + continue; + } + $this->writer->about( RdfVocabulary::NS_DATA, $entityId ) + ->say( RdfVocabulary::NS_ONTOLOGY, $props[$name] ) + ->value( $value ); + } + } + + /** * Write definition for wdno:P123 class to use as novalue * @param string $id */ diff --git a/repo/includes/Rdf/RdfProducer.php b/repo/includes/Rdf/RdfProducer.php index ba954ef..3449fe8 100644 --- a/repo/includes/Rdf/RdfProducer.php +++ b/repo/includes/Rdf/RdfProducer.php @@ -61,6 +61,11 @@ const PRODUCE_RESOLVED_ENTITIES = 256; /** + * Produce page properties + */ + const PRODUCE_PAGE_PROPS = 1024; + + /** * All options turned on. */ const PRODUCE_ALL = 0xFFFF; diff --git a/repo/includes/Rdf/RdfVocabulary.php b/repo/includes/Rdf/RdfVocabulary.php index 44adf71..ec735b1 100644 --- a/repo/includes/Rdf/RdfVocabulary.php +++ b/repo/includes/Rdf/RdfVocabulary.php @@ -115,22 +115,31 @@ private static $canonicalLanguageCodeCache = array(); /** - * @param string $baseUri Base URI for entity concept URIs. - * @param string $dataUri Base URI for entity description URIs. + * @var string[] + */ + private $pageProps; + + /** + * @param string $baseUri Base URI for entity concept URIs. + * @param string $dataUri Base URI for entity description URIs. * @param string[] $canonicalLanguageCodes Mapping of non-standard to canonical language codes. * @param string[] $dataTypeUris Mapping of property data type IDs to their URIs, * if different from the default mapping. + * @param string[] $pagePropertyDefs Mapping of page props: pageProp => wikibase predicate + * All predicates will be prefixed with wikibase: */ public function __construct( $baseUri, $dataUri, array $canonicalLanguageCodes = array(), - array $dataTypeUris = array() + array $dataTypeUris = array(), + array $pagePropertyDefs = array() ) { $this->baseUri = $baseUri; $this->dataUri = $dataUri; $this->canonicalLanguageCodes = $canonicalLanguageCodes; $this->dataTypeUris = $dataTypeUris; + $this->pageProps = $pagePropertyDefs; if ( substr( $this->baseUri, -7 ) === 'entity/' ) { $topUri = substr( $this->baseUri, 0, -7 ); @@ -300,4 +309,12 @@ return self::ONTOLOGY_BASE_URI . "-" . self::ONTOLOGY_VERSION . ".owl"; } + /** + * Get the map of configured page properties + * @return array + */ + public function getPageProperties() { + return$this->pageProps; + } + } diff --git a/repo/includes/WikibaseRepo.php b/repo/includes/WikibaseRepo.php index 98888c1..46c875a 100644 --- a/repo/includes/WikibaseRepo.php +++ b/repo/includes/WikibaseRepo.php @@ -1006,7 +1006,8 @@ $this->getVocabularyBaseUri(), $entityDataTitle->getCanonicalURL() . '/', $languageCodes, - $this->dataTypeDefinitions->getRdfTypeUris() + $this->dataTypeDefinitions->getRdfTypeUris(), + $this->settings->getSetting( 'pagePropertiesRdf' ) ?: [] ); } diff --git a/repo/maintenance/dumpRdf.php b/repo/maintenance/dumpRdf.php index 6213cbb..c451bcd 100644 --- a/repo/maintenance/dumpRdf.php +++ b/repo/maintenance/dumpRdf.php @@ -8,6 +8,7 @@ use Wikibase\Dumpers\DumpGenerator; use Wikibase\Dumpers\RdfDumpGenerator; use Wikibase\Lib\Store\EntityRevisionLookup; +use Wikibase\Lib\Store\EntityTitleLookup; use Wikibase\Rdf\RdfVocabulary; use Wikibase\Rdf\ValueSnakRdfBuilderFactory; use Wikibase\Repo\Store\EntityPerPage; @@ -59,19 +60,25 @@ */ private $hasHadServicesSet = false; + /** + * @var EntityTitleLookup + */ + private $titleLookup; + public function __construct() { parent::__construct(); $this->addOption( 'format', "Set the dump format.", false, true ); } /** - * @param EntityPerPage $entityPerPage - * @param EntityPrefetcher $entityPrefetcher - * @param SiteStore $siteStore - * @param PropertyDataTypeLookup $propertyDataTypeLookup + * @param EntityPerPage $entityPerPage + * @param EntityPrefetcher $entityPrefetcher + * @param SiteStore $siteStore + * @param PropertyDataTypeLookup $propertyDataTypeLookup * @param ValueSnakRdfBuilderFactory $valueSnakRdfBuilderFactory - * @param EntityRevisionLookup $entityRevisionLookup - * @param RdfVocabulary $rdfVocabulary + * @param EntityRevisionLookup $entityRevisionLookup + * @param RdfVocabulary $rdfVocabulary + * @param EntityTitleLookup $titleLookup */ public function setServices( EntityPerPage $entityPerPage, @@ -80,7 +87,8 @@ PropertyDataTypeLookup $propertyDataTypeLookup, ValueSnakRdfBuilderFactory $valueSnakRdfBuilderFactory, EntityRevisionLookup $entityRevisionLookup, - RdfVocabulary $rdfVocabulary + RdfVocabulary $rdfVocabulary, + EntityTitleLookup $titleLookup ) { parent::setDumpEntitiesServices( $entityPerPage ); $this->entityPrefetcher = $entityPrefetcher; @@ -89,6 +97,7 @@ $this->valueSnakRdfBuilderFactory = $valueSnakRdfBuilderFactory; $this->revisionLookup = $entityRevisionLookup; $this->rdfVocabulary = $rdfVocabulary; + $this->titleLookup = $titleLookup; $this->hasHadServicesSet = true; } @@ -102,7 +111,8 @@ $wikibaseRepo->getPropertyDataTypeLookup(), $wikibaseRepo->getValueSnakRdfBuilderFactory(), $wikibaseRepo->getEntityRevisionLookup( 'uncached' ), - $wikibaseRepo->getRdfVocabulary() + $wikibaseRepo->getRdfVocabulary(), + $wikibaseRepo->getEntityContentFactory() ); } parent::execute(); @@ -133,7 +143,8 @@ $this->propertyDatatypeLookup, $this->valueSnakRdfBuilderFactory, $this->entityPrefetcher, - $this->rdfVocabulary + $this->rdfVocabulary, + $this->titleLookup ); } diff --git a/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php b/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php index 9fa1237..f66f4ad 100644 --- a/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php +++ b/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php @@ -15,10 +15,12 @@ use Wikibase\Dumpers\RdfDumpGenerator; use Wikibase\EntityRevision; use Wikibase\Lib\Store\EntityRevisionLookup; +use Wikibase\Lib\Store\EntityTitleLookup; use Wikibase\Lib\Store\RevisionedUnresolvedRedirectException; use Wikibase\Rdf\RdfVocabulary; use Wikibase\Repo\Tests\Rdf\NTriplesRdfTestHelper; use Wikibase\Repo\WikibaseRepo; +use Wikibase\Store\EntityIdLookup; use Wikibase\Test\Rdf\RdfBuilderTest; use Wikibase\Test\Rdf\RdfBuilderTestData; @@ -131,6 +133,10 @@ // Note: we test against the actual RDF bindings here, so we get actual RDF. $rdfBuilderFactory = WikibaseRepo::getDefaultInstance()->getValueSnakRdfBuilderFactory(); + // Really simple mock that knows no titles + $titleLookup = $this->getMock( EntityTitleLookup::class ); + $titleLookup->method( 'getTitleForId' )->willReturn( null ); + return RdfDumpGenerator::createDumpGenerator( 'ntriples', $out, @@ -143,7 +149,8 @@ self::URI_BASE, self::URI_DATA, array( 'test' => 'en-x-test' ) - ) + ), + $titleLookup ); } diff --git a/repo/tests/phpunit/includes/Rdf/RdfBuilderTest.php b/repo/tests/phpunit/includes/Rdf/RdfBuilderTest.php index 4b6e27c..f4d1859 100644 --- a/repo/tests/phpunit/includes/Rdf/RdfBuilderTest.php +++ b/repo/tests/phpunit/includes/Rdf/RdfBuilderTest.php @@ -71,6 +71,10 @@ // Note: using the actual factory here makes this an integration test! $valueBuilderFactory = WikibaseRepo::getDefaultInstance()->getValueSnakRdfBuilderFactory(); + // Really simple mock that knows no titles + $titleLookup = $this->getMock( EntityTitleLookup::class ); + $titleLookup->method( 'getTitleForId' )->willReturn( null ); + $emitter = new NTriplesRdfWriter(); $builder = new RdfBuilder( $this->getTestData()->getSiteList(), @@ -79,7 +83,8 @@ $this->getTestData()->getMockRepository(), $produce, $emitter, - $dedup + $dedup, + $titleLookup ); $builder->startDocument(); -- To view, visit https://gerrit.wikimedia.org/r/305575 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I95868d6ae75c4ebf98ff414200fcdcc2155488f1 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits