jenkins-bot has submitted this change and it was merged.
Change subject: Add reference/value deduplication
......................................................................
Add reference/value deduplication
Task: T92586
Change-Id: I7f2c882764bcf2bdbdd8409acd4e798ff012005e
---
M repo/includes/Dumpers/RdfDumpGenerator.php
M repo/includes/rdf/RdfBuilder.php
M repo/includes/rdf/RdfSerializer.php
A repo/tests/phpunit/data/rdf/Q7_Q9_dedup.nt
A repo/tests/phpunit/data/rdf/Q9.json
A repo/tests/phpunit/data/rdf/dump_refs.nt
M repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php
M repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
8 files changed, 603 insertions(+), 57 deletions(-)
Approvals:
Smalyshev: Looks good to me, approved
Daniel Kinzler: Looks good to me, approved
jenkins-bot: Verified
diff --git a/repo/includes/Dumpers/RdfDumpGenerator.php
b/repo/includes/Dumpers/RdfDumpGenerator.php
index 52f3143..212a362 100644
--- a/repo/includes/Dumpers/RdfDumpGenerator.php
+++ b/repo/includes/Dumpers/RdfDumpGenerator.php
@@ -161,7 +161,8 @@
$entityLookup,
RdfProducer::PRODUCE_ALL_STATEMENTS |
RdfProducer::PRODUCE_TRUTHY_STATEMENTS |
RdfProducer::PRODUCE_QUALIFIERS |
RdfProducer::PRODUCE_REFERENCES |
- RdfProducer::PRODUCE_SITELINKS |
RdfProducer::PRODUCE_FULL_VALUES
+ RdfProducer::PRODUCE_SITELINKS |
RdfProducer::PRODUCE_FULL_VALUES,
+ new \HashBagOStuff()
);
return new RdfDumpGenerator( $output, $entityRevisionLookup,
$entitySerializer );
}
diff --git a/repo/includes/rdf/RdfBuilder.php b/repo/includes/rdf/RdfBuilder.php
index bd4146f..41c9990 100644
--- a/repo/includes/rdf/RdfBuilder.php
+++ b/repo/includes/rdf/RdfBuilder.php
@@ -8,6 +8,7 @@
use EasyRdf_Namespace;
use EasyRdf_Resource;
use SiteList;
+use BagOStuff;
use Wikibase\DataModel\Entity\BasicEntityIdParser;
use Wikibase\DataModel\Entity\Entity;
use Wikibase\DataModel\Entity\EntityDocument;
@@ -125,21 +126,26 @@
private $produceWhat;
/**
+ * Hash to store seen references/values for deduplication
+ * @var BagOStuff
+ */
+ private $dedupBag;
+
+ /**
*
* @param SiteList $sites
* @param string $baseUri
* @param string $dataUri
* @param PropertyDataTypeLookup $propertyLookup
* @param integer $flavor
- * @param EasyRdf_Graph|null $graph
+ * @param BagOStuff|null $dedupBag Container used for deduplication of
refs/values
*/
public function __construct( SiteList $sites, $baseUri, $dataUri,
- PropertyDataTypeLookup $propertyLookup, $flavor,
EasyRdf_Graph $graph = null ) {
- if ( !$graph ) {
- $graph = new EasyRdf_Graph();
- }
-
- $this->graph = $graph;
+ PropertyDataTypeLookup $propertyLookup, $flavor,
+ BagOStuff $dedupBag = null
+ ) {
+ $this->graph = new EasyRdf_Graph();
+ $this->dedupBag = $dedupBag;
$this->sites = $sites;
$this->baseUri = $baseUri;
@@ -213,18 +219,6 @@
*/
private function getStatementQName( $prefix, Statement $statement ) {
return $prefix . ':' . preg_replace( '/[^\w-]/', '-',
$statement->getGuid() );
- }
-
- /**
- * Returns a qname for the given reference using the given prefix.
- *
- * @param string $prefix use a self::NS_* constant, usually
self::NS_REFERENCE
- * @param Reference $ref
- *
- * @return string
- */
- private function getReferenceQName( $prefix, Reference $ref ) {
- return $prefix . ':' . $ref->getHash();
}
/**
@@ -481,6 +475,24 @@
}
/**
+ * Did we already see this value? If yes, we may need to skip it
+ * @param string $hash hash value to check
+ * @param string $namespace
+ * @return boolean
+ */
+ private function alreadySeen( $hash, $namespace ) {
+ if( !$this->dedupBag ) {
+ return false;
+ }
+ $key = $namespace . substr($hash, 0, 5);
+ if( $this->dedupBag->get( $key ) !== $hash ) {
+ $this->dedupBag->set( $key, $hash );
+ return false;
+ }
+ return true;
+ }
+
+ /**
* Adds the given Statement from the given Entity to the RDF graph.
*
* @param EntityId $entityId
@@ -500,8 +512,13 @@
if ( $this->shouldProduce( RdfProducer::PRODUCE_REFERENCES ) ) {
$statementResource = $this->getStatementResource(
$statement );
foreach ( $statement->getReferences() as $ref ) {
//FIXME: split body into separate method
- $refResource = $this->getReferenceResource(
$ref );
- $statementResource->addResource(
self::PROV_QNAME, $refResource );
+ $hash = $ref->getHash();
+ $refQName = self::NS_REFERENCE . ':' . $hash;
+ $statementResource->addResource(
self::PROV_QNAME, $refQName );
+ if( $this->alreadySeen( $hash, 'R' ) ) {
+ continue;
+ }
+ $refResource = $this->graph->resource(
$refQName, array ( self::WIKIBASE_REFERENCE_QNAME ) );
foreach ( $ref->getSnaks() as $refSnak ) {
$this->addSnak( $refResource, $refSnak,
self::NS_VALUE );
}
@@ -557,20 +574,6 @@
}
/**
- * Returns a resource representing the given Reference.
- *
- * @param Reference $ref
- *
- * @return EasyRDF_Resource
- */
- private function getReferenceResource( Reference $ref ) {
- $refQName = $this->getReferenceQName( self::NS_REFERENCE, $ref
);
- return $this->graph->resource( $refQName, array (
- self::WIKIBASE_REFERENCE_QNAME
- ) );
- }
-
- /**
* Adds the given Snak to the RDF graph.
*
* @param EasyRdf_Resource $target Target node to which we're attaching
the snak
@@ -603,8 +606,13 @@
* @param array $props List of properties
*/
private function addExpandedValue( EasyRdf_Resource $target,
$propertyValueQName, DataValue $value, array $props) {
- $node = $this->graph->resource( self::NS_VALUE . ":" .
$value->getHash(), self::WIKIBASE_VALUE_QNAME );
- $target->addResource( $propertyValueQName."-value", $node);
+ $hash = $value->getHash();
+ $vname = self::NS_VALUE . ":" . $hash;
+ $target->addResource( $propertyValueQName."-value", $vname );
+ if( $this->alreadySeen( $hash, 'V' ) ) {
+ return;
+ }
+ $node = $this->graph->resource( $vname,
self::WIKIBASE_VALUE_QNAME );
foreach( $props as $prop => $type ) {
$getter = "get" . ucfirst( $prop );
$data = $value->$getter();
diff --git a/repo/includes/rdf/RdfSerializer.php
b/repo/includes/rdf/RdfSerializer.php
index 23836b6..85b7cee 100644
--- a/repo/includes/rdf/RdfSerializer.php
+++ b/repo/includes/rdf/RdfSerializer.php
@@ -55,6 +55,13 @@
* @var PropertyDataTypeLookup
*/
private $propertyLookup;
+
+ /**
+ * Hash to store seen references/values for deduplication
+ * @var BagOStuff
+ */
+ private $dedupBag;
+
/**
* @param EasyRdf_Format $format
* @param string $baseUri
@@ -71,7 +78,8 @@
SiteList $sites,
PropertyDataTypeLookup $propertyLookup,
EntityLookup $entityLookup,
- $flavor
+ $flavor,
+ \BagOStuff $dedupBag = null
) {
$this->baseUri = $baseUri;
$this->dataUri = $dataUri;
@@ -80,6 +88,7 @@
$this->entityLookup = $entityLookup;
$this->propertyLookup = $propertyLookup;
$this->flavor = $flavor;
+ $this->dedupBag = $dedupBag;
}
/**
@@ -121,7 +130,8 @@
$this->baseUri,
$this->dataUri,
$this->propertyLookup,
- $this->flavor
+ $this->flavor,
+ $this->dedupBag
);
return $builder;
diff --git a/repo/tests/phpunit/data/rdf/Q7_Q9_dedup.nt
b/repo/tests/phpunit/data/rdf/Q7_Q9_dedup.nt
new file mode 100644
index 0000000..5e73c78
--- /dev/null
+++ b/repo/tests/phpunit/data/rdf/Q7_Q9_dedup.nt
@@ -0,0 +1,66 @@
+
+<http://acme.test/Q7> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-2> .
+<http://acme.test/Q7> <http://acme.test/P7>
<http://acme.test/statement/TEST-References> .
+<http://acme.test/Q7> <http://acme.test/assert/P7> "string" .
+<http://acme.test/Q7> <http://acme.test/assert/P7> "string2" .
+<http://acme.test/Q7> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Item> .
+<http://acme.test/Q9> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-2-Dup> .
+<http://acme.test/Q9> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-Dup> .
+<http://acme.test/Q9> <http://acme.test/assert/P7> "string" .
+<http://acme.test/Q9> <http://acme.test/assert/P7> "string2" .
+<http://acme.test/Q9> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Item> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P2> <http://acme.test/Q42> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P2> <http://acme.test/Q666> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P3>
<http://commons.wikimedia.org/wiki/Special:FilePath/Universe.svg> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P3> <http://www.wikidata.org/ontology-0.0.1#Novalue> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P4-value>
<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P4> "Point(12.345
67.89)"^^<http://www.opengis.net/ont/geosparql#wktLiteral> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> "\u0431\u0440\u0435\u0434"@ru .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> "\u043F\u0440\u0435\u0432\u0435\u0434"@ru .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> <http://www.wikidata.org/ontology-0.0.1#Somevalue> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P6-value>
<http://acme.test/value/1e09d673624819aacd170165aae555a1> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P6>
"+19.768000000000000682121026329696178436279296875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P7> "simplestring" .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P8-value>
<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P8>
"-0200-01-01T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P9> <http://url.acme.test/> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Reference> .
+<http://acme.test/statement/TEST-References-2-Dup> <http://acme.test/value/P7>
"string2" .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References-2> <http://acme.test/value/P7>
"string2" .
+<http://acme.test/statement/TEST-References-2>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-2>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-2>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References-Dup> <http://acme.test/value/P7>
"string" .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References> <http://acme.test/value/P7>
"string" .
+<http://acme.test/statement/TEST-References>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#Amount>
"+19.768000000000000682121026329696178436279296875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#LowerBound>
"+19.766999999999999459987520822323858737945556640625"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#Unit> "1" .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#UpperBound>
"+19.76899999999999835154085303656756877899169921875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#CalendarModel>
<http://www.wikidata.org/entity/Q1985727> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Precision>
"9"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Time> "-00000000200-00-00T00:00:00Z" .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Timezone>
"0"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Globe>
<http://www.wikidata.org/entity/Q2> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Latitude>
"12.345"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Longitude>
"67.89"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Precision>
"0.1"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://data.acme.test/Q7> <http://creativecommons.org/ns#license>
<http://creativecommons.org/publicdomain/zero/1.0/> .
+<http://data.acme.test/Q7> <http://schema.org/about> <http://acme.test/Q7> .
+<http://data.acme.test/Q7> <http://schema.org/softwareVersion> "0.0.1" .
+<http://data.acme.test/Q7> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://schema.org/Dataset> .
+<http://data.acme.test/Q9> <http://creativecommons.org/ns#license>
<http://creativecommons.org/publicdomain/zero/1.0/> .
+<http://data.acme.test/Q9> <http://schema.org/about> <http://acme.test/Q9> .
+<http://data.acme.test/Q9> <http://schema.org/softwareVersion> "0.0.1" .
+<http://data.acme.test/Q9> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://schema.org/Dataset> .
diff --git a/repo/tests/phpunit/data/rdf/Q9.json
b/repo/tests/phpunit/data/rdf/Q9.json
new file mode 100644
index 0000000..f0300e8
--- /dev/null
+++ b/repo/tests/phpunit/data/rdf/Q9.json
@@ -0,0 +1,356 @@
+{
+ "id": "Q9",
+ "type": "item",
+ "claims": {
+ "P7": [
+ {
+ "id": "TEST-References-Dup",
+ "mainsnak": {
+ "snaktype": "value",
+ "property": "P7",
+ "datatype": "string",
+ "datavalue": {
+ "value": "string",
+ "type": "string"
+ }
+ },
+ "type": "statement",
+ "rank": "normal",
+ "references": [
+ {
+ "hash": "a9059ae62c138ba7535ece019f82fb2e1fb44d13",
+ "snaks": {
+ "P2": [
+ {
+ "snaktype": "value",
+ "property": "P2",
+ "datatype": "wikibase-entityid",
+ "datavalue": {
+ "value": {
+ "entity-type": "item",
+ "numeric-id": 42
+ },
+ "type": "wikibase-entityid"
+ }
+ },
+ {
+ "snaktype": "value",
+ "property": "P2",
+ "datatype": "wikibase-entityid",
+ "datavalue": {
+ "value": {
+ "entity-type": "item",
+ "numeric-id": 666
+ },
+ "type": "wikibase-entityid"
+ }
+ }
+ ],
+ "P3": [
+ {
+ "snaktype": "value",
+ "property": "P3",
+ "datatype": "commonsMedia",
+ "datavalue": {
+ "value": "Universe.svg",
+ "type": "string"
+ }
+ },
+ {
+ "snaktype": "novalue",
+ "property": "P3"
+ }
+ ],
+ "P4": [
+ {
+ "snaktype": "value",
+ "property": "P4",
+ "datatype": "globecoordinate",
+ "datavalue": {
+ "value": {
+ "latitude": 12.345,
+ "longitude": 67.89,
+ "precision": 0.1,
+ "globe":
"http:\/\/www.wikidata.org\/entity\/Q2"
+ },
+ "type": "globecoordinate"
+ }
+ }
+ ],
+ "P5": [
+ {
+ "snaktype": "value",
+ "property": "P5",
+ "datatype": "monolingualtext",
+ "datavalue": {
+ "value": {
+ "text":
"\u043f\u0440\u0435\u0432\u0435\u0434",
+ "language": "ru"
+ },
+ "type": "monolingualtext"
+ }
+ },
+ {
+ "snaktype": "somevalue",
+ "property": "P5"
+ },
+ {
+ "snaktype": "value",
+ "property": "P5",
+ "datatype": "monolingualtext",
+ "datavalue": {
+ "value": {
+ "text": "\u0431\u0440\u0435\u0434",
+ "language": "ru"
+ },
+ "type": "monolingualtext"
+ }
+ }
+ ],
+ "P6": [
+ {
+ "snaktype": "value",
+ "property": "P6",
+ "datatype": "quantity",
+ "datavalue": {
+ "value": {
+ "amount":
"+19.768000000000000682121026329696178436279296875",
+ "unit": "1",
+ "upperBound":
"+19.76899999999999835154085303656756877899169921875",
+ "lowerBound":
"+19.766999999999999459987520822323858737945556640625"
+ },
+ "type": "quantity"
+ }
+ }
+ ],
+ "P7": [
+ {
+ "snaktype": "value",
+ "property": "P7",
+ "datatype": "string",
+ "datavalue": {
+ "value": "simplestring",
+ "type": "string"
+ }
+ }
+ ],
+ "P8": [
+ {
+ "snaktype": "value",
+ "property": "P8",
+ "datatype": "time",
+ "datavalue": {
+ "value": {
+ "time":
"-00000000200-00-00T00:00:00Z",
+ "timezone": 0,
+ "before": 0,
+ "after": 0,
+ "precision": 9,
+ "calendarmodel":
"http:\/\/www.wikidata.org\/entity\/Q1985727"
+ },
+ "type": "time"
+ }
+ }
+ ],
+ "P9": [
+ {
+ "snaktype": "value",
+ "property": "P9",
+ "datatype": "url",
+ "datavalue": {
+ "value": "http:\/\/url.acme.test\/",
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "snaks-order": [
+ "P2",
+ "P3",
+ "P4",
+ "P5",
+ "P6",
+ "P7",
+ "P8",
+ "P9"
+ ]
+ }
+ ]
+ },
+ {
+ "id": "TEST-References-2-Dup",
+ "mainsnak": {
+ "snaktype": "value",
+ "property": "P7",
+ "datatype": "string",
+ "datavalue": {
+ "value": "string2",
+ "type": "string"
+ }
+ },
+ "type": "statement",
+ "rank": "normal",
+ "references": [
+ {
+ "hash": "a9059ae62c138ba7535ece019f82fb2e1fb44d13",
+ "snaks": {
+ "P2": [
+ {
+ "snaktype": "value",
+ "property": "P2",
+ "datatype": "wikibase-entityid",
+ "datavalue": {
+ "value": {
+ "entity-type": "item",
+ "numeric-id": 42
+ },
+ "type": "wikibase-entityid"
+ }
+ },
+ {
+ "snaktype": "value",
+ "property": "P2",
+ "datatype": "wikibase-entityid",
+ "datavalue": {
+ "value": {
+ "entity-type": "item",
+ "numeric-id": 666
+ },
+ "type": "wikibase-entityid"
+ }
+ }
+ ],
+ "P3": [
+ {
+ "snaktype": "value",
+ "property": "P3",
+ "datatype": "commonsMedia",
+ "datavalue": {
+ "value": "Universe.svg",
+ "type": "string"
+ }
+ },
+ {
+ "snaktype": "novalue",
+ "property": "P3"
+ }
+ ],
+ "P4": [
+ {
+ "snaktype": "value",
+ "property": "P4",
+ "datatype": "globecoordinate",
+ "datavalue": {
+ "value": {
+ "latitude": 12.345,
+ "longitude": 67.89,
+ "precision": 0.1,
+ "globe":
"http:\/\/www.wikidata.org\/entity\/Q2"
+ },
+ "type": "globecoordinate"
+ }
+ }
+ ],
+ "P5": [
+ {
+ "snaktype": "value",
+ "property": "P5",
+ "datatype": "monolingualtext",
+ "datavalue": {
+ "value": {
+ "text":
"\u043f\u0440\u0435\u0432\u0435\u0434",
+ "language": "ru"
+ },
+ "type": "monolingualtext"
+ }
+ },
+ {
+ "snaktype": "somevalue",
+ "property": "P5"
+ },
+ {
+ "snaktype": "value",
+ "property": "P5",
+ "datatype": "monolingualtext",
+ "datavalue": {
+ "value": {
+ "text": "\u0431\u0440\u0435\u0434",
+ "language": "ru"
+ },
+ "type": "monolingualtext"
+ }
+ }
+ ],
+ "P6": [
+ {
+ "snaktype": "value",
+ "property": "P6",
+ "datatype": "quantity",
+ "datavalue": {
+ "value": {
+ "amount":
"+19.768000000000000682121026329696178436279296875",
+ "unit": "1",
+ "upperBound":
"+19.76899999999999835154085303656756877899169921875",
+ "lowerBound":
"+19.766999999999999459987520822323858737945556640625"
+ },
+ "type": "quantity"
+ }
+ }
+ ],
+ "P7": [
+ {
+ "snaktype": "value",
+ "property": "P7",
+ "datatype": "string",
+ "datavalue": {
+ "value": "simplestring",
+ "type": "string"
+ }
+ }
+ ],
+ "P8": [
+ {
+ "snaktype": "value",
+ "property": "P8",
+ "datatype": "time",
+ "datavalue": {
+ "value": {
+ "time":
"-00000000200-00-00T00:00:00Z",
+ "timezone": 0,
+ "before": 0,
+ "after": 0,
+ "precision": 9,
+ "calendarmodel":
"http:\/\/www.wikidata.org\/entity\/Q1985727"
+ },
+ "type": "time"
+ }
+ }
+ ],
+ "P9": [
+ {
+ "snaktype": "value",
+ "property": "P9",
+ "datatype": "url",
+ "datavalue": {
+ "value": "http:\/\/url.acme.test\/",
+ "type": "string"
+ }
+ }
+ ]
+ },
+ "snaks-order": [
+ "P2",
+ "P3",
+ "P4",
+ "P5",
+ "P6",
+ "P7",
+ "P8",
+ "P9"
+ ]
+ }
+ ]
+ }
+ ]
+ }
+}
diff --git a/repo/tests/phpunit/data/rdf/dump_refs.nt
b/repo/tests/phpunit/data/rdf/dump_refs.nt
new file mode 100644
index 0000000..2eaf897
--- /dev/null
+++ b/repo/tests/phpunit/data/rdf/dump_refs.nt
@@ -0,0 +1,69 @@
+<http://acme.test/Q7> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-2> .
+<http://acme.test/Q7> <http://acme.test/P7>
<http://acme.test/statement/TEST-References> .
+<http://acme.test/Q7> <http://acme.test/assert/P7> "string" .
+<http://acme.test/Q7> <http://acme.test/assert/P7> "string2" .
+<http://acme.test/Q7> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Item> .
+<http://acme.test/Q9> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-2-Dup> .
+<http://acme.test/Q9> <http://acme.test/P7>
<http://acme.test/statement/TEST-References-Dup> .
+<http://acme.test/Q9> <http://acme.test/assert/P7> "string" .
+<http://acme.test/Q9> <http://acme.test/assert/P7> "string2" .
+<http://acme.test/Q9> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Item> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P2> <http://acme.test/Q42> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P2> <http://acme.test/Q666> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P3> "Universe.svg" .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P3> <http://www.wikidata.org/ontology-0.0.1#Novalue> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P4-value>
<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P4> "Point(12.345
67.89)"^^<http://www.opengis.net/ont/geosparql#wktLiteral> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> "\u0431\u0440\u0435\u0434"@ru .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> "\u043F\u0440\u0435\u0432\u0435\u0434"@ru .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P5> <http://www.wikidata.org/ontology-0.0.1#Somevalue> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P6-value>
<http://acme.test/value/1e09d673624819aacd170165aae555a1> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P6>
"+19.768000000000000682121026329696178436279296875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P7> "simplestring" .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P8-value>
<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P8>
"-0200-01-01T00:00:00Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://acme.test/value/P9> "http://url.acme.test/" .
+<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Reference> .
+<http://acme.test/statement/TEST-References-2-Dup> <http://acme.test/value/P7>
"string2" .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-2-Dup>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References-2> <http://acme.test/value/P7>
"string2" .
+<http://acme.test/statement/TEST-References-2>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-2>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-2>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References-Dup> <http://acme.test/value/P7>
"string" .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References-Dup>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/statement/TEST-References> <http://acme.test/value/P7>
"string" .
+<http://acme.test/statement/TEST-References>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Statement> .
+<http://acme.test/statement/TEST-References>
<http://www.w3.org/ns/prov#wasDerivedFrom>
<http://acme.test/reference/94c1119b352b4ed0b6d9182b77a9b249f9ec28c8> .
+<http://acme.test/statement/TEST-References>
<http://www.wikidata.org/ontology-0.0.1#Rank>
<http://www.wikidata.org/ontology-0.0.1#NormalRank> .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#Amount>
"+19.768000000000000682121026329696178436279296875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#LowerBound>
"+19.766999999999999459987520822323858737945556640625"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#Unit> "1" .
+<http://acme.test/value/1e09d673624819aacd170165aae555a1>
<http://www.wikidata.org/ontology-0.0.1#UpperBound>
"+19.76899999999999835154085303656756877899169921875"^^<http://www.w3.org/2001/XMLSchema#decimal>
.
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#CalendarModel>
<http://www.wikidata.org/entity/Q1985727> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Precision>
"9"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Time> "-00000000200-00-00T00:00:00Z" .
+<http://acme.test/value/9b0b2552ae2d72bcd64746da766afcea>
<http://www.wikidata.org/ontology-0.0.1#Timezone>
"0"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://www.wikidata.org/ontology-0.0.1#Value> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Globe>
<http://www.wikidata.org/entity/Q2> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Latitude>
"12.345"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Longitude>
"67.89"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://acme.test/value/aad6b70bccf9875ba61d31c767b7f652>
<http://www.wikidata.org/ontology-0.0.1#Precision>
"0.1"^^<http://www.w3.org/2001/XMLSchema#decimal> .
+<http://data.acme.test/Q7> <http://schema.org/about> <http://acme.test/Q7> .
+<http://data.acme.test/Q7> <http://schema.org/dateModified>
"1970-01-12T13:46:40Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://data.acme.test/Q7> <http://schema.org/version>
"12"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://data.acme.test/Q7> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://schema.org/Dataset> .
+<http://data.acme.test/Q9> <http://schema.org/about> <http://acme.test/Q9> .
+<http://data.acme.test/Q9> <http://schema.org/dateModified>
"1970-01-12T13:46:40Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://data.acme.test/Q9> <http://schema.org/version>
"12"^^<http://www.w3.org/2001/XMLSchema#integer> .
+<http://data.acme.test/Q9> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
<http://schema.org/Dataset> .
+<http://www.wikidata.org/ontology-0.0.1#Dump>
<http://creativecommons.org/ns#license>
<http://creativecommons.org/publicdomain/zero/1.0/> .
+<http://www.wikidata.org/ontology-0.0.1#Dump> <http://schema.org/dateModified>
"1970-01-12T13:46:40Z"^^<http://www.w3.org/2001/XMLSchema#dateTime> .
+<http://www.wikidata.org/ontology-0.0.1#Dump>
<http://schema.org/softwareVersion> "0.0.1" .
+<http://www.wikidata.org/ontology-0.0.1#Dump>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Dataset> .
diff --git a/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php
b/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php
index 29af1a1..3532c79 100644
--- a/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php
+++ b/repo/tests/phpunit/includes/Dumpers/RdfDumpGeneratorTest.php
@@ -8,6 +8,7 @@
use Wikibase\RdfProducer;
use Wikibase\Lib\Store\UnresolvedRedirectException;
use Wikibase\EntityRevision;
+use Wikibase\Test\RdfBuilderTest;
/**
* @covers Wikibase\Dumpers\RdfDumpGenerator
@@ -52,31 +53,20 @@
}
/**
- * @param EntityId[] $ids
- * @param EntityId[] $missingIds
- * @param EntityId[] $redirectedIds
+ * @param Entity[] $entities
*
* @return JsonDumpGenerator
*/
- protected function newDumpGenerator( array $ids = array(), array
$missingIds = array(), array $redirectedIds = array() ) {
+ protected function newDumpGenerator( array $entities = array() ) {
$out = fopen( 'php://output', 'w' );
- $jsonTest = new JsonDumpGeneratorTest();
- $entities = $jsonTest->makeEntities( $ids );
$entityLookup = $this->getMock(
'Wikibase\Lib\Store\EntityLookup' );
$entityRevisionLookup = $this->getMock(
'Wikibase\Lib\Store\EntityRevisionLookup' );
$propertyLookup = $this->getMock(
'Wikibase\DataModel\Entity\PropertyDataTypeLookup' );
$entityLookup->expects( $this->any() )
->method( 'getEntity' )
- ->will( $this->returnCallback( function( EntityId $id ) use (
$entities, $missingIds, $redirectedIds ) {
- if ( in_array( $id, $missingIds ) ) {
- return null;
- }
- if ( in_array( $id, $redirectedIds ) ) {
- throw new UnresolvedRedirectException( new
ItemId( 'Q123' ) );
- }
-
+ ->will( $this->returnCallback( function( EntityId $id ) use (
$entities ) {
$key = $id->getSerialization();
return $entities[$key];
} ) );
@@ -142,7 +132,9 @@
* @dataProvider idProvider
*/
public function testGenerateDump( array $ids, $dumpname ) {
- $dumper = $this->newDumpGenerator( $ids );
+ $jsonTest = new JsonDumpGeneratorTest();
+ $entities = $jsonTest->makeEntities( $ids );
+ $dumper = $this->newDumpGenerator( $entities );
$dumper->setTimestamp(1000000);
$jsonTest = new JsonDumpGeneratorTest();
$pager = $jsonTest->makeIdPager( $ids );
@@ -155,4 +147,31 @@
}
+ public function loadDataProvider() {
+ return array(
+ 'references' => array( array( new ItemId( 'Q7'
), new ItemId( 'Q9' ) ), 'refs' ),
+ );
+ }
+
+ /**
+ * @dataProvider loadDataProvider
+ */
+ public function testReferenceDedup( array $ids, $dumpname ) {
+ $rdfTest = new RdfBuilderTest();
+ foreach( $ids as $id ) {
+ $id = $id->getSerialization();
+ $entities[$id] = $rdfTest->getEntityData( $id );
+ }
+ $dumper = $this->newDumpGenerator( $entities );
+ $dumper->setTimestamp(1000000);
+ $jsonTest = new JsonDumpGeneratorTest();
+ $pager = $jsonTest->makeIdPager( $ids );
+
+ ob_start();
+ $dumper->generateDump( $pager );
+ $dump = ob_get_clean();
+ $dump = $this->normalizeData($dump);
+ $this->assertEquals($this->getSerializedData($dumpname), $dump);
+ }
+
}
\ No newline at end of file
diff --git a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
index 009bfb9..3b5d9f8 100644
--- a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
@@ -112,13 +112,14 @@
/**
* @return RdfBuilder
*/
- private static function newRdfBuilder( $produce ) {
+ private static function newRdfBuilder( $produce, \BagOStuff $dedup =
null ) {
return new RdfBuilder(
self::getSiteList(),
self::URI_BASE,
self::URI_DATA,
self::getMockRepository(),
- $produce
+ $produce,
+ $dedup
);
}
@@ -268,4 +269,20 @@
$this->assertEquals( $this->getSerializedData( 'dumpheader' ),
$data);
}
+ public function testDeduplication() {
+ $bag = new \HashBagOStuff();
+ $builder = self::newRdfBuilder( RdfProducer::PRODUCE_ALL, $bag
);
+ $builder->addEntity( $this->getEntityData( 'Q7' ) );
+ $data1 = $this->getDataFromBuilder( $builder );
+
+ $builder = self::newRdfBuilder( RdfProducer::PRODUCE_ALL, $bag
);
+ $builder->addEntity( $this->getEntityData( 'Q9' ) );
+ $data2 = $this->getDataFromBuilder( $builder );
+
+ $data = array_merge($data1, $data2);
+ sort($data);
+
+ $this->assertArrayEquals($this->getSerializedData(
'Q7_Q9_dedup' ), $data);
+ }
+
}
--
To view, visit https://gerrit.wikimedia.org/r/197785
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7f2c882764bcf2bdbdd8409acd4e798ff012005e
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Daniel Kinzler <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits