Daniel Kinzler has uploaded a new change for review. https://gerrit.wikimedia.org/r/204486
Change subject: Introduce DedupeBag. ...................................................................... Introduce DedupeBag. This introduces a dedicated interface for the dedupe logic previously implemented using a BagOStuff made accessible via a callback. Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778 --- M repo/includes/rdf/ComplexValueRdfBuilder.php A repo/includes/rdf/DedupeBag.php M repo/includes/rdf/FullStatementRdfBuilder.php A repo/includes/rdf/HashDedupeBag.php A repo/includes/rdf/NullDedupeBag.php M repo/includes/rdf/RdfBuilder.php M repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php M repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php M repo/tests/phpunit/includes/rdf/RdfBuilderTest.php 9 files changed, 189 insertions(+), 88 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/86/204486/1 diff --git a/repo/includes/rdf/ComplexValueRdfBuilder.php b/repo/includes/rdf/ComplexValueRdfBuilder.php index 2da3d87..a872e3e 100644 --- a/repo/includes/rdf/ComplexValueRdfBuilder.php +++ b/repo/includes/rdf/ComplexValueRdfBuilder.php @@ -19,15 +19,14 @@ class ComplexValueRdfBuilder extends SimpleValueRdfBuilder { /** - * @var callable + * @var DedupeBag */ - private $valueSeenCallback = null; + private $dedupeBag; /** * @var RdfWriter */ private $valueWriter; - /** * @param RdfVocabulary $vocabulary @@ -36,21 +35,23 @@ */ public function __construct( RdfVocabulary $vocabulary, RdfWriter $valueWriter, PropertyDataTypeLookup $propertyLookup ) { parent::__construct( $vocabulary, $propertyLookup ); + + $this->dedupeBag = new NullDedupeBag(); $this->valueWriter = $valueWriter; } /** - * @return callable + * @return DedupeBag */ - public function getValueSeenCallback() { - return $this->valueSeenCallback; + public function getDedupeBag() { + return $this->dedupeBag; } /** - * @param callable $valueSeenCallback + * @param DedupeBag $dedupeBag */ - public function setValueSeenCallback( $valueSeenCallback ) { - $this->valueSeenCallback = $valueSeenCallback; + public function setDedupeBag( DedupeBag $dedupeBag ) { + $this->dedupeBag = $dedupeBag; } /** @@ -119,10 +120,8 @@ private function addExpandedValue( DataValue $value, $prefix, array $props ) { $valueLName = $value->getHash(); - if ( $this->valueSeenCallback ) { - if ( call_user_func( $this->valueSeenCallback, $valueLName ) !== false ) { + if ( $this->dedupeBag->alreadySeen( $valueLName, 'V' ) !== false ) { return $valueLName; - } } $this->valueWriter->about( RdfVocabulary::NS_VALUE, $valueLName )->a( RdfVocabulary::NS_ONTOLOGY, 'Value' ); diff --git a/repo/includes/rdf/DedupeBag.php b/repo/includes/rdf/DedupeBag.php new file mode 100644 index 0000000..5d3fe57 --- /dev/null +++ b/repo/includes/rdf/DedupeBag.php @@ -0,0 +1,32 @@ +<?php + +namespace Wikibase; + +/** + * Interface for a facility that avoids duplicates based on value hashes. + * + * @since 0.5 + * @internal + * + * @licence GNU GPL v2+ + * @author Daniel Kinzler + */ +interface DedupeBag { + + /** + * Did we already see this value? If yes, we may need to skip it + * + * @note False negatives are acceptable, while false positives are not. + * This means that implementations are free to return false if it is not + * sure whether the hash was seen before, but should never return true + * if it is not certain that the hash was need before. + * + * @param string $hash Hash to check + * @param string $namespace Optional namespace to allow a compartmentalized bag, + * tracking hashes from multiple value sets. + * + * @return bool + */ + public function alreadySeen( $hash, $namespace = '' ); + +} diff --git a/repo/includes/rdf/FullStatementRdfBuilder.php b/repo/includes/rdf/FullStatementRdfBuilder.php index 9e91760..580ebe1 100644 --- a/repo/includes/rdf/FullStatementRdfBuilder.php +++ b/repo/includes/rdf/FullStatementRdfBuilder.php @@ -30,9 +30,9 @@ private $propertyMentionCallback = null; /** - * @var callable + * @var DedupeBag */ - private $referenceSeenCallback = null; + private $dedupeBag; /** * @var bool @@ -78,6 +78,8 @@ $this->referenceWriter = $writer; $this->valueBuilder = $valueBuilder; + + $this->dedupeBag = new NullDedupeBag(); } /** @@ -95,10 +97,17 @@ } /** - * @return callable + * @return DedupeBag */ - public function getReferenceSeenCallback() { - return $this->referenceSeenCallback; + public function getDedupeBag() { + return $this->dedupeBag; + } + + /** + * @param DedupeBag $dedupeBag + */ + public function setDedupeBag( DedupeBag $dedupeBag ) { + $this->dedupeBag = $dedupeBag; } /** @@ -196,7 +205,7 @@ $this->statementWriter->about( RdfVocabulary::NS_STATEMENT, $statementLName ) ->say( RdfVocabulary::NS_PROV, 'wasDerivedFrom' )->is( RdfVocabulary::NS_REFERENCE, $refLName ); - if ( $this->referenceSeen( $hash ) !== false ) { + if ( $this->dedupeBag->alreadySeen( $hash, 'R' ) !== false ) { continue; } @@ -242,7 +251,6 @@ } else { wfLogWarning( "Unknown rank $rank encountered for $entityId:{$statement->getGuid()}" ); } - } /** @@ -284,21 +292,6 @@ if ( $this->propertyMentionCallback ) { call_user_func( $this->propertyMentionCallback, $propertyId ); } - } - - /** - * @param string $hash - * - * @return bool - */ - private function referenceSeen( $hash ) { - if ( $this->referenceSeenCallback ) { - if ( call_user_func( $this->referenceSeenCallback, $hash ) ) { - return $hash; - } - } - - return false; } /** diff --git a/repo/includes/rdf/HashDedupeBag.php b/repo/includes/rdf/HashDedupeBag.php new file mode 100644 index 0000000..8a68451 --- /dev/null +++ b/repo/includes/rdf/HashDedupeBag.php @@ -0,0 +1,58 @@ +<?php + +namespace Wikibase; + +/** + * Null implementation of DedupeBag. + * + * @since 0.5 + * @internal + * + * @licence GNU GPL v2+ + * @author Daniel Kinzler + */ +class HashDedupeBag implements DedupeBag { + + /** + * @var array + */ + private $bag; + + /** + * @var int + */ + private $cutoff; + + /** + * @param int $cutoff The number of hash characters to use. A larger number means + * less collisions, but a larger bag. The number can be read as an exponent + * to the size of the hash's alphabet, so with a hex hash and $cutoff = 5, + * you'd get a max bag size of 16^5, and a collision probability of 16^-5. + */ + public function __construct( $cutoff = 5 ) { + $this->cutoff = $cutoff; + + $this->bag = array(); + } + + /** + * @see DedupeBag::alreadySeen + * + * Always returns false. + * + * @param string $hash + * @param string $namespace + * + * @return bool + */ + public function alreadySeen( $hash, $namespace = '' ) { + $key = $namespace . substr( $hash, 0, $this->cutoff ); + if ( !isset( $this->bag[$key] ) || $this->bag[$key] !== $hash ) { + $this->bag[$key] = $hash; + return false; + } + + return true; + } + +} diff --git a/repo/includes/rdf/NullDedupeBag.php b/repo/includes/rdf/NullDedupeBag.php new file mode 100644 index 0000000..15dbb05 --- /dev/null +++ b/repo/includes/rdf/NullDedupeBag.php @@ -0,0 +1,30 @@ +<?php + +namespace Wikibase; + +/** + * Null implementation of DedupeBag. + * + * @since 0.5 + * @internal + * + * @licence GNU GPL v2+ + * @author Daniel Kinzler + */ +class NullDedupeBag implements DedupeBag { + + /** + * @see DedupeBag::alreadySeen + * + * Always returns false. + * + * @param string $hash + * @param string $namespace + * + * @return bool + */ + public function alreadySeen( $hash, $namespace = '' ) { + return false; + } + +} diff --git a/repo/includes/rdf/RdfBuilder.php b/repo/includes/rdf/RdfBuilder.php index a453578..886ea13 100644 --- a/repo/includes/rdf/RdfBuilder.php +++ b/repo/includes/rdf/RdfBuilder.php @@ -8,8 +8,6 @@ use Wikibase\DataModel\Entity\Entity; use Wikibase\DataModel\Entity\EntityId; use Wikibase\DataModel\Entity\Property; -use Wikibase\DataModel\SiteLink; -use Wikibase\DataModel\Snak\PropertyValueSnak; use Wikibase\DataModel\Entity\PropertyDataTypeLookup; use Wikibase\Lib\Store\EntityLookup; use Wikimedia\Purtle\RdfWriter; @@ -49,8 +47,7 @@ private $writer; /** - * Hash to store seen references/values for deduplication - * @var BagOStuff + * @var DedupeBag */ private $dedupBag; @@ -73,12 +70,12 @@ * @param PropertyDataTypeLookup $propertyLookup * @param integer $flavor * @param RdfWriter $writer - * @param BagOStuff|null $dedupBag Container used for deduplication of refs/values + * @param DedupeBag $dedupBag */ public function __construct( SiteList $sites, RdfVocabulary $vocabulary, PropertyDataTypeLookup $propertyLookup, $flavor, RdfWriter $writer, - BagOStuff $dedupBag = null + DedupeBag $dedupBag ) { $this->vocabulary = $vocabulary; $this->dedupBag = $dedupBag; @@ -130,15 +127,12 @@ if ( $this->shouldProduce( RdfProducer::PRODUCE_FULL_VALUES ) ) { $self = $this; // PHP 5.3 compat - $valueSeen = function( $hash ) use ( $self ) { - return $self->alreadySeen( $hash, 'V' ); - }; // NOTE: us sub-writers for nested structures $valueWriter = $writer->sub(); $statementValueBuilder = new ComplexValueRdfBuilder( $vocabulary, $valueWriter, $propertyLookup ); - $statementValueBuilder->setValueSeenCallback( $valueSeen ); + $statementValueBuilder->setDedupeBag( $this->dedupBag ); if ( $this->shouldProduce( RdfProducer::PRODUCE_RESOLVED_ENTITIES ) ) { $statementValueBuilder->setEntityMentionCallback( array( $this, 'entityMentioned' ) ); @@ -177,12 +171,9 @@ $statementValueBuilder = $this->newSnakValueBuilder( $vocabulary, $propertyLookup, $writer ); $self = $this; // PHP 5.3 compat - $referenceSeen = function ( $hash ) use ( $self ) { - return $self->alreadySeen( $hash, 'R' ); - }; $statementBuilder = new FullStatementRdfBuilder( $vocabulary, $writer, $statementValueBuilder ); - $statementBuilder->setReferenceSeenCallback( $referenceSeen ); + $statementBuilder->setDedupeBag( $this->dedupBag ); if ( $this->shouldProduce( RdfProducer::PRODUCE_PROPERTIES ) ) { $statementBuilder->setPropertyMentionCallback( array( $this, 'entityMentioned' ) ); @@ -268,28 +259,6 @@ private function entityResolved( EntityId $entityId ) { $prefixedId = $entityId->getSerialization(); $this->entitiesResolved[$prefixedId] = true; - } - - /** - * Did we already see this value? If yes, we may need to skip it - * - * @todo Make callback private once we drop PHP 5.3 compat. - * - * @param string $hash hash value to check - * @param string $namespace - * - * @return bool - */ - public function alreadySeen( $hash, $namespace ) { - if ( !$this->dedupBag ) { - return false; - } - $key = $namespace . substr($hash, 0, 5); - if ( $this->dedupBag->get( $key ) !== $hash ) { - $this->dedupBag->set( $key, $hash ); - return false; - } - return true; } /** diff --git a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php index 0b5f26b..affa007 100644 --- a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php +++ b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php @@ -14,6 +14,9 @@ use Wikibase\DataModel\Entity\EntityIdValue; use Wikibase\DataModel\Entity\ItemId; use Wikibase\DataModel\Entity\PropertyId; +use Wikibase\DedupeBag; +use Wikibase\HashDedupeBag; +use Wikibase\NullDedupeBag; use Wikimedia\Purtle\RdfWriter; use Wikibase\Rdf\Test\RdfBuilderTestData; use Wikibase\RdfVocabulary; @@ -56,22 +59,22 @@ * * @return ComplexValueRdfBuilder */ - private function newBuilder( array &$mentioned = array(), array $valuesSeen = array() ) { + private function newBuilder( array &$mentioned = array(), DedupeBag $bag = null ) { $entityMentioned = function( EntityId $id ) use ( &$mentioned ) { $key = $id->getSerialization(); $mentioned[$key] = $id; }; + if ( !$bag ) { + $bag = new NullDedupeBag(); + } + $vocabulary = $this->getTestData()->getVocabulary(); $valueWriter = $this->getTestData()->getNTriplesWriter(); - $valueSeenCallback = function( $hash ) use ( $valuesSeen ) { - return in_array( $hash, $valuesSeen ); - }; - $builder = new ComplexValueRdfBuilder( $vocabulary, $valueWriter, $this->getTestData()->getMockRepository() ); $builder->setEntityMentionCallback( $entityMentioned ); - $builder->setValueSeenCallback( $valueSeenCallback ); + $builder->setDedupeBag( $bag ); // HACK: glue on the value writer as a public field, so we can evaluate it later. $builder->test_value_writer = $valueWriter; @@ -338,7 +341,10 @@ $writer->about( RdfVocabulary::NS_ENTITY, 'Q11' ); $mentioned = array(); - $seen = array( $value->getHash() ); + + $seen = new HashDedupeBag(); + $seen->alreadySeen( $value->getHash(), 'V' ); + $builder = $this->newBuilder( $mentioned, $seen ); $builder->addSnakValue( $writer, $propertyId, $value, RdfVocabulary::NS_DIRECT_CLAIM ); diff --git a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php index ce5f0b3..96520fc 100644 --- a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php +++ b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php @@ -4,6 +4,9 @@ use Wikibase\ComplexValueRdfBuilder; use Wikibase\DataModel\Entity\EntityId; +use Wikibase\DedupeBag; +use Wikibase\HashDedupeBag; +use Wikibase\NullDedupeBag; use Wikimedia\Purtle\NTriplesRdfWriter; use Wikibase\Rdf\Test\RdfBuilderTestData; use Wikibase\RdfProducer; @@ -44,11 +47,11 @@ /** * @param int $flavor Bitmap for the output flavor, use RdfProducer::PRODUCE_XXX constants. * @param EntityId[] &$mentioned Receives any entity IDs being mentioned. - * @param string[] $referencesSeen A list of reference hashes that should be considered "already seen". + * @param DedupeBag $dedupe A bag of reference hashes that should be considered "already seen". * * @return FullStatementRdfBuilder */ - private function newBuilder( $flavor, array &$mentioned = array(), array $referencesSeen = array() ) { + private function newBuilder( $flavor, array &$mentioned = array(), DedupeBag $dedupe = null ) { $vocabulary = $this->getTestData()->getVocabulary(); $writer = $this->getTestData()->getNTriplesWriter(); @@ -58,6 +61,10 @@ $mentioned[$key] = $id; }; + if ( !$dedupe ) { + $dedupe = new NullDedupeBag(); + } + if ( $flavor & RdfProducer::PRODUCE_FULL_VALUES ) { $valueWriter = $writer->sub(); @@ -66,12 +73,8 @@ $statementValueBuilder = new SimpleValueRdfBuilder( $vocabulary, $this->getTestData()->getMockRepository() ); } - $referenceSeenCallback = function( $hash ) use ( $referencesSeen ) { - return in_array( $hash, $referencesSeen ); - }; - $statementBuilder = new FullStatementRdfBuilder( $vocabulary, $writer, $statementValueBuilder ); - $statementBuilder->setReferenceSeenCallback( $referenceSeenCallback ); + $statementBuilder->setDedupeBag( $dedupe ); if ( $flavor & RdfProducer::PRODUCE_PROPERTIES ) { $statementBuilder->setPropertyMentionCallback( $entityMentioned ); @@ -154,8 +157,14 @@ public function testAddEntity_seen( $entityName, $dataSetName, array $referencesSeen ) { $entity = $this->getTestData()->getEntity( $entityName ); + $dedupe = new HashDedupeBag(); + + foreach ( $referencesSeen as $hash ) { + $dedupe->alreadySeen( $hash, 'R' ); + } + $mentioned = array(); - $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL, $mentioned, $referencesSeen ); + $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL, $mentioned, $dedupe ); $builder->addEntity( $entity ); $this->assertOrCreateNTriples( $dataSetName, $builder ); diff --git a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php index b0e6105..34d2fb6 100644 --- a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php +++ b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php @@ -3,6 +3,8 @@ namespace Wikibase\Test; use Wikibase\DataModel\Entity\Entity; +use Wikibase\DedupeBag; +use Wikibase\HashDedupeBag; use Wikimedia\Purtle\NTriplesRdfWriter; use Wikibase\Rdf\Test\RdfBuilderTestData; use Wikibase\RdfBuilder; @@ -40,10 +42,11 @@ /** * @return RdfBuilder */ - private function newRdfBuilder( $produce, \BagOStuff $dedup = null ) { + private function newRdfBuilder( $produce, DedupeBag $dedup = null ) { if( !$dedup ) { - $dedup = new \HashBagOStuff(); + $dedup = new HashDedupeBag(); } + $emitter = new NTriplesRdfWriter(); $builder = new RdfBuilder( $this->getTestData()->getSiteList(), @@ -53,6 +56,7 @@ $emitter, $dedup ); + $builder->startDocument(); return $builder; } @@ -165,7 +169,8 @@ } public function testDeduplication() { - $bag = new \HashBagOStuff(); + $bag = new HashDedupeBag(); + $builder = $this->newRdfBuilder( RdfProducer::PRODUCE_ALL, $bag ); $builder->addEntity( $this->getEntityData( 'Q7' ) ); $data1 = $this->getDataFromBuilder( $builder ); -- To view, visit https://gerrit.wikimedia.org/r/204486 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits