Daniel Kinzler has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/204486

Change subject: Introduce DedupeBag.
......................................................................

Introduce DedupeBag.

This introduces a dedicated interface for the dedupe logic
previously implemented using a BagOStuff made accessible via
a callback.

Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778
---
M repo/includes/rdf/ComplexValueRdfBuilder.php
A repo/includes/rdf/DedupeBag.php
M repo/includes/rdf/FullStatementRdfBuilder.php
A repo/includes/rdf/HashDedupeBag.php
A repo/includes/rdf/NullDedupeBag.php
M repo/includes/rdf/RdfBuilder.php
M repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
M repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
M repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
9 files changed, 189 insertions(+), 88 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/86/204486/1

diff --git a/repo/includes/rdf/ComplexValueRdfBuilder.php 
b/repo/includes/rdf/ComplexValueRdfBuilder.php
index 2da3d87..a872e3e 100644
--- a/repo/includes/rdf/ComplexValueRdfBuilder.php
+++ b/repo/includes/rdf/ComplexValueRdfBuilder.php
@@ -19,15 +19,14 @@
 class ComplexValueRdfBuilder extends SimpleValueRdfBuilder {
 
        /**
-        * @var callable
+        * @var DedupeBag
         */
-       private $valueSeenCallback = null;
+       private $dedupeBag;
 
        /**
         * @var RdfWriter
         */
        private $valueWriter;
-
 
        /**
         * @param RdfVocabulary $vocabulary
@@ -36,21 +35,23 @@
         */
        public function __construct( RdfVocabulary $vocabulary, RdfWriter 
$valueWriter, PropertyDataTypeLookup $propertyLookup ) {
                parent::__construct( $vocabulary, $propertyLookup );
+
+               $this->dedupeBag = new NullDedupeBag();
                $this->valueWriter = $valueWriter;
        }
 
        /**
-        * @return callable
+        * @return DedupeBag
         */
-       public function getValueSeenCallback() {
-               return $this->valueSeenCallback;
+       public function getDedupeBag() {
+               return $this->dedupeBag;
        }
 
        /**
-        * @param callable $valueSeenCallback
+        * @param DedupeBag $dedupeBag
         */
-       public function setValueSeenCallback( $valueSeenCallback ) {
-               $this->valueSeenCallback = $valueSeenCallback;
+       public function setDedupeBag( DedupeBag $dedupeBag ) {
+               $this->dedupeBag = $dedupeBag;
        }
 
        /**
@@ -119,10 +120,8 @@
        private function addExpandedValue( DataValue $value, $prefix, array 
$props ) {
                $valueLName = $value->getHash();
 
-               if ( $this->valueSeenCallback ) {
-                       if ( call_user_func( $this->valueSeenCallback, 
$valueLName ) !== false ) {
+               if ( $this->dedupeBag->alreadySeen( $valueLName, 'V' ) !== 
false ) {
                                return $valueLName;
-                       }
                }
 
                $this->valueWriter->about( RdfVocabulary::NS_VALUE, $valueLName 
)->a( RdfVocabulary::NS_ONTOLOGY, 'Value' );
diff --git a/repo/includes/rdf/DedupeBag.php b/repo/includes/rdf/DedupeBag.php
new file mode 100644
index 0000000..5d3fe57
--- /dev/null
+++ b/repo/includes/rdf/DedupeBag.php
@@ -0,0 +1,32 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Interface for a facility that avoids duplicates based on value hashes.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+interface DedupeBag {
+
+       /**
+        * Did we already see this value? If yes, we may need to skip it
+        *
+        * @note False negatives are acceptable, while false positives are not.
+        * This means that implementations are free to return false if it is not
+        * sure whether the hash was seen before, but should never return true
+        * if it is not certain that the hash was need before.
+        *
+        * @param string $hash Hash to check
+        * @param string $namespace Optional namespace to allow a 
compartmentalized bag,
+        *        tracking hashes from multiple value sets.
+        *
+        * @return bool
+        */
+       public function alreadySeen( $hash, $namespace = '' );
+
+}
diff --git a/repo/includes/rdf/FullStatementRdfBuilder.php 
b/repo/includes/rdf/FullStatementRdfBuilder.php
index 9e91760..580ebe1 100644
--- a/repo/includes/rdf/FullStatementRdfBuilder.php
+++ b/repo/includes/rdf/FullStatementRdfBuilder.php
@@ -30,9 +30,9 @@
        private $propertyMentionCallback = null;
 
        /**
-        * @var callable
+        * @var DedupeBag
         */
-       private $referenceSeenCallback = null;
+       private $dedupeBag;
 
        /**
         * @var bool
@@ -78,6 +78,8 @@
                $this->referenceWriter = $writer;
 
                $this->valueBuilder = $valueBuilder;
+
+               $this->dedupeBag = new NullDedupeBag();
        }
 
        /**
@@ -95,10 +97,17 @@
        }
 
        /**
-        * @return callable
+        * @return DedupeBag
         */
-       public function getReferenceSeenCallback() {
-               return $this->referenceSeenCallback;
+       public function getDedupeBag() {
+               return $this->dedupeBag;
+       }
+
+       /**
+        * @param DedupeBag $dedupeBag
+        */
+       public function setDedupeBag( DedupeBag $dedupeBag ) {
+               $this->dedupeBag = $dedupeBag;
        }
 
        /**
@@ -196,7 +205,7 @@
 
                                $this->statementWriter->about( 
RdfVocabulary::NS_STATEMENT, $statementLName )
                                        ->say( RdfVocabulary::NS_PROV, 
'wasDerivedFrom' )->is( RdfVocabulary::NS_REFERENCE, $refLName );
-                               if ( $this->referenceSeen( $hash ) !== false ) {
+                               if ( $this->dedupeBag->alreadySeen( $hash, 'R' 
) !== false ) {
                                        continue;
                                }
 
@@ -242,7 +251,6 @@
                } else {
                        wfLogWarning( "Unknown rank $rank encountered for 
$entityId:{$statement->getGuid()}" );
                }
-
        }
 
        /**
@@ -284,21 +292,6 @@
                if ( $this->propertyMentionCallback ) {
                        call_user_func( $this->propertyMentionCallback, 
$propertyId );
                }
-       }
-
-       /**
-        * @param string $hash
-        *
-        * @return bool
-        */
-       private function referenceSeen( $hash ) {
-               if ( $this->referenceSeenCallback ) {
-                       if ( call_user_func( $this->referenceSeenCallback, 
$hash ) ) {
-                               return $hash;
-                       }
-               }
-
-               return false;
        }
 
        /**
diff --git a/repo/includes/rdf/HashDedupeBag.php 
b/repo/includes/rdf/HashDedupeBag.php
new file mode 100644
index 0000000..8a68451
--- /dev/null
+++ b/repo/includes/rdf/HashDedupeBag.php
@@ -0,0 +1,58 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Null implementation of DedupeBag.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+class HashDedupeBag implements DedupeBag {
+
+       /**
+        * @var array
+        */
+       private $bag;
+
+       /**
+        * @var int
+        */
+       private $cutoff;
+
+       /**
+        * @param int $cutoff The number of hash characters to use. A larger 
number means
+        *        less collisions, but a larger bag. The number can be read as 
an exponent
+        *        to the size of the hash's alphabet, so with a hex hash and 
$cutoff = 5,
+        *        you'd get a max bag size of 16^5, and a collision probability 
of 16^-5.
+        */
+       public function __construct( $cutoff = 5 ) {
+               $this->cutoff = $cutoff;
+
+               $this->bag = array();
+       }
+
+       /**
+        * @see DedupeBag::alreadySeen
+        *
+        * Always returns false.
+        *
+        * @param string $hash
+        * @param string $namespace
+        *
+        * @return bool
+        */
+       public function alreadySeen( $hash, $namespace = '' ) {
+               $key = $namespace . substr( $hash, 0, $this->cutoff );
+               if ( !isset( $this->bag[$key] ) || $this->bag[$key] !== $hash ) 
{
+                       $this->bag[$key] = $hash;
+                       return false;
+               }
+
+               return true;
+       }
+
+}
diff --git a/repo/includes/rdf/NullDedupeBag.php 
b/repo/includes/rdf/NullDedupeBag.php
new file mode 100644
index 0000000..15dbb05
--- /dev/null
+++ b/repo/includes/rdf/NullDedupeBag.php
@@ -0,0 +1,30 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Null implementation of DedupeBag.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+class NullDedupeBag implements DedupeBag {
+
+       /**
+        * @see DedupeBag::alreadySeen
+        *
+        * Always returns false.
+        *
+        * @param string $hash
+        * @param string $namespace
+        *
+        * @return bool
+        */
+       public function alreadySeen( $hash, $namespace = '' ) {
+               return false;
+       }
+
+}
diff --git a/repo/includes/rdf/RdfBuilder.php b/repo/includes/rdf/RdfBuilder.php
index a453578..886ea13 100644
--- a/repo/includes/rdf/RdfBuilder.php
+++ b/repo/includes/rdf/RdfBuilder.php
@@ -8,8 +8,6 @@
 use Wikibase\DataModel\Entity\Entity;
 use Wikibase\DataModel\Entity\EntityId;
 use Wikibase\DataModel\Entity\Property;
-use Wikibase\DataModel\SiteLink;
-use Wikibase\DataModel\Snak\PropertyValueSnak;
 use Wikibase\DataModel\Entity\PropertyDataTypeLookup;
 use Wikibase\Lib\Store\EntityLookup;
 use Wikimedia\Purtle\RdfWriter;
@@ -49,8 +47,7 @@
        private $writer;
 
        /**
-        * Hash to store seen references/values for deduplication
-        * @var BagOStuff
+        * @var DedupeBag
         */
        private $dedupBag;
 
@@ -73,12 +70,12 @@
         * @param PropertyDataTypeLookup $propertyLookup
         * @param integer $flavor
         * @param RdfWriter $writer
-        * @param BagOStuff|null $dedupBag Container used for deduplication of 
refs/values
+        * @param DedupeBag $dedupBag
         */
        public function __construct( SiteList $sites, RdfVocabulary $vocabulary,
                        PropertyDataTypeLookup $propertyLookup, $flavor,
                        RdfWriter $writer,
-                       BagOStuff $dedupBag = null
+                       DedupeBag $dedupBag
        ) {
                $this->vocabulary = $vocabulary;
                $this->dedupBag = $dedupBag;
@@ -130,15 +127,12 @@
 
                if ( $this->shouldProduce( RdfProducer::PRODUCE_FULL_VALUES ) ) 
{
                        $self = $this; // PHP 5.3 compat
-                       $valueSeen = function( $hash ) use ( $self ) {
-                               return $self->alreadySeen( $hash, 'V' );
-                       };
 
                        // NOTE: us sub-writers for nested structures
                        $valueWriter = $writer->sub();
 
                        $statementValueBuilder = new ComplexValueRdfBuilder( 
$vocabulary, $valueWriter, $propertyLookup );
-                       $statementValueBuilder->setValueSeenCallback( 
$valueSeen );
+                       $statementValueBuilder->setDedupeBag( $this->dedupBag );
 
                        if ( $this->shouldProduce( 
RdfProducer::PRODUCE_RESOLVED_ENTITIES ) ) {
                                
$statementValueBuilder->setEntityMentionCallback( array( $this, 
'entityMentioned' ) );
@@ -177,12 +171,9 @@
                $statementValueBuilder = $this->newSnakValueBuilder( 
$vocabulary, $propertyLookup, $writer );
 
                $self = $this; // PHP 5.3 compat
-               $referenceSeen = function ( $hash ) use ( $self ) {
-                       return $self->alreadySeen( $hash, 'R' );
-               };
 
                $statementBuilder = new FullStatementRdfBuilder( $vocabulary, 
$writer, $statementValueBuilder );
-               $statementBuilder->setReferenceSeenCallback( $referenceSeen );
+               $statementBuilder->setDedupeBag( $this->dedupBag );
 
                if ( $this->shouldProduce( RdfProducer::PRODUCE_PROPERTIES ) ) {
                        $statementBuilder->setPropertyMentionCallback( array( 
$this, 'entityMentioned' ) );
@@ -268,28 +259,6 @@
        private function entityResolved( EntityId $entityId ) {
                $prefixedId = $entityId->getSerialization();
                $this->entitiesResolved[$prefixedId] = true;
-       }
-
-       /**
-        * Did we already see this value? If yes, we may need to skip it
-        *
-        * @todo Make callback private once we drop PHP 5.3 compat.
-        *
-        * @param string $hash hash value to check
-        * @param string $namespace
-        *
-        * @return bool
-        */
-       public function alreadySeen( $hash, $namespace ) {
-               if ( !$this->dedupBag ) {
-                       return false;
-               }
-               $key = $namespace . substr($hash, 0, 5);
-               if ( $this->dedupBag->get( $key ) !== $hash ) {
-                       $this->dedupBag->set( $key, $hash );
-                       return false;
-               }
-               return true;
        }
 
        /**
diff --git a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php 
b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
index 0b5f26b..affa007 100644
--- a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
@@ -14,6 +14,9 @@
 use Wikibase\DataModel\Entity\EntityIdValue;
 use Wikibase\DataModel\Entity\ItemId;
 use Wikibase\DataModel\Entity\PropertyId;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
+use Wikibase\NullDedupeBag;
 use Wikimedia\Purtle\RdfWriter;
 use Wikibase\Rdf\Test\RdfBuilderTestData;
 use Wikibase\RdfVocabulary;
@@ -56,22 +59,22 @@
         *
         * @return ComplexValueRdfBuilder
         */
-       private function newBuilder( array &$mentioned = array(), array 
$valuesSeen = array() ) {
+       private function newBuilder( array &$mentioned = array(), DedupeBag 
$bag = null ) {
                $entityMentioned = function( EntityId $id ) use ( &$mentioned ) 
{
                        $key = $id->getSerialization();
                        $mentioned[$key] = $id;
                };
 
+               if ( !$bag ) {
+                       $bag = new NullDedupeBag();
+               }
+
                $vocabulary = $this->getTestData()->getVocabulary();
                $valueWriter = $this->getTestData()->getNTriplesWriter();
 
-               $valueSeenCallback = function( $hash ) use ( $valuesSeen ) {
-                       return in_array( $hash, $valuesSeen );
-               };
-
                $builder = new ComplexValueRdfBuilder( $vocabulary, 
$valueWriter, $this->getTestData()->getMockRepository() );
                $builder->setEntityMentionCallback( $entityMentioned );
-               $builder->setValueSeenCallback( $valueSeenCallback );
+               $builder->setDedupeBag( $bag );
 
                // HACK: glue on the value writer as a public field, so we can 
evaluate it later.
                $builder->test_value_writer = $valueWriter;
@@ -338,7 +341,10 @@
                $writer->about( RdfVocabulary::NS_ENTITY, 'Q11' );
 
                $mentioned = array();
-               $seen = array( $value->getHash() );
+
+               $seen = new HashDedupeBag();
+               $seen->alreadySeen( $value->getHash(), 'V' );
+
                $builder = $this->newBuilder( $mentioned, $seen );
                $builder->addSnakValue( $writer, $propertyId, $value, 
RdfVocabulary::NS_DIRECT_CLAIM );
 
diff --git a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php 
b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
index ce5f0b3..96520fc 100644
--- a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
@@ -4,6 +4,9 @@
 
 use Wikibase\ComplexValueRdfBuilder;
 use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
+use Wikibase\NullDedupeBag;
 use Wikimedia\Purtle\NTriplesRdfWriter;
 use Wikibase\Rdf\Test\RdfBuilderTestData;
 use Wikibase\RdfProducer;
@@ -44,11 +47,11 @@
        /**
         * @param int $flavor Bitmap for the output flavor, use 
RdfProducer::PRODUCE_XXX constants.
         * @param EntityId[] &$mentioned Receives any entity IDs being 
mentioned.
-        * @param string[] $referencesSeen A list of reference hashes that 
should be considered "already seen".
+        * @param DedupeBag $dedupe A bag of reference hashes that should be 
considered "already seen".
         *
         * @return FullStatementRdfBuilder
         */
-       private function newBuilder( $flavor, array &$mentioned = array(), 
array $referencesSeen = array() ) {
+       private function newBuilder( $flavor, array &$mentioned = array(), 
DedupeBag $dedupe = null ) {
                $vocabulary = $this->getTestData()->getVocabulary();
 
                $writer = $this->getTestData()->getNTriplesWriter();
@@ -58,6 +61,10 @@
                        $mentioned[$key] = $id;
                };
 
+               if ( !$dedupe ) {
+                       $dedupe = new NullDedupeBag();
+               }
+
                if ( $flavor & RdfProducer::PRODUCE_FULL_VALUES ) {
                        $valueWriter = $writer->sub();
 
@@ -66,12 +73,8 @@
                        $statementValueBuilder = new SimpleValueRdfBuilder( 
$vocabulary, $this->getTestData()->getMockRepository() );
                }
 
-               $referenceSeenCallback = function( $hash ) use ( 
$referencesSeen ) {
-                       return in_array( $hash, $referencesSeen );
-               };
-
                $statementBuilder = new FullStatementRdfBuilder( $vocabulary, 
$writer, $statementValueBuilder );
-               $statementBuilder->setReferenceSeenCallback( 
$referenceSeenCallback );
+               $statementBuilder->setDedupeBag( $dedupe );
 
                if ( $flavor & RdfProducer::PRODUCE_PROPERTIES  ) {
                        $statementBuilder->setPropertyMentionCallback( 
$entityMentioned );
@@ -154,8 +157,14 @@
        public function testAddEntity_seen( $entityName, $dataSetName, array 
$referencesSeen ) {
                $entity = $this->getTestData()->getEntity( $entityName );
 
+               $dedupe = new HashDedupeBag();
+
+               foreach ( $referencesSeen as $hash ) {
+                       $dedupe->alreadySeen( $hash, 'R' );
+               }
+
                $mentioned = array();
-               $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL, 
$mentioned, $referencesSeen );
+               $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL, 
$mentioned, $dedupe );
                $builder->addEntity( $entity );
 
                $this->assertOrCreateNTriples( $dataSetName, $builder );
diff --git a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php 
b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
index b0e6105..34d2fb6 100644
--- a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
@@ -3,6 +3,8 @@
 namespace Wikibase\Test;
 
 use Wikibase\DataModel\Entity\Entity;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
 use Wikimedia\Purtle\NTriplesRdfWriter;
 use Wikibase\Rdf\Test\RdfBuilderTestData;
 use Wikibase\RdfBuilder;
@@ -40,10 +42,11 @@
        /**
         * @return RdfBuilder
         */
-       private function newRdfBuilder( $produce, \BagOStuff $dedup = null ) {
+       private function newRdfBuilder( $produce, DedupeBag $dedup = null ) {
                if( !$dedup ) {
-                       $dedup = new \HashBagOStuff();
+                       $dedup = new HashDedupeBag();
                }
+
                $emitter = new NTriplesRdfWriter();
                $builder = new RdfBuilder(
                        $this->getTestData()->getSiteList(),
@@ -53,6 +56,7 @@
                        $emitter,
                        $dedup
                );
+
                $builder->startDocument();
                return $builder;
        }
@@ -165,7 +169,8 @@
        }
 
        public function testDeduplication() {
-               $bag = new \HashBagOStuff();
+               $bag = new HashDedupeBag();
+
                $builder = $this->newRdfBuilder( RdfProducer::PRODUCE_ALL, $bag 
);
                $builder->addEntity( $this->getEntityData( 'Q7' ) );
                $data1 = $this->getDataFromBuilder( $builder );

-- 
To view, visit https://gerrit.wikimedia.org/r/204486
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to