jenkins-bot has submitted this change and it was merged.

Change subject: Check URIs in data values against vocabulary base URI.
......................................................................


Check URIs in data values against vocabulary base URI.

We use URIs as identifiers for calendars, reference globes, and units
of measurement. The vocabulary these URIs come from should be configurable.

Currently, calendars and globes are hardcoded to use Wikidata URIs,
while units use the local repo concepts as the vocabulary.

This change checks incoming data values against the appropriate URI
prefixes.

Bug: T111171
Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d
---
M repo/includes/ValidatorBuilders.php
M repo/includes/WikibaseRepo.php
M repo/tests/phpunit/includes/ValidatorBuildersTest.php
3 files changed, 99 insertions(+), 26 deletions(-)

Approvals:
  Thiemo Mättig (WMDE): Looks good to me, approved
  JanZerebecki: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/repo/includes/ValidatorBuilders.php 
b/repo/includes/ValidatorBuilders.php
index 7475572..60d01ea 100644
--- a/repo/includes/ValidatorBuilders.php
+++ b/repo/includes/ValidatorBuilders.php
@@ -37,19 +37,35 @@
 class ValidatorBuilders {
 
        /**
-        * @var EntityIdParser
-        */
-       private $entityIdParser;
-
-       /**
         * @var EntityLookup
         */
        private $entityLookup;
 
        /**
+        * @var EntityIdParser
+        */
+       private $entityIdParser;
+
+       /**
         * @var string[]
         */
        private $urlSchemes;
+
+       /**
+        * @var string The base URI for the vocabulary to use for units (and in 
the
+        * future, globes and calendars).
+        */
+       private $vocabularyBaseUri;
+
+       /**
+        * @var string The base URI wikibase concepts, for use with the 
validators for time and globe
+        * values. Our parsers for these data types currently have Wikidata 
URIs hardcoded, so we need
+        * to hardcode the URI to check them against for now.
+        *
+        * @todo: use a configurable vocabulary for calendars and reference 
globes, instead of
+        * hardcoding wikidata. Then replace usages of $wikidataBaseUri with 
$vocabularyBaseUri.
+        */
+       private $wikidataBaseUri = 'http://www.wikidata.org/entity/';
 
        /**
         *
@@ -62,17 +78,20 @@
         * @param EntityIdParser $idParser
         * @param string[] $urlSchemes
         * @param ContentLanguages $contentLanguages
+        * @param string $vocabularyBaseUri The base URI for vocabulary 
concepts.
         */
        public function __construct(
                EntityLookup $lookup,
                EntityIdParser $idParser,
                array $urlSchemes,
+               $vocabularyBaseUri,
                ContentLanguages $contentLanguages
        ) {
-               $this->contentLanguages = $contentLanguages;
-               $this->entityIdParser = $idParser;
                $this->entityLookup = $lookup;
+               $this->entityIdParser = $idParser;
                $this->urlSchemes = $urlSchemes;
+               $this->vocabularyBaseUri = $vocabularyBaseUri;
+               $this->contentLanguages = $contentLanguages;
        }
 
        /**
@@ -187,7 +206,7 @@
                $validators[] = new TypeValidator( 'array' );
 
                // Expected to be a short IRI, see TimeFormatter and TimeParser.
-               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), 255 );
+               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), $this->wikidataBaseUri, 255 );
                //TODO: enforce well known calendar models from config
 
                $validators[] = new DataFieldValidator( 'calendarmodel', 
$urlValidator );
@@ -235,7 +254,7 @@
                $validators[] = new TypeValidator( 'array' );
 
                // Expected to be a short IRI, see GlobeCoordinateValue and 
GlobeCoordinateParser.
-               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), 255 );
+               $urlValidator = $this->getUrlValidator( array( 'http', 'https' 
), $this->wikidataBaseUri, 255 );
                //TODO: enforce well known reference globes from config
 
                $validators[] = new DataFieldValidator( 'precision', new 
NumberValidator() );
@@ -251,13 +270,14 @@
 
        /**
         * @param string[] $urlSchemes List of URL schemes, e.g. 'http'
+        * @param string|null $prefix a required prefix
         * @param int $maxLength Defaults to 500 characters. Even if URLs are 
unlimited in theory they
         * should be limited to about 2000. About 500 is a reasonable 
compromise.
         * @see http://stackoverflow.com/a/417184
         *
         * @return CompositeValidator
         */
-       private function getUrlValidator( $urlSchemes, $maxLength = 500 ) {
+       private function getUrlValidator( array $urlSchemes, $prefix = null, 
$maxLength = 500 ) {
                $validators = array();
                $validators[] = new TypeValidator( 'string' );
                $validators[] = new StringLengthValidator( 2, $maxLength );
@@ -266,7 +286,23 @@
                $urlSchemeValidators = $urlValidators->getValidators( 
$urlSchemes );
                $validators[] = new UrlValidator( $urlSchemeValidators );
 
+               if ( $prefix !== null ) {
+                       //XXX: we may want to allow http AND https.
+                       $validators[] = $this->getPrefixValidator( $prefix, 
'bad-prefix' );
+               }
+
                return new CompositeValidator( $validators ); //Note: each 
validator is fatal
+       }
+
+       /**
+        * @param string $prefix
+        * @param string $errorCode
+        *
+        * @return RegexValidator
+        */
+       private function getPrefixValidator( $prefix, $errorCode ) {
+               $regex = '!^' . preg_quote( $prefix, '!' ) . '!';
+               return new RegexValidator( $regex, false, $errorCode );
        }
 
        /**
@@ -295,9 +331,9 @@
                $unitValidators = new AlternativeValidator( array(
                        // NOTE: "1" is always considered legal for historical 
reasons,
                        // since we use it to represent "unitless" quantities. 
We could also use
-                       // http://qudt.org/vocab/unit#Unitless or 
https://www.wikidata.org/entity/Q199
+                       // http://qudt.org/vocab/unit#Unitless or 
http://www.wikidata.org/entity/Q199
                        new MembershipValidator( array( '1' ) ),
-                       $this->getUrlValidator( array( 'http', 'https' ), 255 ),
+                       $this->getUrlValidator( array( 'http', 'https' ), 
$this->vocabularyBaseUri, 255 ),
                ) );
                $validators[] = new DataFieldValidator( 'unit', $unitValidators 
);
 
diff --git a/repo/includes/WikibaseRepo.php b/repo/includes/WikibaseRepo.php
index 5ce712c..3b5bab8 100644
--- a/repo/includes/WikibaseRepo.php
+++ b/repo/includes/WikibaseRepo.php
@@ -265,6 +265,7 @@
                        $this->getEntityLookup(),
                        $this->getEntityIdParser(),
                        $urlSchemes,
+                       $this->getVocabularyBaseUri(),
                        $this->getMonolingualTextLanguages()
                );
        }
@@ -722,6 +723,15 @@
        }
 
        /**
+        * @return string
+        */
+       private function getVocabularyBaseUri() {
+               //@todo: We currently use the local repo concept URI here. This 
should be configurable,
+               // to e.g. allow 3rd parties to use Wikidata as their 
vocabulary repo.
+               return $this->getSettings()->getSetting( 'conceptBaseUri' );
+       }
+
+       /**
         * @return OutputFormatSnakFormatterFactory
         */
        protected function newSnakFormatterFactory() {
diff --git a/repo/tests/phpunit/includes/ValidatorBuildersTest.php 
b/repo/tests/phpunit/includes/ValidatorBuildersTest.php
index 09b4dff..2233c01 100644
--- a/repo/tests/phpunit/includes/ValidatorBuildersTest.php
+++ b/repo/tests/phpunit/includes/ValidatorBuildersTest.php
@@ -54,6 +54,7 @@
                        $entityLookup,
                        $entityIdParser,
                        $urlSchemes,
+                       'http://qudt.org/vocab/',
                        $contentLanguages
                );
 
@@ -62,6 +63,7 @@
 
        public function provideDataTypeValidation() {
                $latLonValue = new LatLongValue( 0, 0 );
+               $wikidataUri = 'http://www.wikidata.org/entity/';
 
                $cases = array(
                        //wikibase-item
@@ -114,21 +116,21 @@
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://' . str_repeat( 'x', 256 ) ),
+                                       $wikidataUri . 'Q' . str_repeat( '6', 
256 ) ),
                                false,
                                'calendar: too long'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://acme.com/calendar' ),
+                                       $wikidataUri . 'Q1985727' ),
                                true,
                                'calendar: URL'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       ' http://acme.com/calendar ' ),
+                                       ' ' . $wikidataUri . 'Q1985727 ' ),
                                false,
                                'calendar: untrimmed'
                        ),
@@ -144,14 +146,14 @@
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T11:22:33Z', 0, 0, 
0, TimeValue::PRECISION_DAY,
-                                       'http://acme.com/calendar' ),
+                                       $wikidataUri . 'Q1985727' ),
                                false,
                                'time given to the second'
                        ),
                        array(
                                'time',
                                new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 
0, TimeValue::PRECISION_SECOND,
-                                       'http://acme.com/calendar' ),
+                                       $wikidataUri . 'Q1985727' ),
                                false,
                                'precision: second'
                        ),
@@ -171,30 +173,55 @@
                        //globe-coordinate[precision]
                        array(
                                'globe-coordinate',
-                               new GlobeCoordinateValue( $latLonValue, 1, 
'http://www.wikidata.org/entity/Q2' ),
+                               new GlobeCoordinateValue( $latLonValue, 1, 
$wikidataUri . 'Q2' ),
                                true,
                                'integer precision is valid'
                        ),
                        array(
                                'globe-coordinate',
-                               new GlobeCoordinateValue( $latLonValue, 0.2, 
'http://www.wikidata.org/entity/Q2' ),
+                               new GlobeCoordinateValue( $latLonValue, 0.2, 
$wikidataUri . 'Q2' ),
                                true,
                                'float precision is valid'
                        ),
                        array(
                                'globe-coordinate',
-                               new GlobeCoordinateValue( $latLonValue, null, 
'http://www.wikdiata.org/entity/Q2' ),
+                               new GlobeCoordinateValue( $latLonValue, null, 
$wikidataUri . 'Q2' ),
                                false,
                                'null precision is invalid'
                        ),
 
                        //globe-coordinate[globe]
                        // FIXME: this is testing unimplemented behaviour? 
Probably broken...
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, '' ), false, 'globe: empty string should be invalid' ),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://' . str_repeat( 'x', 256 ) ), false, 'globe: too long' 
),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, 'http://acme.com/globe' ), true, 'globe: URL' ),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, ' http://acme.com/globe ' ), false, 'globe: untrimmed' ),
-                       array( 'globe-coordinate', new GlobeCoordinateValue( 
$latLonValue, 1, ' javascript:alert(1) ' ), false, 'globe: bad URL scheme' ),
+                       array(
+                               'globe-coordinate',
+                               new GlobeCoordinateValue( $latLonValue, 1, '' ),
+                               false,
+                               'globe: empty string should be invalid'
+                       ),
+                       array(
+                               'globe-coordinate',
+                               new GlobeCoordinateValue( $latLonValue, 1, 
$wikidataUri . 'Q' . str_repeat( '6', 256 ) ),
+                               false,
+                               'globe: too long'
+                       ),
+                       array(
+                               'globe-coordinate',
+                               new GlobeCoordinateValue( $latLonValue, 1, 
$wikidataUri . 'Q2' ),
+                               true,
+                               'globe: URL'
+                       ),
+                       array(
+                               'globe-coordinate',
+                               new GlobeCoordinateValue( $latLonValue, 1, ' ' 
. $wikidataUri . 'Q2 ' ),
+                               false,
+                               'globe: untrimmed'
+                       ),
+                       array(
+                               'globe-coordinate',
+                               new GlobeCoordinateValue( $latLonValue, 1, ' 
javascript:alert(1) ' ),
+                               false,
+                               'globe: bad URL scheme'
+                       ),
                        //TODO: globe must be an item reference
                        //TODO: globe must be from a list of configured values
 
@@ -221,7 +248,7 @@
                        //quantity
                        array( 'quantity', QuantityValue::newFromNumber( 5 ), 
true, 'Simple integer' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, 
'http://qudt.org/vocab/unit#Meter' ), true, 'Vocabulary URI' ),
-                       array( 'quantity', QuantityValue::newFromNumber( 5, 
'https://www.wikidata.org/entity/Q11573' ), true, 'Wikidata URI' ),
+                       array( 'quantity', QuantityValue::newFromNumber( 5, 
$wikidataUri . 'Q11573' ), false, 'Wikidata URI' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, '1' 
), true, '1 means unitless' ),
                        array( 'quantity', QuantityValue::newFromNumber( 5, 
'kittens' ), false, 'Bad unit URI' ),
                        array( 'quantity', QuantityValue::newFromNumber( 
'-11.234', '1', '-10', '-12' ), true, 'decimal strings' ),

-- 
To view, visit https://gerrit.wikimedia.org/r/235512
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: Addshore <addshorew...@gmail.com>
Gerrit-Reviewer: Aude <aude.w...@gmail.com>
Gerrit-Reviewer: Bene <benestar.wikime...@gmail.com>
Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de>
Gerrit-Reviewer: JanZerebecki <jan.wikime...@zerebecki.de>
Gerrit-Reviewer: Jonas Kress (WMDE) <jonas.kr...@wikimedia.de>
Gerrit-Reviewer: Lydia Pintscher <lydia.pintsc...@wikimedia.de>
Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to