jenkins-bot has submitted this change and it was merged. Change subject: Check URIs in data values against vocabulary base URI. ......................................................................
Check URIs in data values against vocabulary base URI. We use URIs as identifiers for calendars, reference globes, and units of measurement. The vocabulary these URIs come from should be configurable. Currently, calendars and globes are hardcoded to use Wikidata URIs, while units use the local repo concepts as the vocabulary. This change checks incoming data values against the appropriate URI prefixes. Bug: T111171 Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d --- M repo/includes/ValidatorBuilders.php M repo/includes/WikibaseRepo.php M repo/tests/phpunit/includes/ValidatorBuildersTest.php 3 files changed, 99 insertions(+), 26 deletions(-) Approvals: Thiemo Mättig (WMDE): Looks good to me, approved JanZerebecki: Looks good to me, approved jenkins-bot: Verified diff --git a/repo/includes/ValidatorBuilders.php b/repo/includes/ValidatorBuilders.php index 7475572..60d01ea 100644 --- a/repo/includes/ValidatorBuilders.php +++ b/repo/includes/ValidatorBuilders.php @@ -37,19 +37,35 @@ class ValidatorBuilders { /** - * @var EntityIdParser - */ - private $entityIdParser; - - /** * @var EntityLookup */ private $entityLookup; /** + * @var EntityIdParser + */ + private $entityIdParser; + + /** * @var string[] */ private $urlSchemes; + + /** + * @var string The base URI for the vocabulary to use for units (and in the + * future, globes and calendars). + */ + private $vocabularyBaseUri; + + /** + * @var string The base URI wikibase concepts, for use with the validators for time and globe + * values. Our parsers for these data types currently have Wikidata URIs hardcoded, so we need + * to hardcode the URI to check them against for now. + * + * @todo: use a configurable vocabulary for calendars and reference globes, instead of + * hardcoding wikidata. Then replace usages of $wikidataBaseUri with $vocabularyBaseUri. + */ + private $wikidataBaseUri = 'http://www.wikidata.org/entity/'; /** * @@ -62,17 +78,20 @@ * @param EntityIdParser $idParser * @param string[] $urlSchemes * @param ContentLanguages $contentLanguages + * @param string $vocabularyBaseUri The base URI for vocabulary concepts. */ public function __construct( EntityLookup $lookup, EntityIdParser $idParser, array $urlSchemes, + $vocabularyBaseUri, ContentLanguages $contentLanguages ) { - $this->contentLanguages = $contentLanguages; - $this->entityIdParser = $idParser; $this->entityLookup = $lookup; + $this->entityIdParser = $idParser; $this->urlSchemes = $urlSchemes; + $this->vocabularyBaseUri = $vocabularyBaseUri; + $this->contentLanguages = $contentLanguages; } /** @@ -187,7 +206,7 @@ $validators[] = new TypeValidator( 'array' ); // Expected to be a short IRI, see TimeFormatter and TimeParser. - $urlValidator = $this->getUrlValidator( array( 'http', 'https' ), 255 ); + $urlValidator = $this->getUrlValidator( array( 'http', 'https' ), $this->wikidataBaseUri, 255 ); //TODO: enforce well known calendar models from config $validators[] = new DataFieldValidator( 'calendarmodel', $urlValidator ); @@ -235,7 +254,7 @@ $validators[] = new TypeValidator( 'array' ); // Expected to be a short IRI, see GlobeCoordinateValue and GlobeCoordinateParser. - $urlValidator = $this->getUrlValidator( array( 'http', 'https' ), 255 ); + $urlValidator = $this->getUrlValidator( array( 'http', 'https' ), $this->wikidataBaseUri, 255 ); //TODO: enforce well known reference globes from config $validators[] = new DataFieldValidator( 'precision', new NumberValidator() ); @@ -251,13 +270,14 @@ /** * @param string[] $urlSchemes List of URL schemes, e.g. 'http' + * @param string|null $prefix a required prefix * @param int $maxLength Defaults to 500 characters. Even if URLs are unlimited in theory they * should be limited to about 2000. About 500 is a reasonable compromise. * @see http://stackoverflow.com/a/417184 * * @return CompositeValidator */ - private function getUrlValidator( $urlSchemes, $maxLength = 500 ) { + private function getUrlValidator( array $urlSchemes, $prefix = null, $maxLength = 500 ) { $validators = array(); $validators[] = new TypeValidator( 'string' ); $validators[] = new StringLengthValidator( 2, $maxLength ); @@ -266,7 +286,23 @@ $urlSchemeValidators = $urlValidators->getValidators( $urlSchemes ); $validators[] = new UrlValidator( $urlSchemeValidators ); + if ( $prefix !== null ) { + //XXX: we may want to allow http AND https. + $validators[] = $this->getPrefixValidator( $prefix, 'bad-prefix' ); + } + return new CompositeValidator( $validators ); //Note: each validator is fatal + } + + /** + * @param string $prefix + * @param string $errorCode + * + * @return RegexValidator + */ + private function getPrefixValidator( $prefix, $errorCode ) { + $regex = '!^' . preg_quote( $prefix, '!' ) . '!'; + return new RegexValidator( $regex, false, $errorCode ); } /** @@ -295,9 +331,9 @@ $unitValidators = new AlternativeValidator( array( // NOTE: "1" is always considered legal for historical reasons, // since we use it to represent "unitless" quantities. We could also use - // http://qudt.org/vocab/unit#Unitless or https://www.wikidata.org/entity/Q199 + // http://qudt.org/vocab/unit#Unitless or http://www.wikidata.org/entity/Q199 new MembershipValidator( array( '1' ) ), - $this->getUrlValidator( array( 'http', 'https' ), 255 ), + $this->getUrlValidator( array( 'http', 'https' ), $this->vocabularyBaseUri, 255 ), ) ); $validators[] = new DataFieldValidator( 'unit', $unitValidators ); diff --git a/repo/includes/WikibaseRepo.php b/repo/includes/WikibaseRepo.php index 5ce712c..3b5bab8 100644 --- a/repo/includes/WikibaseRepo.php +++ b/repo/includes/WikibaseRepo.php @@ -265,6 +265,7 @@ $this->getEntityLookup(), $this->getEntityIdParser(), $urlSchemes, + $this->getVocabularyBaseUri(), $this->getMonolingualTextLanguages() ); } @@ -722,6 +723,15 @@ } /** + * @return string + */ + private function getVocabularyBaseUri() { + //@todo: We currently use the local repo concept URI here. This should be configurable, + // to e.g. allow 3rd parties to use Wikidata as their vocabulary repo. + return $this->getSettings()->getSetting( 'conceptBaseUri' ); + } + + /** * @return OutputFormatSnakFormatterFactory */ protected function newSnakFormatterFactory() { diff --git a/repo/tests/phpunit/includes/ValidatorBuildersTest.php b/repo/tests/phpunit/includes/ValidatorBuildersTest.php index 09b4dff..2233c01 100644 --- a/repo/tests/phpunit/includes/ValidatorBuildersTest.php +++ b/repo/tests/phpunit/includes/ValidatorBuildersTest.php @@ -54,6 +54,7 @@ $entityLookup, $entityIdParser, $urlSchemes, + 'http://qudt.org/vocab/', $contentLanguages ); @@ -62,6 +63,7 @@ public function provideDataTypeValidation() { $latLonValue = new LatLongValue( 0, 0 ); + $wikidataUri = 'http://www.wikidata.org/entity/'; $cases = array( //wikibase-item @@ -114,21 +116,21 @@ array( 'time', new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 0, TimeValue::PRECISION_DAY, - 'http://' . str_repeat( 'x', 256 ) ), + $wikidataUri . 'Q' . str_repeat( '6', 256 ) ), false, 'calendar: too long' ), array( 'time', new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 0, TimeValue::PRECISION_DAY, - 'http://acme.com/calendar' ), + $wikidataUri . 'Q1985727' ), true, 'calendar: URL' ), array( 'time', new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 0, TimeValue::PRECISION_DAY, - ' http://acme.com/calendar ' ), + ' ' . $wikidataUri . 'Q1985727 ' ), false, 'calendar: untrimmed' ), @@ -144,14 +146,14 @@ array( 'time', new TimeValue( '+2013-06-06T11:22:33Z', 0, 0, 0, TimeValue::PRECISION_DAY, - 'http://acme.com/calendar' ), + $wikidataUri . 'Q1985727' ), false, 'time given to the second' ), array( 'time', new TimeValue( '+2013-06-06T00:00:00Z', 0, 0, 0, TimeValue::PRECISION_SECOND, - 'http://acme.com/calendar' ), + $wikidataUri . 'Q1985727' ), false, 'precision: second' ), @@ -171,30 +173,55 @@ //globe-coordinate[precision] array( 'globe-coordinate', - new GlobeCoordinateValue( $latLonValue, 1, 'http://www.wikidata.org/entity/Q2' ), + new GlobeCoordinateValue( $latLonValue, 1, $wikidataUri . 'Q2' ), true, 'integer precision is valid' ), array( 'globe-coordinate', - new GlobeCoordinateValue( $latLonValue, 0.2, 'http://www.wikidata.org/entity/Q2' ), + new GlobeCoordinateValue( $latLonValue, 0.2, $wikidataUri . 'Q2' ), true, 'float precision is valid' ), array( 'globe-coordinate', - new GlobeCoordinateValue( $latLonValue, null, 'http://www.wikdiata.org/entity/Q2' ), + new GlobeCoordinateValue( $latLonValue, null, $wikidataUri . 'Q2' ), false, 'null precision is invalid' ), //globe-coordinate[globe] // FIXME: this is testing unimplemented behaviour? Probably broken... - array( 'globe-coordinate', new GlobeCoordinateValue( $latLonValue, 1, '' ), false, 'globe: empty string should be invalid' ), - array( 'globe-coordinate', new GlobeCoordinateValue( $latLonValue, 1, 'http://' . str_repeat( 'x', 256 ) ), false, 'globe: too long' ), - array( 'globe-coordinate', new GlobeCoordinateValue( $latLonValue, 1, 'http://acme.com/globe' ), true, 'globe: URL' ), - array( 'globe-coordinate', new GlobeCoordinateValue( $latLonValue, 1, ' http://acme.com/globe ' ), false, 'globe: untrimmed' ), - array( 'globe-coordinate', new GlobeCoordinateValue( $latLonValue, 1, ' javascript:alert(1) ' ), false, 'globe: bad URL scheme' ), + array( + 'globe-coordinate', + new GlobeCoordinateValue( $latLonValue, 1, '' ), + false, + 'globe: empty string should be invalid' + ), + array( + 'globe-coordinate', + new GlobeCoordinateValue( $latLonValue, 1, $wikidataUri . 'Q' . str_repeat( '6', 256 ) ), + false, + 'globe: too long' + ), + array( + 'globe-coordinate', + new GlobeCoordinateValue( $latLonValue, 1, $wikidataUri . 'Q2' ), + true, + 'globe: URL' + ), + array( + 'globe-coordinate', + new GlobeCoordinateValue( $latLonValue, 1, ' ' . $wikidataUri . 'Q2 ' ), + false, + 'globe: untrimmed' + ), + array( + 'globe-coordinate', + new GlobeCoordinateValue( $latLonValue, 1, ' javascript:alert(1) ' ), + false, + 'globe: bad URL scheme' + ), //TODO: globe must be an item reference //TODO: globe must be from a list of configured values @@ -221,7 +248,7 @@ //quantity array( 'quantity', QuantityValue::newFromNumber( 5 ), true, 'Simple integer' ), array( 'quantity', QuantityValue::newFromNumber( 5, 'http://qudt.org/vocab/unit#Meter' ), true, 'Vocabulary URI' ), - array( 'quantity', QuantityValue::newFromNumber( 5, 'https://www.wikidata.org/entity/Q11573' ), true, 'Wikidata URI' ), + array( 'quantity', QuantityValue::newFromNumber( 5, $wikidataUri . 'Q11573' ), false, 'Wikidata URI' ), array( 'quantity', QuantityValue::newFromNumber( 5, '1' ), true, '1 means unitless' ), array( 'quantity', QuantityValue::newFromNumber( 5, 'kittens' ), false, 'Bad unit URI' ), array( 'quantity', QuantityValue::newFromNumber( '-11.234', '1', '-10', '-12' ), true, 'decimal strings' ), -- To view, visit https://gerrit.wikimedia.org/r/235512 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib91b1c0a297fd9ab54b0dabf446eb8850e46ac0d Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Addshore <addshorew...@gmail.com> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Bene <benestar.wikime...@gmail.com> Gerrit-Reviewer: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: JanZerebecki <jan.wikime...@zerebecki.de> Gerrit-Reviewer: Jonas Kress (WMDE) <jonas.kr...@wikimedia.de> Gerrit-Reviewer: Lydia Pintscher <lydia.pintsc...@wikimedia.de> Gerrit-Reviewer: Thiemo Mättig (WMDE) <thiemo.maet...@wikimedia.de> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits