Hjfocs has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/360376 )

Change subject: T168270: Validate the RDF syntax of a third-party dataset
......................................................................

T168270: Validate the RDF syntax of a third-party dataset

The first step towards the validation of the Wikidata
RDF data model.
Use case: the Wikidata primary sources tool allows
third-party providers to release their datasets, which will
undergo curation by Wikidata editors before the actual
inclusion.
Hence, full compliance with Wikidata data model and
vocabularies is an essential requirement for a dataset to be
uploaded into the primary sources tool.
The Wikidata query service seems a good fit for the tool
back-end.

Change-Id: I0dba9bb036058123a5e44bb16b30f1fcfa6a37ec
---
A 
tools/src/main/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidator.java
A 
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidatorUnitTest.java
A tools/src/test/resources/chuck_berry.ttl
3 files changed, 155 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf 
refs/changes/76/360376/1

diff --git 
a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidator.java
 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidator.java
new file mode 100644
index 0000000..6275493
--- /dev/null
+++ 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidator.java
@@ -0,0 +1,46 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import org.openrdf.model.Model;
+import org.openrdf.rio.RDFFormat;
+import org.openrdf.rio.RDFParseException;
+import org.openrdf.rio.Rio;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ * @author Marco Fossati - User:Hjfocs
+ * @since 0.2.4
+ * Created on Jun 19, 2017.
+ */
+public class WikibaseDataModelValidator {
+
+    private static final Logger log = 
LoggerFactory.getLogger(WikibaseDataModelValidator.class);
+
+    /**
+     * Check the RDF syntax correctness of a given dataset.
+     * Note that parsing is done in memory over the whole dataset.
+     *
+     * @param dataset - the input stream of the dataset to check
+     * @param baseURI - the base URI
+     * @param format  - the RDF format used to serialize the input dataset
+     * @return the successfully parsed RDF {@link Model}
+     */
+    public Model checkSyntax(InputStream dataset, String baseURI, RDFFormat 
format) {
+        Model parsed = null;
+        try {
+            parsed = Rio.parse(dataset, baseURI, format);
+        } catch (IOException ioe) {
+            log.error("Couldn't read your dataset: {}", ioe.getMessage());
+            System.exit(1);
+        } catch (RDFParseException rpe) {
+            log.error("Your dataset is not valid RDF. Please fix it: {}", 
rpe.getMessage());
+            System.exit(1);
+            return null;
+        }
+        return parsed;
+    }
+
+}
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidatorUnitTest.java
 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidatorUnitTest.java
new file mode 100644
index 0000000..b868dd4
--- /dev/null
+++ 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/WikibaseDataModelValidatorUnitTest.java
@@ -0,0 +1,90 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import com.google.common.io.Resources;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.openrdf.model.Model;
+import org.openrdf.rio.RDFFormat;
+import org.openrdf.rio.RDFParseException;
+import org.openrdf.rio.Rio;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
+/**
+ * @author Marco Fossati - User:Hjfocs
+ * @since 0.2.4
+ * Created on Jun 19, 2017.
+ */
+@RunWith(RandomizedRunner.class)
+public class WikibaseDataModelValidatorUnitTest {
+
+    private static final String TEST_DATASET_FILE_NAME = "chuck_berry.ttl";
+    private static WikibaseDataModelValidator validator;
+    private static InputStream testDataset;
+    private static Model parsedDataset;
+
+    @BeforeClass
+    public static void setUpOnce() {
+        validator = new WikibaseDataModelValidator();
+        openTestDatasetStream();
+        try {
+            parsedDataset = Rio.parse(testDataset, "http://test.dataset";, 
RDFFormat.TURTLE);
+        } catch (IOException ioe) {
+            System.err.println("I/O trouble while reading from [" +
+                TEST_DATASET_FILE_NAME +
+                "]. This shouldn't happen here. Please re-run.");
+            System.exit(1);
+        } catch (RDFParseException rpe) {
+            System.err.println("Parse error: " +
+                rpe.getMessage() +
+                " This shouldn't happen here. Please re-run.");
+        }
+    }
+
+    @AfterClass
+    public static void tearDownOnce() {
+        closeTestDatasetStream();
+    }
+
+    private static void openTestDatasetStream() {
+        try {
+            testDataset = Resources.asByteSource(
+                Resources.getResource(TEST_DATASET_FILE_NAME))
+                .openBufferedStream();
+        } catch (IOException ioe) {
+            System.err.println("Couldn't load test resource [" +
+                TEST_DATASET_FILE_NAME +
+                "]. Looks like your codebase is broken." +
+                "Please get a fresh copy of it.");
+            System.exit(1);
+        }
+    }
+
+    private static void closeTestDatasetStream() {
+        try {
+            testDataset.close();
+        } catch (IOException ioe) {
+            System.err.println("Couldn't close test resource buffer: " +
+                ioe.getMessage());
+            System.exit(1);
+        }
+    }
+
+    @Test
+    public void testCheckSyntax() throws Exception {
+        openTestDatasetStream();
+        Model checked = validator.checkSyntax(testDataset, 
"http://test.dataset";, RDFFormat.TURTLE);
+        closeTestDatasetStream();
+        // The test resource has a valid syntax
+        assertNotNull(checked);
+        assertEquals(parsedDataset, checked);
+    }
+
+}
diff --git a/tools/src/test/resources/chuck_berry.ttl 
b/tools/src/test/resources/chuck_berry.ttl
new file mode 100644
index 0000000..af91677
--- /dev/null
+++ b/tools/src/test/resources/chuck_berry.ttl
@@ -0,0 +1,19 @@
+@prefix wikibase: <http://wikiba.se/ontology-beta#> .
+@prefix wd: <http://www.wikidata.org/entity/> .
+@prefix wds: <http://www.wikidata.org/entity/statement/> .
+@prefix wdref: <http://www.wikidata.org/reference/> .
+@prefix p: <http://www.wikidata.org/prop/> .
+@prefix ps: <http://www.wikidata.org/prop/statement/> .
+@prefix pq: <http://www.wikidata.org/prop/qualifier/> .
+@prefix pr: <http://www.wikidata.org/prop/reference/> .
+@prefix prov: <http://www.w3.org/ns/prov#> .
+
+wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 a wikibase:Statement,
+               wikibase:BestRank ;
+       wikibase:rank wikibase:NormalRank ;
+       ps:P18 
<http://commons.wikimedia.org/wiki/Special:FilePath/Chuck-berry-2007-07-18.jpg> 
;
+       pq:P2096 "Chuck Berry (2007)"@ca ;
+       prov:wasDerivedFrom wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc .
+
+wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc a wikibase:Reference ;
+       pr:P143 wd:Q206855 .

-- 
To view, visit https://gerrit.wikimedia.org/r/360376
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0dba9bb036058123a5e44bb16b30f1fcfa6a37ec
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Hjfocs <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to