jenkins-bot has submitted this change and it was merged. Change subject: Handle 404s/deletes ......................................................................
Handle 404s/deletes When wikbase 404s on a page we make sure that that page doesn't have any entries in the RDF store. This should cover deletes. Change-Id: I938d23fab7dd636198f2b0fb3ffde25a266f5815 --- M tools/src/main/java/org/wikidata/query/rdf/tool/Update.java M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java M tools/src/main/java/org/wikidata/query/rdf/tool/wikibase/WikibaseRepository.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java 6 files changed, 69 insertions(+), 21 deletions(-) Approvals: Manybubbles: Looks good to me, approved jenkins-bot: Verified diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java index 1338146..e249f4e 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java @@ -278,13 +278,13 @@ */ private void handleChange(Change change) throws RetryableException, ContainedException { log.debug("Received revision information {}", change); - // TODO deletes if (change.revision() >= 0 && rdfRepository.hasRevision(change.entityId(), change.revision())) { log.debug("RDF repostiroy already has this revision, skipping."); return; } Munger munger = new Munger(entityDataUris, entityUris); - rdfRepository.sync(change.entityId(), munger.munge(wikibase.fetchRdfForEntity(change.entityId()))); + rdfRepository.sync(change.entityId(), + munger.munge(change.entityId(), wikibase.fetchRdfForEntity(change.entityId()))); updateMeter.mark(); } diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java index 9f71496..858c7c2 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java @@ -38,11 +38,14 @@ * @param statements statements to munge * @return a reference to statements */ - public Collection<Statement> munge(Collection<Statement> statements) { - /* - * Filters and adds RDF based in a single pass. - */ + public Collection<Statement> munge(String entityId, Collection<Statement> statements) { + if (statements.isEmpty()) { + // Empty collection is a delete. + return statements; + } + // Filters and adds RDF based in a single pass. Iterator<Statement> itr = statements.iterator(); + String entityUri = entityUris.namespace() + entityId; Value revisionId = null; Value lastModified = null; Resource entity = null; @@ -71,7 +74,14 @@ } if (subject.startsWith(entityUris.namespace())) { entity = s.getSubject(); - if (predicate.equals(RDF.TYPE) && s.getObject().stringValue().equals(Ontology.ITEM)) { + if (!subject.equals(entityUri)) { + /* + * Some flavors of rdf dump information about other entities + * along side the main entity. We can't handle that properly + * and it doesn't make a ton of sense anyway. + */ + itr.remove(); + } else if (predicate.equals(RDF.TYPE) && s.getObject().stringValue().equals(Ontology.ITEM)) { // We don't need wd:Q1 a wdo:item itr.remove(); } else if (predicate.equals(SchemaDotOrg.NAME)) { diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java index 47d6384..6709aa0 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java @@ -63,26 +63,32 @@ * @param statements all known statements about the entity */ public void sync(String entityId, Collection<Statement> statements) { + StringBuilder command = new StringBuilder(); UpdateBuilder siteLinksBuilder = updateBuilder(); siteLinksBuilder.delete("?s", "?p", "?o"); siteLinksBuilder.where("?s", "schema:about", "entity:" + entityId); siteLinksBuilder.where("?s", "?p", "?o"); - siteLinksBuilder.where().notExists().values(statements, "?s", "?p", "?o"); + if (!statements.isEmpty()) { + siteLinksBuilder.where().notExists().values(statements, "?s", "?p", "?o"); + } + command.append(siteLinksBuilder).append(";\n"); UpdateBuilder generalBuilder = updateBuilder(); generalBuilder.delete("entity:" + entityId, "?p", "?o"); generalBuilder.where("entity:" + entityId, "?p", "?o"); - generalBuilder.where().notExists().values(statements, "?s", "?p", "?o"); + if (!statements.isEmpty()) { + generalBuilder.where().notExists().values(statements, "?s", "?p", "?o"); + } + command.append(generalBuilder).append(";\n"); - UpdateBuilder insertBuilder = updateBuilder(); - for (Statement statement : statements) { - insertBuilder.insert(statement.getSubject(), statement.getPredicate(), statement.getObject()); + if (!statements.isEmpty()) { + UpdateBuilder insertBuilder = updateBuilder(); + for (Statement statement : statements) { + insertBuilder.insert(statement.getSubject(), statement.getPredicate(), statement.getObject()); + } + command.append(insertBuilder).append(";\n"); } long start = System.currentTimeMillis(); - StringBuilder command = new StringBuilder(); - command.append(siteLinksBuilder).append(";\n"); - command.append(generalBuilder).append(";\n"); - command.append(insertBuilder).append(";\n"); execute("update", IGNORE_RESPONSE, command.toString()); log.debug("Updating {} took {} millis", entityId, System.currentTimeMillis() - start); } diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/wikibase/WikibaseRepository.java b/tools/src/main/java/org/wikidata/query/rdf/tool/wikibase/WikibaseRepository.java index 5a65658..82b8750 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/wikibase/WikibaseRepository.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/wikibase/WikibaseRepository.java @@ -8,6 +8,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; +import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Locale; @@ -87,6 +88,14 @@ parser.setRDFHandler(collector); try { try (CloseableHttpResponse response = client.execute(new HttpGet(uri))) { + if (response.getStatusLine().getStatusCode() == 404) { + // A delete/nonexistent page + return Collections.emptyList(); + } + if (response.getStatusLine().getStatusCode() >= 300) { + throw new ContainedException("Unexpected status code fetching RDF for " + uri + ": " + + response.getStatusLine().getStatusCode()); + } parser.parse(new InputStreamReader(response.getEntity().getContent(), Charsets.UTF_8), uri.toString()); } } catch (IOException e) { @@ -204,6 +213,7 @@ */ builder.setPath(String.format(Locale.ROOT, "/wiki/Special:EntityData/%s.ttl", title)); builder.addParameter("nocache", ""); + builder.addParameter("flavor", "dump"); return build(builder); } diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java index 6ba5cf5..a6525ff 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java @@ -38,7 +38,7 @@ public void mungesEntityDataOntoEntity() { List<Statement> statements = basicEntity("Q23"); - munger.munge(statements); + munger.munge("Q23", statements); // This Matcher is so hard to build...... ImmutableList.Builder<Matcher<? super Statement>> matchers = ImmutableList.builder(); matchers.add(equalTo(statement("Q23", SchemaDotOrg.VERSION, new LiteralImpl("a revision number I promise")))); @@ -50,7 +50,7 @@ public void extraDataIsntModified() { List<Statement> statements = basicEntity("Q23"); statements.add(statement("Q23", "P509", "Q6")); - munger.munge(statements); + munger.munge("Q23", statements); assertThat(statements, hasItem(equalTo(statement("Q23", "P509", "Q6")))); } @@ -58,7 +58,7 @@ public void complainsAboutExtraSubjects() { List<Statement> statements = basicEntity("Q23"); statements.add(statement("http://example.com/bogus", "Q23", "Q23")); - munger.munge(statements); + munger.munge("Q23", statements); } @Test @@ -75,7 +75,7 @@ statements.add(metaDecl); statements.add(articleDecl); } - munger.munge(statements); + munger.munge("Q23", statements); assertThat(statements, both(hasItem(equalTo(articleDecl))).and(hasItem(equalTo(metaDecl)))); } @@ -87,12 +87,25 @@ List<Statement> statements = basicEntity("Q23"); statements.addAll(ImmutableList.of(rdfsDecl, skosDecl, schemaDecl)); - munger.munge(statements); + munger.munge("Q23", statements); assertThat(statements, hasItem(equalTo(rdfsDecl))); assertThat(statements, not(hasItem(equalTo(skosDecl)))); assertThat(statements, not(hasItem(equalTo(schemaDecl)))); } + @Test + public void labelsOnOthersRemoved() { + Statement georgeDecl = statement("Q23", RDFS.LABEL, new LiteralImpl("george", "en")); + Statement marthaDecl = statement("Q191789", RDFS.LABEL, new LiteralImpl("martha", "en")); + + List<Statement> statements = basicEntity("Q23"); + statements.add(georgeDecl); + statements.add(marthaDecl); + munger.munge("Q23", statements); + assertThat(statements, hasItem(equalTo(georgeDecl))); + assertThat(statements, not(hasItem(equalTo(marthaDecl)))); + } + private List<Statement> basicEntity(String entityId) { List<Statement> statements = new ArrayList<>(); String entityData = EntityData.WIKIDATA.namespace() + entityId; diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java index 619e1eb..268717e 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java @@ -11,6 +11,7 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Locale; @@ -193,6 +194,14 @@ assertFalse(r.hasNext()); } + @Test + public void delete() throws QueryEvaluationException { + newSiteLink(); + repository.sync("Q23", Collections.<Statement> emptyList()); + TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}"); + assertFalse(r.hasNext()); + } + private void syncJustVersion(String entityId, int version) { Statement statement = statement(entityId, SchemaDotOrg.VERSION, new IntegerLiteralImpl(new BigInteger(Integer.toString(version)))); -- To view, visit https://gerrit.wikimedia.org/r/200774 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I938d23fab7dd636198f2b0fb3ffde25a266f5815 Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits