Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/204074
Change subject: Switch update from java built to string replaced ...................................................................... Switch update from java built to string replaced This should make them easier to read and modify. Change-Id: I2a50b6b7a17a5cb6bdb5f62829076a4d3c67131b --- M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java A tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.sync.sparql M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java 4 files changed, 214 insertions(+), 246 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf refs/changes/74/204074/1 diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java index 2c6f75b..ab85823 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.java @@ -1,9 +1,12 @@ package org.wikidata.query.rdf.tool.rdf; +import static com.google.common.io.Resources.getResource; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.net.URI; +import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.Date; @@ -43,9 +46,11 @@ import org.wikidata.query.rdf.common.uri.SchemaDotOrg; import org.wikidata.query.rdf.common.uri.WikibaseUris; import org.wikidata.query.rdf.tool.exception.ContainedException; +import org.wikidata.query.rdf.tool.exception.FatalException; import com.google.common.base.Charsets; import com.google.common.io.CharStreams; +import com.google.common.io.Resources; public class RdfRepository { private static final Logger log = LoggerFactory.getLogger(RdfRepository.class); @@ -54,10 +59,17 @@ .build(); private final URI uri; private final WikibaseUris uris; + private final String syncBody; public RdfRepository(URI uri, WikibaseUris uris) { this.uri = uri; this.uris = uris; + URL syncBodyUrl = getResource(RdfRepository.class, "RdfRepository.sync.sparql"); + try { + syncBody = Resources.toString(syncBodyUrl, Charsets.UTF_8); + } catch (IOException e) { + throw new FatalException("Can't load " + syncBodyUrl); + } } /** @@ -76,101 +88,17 @@ public int sync(String entityId, Collection<Statement> statements) { // TODO this is becoming a mess too log.debug("Updating data for {}", entityId); - String entity = "entity:" + entityId; - UpdateBuilder siteLinksBuilder = updateBuilder(); - siteLinksBuilder.prefix("schema", SchemaDotOrg.NAMESPACE); - siteLinksBuilder.delete("?s", "?p", "?o"); - siteLinksBuilder.where("?s", "schema:about", "entity:" + entityId); - siteLinksBuilder.where("?s", "?p", "?o"); - if (!statements.isEmpty()) { - siteLinksBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } + UpdateBuilder b = new UpdateBuilder(syncBody); + b.bindUri("entity:id", uris.entity() + entityId); + b.bindUri("schema:about", SchemaDotOrg.ABOUT); + b.bindUri("prov:wasDerivedFrom", Provenance.WAS_DERIVED_FROM); + b.bind("uris.value", uris.value()); + b.bind("uris.statement", uris.statement()); + b.bindStatements("insertStatements", statements); + b.bindValues("valueStatements", statements); - UpdateBuilder valuesOnReferencesBuilder = updateBuilder(); - valuesOnReferencesBuilder.prefix("prov", Provenance.NAMESPACE); - valuesOnReferencesBuilder.delete("?s", "?p", "?o"); - valuesOnReferencesBuilder.where(entity, "?statementPred", "?statement"); - valuesOnReferencesBuilder.where().add(startsWith("?statement", uris.statement())); - valuesOnReferencesBuilder.where("?statement", "prov:wasDerivedFrom", "?ref"); - valuesOnReferencesBuilder.where("?ref", "?expandedValuePred", "?s"); - valuesOnReferencesBuilder.where().add(startsWith("?s", uris.value())); - valuesOnReferencesBuilder.where("?s", "?p", "?o"); - // We can't clear references that are still used elsewhere - valuesOnReferencesBuilder.where().notExists()// - .add("?otherStatement", "prov:wasDerivedFrom", "?ref")// - .add("?otherEntity", "?otherStatementPred", "?otherStatement")// - .add("FILTER ( " + entity + " != ?otherEntity ) ."); - if (!statements.isEmpty()) { - valuesOnReferencesBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } - - UpdateBuilder referencesBuilder = updateBuilder(); - referencesBuilder.prefix("prov", Provenance.NAMESPACE); - referencesBuilder.delete("?s", "?p", "?o"); - referencesBuilder.where(entity, "?statementPred", "?statement"); - referencesBuilder.where().add(startsWith("?statement", uris.statement())); - referencesBuilder.where("?statement", "prov:wasDerivedFrom", "?s"); - referencesBuilder.where("?s", "?p", "?o"); - // We can't clear references that are still used elsewhere - referencesBuilder.where().notExists()// - .add("?otherStatement", "prov:wasDerivedFrom", "?s")// - .add("?otherEntity", "?otherStatementPred", "?otherStatement")// - .add("FILTER ( " + entity + " != ?otherEntity ) ."); - if (!statements.isEmpty()) { - referencesBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } - - UpdateBuilder valuesOnExpandedStatementsBuilder = updateBuilder(); - valuesOnExpandedStatementsBuilder.delete("?s", "?p", "?o"); - valuesOnExpandedStatementsBuilder.where(entity, "?statementPred", "?statement"); - valuesOnExpandedStatementsBuilder.where().add(startsWith("?statement", uris.statement())); - valuesOnExpandedStatementsBuilder.where("?statement", "?expandedValuePred", "?s"); - valuesOnExpandedStatementsBuilder.where().add(startsWith("?s", uris.value())); - valuesOnExpandedStatementsBuilder.where("?s", "?p", "?o"); - if (!statements.isEmpty()) { - valuesOnExpandedStatementsBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } - - UpdateBuilder expandedStatementsBuilder = updateBuilder(); - expandedStatementsBuilder.delete("?s", "?p", "?o"); - expandedStatementsBuilder.where(entity, "?statementPred", "?s"); - expandedStatementsBuilder.where().add(startsWith("?s", uris.statement())); - expandedStatementsBuilder.where("?s", "?p", "?o"); - if (!statements.isEmpty()) { - expandedStatementsBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } - - UpdateBuilder generalBuilder = updateBuilder(); - generalBuilder.delete(entity, "?p", "?o"); - generalBuilder.where(entity, "?p", "?o"); - if (!statements.isEmpty()) { - // TODO should this be statements, entity, ?p, ?o ? - generalBuilder.where().notExists().values(statements, "?s", "?p", "?o"); - } - - /* - * The order in which these are executed is important: if you think of - * the triples that must be managed by this action as a tree then you - * must start with the leaves first or when you clear out the trunk the - * leaves will be orphaned. - */ - StringBuilder command = new StringBuilder(); - command.append(siteLinksBuilder).append(";\n"); - command.append(valuesOnReferencesBuilder).append(";\n"); - command.append(referencesBuilder).append(";\n"); - command.append(valuesOnExpandedStatementsBuilder).append(";\n"); - command.append(expandedStatementsBuilder).append(";\n"); - command.append(generalBuilder).append(";\n"); - - if (!statements.isEmpty()) { - UpdateBuilder insertBuilder = updateBuilder(); - for (Statement statement : statements) { - insertBuilder.insert(statement.getSubject(), statement.getPredicate(), statement.getObject()); - } - command.append(insertBuilder).append(";\n"); - } long start = System.currentTimeMillis(); - int modified = execute("update", UPDATE_COUNT_RESPONSE, command.toString()); + int modified = execute("update", UPDATE_COUNT_RESPONSE, b.toString()); log.debug("Updating {} took {} millis and modified {} statements", entityId, System.currentTimeMillis() - start, modified); return modified; @@ -265,21 +193,6 @@ private String responseBodyAsString(CloseableHttpResponse response) throws IOException { return CharStreams.toString(new InputStreamReader(response.getEntity().getContent(), "UTF-8")); - } - - private UpdateBuilder updateBuilder() { - UpdateBuilder b = new UpdateBuilder(); - b.prefix("entity", uris.entity()); - return b; - } - - private String startsWith(String name, String prefix) { - StringBuilder filter = new StringBuilder(); - filter.append("FILTER( STRSTARTS(STR("); - filter.append(name).append("), \""); - filter.append(prefix); - filter.append("\") ) ."); - return filter.toString(); } /** diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.sync.sparql b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.sync.sparql new file mode 100644 index 0000000..c30cb0b --- /dev/null +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/RdfRepository.sync.sparql @@ -0,0 +1,112 @@ +# Clear out of date site links +DELETE { + ?s ?p ?o . +} +WHERE { + ?s %schema:about% %entity:id% . + ?s ?p ?o . + # This construct is constantly reused throughout the updates. Its job is to not delete statements + # that are still in use. + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +# Clear statements on expanded values in referenes that are no longer used +DELETE { + ?s ?p ?o . +} +WHERE { + %entity:id% ?statementPred ?statement . + FILTER( STRSTARTS(STR(?statement), "%uris.statement%") ) . + ?statement %prov:wasDerivedFrom% ?ref . + # Since references are shared we can only clear the values on them when they are no longer used + # anywhere else. + FILTER NOT EXISTS { + ?otherStatement %prov:wasDerivedFrom% ?ref . + ?otherEntity ?otherStatementPred ?otherStatement . + FILTER ( %entity:id% != ?otherEntity ) . + } + ?ref ?expandedValuePred ?s . + # Without this filter we'd try to delete stuff from entities. For example that pattern above matches + # ref:_ v:P143 entity:Q328 + # so we'd try to clear everything from Q328 (enwiki). So we filter where ?s is in the value prefix. + FILTER( STRSTARTS(STR(?s), "%uris.value%") ) . + ?s ?p ?o . + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +# Clear statements about references that are no longer used +DELETE { + ?s ?p ?o . +} +WHERE { + %entity:id% ?statementPred ?statement . + FILTER( STRSTARTS(STR(?statement), "%uris.statement%") ) . + ?statement %prov:wasDerivedFrom% ?s . + # Since references are shared we can only clear the values on them when they are no longer used + # anywhere else. + FILTER NOT EXISTS { + ?otherStatement %prov:wasDerivedFrom% ?s . + ?otherEntity ?otherStatementPred ?otherStatement . + FILTER ( %entity:id% != ?otherEntity ) . + } + ?s ?p ?o . + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +# Clear out of date expanded values on statements about the entity +DELETE { + ?s ?p ?o . +} +WHERE { + %entity:id% ?statementPred ?statement . + FILTER( STRSTARTS(STR(?statement), "%uris.statement%") ) . + ?statement ?expandedValuePred ?s . + # Without this filter we'd clear all kinds of things. Only try and clear value nodes. + FILTER( STRSTARTS(STR(?s), "%uris.value%") ) . + ?s ?p ?o . + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +# Clear out of date statements about statements +DELETE { + ?s ?p ?o . +} +WHERE { + %entity:id% ?statementPred ?s . + FILTER( STRSTARTS(STR(?s), "%uris.statement%") ) . + ?s ?p ?o . + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +# Clear out of date statements about the entity +DELETE { + %entity:id% ?p ?o . +} +WHERE { + %entity:id% ?p ?o . + FILTER NOT EXISTS { + VALUES ( ?s ?p ?o ) { + %valueStatements% + } + } +}; +INSERT { + %insertStatements% +} +WHERE {}; + diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java index 1a803e8..345c211 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java @@ -1,177 +1,92 @@ package org.wikidata.query.rdf.tool.rdf; -import java.util.ArrayList; import java.util.Collection; -import java.util.List; import org.openrdf.model.Literal; import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.vocabulary.XMLSchema; -import com.google.common.base.Joiner; - /** * Quick and dirty update builder. */ public class UpdateBuilder { - private final StringBuilder prefixes = new StringBuilder(); - private final BasicPart delete = new BasicPart(); - private final BasicPart insert = new BasicPart(); - private final BasicPart where = new BasicPart(); + private String template; - public UpdateBuilder prefix(String prefix, String expandedForm) { - prefixes.append("PREFIX ").append(prefix).append(": <").append(expandedForm).append(">\n"); + public UpdateBuilder(String template) { + this.template = template; + } + + public UpdateBuilder bind(String from, String to) { + template = template.replace('%' + from + '%', to); return this; } - public UpdateBuilder delete(Object s, Object p, Object o) { - delete.add(s, p, o); + public UpdateBuilder bindUri(String from, String to) { + bind(from, '<' + to + '>'); return this; } - public UpdateBuilder insert(Object s, Object p, Object o) { - insert.add(s, p, o); + public UpdateBuilder bindStatements(String from, Collection<Statement> statements) { + StringBuilder b = new StringBuilder(statements.size() * 30); + for (Statement s : statements) { + b.append(str(s.getSubject())).append(' '); + b.append(str(s.getPredicate())).append(' '); + b.append(str(s.getObject())).append(" .\n"); + } + bind(from, b.toString().trim()); return this; } - public UpdateBuilder where(Object s, Object p, Object o) { - where.add(s, p, o); + public UpdateBuilder bindValues(String from, Collection<Statement> statements) { + StringBuilder b = new StringBuilder(statements.size() * 30); + for (Statement s : statements) { + b.append("( ").append(str(s.getSubject())).append(' '); + b.append(str(s.getPredicate())).append(' '); + b.append(str(s.getObject())).append(" )\n"); + } + bind(from, b.toString().trim()); return this; - } - - public BasicPart where() { - return where; } @Override public String toString() { - StringBuilder b = new StringBuilder(); - b.append(prefixes); - if (!delete.parts.isEmpty()) { - b.append("DELETE {\n").append(delete).append("}\n"); - } - if (!insert.parts.isEmpty()) { - b.append("INSERT {\n").append(insert).append("}\n"); - } - b.append("WHERE {\n").append(where).append("}"); - return b.toString(); + return template; } - private static class AbstractPart<Self extends AbstractPart<Self>> { - protected final List<Object> parts = new ArrayList<>(); - protected final int indent; - - private AbstractPart(int indent) { - this.indent = indent; + /** + * Properly stringify a subject, predicate, or object so it fits in the + * update query. + */ + private String str(Object o) { + if (o instanceof String) { + // Got to escape those quotes + return o.toString().replace("\"", "\\\""); } - - @SuppressWarnings("unchecked") - public Self add(String s) { - parts.add(indent().append(s)); - return (Self) this; + if (o instanceof URI) { + return '<' + o.toString() + '>'; } + if (o instanceof Literal) { + Literal l = (Literal) o; + // This is very similar to LiteralImpl's toString but with label + // escaping. + StringBuilder sb = new StringBuilder(l.getLabel().length() * 2); - @SuppressWarnings("unchecked") - public Self add(Object s, Object p, Object o) { - parts.add(indent().append(str(s)).append(' ').append(str(p)).append(' ').append(str(o)).append(" .")); - return (Self) this; - } + sb.append('"'); + sb.append(l.getLabel().replace("\\", "\\\\").replace("\"", "\\\"")); + sb.append('"'); - public NotExists notExists() { - NotExists ne = new NotExists(indent + 1); - parts.add(ne); - return ne; - } - - @Override - public String toString() { - return Joiner.on('\n').join(parts) + "\n"; - } - - /** - * Properly stringify a subject, predicate, or object so it fits in the - * update query. - */ - protected String str(Object o) { - if (o instanceof String) { - // Got to escape those quotes - return o.toString().replace("\"", "\\\""); + if (l.getLanguage() != null) { + sb.append('@'); + sb.append(l.getLanguage()); + } else if (!l.getDatatype().equals(XMLSchema.STRING)) { + sb.append("^^<"); + sb.append(l.getDatatype()); + sb.append(">"); } - if (o instanceof URI) { - return "<" + o + ">"; - } - if (o instanceof Literal) { - Literal l = (Literal) o; - // This is very similar to LiteralImpl's toString but with label - // escaping. - StringBuilder sb = new StringBuilder(l.getLabel().length() * 2); - sb.append('"'); - sb.append(l.getLabel().replace("\\", "\\\\").replace("\"", "\\\"")); - sb.append('"'); - - if (l.getLanguage() != null) { - sb.append('@'); - sb.append(l.getLanguage()); - } else if (!l.getDatatype().equals(XMLSchema.STRING)) { - sb.append("^^<"); - sb.append(l.getDatatype()); - sb.append(">"); - } - - return sb.toString(); - } - throw new RuntimeException("I have no idea what do to with a " + o.getClass()); + return sb.toString(); } - - protected StringBuilder indent() { - return indent(new StringBuilder(), indent); - } - - protected StringBuilder indent(StringBuilder b, int indentation) { - for (int i = 0; i < indentation; i++) { - b.append(" "); - } - return b; - } - } - - public static class BasicPart extends AbstractPart<BasicPart> { - private BasicPart() { - super(1); - } - } - - public static class NotExists extends AbstractPart<NotExists> { - private NotExists(int indent) { - super(indent); - } - - @Override - public String toString() { - StringBuilder b = indent(new StringBuilder(), indent - 1); - b.append("FILTER NOT EXISTS {\n"); - b.append(super.toString()); - indent(b, indent - 1).append("}"); - return b.toString(); - } - - public NotExists values(Collection<Statement> statements, String... names) { - StringBuilder b = indent().append("VALUES ("); - for (String name : names) { - b.append(name).append(" "); - } - b.append(") {\n"); - for (Statement statement : statements) { - indent(b, indent + 1).append("( "); - b.append(str(statement.getSubject())).append(' '); - b.append(str(statement.getPredicate())).append(' '); - b.append(str(statement.getObject())).append(" )\n"); - } - indent(b, indent).append("}"); - parts.add(b); - return this; - } + throw new RuntimeException("I have no idea what do to with a " + o.getClass()); } } diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java index d036f63..37055f1 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java @@ -425,6 +425,34 @@ assertFalse(r.hasNext()); } + /** + * Tests that if a reference is used multiple times on the same entity it + * isn't cleared if its removed from the entity just once. That'd be wrong + * because its still on the entity. + */ + @Test + public void sharedReferenceOnSameEntity() { + String referenceUri = uris.reference() + "e36b7373814a0b74caa84a5fc2b1e3297060ab0f"; + List<Statement> george = expandedStatement("9D3713FF-7BCC-489F-9386-C7322C0AC284", "Q23", "P19", "Q494413", + Ontology.NORMAL_RANK, referenceUri); + statement(george, referenceUri, uris.value() + "P854", "http://www.anb.org/articles/02/02-00332.html"); + List<Statement> georgeWithoutSecondReference = new ArrayList<>(george); + String otherStatementUri = uris.statement() + "ASDFasdf"; + statement(george, "Q23", uris.value() + "P129", otherStatementUri); + statement(george, otherStatementUri, uris.value() + "P129", new LiteralImpl("cat")); + statement(george, otherStatementUri, Provenance.WAS_DERIVED_FROM, referenceUri); + rdfRepository.sync("Q23", george); + assertTrue(rdfRepository.ask(uris.prefixes(new StringBuilder()) + .append("ASK {ref:e36b7373814a0b74caa84a5fc2b1e3297060ab0f v:P854 ?o }").toString())); + + rdfRepository.sync("Q23", georgeWithoutSecondReference); + assertTrue(rdfRepository.ask(uris.prefixes(new StringBuilder()) + .append("ASK {ref:e36b7373814a0b74caa84a5fc2b1e3297060ab0f v:P854 ?o }").toString())); + + } + + // TODO values on shared references change when they change + private List<Statement> expandedStatement(String statementId, String subject, String predicate, String value, String rank, String referenceUri) { List<Statement> statements = expandedStatement(statementId, subject, predicate, value, rank); -- To view, visit https://gerrit.wikimedia.org/r/204074 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2a50b6b7a17a5cb6bdb5f62829076a4d3c67131b Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits