Osma Suominen created JENA-1128:
-----------------------------------
Summary: sdbquery doesn't work with MINUS
Key: JENA-1128
URL: https://issues.apache.org/jira/browse/JENA-1128
Project: Apache Jena
Issue Type: Bug
Components: SDB
Affects Versions: Jena 3.0.1
Environment: jena-sdb-3.1.0-SNAPSHOT dated 2 Feb 2016
apache-jena-3.1.0-SNAPSHOT dated 2 Feb 2016
Reporter: Osma Suominen
I'm running a SPARQL query against a SDB loaded with SKOS data. The intent of
the query is to check for broken links, i.e. skos:closeMatch relationships that
point to nonexistent concepts in another SKOS dataset. I have simplified my
query to a rather minimal test case below. In this case, also the remote data
is included in the same graph for simplicity.
Here is my test data:
{noformat}
@prefix skos: <http://www.w3.org/2004/02/skos/core#>.
@prefix local: <http://example.com/local/>.
@prefix remote: <http://example.com/remote/>.
local:conceptA a skos:Concept ;
skos:prefLabel "Local concept A"@en ;
skos:note "has a valid link to an existing remote concept" ;
skos:closeMatch remote:conceptC .
local:conceptB a skos:Concept ;
skos:prefLabel "Local concept B"@en ;
skos:note "has a broken link to a nonexistent remote concept" ;
skos:closeMatch remote:conceptD .
remote:conceptC a skos:Concept ;
skos:prefLabel "Remote concept C"@en .
{noformat}
This is my SPARQL query:
{noformat}
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT * WHERE {
?local skos:closeMatch ?remote .
FILTER NOT EXISTS { ?remote a skos:Concept }
}
{noformat}
If I run the query using the command line tool "sparql" from the apache-jena
distribution, it returns the correct result, i.e. the one concept with the
broken link:
{noformat}
------------------------------------------------------------------------------
| local | remote |
==============================================================================
| <http://example.com/local/conceptB> | <http://example.com/remote/conceptD> |
------------------------------------------------------------------------------
{noformat}
But when I load the above data into a SDB database (MySQL) and use sdbquery
with the same SPARQL query, I get a different result (here run with the --debug
option) which has an extra row:
{debug}
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT *
WHERE
{ ?local skos:closeMatch ?remote
MINUS
{ ?remote a skos:Concept }
}
- - - - - - - - - - - - - -
SELECT -- V_3=?remote
R_3.lex AS V_3_lex, R_3.datatype AS V_3_datatype, R_3.lang AS V_3_lang,
R_3.type AS V_3_type
FROM
( SELECT -- ?remote:(T_2.s=>T_2.X_1)
T_2.s AS X_1
FROM Triples AS T_2 -- ?remote
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> skos:Concept
WHERE ( T_2.p = -6430697865200335348 -- Const:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
AND T_2.o = 1728757386496985884 -- Const: skos:Concept
)
) AS T_2 -- ?remote:(T_2.s=>T_2.X_1)
LEFT OUTER JOIN
Nodes AS R_3 -- Var: ?remote
ON ( T_2.X_1 = R_3.hash )
(minus
(SQL '''SqlSelectBlock/S_1 -- V_1=?local V_2=?remote
R_1.lex/V_1_lex R_1.datatype/V_1_datatype R_1.lang/V_1_lang
R_1.type/V_1_type
R_2.lex/V_2_lex R_2.datatype/V_2_datatype R_2.lang/V_2_lang
R_2.type/V_2_type
Join/left outer
Join/left outer
SqlSelectBlock/T_1
T_1.p = 2699241716664962559
Table T_1 -- ?local skos:closeMatch ?remote
Table R_1 -- Var: ?local
Condition T_1.s = R_1.hash
Table R_2 -- Var: ?remote
Condition T_1.o = R_2.hash''')
(SQL '''SqlSelectBlock/S_2 -- V_3=?remote
R_3.lex/V_3_lex R_3.datatype/V_3_datatype R_3.lang/V_3_lang
R_3.type/V_3_type
Join/left outer
SqlSelectBlock/T_2 -- ?remote:(T_2.s=>T_2.X_1)
T_2.s/X_1
T_2.p = -6430697865200335348
T_2.o = 1728757386496985884
Table T_2 -- ?remote
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> skos:Concept
Table R_3 -- Var: ?remote
Condition T_2.X_1 = R_3.hash''')
)
SELECT -- V_1=?local V_2=?remote
R_1.lex AS V_1_lex, R_1.datatype AS V_1_datatype, R_1.lang AS V_1_lang,
R_1.type AS V_1_type,
R_2.lex AS V_2_lex, R_2.datatype AS V_2_datatype, R_2.lang AS V_2_lang,
R_2.type AS V_2_type
FROM
( SELECT *
FROM Triples AS T_1 -- ?local skos:closeMatch ?remote
WHERE ( T_1.p = 2699241716664962559 -- Const: skos:closeMatch
)
) AS T_1
LEFT OUTER JOIN
Nodes AS R_1 -- Var: ?local
ON ( T_1.s = R_1.hash )
LEFT OUTER JOIN
Nodes AS R_2 -- Var: ?remote
ON ( T_1.o = R_2.hash )
------------------------------------------------------------------------------
| local | remote |
==============================================================================
| <http://example.com/local/conceptA> | <http://example.com/remote/conceptC> |
| <http://example.com/local/conceptB> | <http://example.com/remote/conceptD> |
------------------------------------------------------------------------------
{debug}
If I change the query to use FILTER NOT EXISTS instead of MINUS, then I get the
correct result also with sdbquery:
{noformat}
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
SELECT *
WHERE
{ ?local skos:closeMatch ?remote
FILTER NOT EXISTS { ?remote a skos:Concept }
}
- - - - - - - - - - - - - -
(filter (notexists
(SQL '''SqlSelectBlock/T_2 -- ?remote:(T_2.s=>T_2.X_1)
T_2.s/X_1
T_2.p = -6430697865200335348
T_2.o = 1728757386496985884
Table T_2 -- ?remote
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> skos:Concept''')
)
(SQL '''SqlSelectBlock/S_1 -- V_1=?local V_2=?remote
R_1.lex/V_1_lex R_1.datatype/V_1_datatype R_1.lang/V_1_lang
R_1.type/V_1_type
R_2.lex/V_2_lex R_2.datatype/V_2_datatype R_2.lang/V_2_lang
R_2.type/V_2_type
Join/left outer
Join/left outer
SqlSelectBlock/T_1
T_1.p = 2699241716664962559
Table T_1 -- ?local skos:closeMatch ?remote
Table R_1 -- Var: ?local
Condition T_1.s = R_1.hash
Table R_2 -- Var: ?remote
Condition T_1.o = R_2.hash''')
)
SELECT -- V_1=?local V_2=?remote
R_1.lex AS V_1_lex, R_1.datatype AS V_1_datatype, R_1.lang AS V_1_lang,
R_1.type AS V_1_type,
R_2.lex AS V_2_lex, R_2.datatype AS V_2_datatype, R_2.lang AS V_2_lang,
R_2.type AS V_2_type
FROM
( SELECT *
FROM Triples AS T_1 -- ?local skos:closeMatch ?remote
WHERE ( T_1.p = 2699241716664962559 -- Const: skos:closeMatch
)
) AS T_1
LEFT OUTER JOIN
Nodes AS R_1 -- Var: ?local
ON ( T_1.s = R_1.hash )
LEFT OUTER JOIN
Nodes AS R_2 -- Var: ?remote
ON ( T_1.o = R_2.hash )
SELECT *
FROM Triples AS T_3 --
<http://example.com/remote/conceptC>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> skos:Concept
WHERE ( T_3.s = 5972767169237582230 -- Const:
<http://example.com/remote/conceptC>
AND T_3.p = -6430697865200335348 -- Const:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
AND T_3.o = 1728757386496985884 -- Const: skos:Concept
)
SELECT *
FROM Triples AS T_4 --
<http://example.com/remote/conceptD>
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> skos:Concept
WHERE ( T_4.s = 8175828786801660008 -- Const:
<http://example.com/remote/conceptD>
AND T_4.p = -6430697865200335348 -- Const:
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>
AND T_4.o = 1728757386496985884 -- Const: skos:Concept
)
------------------------------------------------------------------------------
| local | remote |
==============================================================================
| <http://example.com/local/conceptB> | <http://example.com/remote/conceptD> |
------------------------------------------------------------------------------
{noformat}
However, in my actual query that this example is based on
(https://github.com/NatLibFi/Finto-data/blob/master/tools/yso-updater-sparql/5-ysa-removed-concepts.rq)
using FILTER NOT EXISTS is not an efficient solution, because the subtracted
part uses a federated query and it will result in almost 30000 queries to be
performed to the remote endpoint instead of just one.
I'm using the most recent snapshots available from repository.apache.org.
--
This message was sent by Atlassian JIRA
(v6.3.4#6332)