On 10/4/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote:
> I'm a noob to the semantic web though not to python, and am looking to
> experiment with the dbpedia data (now that's a possible killer app :-),
> which is sizable, like a few million triples it seems: http://dbpedia.org/
Hey there.
> I'm trying to figure out how to get the dbpedia data loaded and usable with
> rdflib, i.e. accessible via a SPARQL interface on my local machine, and
> could really use some advice - from the API docs I don't understand what
> needs to be done with store(), except that it's needed, and I probably don't
> want to load several hundred megabytes into memory. :-)
So, the naive approach would be to use the generic (store-agnostic)
parsers to parse the .nt files into a store of your choice (see below:
where 'config' is the connection string for the store you want to use,
'storeName' corresponds to the non-qualified name of any of the
modules in rdflib.store - MySQL, IOMemory, Sleepycat, etc..):
---------------------------------------------------------------
from rdflib.Graph import Graph,ConjunctiveGraph
from rdflib import URIRef, store, plugin
from rdflib.store import Store, VALID_STORE, NO_STORE, CORRUPTED_STORE
store = plugin.get(storeName,Store)()
rt=store.open(config,create=False)
if rt == NO_STORE:
#No store, create it using configuration
store.open(config,create=True)
elif rt == CORRUPTED_STORE:
#Store is corrupted, destroy and recreate
store.destroy(config)
store.open(config,create=True)
else:
#Store exists and is valid, do nothing
pass
g=Graph(store,identifier=URIRef('.. graph name ..'))
#this will take a looooong time
g.parse('.. path to NTriples file ..',format='ntriples')
store.commit()
---------------------------------------------------------------
However, I wouldn't suggest this for the MySQL store, for that I have
attached a mass-load script that I use for large datasets (in the
millions of triples) which parses an RDF graph serialization into a
set of delimited files which can be loaded (very efficiently) into
MySQL via:
LOAD DATA LOCAL INFILE '%s' IGNORE INTO TABLE %s FIELDS TERMINATED
BY '|' ENCLOSED BY '"'
If you go the MySQL route, let me know, I can give you further instructions.
> Could anyone give
> some high-level advice on how I can load big n-triples files (that would be
> the dbpedia .nt files) using rdflib, then have them all accessible via
> SPARQL?
This probably goes for all the rdflib stores, but for large RDF
serialization you probably don't want to go the
graph.parse(..input..,format='...') route, since that involves a
massive memory hit (parsing the entire graph into memory) as well as
the inefficiency of directly parsing it in
As for making a fully loaded RDFLib store available over SPARQL you
simply need a web framework for answering to SPARQL queries and
routing them through RDFLib. Offhead I know of a few:
- Triclops (http://python-dlp.googlecode.com/svn/trunk/triclops/)
- sparqlhttp (http://projects.bigasterisk.com/sparqlhttp/)
There may be others..
from __future__ import generators
from rdflib import BNode
from rdflib.store import Store
from rdflib.Literal import Literal
from pprint import pprint
import sys, re
from rdflib.term_utils import *
from rdflib.Graph import QuotedGraph
from rdflib.store.REGEXMatching import REGEXTerm, NATIVE_REGEX, PYTHON_REGEX
from rdflib.store.AbstractSQLStore import *
from rdflib.store.MySQL import MySQL
from cStringIO import StringIO
from rdflib.store.FOPLRelationalModel.QuadSlot import *
Any = None
VALUES_EXPR = re.compile('.*VALUES (\(.*\))')
TABLE_NAME_EXPR = re.compile('INSERT INTO (\S*)\s+VALUES')
ROW_DELIMETER = '\n'
COL_DELIMETER = '|'
class MySQLLoader(MySQL):
def __init__(self, identifier=None, configuration=None):
super(MySQLLoader,self).__init__(identifier,configuration)
self.assertedNonTypeValues = StringIO()
self.assertedTypeValues = StringIO()
self.assertedLiteralValues = StringIO()
def open(self, configuration, create=True):
pass
def add(self, (subject, predicate, obj), context=None, quoted=False):
self.addN([(subject,predicate,obj,context)])
def addN(self, quads):
"""
Adds each item in the list of statements to a specific context. The quoted argument
is interpreted by formula-aware stores to indicate this statement is quoted/hypothetical.
Note that the default implementation is a redirect to add
"""
for s,p,o,c in quads:
assert c is not None, "Context associated with %s %s %s is None!"%(s,p,o)
qSlots = genQuadSlots([s,p,o,c.identifier])
if p == RDF.type:
kb = self.aboxAssertions
elif isinstance(o,Literal):
kb = self.literalProperties
else:
kb = self.binaryRelations
kb.insertRelations([qSlots])
# for kb in self.partitions:
# if kb.pendingInsertions:
# kb.flushInsertions(self._db)
def dumpRDF(self,suffix):
#Dump Binary Relations
f=open('%s-%s.dump'%(self.binaryRelations,suffix),'w')
for vals in self.binaryRelations.pendingInsertions:
f.write(COL_DELIMETER.join([u'"%s"'%item for item in vals])+ROW_DELIMETER)
f.close()
#Dump ABOX Assertions
f=open('%s-%s.dump'%(self.aboxAssertions,suffix),'w')
for vals in self.aboxAssertions.pendingInsertions:
f.write(COL_DELIMETER.join([u'"%s"'%item for item in vals])+ROW_DELIMETER)
f.close()
#Dump Literal Properties
f=open('%s-%s.dump'%(self.literalProperties,suffix),'w')
#print self.literalProperties.pendingInsertions.items()
for (dType,lang),valList in self.literalProperties.pendingInsertions.items():
for vals in valList:
if not dType and not lang:
vals = tuple(list(vals)+['NULL','NULL'])
if dType and not lang:
vals = tuple(list(vals)+['NULL'])
elif not dType and lang:
vals = tuple(list(vals[:-1])+['NULL']+[vals[-1]])
f.write(COL_DELIMETER.join([item != 'NULL' and u'"%s"'%item or item for item in vals])+ROW_DELIMETER)
f.close()
#Dump Identifiers
f=open('%s-%s.dump'%(self.idHash,suffix),'w')
for md5Int,(termType,lexical) in self.idHash.hashUpdateQueue.items():
f.write(COL_DELIMETER.join([u'"%s"'%item for item in [md5Int,termType,lexical]])+ROW_DELIMETER)
f.close()
#Dump Values
f=open('%s-%s.dump'%(self.valueHash,suffix),'w')
for md5Int,lexical in self.valueHash.hashUpdateQueue.items():
f.write(COL_DELIMETER.join([u'"%s"'%item for item in [md5Int,lexical]])+ROW_DELIMETER)
f.close()
#Transactional interfaces
def commit(self):
""" """
pass
def rollback(self):
""" """
pass
def bind(self, prefix, namespace):
""" """
pass
def prefix(self, namespace):
""" """
pass
def namespace(self, prefix):
""" """
pass
def namespaces(self):
pass
_______________________________________________
Dev mailing list
[email protected]
http://rdflib.net/mailman/listinfo/dev