Hi,
I found that the RDFXMLHandler.py code is not checking for unescaped
characters mentioned in sect. 2.4.3 in rfc2396
(http://www.isi.edu/in-notes/rfc2396.txt). In order to remedy this I
wrote a small patch (attached) that checks the URI when it is
absolutized.
Cheers,
Mikael
Index: rdflib/syntax/parsers/RDFXMLHandler.py
===================================================================
--- rdflib/syntax/parsers/RDFXMLHandler.py (revision 866)
+++ rdflib/syntax/parsers/RDFXMLHandler.py (working copy)
@@ -38,6 +38,7 @@
from xml.sax.saxutils import handler, quoteattr, escape
from urlparse import urljoin, urldefrag
+import re
RDFNS = RDF.RDFNS
@@ -206,11 +207,19 @@
# element handler
parent = property(get_parent)
+ def is_escaped_uri(self, uri):
+ esc_uri = re.compile(r'[\[\]\{}\|\^`<>" \\]')
+ find_esc = esc_uri.search(uri)
+ if find_esc != None:
+ self.error("URI character " + str(find_esc.end()) + " is not properly escaped")
+ return True
+
def absolutize(self, uri):
- result = urljoin(self.current.base, uri, allow_fragments=1)
- if uri and uri[-1]=="#" and result[-1]!="#":
- result = "%s#" % result
- return URIRef(result)
+ if not self.is_escaped_uri(uri):
+ result = urljoin(self.current.base, uri, allow_fragments=1)
+ if uri and uri[-1]=="#" and result[-1]!="#":
+ result = "%s#" % result
+ return URIRef(result)
def convert(self, name, qname, attrs):
if name[0] is None:
_______________________________________________
Dev mailing list
[email protected]
http://rdflib.net/mailman/listinfo/dev