Hi,

I found that the RDFXMLHandler.py code is not checking for unescaped
characters mentioned in sect. 2.4.3 in rfc2396
(http://www.isi.edu/in-notes/rfc2396.txt). In order to remedy this I
wrote a small patch (attached) that checks the URI when it is
absolutized.

Cheers,
Mikael
Index: rdflib/syntax/parsers/RDFXMLHandler.py
===================================================================
--- rdflib/syntax/parsers/RDFXMLHandler.py	(revision 866)
+++ rdflib/syntax/parsers/RDFXMLHandler.py	(working copy)
@@ -38,6 +38,7 @@
 
 from xml.sax.saxutils import handler, quoteattr, escape
 from urlparse import urljoin, urldefrag
+import re
 
 RDFNS = RDF.RDFNS
 
@@ -206,11 +207,19 @@
     # element handler
     parent = property(get_parent)
 
+    def is_escaped_uri(self, uri):
+        esc_uri = re.compile(r'[\[\]\{}\|\^`<>" \\]')
+        find_esc = esc_uri.search(uri)
+        if find_esc != None:
+            self.error("URI character " + str(find_esc.end()) + " is not properly escaped")
+        return True
+        
     def absolutize(self, uri):
-        result = urljoin(self.current.base, uri, allow_fragments=1)
-        if uri and uri[-1]=="#" and result[-1]!="#":
-            result = "%s#" % result
-        return URIRef(result)
+        if not self.is_escaped_uri(uri):
+            result = urljoin(self.current.base, uri, allow_fragments=1)
+            if uri and uri[-1]=="#" and result[-1]!="#":
+                result = "%s#" % result
+            return URIRef(result)
 
     def convert(self, name, qname, attrs):
         if name[0] is None:
_______________________________________________
Dev mailing list
[email protected]
http://rdflib.net/mailman/listinfo/dev

Reply via email to