Oops. This was sent to me some (long) time ago. Anybody want to handle it? I really don't have time anymore to look into the parser code unless it is to scratch an own itch...
Holger
---------- Forwarded message ---------- Date: Fri, 01 Aug 2003 18:00:36 -0500 From: Dario Morales Lopez <[EMAIL PROTECTED]> To: [EMAIL PROTECTED] X-Spambayes-Classification: ham; 0.00 Subject: Url.py replacing spaces in the url
Hi, I found your e-mail in the authors page from the plukers users guide, since I don't want to subscribe to the mailing lists I think you are the person who sould get this.
Attached is a file with a small modiffication that replaces space chars with url encoded spaces (%20). Since I'm new to plucker and to python I don't really know if it is the place to do the replacing and neither if it is the best way to do it, but it's a quick hack and it works for me.
I've done this because I'm switching from AvantGo to Pluker and was adding my local cinema showtimes page to plucker, but while parsing the first page plucker exited with an aufull invalid request error, I've realized that the error occurred because the page contains "hrefs" with spaces in them. Since the page is intended to be use with AvantGo and not with plucker asking the web page mantainer to change their invalid urls was out of question.
I hope it's usefull in some way and thank you four your very good efforts bringing plucker to the world.
-- Darío Morales López [EMAIL PROTECTED] ------------------- "UNIX was not designed to stop you from doing stupid things, because that would also stop you from doing clever things." - Doug Gwyn
#!/usr/bin/env python
""" Url.py $Id: Url.py,v 1.17 2003/03/27 02:55:19 chrish Exp $ Utility class to encapsulate information about an URL and the useful operations thereon. Copyright 1999, 2000 by Holger Duerer <[EMAIL PROTECTED]> Distributable under the GNU General Public License Version 2 or newer. """ import urlparse, urllib, string, sys, os urlparse.uses_relative.append ('plucker') urlparse.uses_netloc.append ('plucker') urlparse.uses_params.append ('plucker') urlparse.uses_query.append ('plucker') urlparse.uses_fragment.append ('plucker') ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # ###################################################################### def windows_file_url_parse (url): prot='file' fragment='' i = string.rfind(url, '#') if i >= 0: fragment = url[i+1:] url = url[:i] path=url if string.lower(path[0:7]) == 'file://': path=path[7:] if string.lower(path[0:5]) == 'file:': path=path[5:] if ((string.upper(path[0:1]) >= 'A') and (string.upper(path[0:1]) <= 'Z')) and (path[1:2] == ':'): path = string.upper(path[0:1]) + path[1:] host='' params='' query='' return prot, host, path, params, query, fragment ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # ###################################################################### def windows_file_urljoin(base, url): def add_fragment(path, frag): if frag != '': res = path + '#' + frag else: res = path return res i = string.find(url, ':') # a new http:// file:// not based to source is _not_ used if (i < 3) or (i > 10): (prot, host, path, params, query, fragment) = windows_file_url_parse (url) if path != '': ###################################### # FIX ME!!!! # # path like .\test\..\images\ # # are not work yet! # ###################################### # .\file.ext == file.ext if (path[0:2] == '.\\') or (path[0:2] == './'): path = path[2:] url = os.path.join (os.path.dirname(str (base)), add_fragment(path, fragment)) return url # one dir up if (path[0:3] == '..\\') or (path[0:3] == '../'): path = path[3:] url = os.path.join (os.path.dirname(os.path.dirname(str (base))), add_fragment(path, fragment)) return url # two dir up if (path[0:4] == '...\\') or (path[0:4] == '.../'): path = path[4:] url = os.path.join (os.path.dirname(os.path.dirname(os.path.dirname(str (base)))), add_fragment(path, fragment)) return url # Root dir if (path[0:1] == '\\') or (path[0:1] == '/'): path = path[1:] str_base = str (base) url = os.path.join ('file:' + str_base[5] + ':' , add_fragment(path, fragment)) return url # normale case else: url = os.path.join (os.path.dirname(str (base)), add_fragment(path, fragment)) return url else: url = base + '#' + fragment return url else: return url return url ###################################################################### # Replacement for the urlparse lib, because this is buggy on Windows # # And its behavior changed in Python 2.2.2 CRH ###################################################################### def plucker_file_urlunparse(protocol, host, path, params, query, fragment): text = '' if protocol != '': text = text + protocol + ':' + path if fragment != '': text = text + '#' + fragment return text class URL: """Encapsulate some useful things from urllib and urlparse""" def __init__ (self, url, base = None): if isinstance (url, URL) and base is None: # Simple copy constructor: make it more efficient self._protocol = url._protocol self._host = url._host self._path = url._path self._params = url._params self._query = url._query self._fragment = url._fragment else: url = str (url) if base is not None: if sys.platform == 'win32' and string.lower(str (base)[0:5]) == 'file:': url = windows_file_urljoin (str (base), url) else: url = urlparse.urljoin (str (base), url) # according to RFC 2396, this 'unquote' is inappropriate # according to the HTML 4.01 spec, this 'unquote' is unnecessary # url = urllib.unquote (url) if sys.platform == 'win32' and string.lower(url[0:5]) == 'file:': (prot, host, path, params, query, fragment) = windows_file_url_parse (url) else: (prot, host, path, params, query, fragment) = urlparse.urlparse (url) host = string.lower (host) self._protocol = prot self._host = host self._path = path self._params = params self._query = query self._fragment = fragment def as_string (self, with_fragment): if with_fragment: fragment = self._fragment else: fragment = "" if self._protocol == 'plucker' or self._protocol == 'file': text = plucker_file_urlunparse (self._protocol, self._host, self._path, self._params, self._query, fragment) else: text = urlparse.urlunparse ((self._protocol, self._host, self._path, self._params, self._query, fragment)) return text def __str__ (self): return self.as_string (with_fragment=1) def __repr__ (self): return "URL (%s)" % repr (self.as_string (with_fragment=1)) def get_protocol (self): return self._protocol def get_host (self): return self._host def get_path (self): return self._path def get_fragment (self): return self._fragment def get_full_path (self, with_fragment): if with_fragment: fragment = self._fragment else: fragment = "" if sys.platform == 'win32' and self._protocol == 'file': text = plucker_file_urlunparse ("", "", self._path, self._params, self._query, fragment) else: text = urlparse.urlunparse (("", "", self._path, self._params, self._query, fragment)) return text def remove_fragment (self): self._fragment = "" def CleanURL (url, base=None): """Remove leading and trailing white space and generally clean up this URL""" if isinstance (url, URL): # This branch is currently never taken, we get always called # with a string as 'url' if base is not None: # FIXME!! Does this make sense at all? URLs should always be # absoulte, so giving a base is moot... result = Url (url, base).as_string (with_fragment=1) else: result = url.as_string (with_fragment=1) else: url = string.strip (str (url)) # If we don't want an invalid request error then # replace spaces chars " " with "%20" (url encoded space) url = string.replace(str (url)," ","%20") url = URL (url, base) result = url.as_string (with_fragment=1) return result