Url.py replacing spaces in the url (fwd)

mailinglists Tue, 02 Sep 2003 03:41:41 -0700

Oops.  This was sent to me some (long) time ago.
Anybody want to handle it?  I really don't have time
anymore to look into the parser code unless it is
to scratch an own itch...

Holger

---------- Forwarded message ----------
Date: Fri, 01 Aug 2003 18:00:36 -0500
From: Dario Morales Lopez <[EMAIL PROTECTED]>
To: [EMAIL PROTECTED]
X-Spambayes-Classification: ham; 0.00
Subject: Url.py replacing spaces in the url

Hi, I found your e-mail  in the authors page from the plukers users
guide, since I don't want to subscribe to the mailing lists I think you
are the person who sould get this.

Attached is a file with a small modiffication that replaces space chars
with url encoded spaces (%20). Since I'm new to plucker and to python I
don't really know if it is the place to do the replacing and neither if
it is the best way to do it, but it's a quick hack and it works for me.

I've done this because I'm switching from AvantGo to Pluker and was
adding my local cinema showtimes page to plucker, but while parsing the
first page plucker exited with an aufull invalid request error, I've
realized that the error occurred because the page contains "hrefs" with
spaces in them. Since the page is intended to be use with AvantGo and
not with plucker asking the web page mantainer to change their invalid
urls was out of question.

I hope it's usefull in some way and thank you four your very good
efforts bringing plucker to the world.

--
Darío Morales López
[EMAIL PROTECTED]
-------------------
"UNIX was not designed to stop you from doing stupid things, because that
would also stop you from doing clever things." - Doug Gwyn

#!/usr/bin/env python


"""
Url.py   $Id: Url.py,v 1.17 2003/03/27 02:55:19 chrish Exp $

Utility class to encapsulate information about an URL and the useful
operations thereon.


Copyright 1999, 2000 by Holger Duerer <[EMAIL PROTECTED]>

Distributable under the GNU General Public License Version 2 or newer.

"""

import urlparse, urllib, string, sys, os

urlparse.uses_relative.append ('plucker')
urlparse.uses_netloc.append ('plucker')
urlparse.uses_params.append ('plucker')
urlparse.uses_query.append ('plucker')
urlparse.uses_fragment.append ('plucker')



######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
######################################################################
def windows_file_url_parse (url):
    prot='file'
    fragment=''
    i = string.rfind(url, '#')
    if i >= 0:
        fragment = url[i+1:]
        url = url[:i]
    path=url
    if string.lower(path[0:7]) == 'file://':
        path=path[7:]
    if string.lower(path[0:5]) == 'file:':
        path=path[5:]
    if ((string.upper(path[0:1]) >= 'A') and (string.upper(path[0:1]) <= 'Z')) and 
(path[1:2] == ':'):
            path = string.upper(path[0:1]) + path[1:]
    host=''
    params=''
    query=''
    return prot, host, path, params, query, fragment



######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
######################################################################
def windows_file_urljoin(base, url):
    def add_fragment(path, frag):
        if frag != '':
            res = path + '#' + frag
        else:
            res = path
        return res

    i = string.find(url, ':')
    # a new http:// file:// not based to source is _not_ used
    if (i < 3) or (i > 10):
        (prot, host, path, params, query, fragment) = windows_file_url_parse (url)
        if path != '':
            ######################################
            # FIX ME!!!!                         #
            # path like .\test\..\images\        #
            # are not work yet!                  #
            ######################################
            # .\file.ext == file.ext
            if (path[0:2] == '.\\') or (path[0:2] == './'):
                path = path[2:]
                url = os.path.join (os.path.dirname(str (base)), add_fragment(path, 
fragment))
                return url
            # one dir up
            if (path[0:3] == '..\\') or (path[0:3] == '../'):
                path = path[3:]
                url = os.path.join (os.path.dirname(os.path.dirname(str (base))), 
add_fragment(path, fragment))
                return url
            # two dir up
            if (path[0:4] == '...\\') or (path[0:4] == '.../'):
                path = path[4:]
                url = os.path.join 
(os.path.dirname(os.path.dirname(os.path.dirname(str (base)))), add_fragment(path, 
fragment))
                return url
            # Root dir
            if (path[0:1] == '\\') or (path[0:1] == '/'):
                path = path[1:]
                str_base = str (base)
                url = os.path.join ('file:' + str_base[5] + ':' , add_fragment(path, 
fragment))
                return url
            # normale case
            else:
                url = os.path.join (os.path.dirname(str (base)), add_fragment(path, 
fragment))
                return url
        else:
            url = base + '#' + fragment
            return url
    else:
        return url

    return url



######################################################################
# Replacement for the urlparse lib, because this is buggy on Windows #
# And its behavior changed in Python 2.2.2 CRH
######################################################################
def plucker_file_urlunparse(protocol, host, path, params, query, fragment):
    text = ''
    if protocol != '':
        text = text + protocol + ':' + path
    if fragment != '':
        text = text + '#' + fragment
    return text



class URL:
    """Encapsulate some useful things from urllib and urlparse"""

    def __init__ (self, url, base = None):
        if isinstance (url, URL) and base is None:
            # Simple copy constructor: make it more efficient
            self._protocol = url._protocol
            self._host = url._host
            self._path = url._path
            self._params = url._params
            self._query = url._query
            self._fragment = url._fragment
        else:            
            url = str (url)
            if base is not None:
                if sys.platform == 'win32' and string.lower(str (base)[0:5]) == 
'file:':
                    url = windows_file_urljoin (str (base), url)
                else:
                    url = urlparse.urljoin (str (base), url)
            # according to RFC 2396, this 'unquote' is inappropriate
            # according to the HTML 4.01 spec, this 'unquote' is unnecessary
            # url = urllib.unquote (url)
            if sys.platform == 'win32' and string.lower(url[0:5]) == 'file:':
                (prot, host, path, params, query, fragment) = windows_file_url_parse 
(url)
            else:
                (prot, host, path, params, query, fragment) = urlparse.urlparse (url)
            host = string.lower (host)
            self._protocol = prot
            self._host = host
            self._path = path
            self._params = params
            self._query = query
            self._fragment = fragment

    def as_string (self, with_fragment):
        if with_fragment:
            fragment = self._fragment
        else:
            fragment = ""
        if self._protocol == 'plucker' or self._protocol == 'file':
            text = plucker_file_urlunparse (self._protocol,
                                            self._host,
                                            self._path,
                                            self._params,
                                            self._query,
                                            fragment)
        else:
            text = urlparse.urlunparse ((self._protocol,
                                         self._host,
                                         self._path,
                                         self._params,
                                         self._query,
                                         fragment))
        return text

     
    def __str__ (self):
        return self.as_string (with_fragment=1)
    
    def __repr__ (self):
        return "URL (%s)" % repr (self.as_string (with_fragment=1))

    def get_protocol (self):
        return self._protocol
            
    def get_host (self):
        return self._host
            
    def get_path (self):
        return self._path

    def get_fragment (self):
        return self._fragment

    def get_full_path (self, with_fragment):
        if with_fragment:
            fragment = self._fragment
        else:
            fragment = ""
        if sys.platform == 'win32' and self._protocol == 'file':
            text = plucker_file_urlunparse ("",
                                            "",
                                            self._path,
                                            self._params,
                                            self._query,
                                            fragment)
        else:
            text = urlparse.urlunparse (("",
                                         "",
                                         self._path,
                                         self._params,
                                         self._query,
                                         fragment))
        return text

    def remove_fragment (self):
        self._fragment = ""



def CleanURL (url, base=None):
    """Remove leading and trailing white space and generally clean up
    this URL"""
    if isinstance (url, URL):
        # This branch is currently never taken, we get always called
        # with a string as 'url'
        if base is not None:
            # FIXME!!  Does this make sense at all?  URLs should always be
            # absoulte, so giving a base is moot...
            result = Url (url, base).as_string (with_fragment=1)
        else:
            result = url.as_string (with_fragment=1)
    else:
        url = string.strip (str (url))
        # If we don't want an invalid request error then 
        # replace spaces chars " " with "%20" (url encoded space)
        url = string.replace(str (url)," ","%20")
        url = URL (url, base)

        result = url.as_string (with_fragment=1)
    return result

Url.py replacing spaces in the url (fwd)

Reply via email to