Re: Unicode Decode Error

Andrew Mitchell Wed, 05 Sep 2007 10:14:28 -0700

I fixed this issue with a patch a maintainer sent me.

Replace the code in your Unicode.py with the code below and recompile.

Andrew

#!/usr/bin/env python
#
# Copyright (C) 2002 Gre7g Luterman <[EMAIL PROTECTED]>
#
# This file is part of TMDA.
#
# TMDA is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version. A copy of this license should
# be included in the file COPYING.
#
# TMDA is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# for more details.
#
# You should have received a copy of the GNU General Public License
# along with TMDA; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

"Unicode utilities for tmda-cgi."

import codecs
import re

import Template

# Handy values
AltChar = re.compile("[\x80-\xFF]")
UTF8 = codecs.lookup("utf-8")[0]

def Xlate(Chr):
if ord(Chr) >= 160: return unichr(ord(Chr))
if Chr == "\x80": return u"\u20AC"
if Chr == "‚": return u"\u201A"
if Chr == "ƒ": return u"\u2061"
if Chr == "„": return u"\u201E"
if Chr == "…": return u"\u2026"
if Chr == "†": return u"\u2020"
if Chr == "‡": return u"\u2021"
if Chr == "‰": return u"\u2030"
if Chr == "‹": return u"\u2039"
if Chr == "‘": return u"\u2018"
if Chr == "’": return u"\u2019"
if Chr == "“": return u"\u201C"
if Chr == "”": return u"\u201D"
if Chr == "•": return u"\u2022"
if Chr == "–": return u"\u2014"
if Chr == "—": return u"\u2015"
if Chr == "™": return u"\u2122"
return u"\u007F"

def Iso8859(Str):
RetVal = u""
while 1:
Match = AltChar.search(Str)
if Match:
RetVal += Str[:Match.start()] + Xlate(Match.group(0))
Str = Str[Match.end():]
else:
break
RetVal += Str
return (RetVal,)

def TranslateToUTF8(CharSet, Str, Errors):
"Represent a string in UTF-8."
import email.Charset

if not CharSet: CharSet = "iso-8859-1"
CS = email.Charset.Charset(CharSet)
CharSet = CS.input_charset

# Find appropriate decoder
if CharSet in ("iso-8859-1", "us-ascii", "us_ascii" ):
Decoder = Iso8859
else:
try:
Decoder = codecs.lookup(CharSet)[1]
except LookupError:
try:
# Is it GB2312?
if CharSet == "gb2312":
import chinese.gb2312
Lib = chinese.gb2312
# Is it GBK?
elif CharSet == "gbk":
import chinese.gbk
Lib = chinese.gbk
# Is it Big5?
elif CharSet == "big5":
import chinese.big5
Lib = chinese.big5
# Is it iso-2022-jp?
elif CharSet == "iso-2022-jp":
import japanese.iso_2022_jp_ext
Lib = japanese.iso_2022_jp_ext
# Don't recognize it. Was it our fallback?
elif CharSet == PVars[("General", "CSEncoding")]:
# It was our fallback! Give up now!
return Str
# Mark it and use the fallback
else:
return "(%s) %s" % (CharSet,
TranslateToUTF8(PVars[("General", "CSEncoding")], Str, Errors))
Decoder = Lib.Codec().decode
except ImportError:
# We know what it was, but we don't have the library installed.
return "(%s) %s" % (CharSet, Str)

# Decode string to Unicode
try:
Uni = Decoder(Str, errors = Errors)[0]
except:
try:
Uni = Decoder(Str)[0]
except:
try:
# what? it claimed to be this character set but won't decode?
# try iso-8859 as a last resort
Decoder = Iso8859
Uni = Decoder(Str)[0]
except:
# total failure - we were lied to and can't figure out the character
# set.
return "?"
# Encode for UTF-8
return UTF8(Uni)[0]

Andreas Plachy wrote:
> Hi all!
>
> We got often problems with spammails, which have special characters in
> the header.
> The tmda-cgi is still unaseable, till I delete the pending file or
> remove the specials chars in it.
>
> What we have to change to bypass this unicode error?
>
> In this expamle the "From"-field looks like this: "From: Viagra.com Inc
> ® <[EMAIL PROTECTED]>"
>
> Hope someone needs an answer...
>
> so long,
> Andreas
>
>
>
>
> UnicodeDecodeError Python 2.3.4: /usr/bin/python
> Wed Sep 5 07:17:50 2007 
>
> A problem occurred in a Python script. Here is the sequence of function
> calls leading up to the error, in the order they occurred.
>
>  /usr/src/tmda-cgi-0.13/tmda-cgi.py  
>   188   elif Cmd == "pending":
>  
>   189     import PendList
>  
>   190     Call(PendList)
>  
>   191   elif Cmd == "restore":
>  
>   192     pass
>  
> Call = <function Call>, PendList = <module 'PendList' from
> '/usr/src/tmda-cgi-0.13/PendList.pyc'> 
>
>
>  /usr/src/tmda-cgi-0.13/tmda-cgi.py in Call(Library=<module 'PendList'
> from '/usr/src/tmda-cgi-0.13/PendList.pyc'>, Str=None) 
>    86     Library.Show(Str)
>  
>    87   else:
>  
>    88     Library.Show()
>  
>    89 
>  
>    90 # Capture WebUID
>  
> Library = <module 'PendList' from
> '/usr/src/tmda-cgi-0.13/PendList.pyc'>, Library.Show = <function Show> 
>
>
>  /usr/src/tmda-cgi-0.13/PendList.py in Show() 
>   498               value += Unicode.TranslateToUTF8(CharSet,
> decoded[0], "ignore")
>  
>   499           else:
>  
>   500             value += Unicode.TranslateToUTF8(CharSet, decoded[0],
> "ignore")
>  
>   501         From = value
>  
>   502         Temp = Address.search(From)
>  
> value = '', global Unicode = <module 'Unicode' from
> '/usr/src/tmda-cgi-0.13/Unicode.pyc'>, Unicode.TranslateToUTF8 =
> <function TranslateToUTF8>, CharSet = 'us-ascii"', decoded =
> ('Viagra.com Inc \xae <[EMAIL PROTECTED]>', None) 
>
>
>  /usr/src/tmda-cgi-0.13/Unicode.py in
> TranslateToUTF8(CharSet='us-ascii"', Str='Viagra.com Inc \xae
> <[EMAIL PROTECTED]>', Errors='ignore') 
>   113     Uni = Decoder(Str, errors = Errors)[0]
>  
>   114   except:
>  
>   115     Uni = Decoder(Str)[0]
>  
>   116 
>  
>   117   # Encode for UTF-8
>  
> Uni undefined, Decoder = <built-in function ascii_decode>, Str =
> 'Viagra.com Inc \xae <[EMAIL PROTECTED]>' 
>
>
> UnicodeDecodeError: 'ascii' codec can't decode byte 0xae in position 15:
> ordinal not in range(128) 
>       args = ('ascii', 'Viagra.com Inc \xae <[EMAIL PROTECTED]>', 15, 16,
> 'ordinal not in range(128)') 
>       encoding = 'ascii' 
>       end = 16 
>       object = 'Viagra.com Inc \xae <[EMAIL PROTECTED]>' 
>       reason = 'ordinal not in range(128)' 
>       start = 15 
>
>   
> ------------------------------------------------------------------------
>
> _____________________________________________
> tmda-users mailing list (tmda-users@tmda.net)
> http://tmda.net/lists/listinfo/tmda-users

-- 
This message has been scanned for viruses and
dangerous content by MailScanner.

_____________________________________________
tmda-users mailing list (tmda-users@tmda.net)
http://tmda.net/lists/listinfo/tmda-users

Re: Unicode Decode Error

Reply via email to