Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5654/spambayes
Modified Files:
storage.py
Log Message:
Fix [ 1187208 ] import into CDB chokes on 8-bit chars
Index: storage.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/storage.py,v
retrieving revision 1.50
retrieving revision 1.51
diff -C2 -d -r1.50 -r1.51
*** storage.py 21 Apr 2005 07:16:48 -0000 1.50
--- storage.py 22 Apr 2005 04:08:25 -0000 1.51
***************
*** 65,68 ****
--- 65,69 ----
import os
import sys
+ import types
from spambayes import classifier
from spambayes.Options import options, get_pathname_option
***************
*** 620,623 ****
--- 621,636 ----
return wi
+ # Stolen from sb_dbexpimp.py
+ # Heaven only knows what encoding non-ASCII stuff will be in
+ # Try a few common western encodings and punt if they all fail
+ def uunquote(self, s):
+ for encoding in ("utf-8", "cp1252", "iso-8859-1"):
+ try:
+ return unicode(s, encoding)
+ except UnicodeDecodeError:
+ pass
+ # punt
+ return s
+
def load(self):
if os.path.exists(self.db_name):
***************
*** 627,631 ****
self.nham, self.nspam = [int(i) for i in \
data[self.statekey].split(',')]
! self.wordinfo = dict([(k, self._WordInfoFactory(v)) \
for k, v in data.iteritems() \
if k != self.statekey])
--- 640,645 ----
self.nham, self.nspam = [int(i) for i in \
data[self.statekey].split(',')]
! self.wordinfo = dict([(self.uunquote(k),
! self._WordInfoFactory(v)) \
for k, v in data.iteritems() \
if k != self.statekey])
***************
*** 645,648 ****
--- 659,664 ----
items = [(self.statekey, "%d,%d" % (self.nham, self.nspam))]
for word, wi in self.wordinfo.iteritems():
+ if isinstance(word, types.UnicodeType):
+ word = word.encode("utf-8")
items.append((word, "%d,%d" % (wi.hamcount, wi.spamcount)))
db = open(self.db_name, "wb")
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins