Revision: 3196
          http://spambayes.svn.sourceforge.net/spambayes/?rev=3196&view=rev
Author:   montanaro
Date:     2008-11-25 02:02:34 +0000 (Tue, 25 Nov 2008)

Log Message:
-----------
use hashlib.md5 if possible
use safepickle functions
raise SystemError if ocr engine croaks

Modified Paths:
--------------
    trunk/spambayes/spambayes/ImageStripper.py

Modified: trunk/spambayes/spambayes/ImageStripper.py
===================================================================
--- trunk/spambayes/spambayes/ImageStripper.py  2008-11-25 02:00:09 UTC (rev 
3195)
+++ trunk/spambayes/spambayes/ImageStripper.py  2008-11-25 02:02:34 UTC (rev 
3196)
@@ -8,13 +8,11 @@
 import os
 import tempfile
 import math
-import time
-import md5
-import atexit
 try:
-    import cPickle as pickle
+    from hashlib import md5
 except ImportError:
-    import pickle
+    from md5 import new as md5
+import atexit
 try:
     import cStringIO as StringIO
 except ImportError:
@@ -25,6 +23,8 @@
 except ImportError:
     Image = None
 
+from spambayes.safepickle import pickle_read, pickle_write
+
 # The email mime object carrying the image data can have a special attribute
 # which indicates that a message had an image, but it was large (ie, larger
 # than the 'max_image_size' option.)  This allows the provider of the email
@@ -72,7 +72,7 @@
             #    C:/Program Files/SpamBayes/bin
             # so add that directory to the path and make sure we
             # look for a file ending in ".exe".
-            if sys.frozen=="dll":
+            if sys.frozen == "dll":
                 import win32api
                 sentinal = win32api.GetModuleFileName(sys.frozendllhandle)
             else:
@@ -81,7 +81,7 @@
             # So just use the short version.
             # For the sake of safety, in a binary build we *only* look in
             # our bin dir.
-            path=[win32api.GetShortPathName(os.path.dirname(sentinal))]
+            path = [win32api.GetShortPathName(os.path.dirname(sentinal))]
         else:
             # a source build - for testing, allow it in SB package dir.
             import spambayes
@@ -255,8 +255,8 @@
         ret = ocr.read()
         exit_code = ocr.close()
         if exit_code:
-            print "warning:", self.engine_name, "failed with exit code", 
exit_code
-            print "command line was:", repr(cmdline)
+            raise SystemError, ("%s failed with exit code %s" %
+                                (self.engine_name, exit_code))
         return ret
 
 class OCREngineOCRAD(OCRExecutableEngine):
@@ -269,7 +269,7 @@
                 (self.program, scale, charset, pnmfile, os.path.devnull)
 
 class OCREngineGOCR(OCRExecutableEngine):
-    engine_name="gocr"
+    engine_name = "gocr"
 
     def get_command_line(self, pnmfile):
         return '%s "%s" 2>%s' % (self.program, pnmfile, os.path.devnull)
@@ -302,7 +302,7 @@
     def __init__(self, cachefile=""):
         self.cachefile = os.path.expanduser(cachefile)
         if os.path.exists(self.cachefile):
-            self.cache = pickle.load(open(self.cachefile))
+            self.cache = pickle_read(self.cachefile)
         else:
             self.cache = {}
         self.misses = self.hits = 0
@@ -315,14 +315,20 @@
         textbits = []
         tokens = Set()
         for pnmfile in pnmfiles:
-            fhash = md5.new(open(pnmfile).read()).hexdigest()
+            preserve = False
+            fhash = md5(open(pnmfile).read()).hexdigest()
             if fhash in self.cache:
                 self.hits += 1
                 ctext, ctokens = self.cache[fhash]
             else:
                 self.misses += 1
                 if self.engine.program:
-                    ctext = self.engine.extract_text(pnmfile).lower()
+                    try:
+                        ctext = self.engine.extract_text(pnmfile).lower()
+                    except SystemError, msg:
+                        print >> sys.stderr, msg
+                        preserve = True
+                        ctext = ""
                 else:
                     # We should not get here if no OCR is enabled.  If it
                     # is enabled and we have no program, its OK to spew lots
@@ -345,13 +351,14 @@
                 self.cache[fhash] = (ctext, ctokens)
             textbits.append(ctext)
             tokens |= ctokens
-            os.unlink(pnmfile)
+            if not preserve:
+                os.unlink(pnmfile)
 
         return "\n".join(textbits), tokens
 
     def analyze(self, engine_name, parts):
         # check engine hasn't changed...
-        if self.engine is not None and self.engine.engine_name!=engine_name:
+        if self.engine is not None and self.engine.engine_name != engine_name:
             self.engine = None
         # check engine exists and is valid
         if self.engine is None:
@@ -385,7 +392,7 @@
                 print >> sys.stderr, "%.2f%% hit rate" % \
                       (100 * self.hits / (self.hits + self.misses)),
             print >> sys.stderr
-        pickle.dump(self.cache, open(self.cachefile, "wb"))
+        pickle_write(self.cachefile, self.cache)
 
 _cachefile = options["Tokenizer", "crack_image_cache"]
 crack_images = ImageStripper(_cachefile).analyze


This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to