Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv3625

Modified Files:
        ImageStripper.py Options.py tokenizer.py 
Log Message:
Add gocr support


Index: ImageStripper.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v
retrieving revision 1.11
retrieving revision 1.12
diff -C2 -d -r1.11 -r1.12
*** ImageStripper.py    2 Dec 2006 22:09:25 -0000       1.11
--- ImageStripper.py    12 Feb 2007 11:24:59 -0000      1.12
***************
*** 26,29 ****
--- 26,41 ----
      Image = None
  
+ # The email mime object carrying the image data can have a special attribute
+ # which indicates that a message had an image, but it was large (ie, larger
+ # than the 'max_image_size' option.)  This allows the provider of the email
+ # object to avoid loading huge images into memory just to have this image
+ # stripper ignore it.
+ # If the attribute exists, it should be the size of the image (we assert it
+ # is > max_image_size).  The image payload is ignored.
+ # A 'cleaner' option would be to look at a header - but an attribute was
+ # chosen to avoid spammers getting wise and 'injecting' the header into the
+ # message body of a mime section.
+ image_large_size_attribute = "spambayes_image_large_size"
+ 
  try:
      # We have three possibilities for Set:
***************
*** 55,64 ****
      path = os.environ.get("PATH", "").split(os.pathsep)
      if sys.platform == "win32":
-         # Outlook plugin puts executables in (for example):
-         #    C:/Program Files/SpamBayes/bin
-         # so add that directory to the path and make sure we
-         # look for a file ending in ".exe".
-         path.append(os.path.dirname(sys.executable))
          prog = "%s.exe" % prog
      for directory in path:
          program = os.path.join(directory, prog)
--- 67,84 ----
      path = os.environ.get("PATH", "").split(os.pathsep)
      if sys.platform == "win32":
          prog = "%s.exe" % prog
+         if hasattr(sys, "frozen"): # a binary (py2exe) build..
+             # Outlook plugin puts executables in (for example):
+             #    C:/Program Files/SpamBayes/bin
+             # so add that directory to the path and make sure we
+             # look for a file ending in ".exe".
+             # Put it at the *start* of the paths we search - who knows
+             # what else me may encounter in the wild!
+             path.insert(0, os.path.dirname(sys.executable))
+         else:
+             # a source build - for testing, allow it in SB package dir.
+             import spambayes
+             path.insert(0, os.path.abspath(spambayes.__path__[0]))
+ 
      for directory in path:
          program = os.path.join(directory, prog)
***************
*** 89,100 ****
      tokens = Set()
      rows = []
      for part in parts:
!         try:
!             bytes = part.get_payload(decode=True)
!         except:
!             tokens.add("invalid-image:%s" % part.get_content_type())
!             continue
  
!         if len(bytes) > options["Tokenizer", "max_image_size"]:
              tokens.add("image:big")
              continue                # assume it's just a picture for now
--- 109,130 ----
      tokens = Set()
      rows = []
+     max_image_size = options["Tokenizer", "max_image_size"]
      for part in parts:
!         # See 'image_large_size_attribute' above - the provider may have seen
!         # an image, but optimized the fact we don't bother processing large
!         # images.
!         nbytes = getattr(part, image_large_size_attribute, None)
!         if nbytes is None: # no optimization - process normally...
!             try:
!                 bytes = part.get_payload(decode=True)
!                 nbytes = len(bytes)
!             except:
!                 tokens.add("invalid-image:%s" % part.get_content_type())
!                 continue
!         else:
!             # optimization should not have remove images smaller than our max
!             assert nbytes > max_image_size, (len(bytes), max_image_size)
  
!         if nbytes > max_image_size:
              tokens.add("image:big")
              continue                # assume it's just a picture for now
***************
*** 157,161 ****
          full_image = imconcattb(full_image, image)
  
!     fd, pnmfile = tempfile.mkstemp()
      os.close(fd)
      full_image.save(open(pnmfile, "wb"), "PPM")
--- 187,191 ----
          full_image = imconcattb(full_image, image)
  
!     fd, pnmfile = tempfile.mkstemp('-spambayes-image')
      os.close(fd)
      full_image.save(open(pnmfile, "wb"), "PPM")
***************
*** 163,166 ****
--- 193,288 ----
      return [pnmfile], tokens
  
+ class OCREngine(object):
+     """Base class for an OCR "engine" that extracts text.  Ideally would
+        also deal with image format (as different engines will have different
+        requirements), but all currently supported ones deal with the PNM
+        formats (ppm/pgm/pbm)
+     """
+     engine_name = None # sub-classes should override.
+     def __init__(self):
+         pass
+ 
+     def is_enabled(self):
+         """Return true if this engine is able to be used.  Note that
+            returning true only means it is *capable* of being used - not that
+            it is enabled.  eg, it should check the program is needs to use
+            is installed, etc.
+         """
+         raise NotImplementedError
+ 
+     def extract_text(self, pnmfiles):
+         """Extract the text as an unprocessed stream (but as a string).
+            Typically this will be the raw output from the OCR engine.
+         """
+         raise NotImplementedError
+ 
+ class OCRExecutableEngine(OCREngine):
+     """Uses a simple executable that writes to stdout to extract the text"""
+     program_name = None
+     def __init__(self):
+         # we go looking for the program first use and cache its location
+         self._program = None
+         OCREngine.__init__(self)
+ 
+     def is_enabled(self):
+         return self.program is not None
+ 
+     def get_program(self):
+         # by default, executable is same as engine name
+         if not self._program:
+             self._program = find_program(self.engine_name)
+         return self._program
+     
+     program = property(get_program)
+ 
+ class OCREngineOCRAD(OCRExecutableEngine):
+     engine_name = "ocrad"
+ 
+     def extract_text(self, pnmfile):
+         assert self.is_enabled(), "I'm not working!"
+         scale = options["Tokenizer", "ocrad_scale"] or 1
+         charset = options["Tokenizer", "ocrad_charset"]
+         ocr = os.popen('%s -s %s -c %s -f "%s" 2>%s' %
+                        (self.program, scale, charset,
+                         pnmfile, os.path.devnull))
+         ret = ocr.read()
+         ocr.close()
+         return ret
+ 
+ class OCREngineGOCR(OCRExecutableEngine):
+     engine_name="gocr"
+ 
+     def extract_text(self, pnmfile):
+         assert self.is_enabled(), "I'm not working!"
+         ocr = os.popen('%s "%s" 2>%s' %
+                        (self.program, pnmfile, os.path.devnull))
+         ret = ocr.read()
+         ocr.close()
+         return ret
+ 
+ # This lists all engines, with the first listed that is enabled winning.
+ # Matched with the engine name, as specified in Options.py, via the
+ # 'engine_name' attribute on the class.
+ _ocr_engines = [
+     OCREngineGOCR,
+     OCREngineOCRAD,
+ ]
+ 
+ def get_engine(engine_name):
+     if not engine_name:
+         candidates = _ocr_engines
+     else:
+         for e in _ocr_engines:
+             if e.engine_name == engine_name:
+                 candidates = [e]
+                 break
+         else:
+             candidates = []
+     for candidate in candidates:
+         engine = candidate()
+         if engine.is_enabled():
+             return engine
+     return None
+ 
  class ImageStripper:
      def __init__(self, cachefile=""):
***************
*** 173,182 ****
          if self.cachefile:
              atexit.register(self.close)
! 
      def extract_ocr_info(self, pnmfiles):
          textbits = []
          tokens = Set()
-         scale = options["Tokenizer", "ocrad_scale"] or 1
-         charset = options["Tokenizer", "ocrad_charset"]
          for pnmfile in pnmfiles:
              fhash = md5.new(open(pnmfile).read()).hexdigest()
--- 295,304 ----
          if self.cachefile:
              atexit.register(self.close)
!         self.engine = None
!     
      def extract_ocr_info(self, pnmfiles):
+         assert self.engine, "must have an engine!"
          textbits = []
          tokens = Set()
          for pnmfile in pnmfiles:
              fhash = md5.new(open(pnmfile).read()).hexdigest()
***************
*** 186,194 ****
              else:
                  self.misses += 1
!                 ocr = os.popen('%s -s %s -c %s -f "%s" 2>%s' %
!                                (find_program("ocrad"), scale, charset,
!                                 pnmfile, os.path.devnull))
!                 ctext = ocr.read().lower()
!                 ocr.close()
                  ctokens = set()
                  if not ctext.strip():
--- 308,322 ----
              else:
                  self.misses += 1
!                 if self.engine.program:
!                     ctext = self.engine.extract_text(pnmfile).lower()
!                 else:
!                     # We should not get here if no OCR is enabled.  If it
!                     # is enabled and we have no program, its OK to spew lots
!                     # of warnings - they should either disable OCR (it is by
!                     # default), or fix their config.
!                     print >> sys.stderr, \
!                           "No OCR program '%s' available - can't get text!" \
!                           % (self.engine.program_name,)
!                     ctext = ""
                  ctokens = set()
                  if not ctext.strip():
***************
*** 208,217 ****
          return "\n".join(textbits), tokens
  
!     def analyze(self, parts):
!         if not parts:
              return "", Set()
  
!         # need ocrad
!         if not find_program("ocrad"):
              return "", Set()
  
--- 336,353 ----
          return "\n".join(textbits), tokens
  
!     def analyze(self, engine_name, parts):
!         # check engine hasn't changed...
!         if self.engine is not None and self.engine.engine_name!=engine_name:
!             self.engine = None
!         # check engine exists and is valid
!         if self.engine is None:
!             self.engine = get_engine(engine_name)
!         if self.engine is None:
!             # We only get here if explicitly enabled - spewing msgs is ok.
!             print >> sys.stderr, "invalid engine name '%s' - OCR disabled" \
!                                  % (engine_name,)
              return "", Set()
  
!         if not parts:
              return "", Set()
  

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.139
retrieving revision 1.140
diff -C2 -d -r1.139 -r1.140
*** Options.py  9 Sep 2006 23:02:20 -0000       1.139
--- Options.py  12 Feb 2007 11:24:59 -0000      1.140
***************
*** 129,137 ****
       (hopefully) text content contained in any images in each message.
       The current support is minimal, relies on the installation of
!      ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and PIL.
!      It is almost certainly only useful in its current form on Unix-like
!      machines."""),
       BOOLEAN, RESTORE),
  
      ("crack_image_cache", _("Cache to speed up ocr."), "",
       _("""If non-empty, names a file from which to read cached ocr info
--- 129,145 ----
       (hopefully) text content contained in any images in each message.
       The current support is minimal, relies on the installation of
!      an OCR 'engine' (see x-ocr_engine.)"""),
       BOOLEAN, RESTORE),
  
+     ("x-ocr_engine", _("OCR engine to use"), "",
+      _("""(EXPERIMENTAL) The name of the OCR engine to use.  If empty, all
+      supported engines will be checked to see if they are installed.
+      Engines currently supported include ocrad
+      (http://www.gnu.org/software/ocrad/ocrad.html) and gocr
+      (http://jocr.sourceforge.net/download.html) and they require the
+      appropriate executable be installed in either your PATH, or in the
+      main spambayes directory."""),
+      HEADER_VALUE, RESTORE),
+ 
      ("crack_image_cache", _("Cache to speed up ocr."), "",
       _("""If non-empty, names a file from which to read cached ocr info

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.46
retrieving revision 1.47
diff -C2 -d -r1.46 -r1.47
*** tokenizer.py        13 Dec 2006 14:44:49 -0000      1.46
--- tokenizer.py        12 Feb 2007 11:25:00 -0000      1.47
***************
*** 1619,1623 ****
              # Find image/* parts of the body, calculating the log(size) of
              # each image.
!             
              total_len = 0
              for part in parts:
--- 1619,1623 ----
              # Find image/* parts of the body, calculating the log(size) of
              # each image.
! 
              total_len = 0
              for part in parts:
***************
*** 1636,1641 ****
  
          if options["Tokenizer", "x-crack_images"]:
              from spambayes.ImageStripper import crack_images
!             text, tokens = crack_images(parts)
              for t in tokens:
                  yield t
--- 1636,1642 ----
  
          if options["Tokenizer", "x-crack_images"]:
+             engine_name = options["Tokenizer", 'x-ocr_engine']
              from spambayes.ImageStripper import crack_images
!             text, tokens = crack_images(engine_name, parts)
              for t in tokens:
                  yield t

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to