Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv21273/spambayes
Modified Files:
ImageStripper.py Options.py tokenizer.py
Log Message:
Use PIL to decode input images if available (faster, much more robust, and
platform-independent). Add a token cache for the ocr output to speed up
that operation. Slight API change for the ocr stuff. Now a singleton is
created and used for all analysis.
Index: ImageStripper.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/ImageStripper.py,v
retrieving revision 1.1
retrieving revision 1.2
diff -C2 -d -r1.1 -r1.2
*** ImageStripper.py 6 Aug 2006 17:09:04 -0000 1.1
--- ImageStripper.py 10 Aug 2006 04:07:59 -0000 1.2
***************
*** 3,10 ****
--- 3,28 ----
"""
+ from __future__ import division
+
+ import sys
import os
import tempfile
import math
import time
+ import md5
+ import atexit
+ try:
+ import cPickle as pickle
+ except ImportError:
+ import pickle
+ try:
+ import cStringIO as StringIO
+ except ImportError:
+ import StringIO
+
+ try:
+ from PIL import Image
+ except ImportError:
+ Image = None
try:
***************
*** 65,128 ****
return decoders
! def decode_parts(parts, decoders):
! pnmfiles = []
! for part in parts:
! decoder = decoders.get(part.get_content_type())
! if decoder is None:
! continue
! try:
! bytes = part.get_payload(decode=True)
! except:
! continue
! if len(bytes) > options["Tokenizer", "max_image_size"]:
! continue # assume it's just a picture for now
! fd, imgfile = tempfile.mkstemp()
! os.write(fd, bytes)
! os.close(fd)
! fd, pnmfile = tempfile.mkstemp()
! os.close(fd)
! os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
! pnmfiles.append(pnmfile)
! if not pnmfiles:
! return
- if len(pnmfiles) > 1:
- if find_program("pnmcat"):
fd, pnmfile = tempfile.mkstemp()
os.close(fd)
! os.system("pnmcat -lr %s > %s 2>/dev/null" %
! (" ".join(pnmfiles), pnmfile))
! for f in pnmfiles:
! os.unlink(f)
! pnmfiles = [pnmfile]
! return pnmfiles
! def extract_ocr_info(pnmfiles):
! fd, orf = tempfile.mkstemp()
! os.close(fd)
! textbits = []
! tokens = Set()
! for pnmfile in pnmfiles:
! ocr = os.popen("ocrad -x %s < %s 2>/dev/null" % (orf, pnmfile))
! textbits.append(ocr.read())
! ocr.close()
! for line in open(orf):
! if line.startswith("lines"):
! nlines = int(line.split()[1])
! if nlines:
! tokens.add("image-text-lines:%d" % int(log2(nlines)))
! os.unlink(pnmfile)
! os.unlink(orf)
! return "\n".join(textbits), tokens
- class ImageStripper:
def analyze(self, parts):
if not parts:
--- 83,211 ----
return decoders
! def imconcat(im1, im2):
! # concatenate im1 and im2 left-to-right
! w1, h1 = im1.size
! w2, h2 = im2.size
! im3 = Image.new("RGB", (w1+w2, max(h1, h2)))
! im3.paste(im1, (0, 0))
! im3.paste(im2, (0, w1))
! return im3
! class ImageStripper:
! def __init__(self, cachefile=""):
! self.cachefile = os.path.expanduser(cachefile)
! if os.path.exists(self.cachefile):
! self.cache = pickle.load(open(self.cachefile))
! else:
! self.cache = {}
! self.misses = self.hits = 0
! if self.cachefile:
! atexit.register(self.close)
! def NetPBM_decode_parts(self, parts, decoders):
! pnmfiles = []
! for part in parts:
! decoder = decoders.get(part.get_content_type())
! if decoder is None:
! continue
! try:
! bytes = part.get_payload(decode=True)
! except:
! continue
! if len(bytes) > options["Tokenizer", "max_image_size"]:
! continue # assume it's just a picture for now
! fd, imgfile = tempfile.mkstemp()
! os.write(fd, bytes)
! os.close(fd)
fd, pnmfile = tempfile.mkstemp()
os.close(fd)
! os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
! pnmfiles.append(pnmfile)
! os.unlink(imgfile)
! if not pnmfiles:
! return
! if len(pnmfiles) > 1:
! if find_program("pnmcat"):
! fd, pnmfile = tempfile.mkstemp()
! os.close(fd)
! os.system("pnmcat -lr %s > %s 2>/dev/null" %
! (" ".join(pnmfiles), pnmfile))
! for f in pnmfiles:
! os.unlink(f)
! pnmfiles = [pnmfile]
! return pnmfiles
! def PIL_decode_parts(self, parts):
! full_image = None
! for part in parts:
! try:
! bytes = part.get_payload(decode=True)
! except:
! continue
! if len(bytes) > options["Tokenizer", "max_image_size"]:
! continue # assume it's just a picture for now
!
! # We're dealing with spammers here - who knows what garbage they
! # will call a GIF image to entice you to open it?
! try:
! image = Image.open(StringIO.StringIO(bytes))
! image.load()
! except IOError:
! continue
! else:
! image = image.convert("RGB")
!
! if full_image is None:
! full_image = image
! else:
! full_image = imconcat(full_image, image)
!
! if not full_image:
! return
!
! fd, pnmfile = tempfile.mkstemp()
! os.close(fd)
! full_image.save(open(pnmfile, "wb"), "PPM")
!
! return [pnmfile]
!
! def extract_ocr_info(self, pnmfiles):
! fd, orf = tempfile.mkstemp()
! os.close(fd)
!
! textbits = []
! tokens = Set()
! for pnmfile in pnmfiles:
! fhash = md5.new(open(pnmfile).read()).hexdigest()
! if fhash in self.cache:
! self.hits += 1
! ctext, ctokens = self.cache[fhash]
! else:
! self.misses += 1
! ocr = os.popen("ocrad -x %s < %s 2>/dev/null" % (orf,
pnmfile))
! ctext = ocr.read().lower()
! ocr.close()
! ctokens = set()
! for line in open(orf):
! if line.startswith("lines"):
! nlines = int(line.split()[1])
! if nlines:
! ctokens.add("image-text-lines:%d" %
! int(log2(nlines)))
! self.cache[fhash] = (ctext, ctokens)
! textbits.append(ctext)
! tokens |= ctokens
! os.unlink(pnmfile)
! os.unlink(orf)
!
! return "\n".join(textbits), tokens
def analyze(self, parts):
if not parts:
***************
*** 133,143 ****
return "", Set()
! decoders = find_decoders()
! pnmfiles = decode_parts(parts, decoders)
! if not pnmfiles:
! return "", Set()
! return extract_ocr_info(pnmfiles)
!
--- 216,240 ----
return "", Set()
! if Image is not None:
! pnmfiles = self.PIL_decode_parts(parts)
! else:
! pnmfiles = self.NetPBM_decode_parts(parts, find_decoders())
! if pnmfiles:
! return self.extract_ocr_info(pnmfiles)
! return "", Set()
!
! def close(self):
! if options["globals", "verbose"]:
! print >> sys.stderr, "saving", len(self.cache),
! print >> sys.stderr, "items to", self.cachefile,
! if self.hits + self.misses:
! print >> sys.stderr, "%.2f%% hit rate" % \
! (100 * self.hits / (self.hits + self.misses)),
! print >> sys.stderr
! pickle.dump(self.cache, open(self.cachefile, "wb"))
!
! _cachefile = options["Tokenizer", "crack_image_cache"]
! crack_images = ImageStripper(_cachefile).analyze
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.136
retrieving revision 1.137
diff -C2 -d -r1.136 -r1.137
*** Options.py 6 Aug 2006 17:09:05 -0000 1.136
--- Options.py 10 Aug 2006 04:07:59 -0000 1.137
***************
*** 118,122 ****
token store (only dbm and zodb supported so far, zodb has problems,
dbm is untested, hence the default)."""),
! FILE, RESTORE),
("x-image_size", _("Generate image size tokens"), False,
--- 118,122 ----
token store (only dbm and zodb supported so far, zodb has problems,
dbm is untested, hence the default)."""),
! PATH, RESTORE),
("x-image_size", _("Generate image size tokens"), False,
***************
*** 134,137 ****
--- 134,142 ----
BOOLEAN, RESTORE),
+ ("crack_image_cache", _("Cache to speed up ocr."), "",
+ _("""If non-empty, names a file from which to read cached ocr info
+ at start and to which to save that info at exit."""),
+ PATH, RESTORE),
+
("max_image_size", _("Max image size to try OCR-ing"), 100000,
_("""When crack_images is enabled, this specifies the largest
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.44
retrieving revision 1.45
diff -C2 -d -r1.44 -r1.45
*** tokenizer.py 7 Aug 2006 02:47:10 -0000 1.44
--- tokenizer.py 10 Aug 2006 04:07:59 -0000 1.45
***************
*** 1636,1641 ****
if options["Tokenizer", "x-crack_images"]:
! from spambayes.ImageStripper import ImageStripper
! text, tokens = ImageStripper().analyze(parts)
for t in tokens:
yield t
--- 1636,1641 ----
if options["Tokenizer", "x-crack_images"]:
! from spambayes.ImageStripper import crack_images
! text, tokens = crack_images(parts)
for t in tokens:
yield t
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins