Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv11708/spambayes
Modified Files:
Options.py tokenizer.py
Added Files:
ImageStripper.py
Log Message:
Crude OCR capability based on the ocrad program and netpbm. As bad as
ocrad's text extraction is, this gimmick seems to work pretty well at
catching the currently crop of pump-n-dump spams. Unix only until someone
implements similar functionality for Windows.
--- NEW FILE: ImageStripper.py ---
"""
This is the place where we try and discover information buried in images.
"""
import os
import tempfile
import math
import time
try:
# We have three possibilities for Set:
# (a) With Python 2.2 and earlier, we use our compatsets class
# (b) With Python 2.3, we use the sets.Set class
# (c) With Python 2.4 and later, we use the builtin set class
Set = set
except NameError:
try:
from sets import Set
except ImportError:
from spambayes.compatsets import Set
from spambayes.Options import options
# copied from tokenizer.py - maybe we should split it into pieces...
def log2(n, log=math.log, c=math.log(2)):
return log(n)/c
# I'm sure this is all wrong for Windows. Someone else can fix it. ;-)
def is_executable(prog):
info = os.stat(prog)
return (info.st_uid == os.getuid() and (info.st_mode & 0100) or
info.st_gid == os.getgid() and (info.st_mode & 0010) or
info.st_mode & 0001)
def find_program(prog):
for directory in os.environ.get("PATH", "").split(os.pathsep):
program = os.path.join(directory, prog)
if os.path.exists(program) and is_executable(program):
return program
return ""
def find_decoders():
# check for filters to convert to netpbm
for decode_jpeg in ["jpegtopnm", "djpeg"]:
if find_program(decode_jpeg):
break
else:
decode_jpeg = None
for decode_png in ["pngtopnm"]:
if find_program(decode_png):
break
else:
decode_png = None
for decode_gif in ["giftopnm"]:
if find_program(decode_gif):
break
else:
decode_gif = None
decoders = {
"image/jpeg": decode_jpeg,
"image/gif": decode_gif,
"image/png": decode_png,
}
return decoders
def decode_parts(parts, decoders):
pnmfiles = []
for part in parts:
decoder = decoders.get(part.get_content_type())
if decoder is None:
continue
try:
bytes = part.get_payload(decode=True)
except:
continue
if len(bytes) > options["Tokenizer", "max_image_size"]:
continue # assume it's just a picture for now
fd, imgfile = tempfile.mkstemp()
os.write(fd, bytes)
os.close(fd)
fd, pnmfile = tempfile.mkstemp()
os.close(fd)
os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
pnmfiles.append(pnmfile)
if not pnmfiles:
return
if len(pnmfiles) > 1:
if find_program("pnmcat"):
fd, pnmfile = tempfile.mkstemp()
os.close(fd)
os.system("pnmcat -lr %s > %s 2>/dev/null" %
(" ".join(pnmfiles), pnmfile))
for f in pnmfiles:
os.unlink(f)
pnmfiles = [pnmfile]
return pnmfiles
def extract_ocr_info(pnmfiles):
fd, orf = tempfile.mkstemp()
os.close(fd)
textbits = []
tokens = Set()
for pnmfile in pnmfiles:
ocr = os.popen("ocrad -x %s < %s 2>/dev/null" % (orf, pnmfile))
textbits.append(ocr.read())
ocr.close()
for line in open(orf):
if line.startswith("lines"):
nlines = int(line.split()[1])
if nlines:
tokens.add("image-text-lines:%d" % int(log2(nlines)))
os.unlink(pnmfile)
os.unlink(orf)
return "\n".join(textbits), tokens
class ImageStripper:
def analyze(self, parts):
if not parts:
return "", Set()
# need ocrad
if not find_program("ocrad"):
return "", Set()
decoders = find_decoders()
pnmfiles = decode_parts(parts, decoders)
if not pnmfiles:
return "", Set()
return extract_ocr_info(pnmfiles)
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.135
retrieving revision 1.136
diff -C2 -d -r1.135 -r1.136
*** Options.py 6 Aug 2006 16:58:31 -0000 1.135
--- Options.py 6 Aug 2006 17:09:05 -0000 1.136
***************
*** 125,128 ****
--- 125,142 ----
BOOLEAN, RESTORE),
+ ("x-crack_images", _("Look inside images for text"), False,
+ _("""(EXPERIMENTAL) If true, generate tokens based on the
+ (hopefully) text content contained in any images in each message.
+ The current support is minimal, relies on the installation of
+ ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and netpbm.
+ It is almost certainly only useful in its current form on Unix-like
+ machines."""),
+ BOOLEAN, RESTORE),
+
+ ("max_image_size", _("Max image size to try OCR-ing"), 100000,
+ _("""When crack_images is enabled, this specifies the largest
+ image to try OCR on."""),
+ INTEGER, RESTORE),
+
("count_all_header_lines", _("Count all header lines"), False,
_("""Generate tokens just counting the number of instances of each kind
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** tokenizer.py 6 Aug 2006 16:58:31 -0000 1.41
--- tokenizer.py 6 Aug 2006 17:09:05 -0000 1.42
***************
*** 1618,1621 ****
--- 1618,1629 ----
yield "image-size:2**%d" % round(log2(len(text)))
+ if options["Tokenizer", "x-crack_images"]:
+ from spambayes.ImageStripper import ImageStripper
+ text, tokens = ImageStripper().analyze(parts)
+ for t in tokens:
+ yield t
+ for t in self.tokenize_text(text):
+ yield t
+
# Find, decode (base64, qp), and tokenize textual parts of the body.
for part in textparts(msg):
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins