Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv11708/spambayes

Modified Files:
        Options.py tokenizer.py 
Added Files:
        ImageStripper.py 
Log Message:
Crude OCR capability based on the ocrad program and netpbm.  As bad as
ocrad's text extraction is, this gimmick seems to work pretty well at
catching the currently crop of pump-n-dump spams.  Unix only until someone
implements similar functionality for Windows.


--- NEW FILE: ImageStripper.py ---
"""
This is the place where we try and discover information buried in images.
"""

import os
import tempfile
import math
import time

try:
    # We have three possibilities for Set:
    #  (a) With Python 2.2 and earlier, we use our compatsets class
    #  (b) With Python 2.3, we use the sets.Set class
    #  (c) With Python 2.4 and later, we use the builtin set class
    Set = set
except NameError:
    try:
        from sets import Set
    except ImportError:
        from spambayes.compatsets import Set

from spambayes.Options import options

# copied from tokenizer.py - maybe we should split it into pieces...
def log2(n, log=math.log, c=math.log(2)):
    return log(n)/c

# I'm sure this is all wrong for Windows.  Someone else can fix it. ;-)
def is_executable(prog):
    info = os.stat(prog)
    return (info.st_uid == os.getuid() and (info.st_mode & 0100) or
            info.st_gid == os.getgid() and (info.st_mode & 0010) or
            info.st_mode & 0001)

def find_program(prog):
    for directory in os.environ.get("PATH", "").split(os.pathsep):
        program = os.path.join(directory, prog)
        if os.path.exists(program) and is_executable(program):
            return program
    return ""

def find_decoders():
    # check for filters to convert to netpbm
    for decode_jpeg in ["jpegtopnm", "djpeg"]:
        if find_program(decode_jpeg):
            break
    else:
        decode_jpeg = None
    for decode_png in ["pngtopnm"]:
        if find_program(decode_png):
            break
    else:
        decode_png = None
    for decode_gif in ["giftopnm"]:
        if find_program(decode_gif):
            break
    else:
        decode_gif = None

    decoders = {
        "image/jpeg": decode_jpeg,
        "image/gif": decode_gif,
        "image/png": decode_png,
        }
    return decoders

def decode_parts(parts, decoders):
    pnmfiles = []
    for part in parts:
        decoder = decoders.get(part.get_content_type())
        if decoder is None:
            continue
        try:
            bytes = part.get_payload(decode=True)
        except:
            continue

        if len(bytes) > options["Tokenizer", "max_image_size"]:
            continue                # assume it's just a picture for now

        fd, imgfile = tempfile.mkstemp()
        os.write(fd, bytes)
        os.close(fd)

        fd, pnmfile = tempfile.mkstemp()
        os.close(fd)
        os.system("%s <%s >%s 2>dev.null" % (decoder, imgfile, pnmfile))
        pnmfiles.append(pnmfile)

    if not pnmfiles:
        return

    if len(pnmfiles) > 1:
        if find_program("pnmcat"):
            fd, pnmfile = tempfile.mkstemp()
            os.close(fd)
            os.system("pnmcat -lr %s > %s 2>/dev/null" %
                      (" ".join(pnmfiles), pnmfile))
            for f in pnmfiles:
                os.unlink(f)
            pnmfiles = [pnmfile]

    return pnmfiles

def extract_ocr_info(pnmfiles):
    fd, orf = tempfile.mkstemp()
    os.close(fd)

    textbits = []
    tokens = Set()
    for pnmfile in pnmfiles:
        ocr = os.popen("ocrad -x %s < %s 2>/dev/null" % (orf, pnmfile))
        textbits.append(ocr.read())
        ocr.close()
        for line in open(orf):
            if line.startswith("lines"):
                nlines = int(line.split()[1])
                if nlines:
                    tokens.add("image-text-lines:%d" % int(log2(nlines)))

        os.unlink(pnmfile)
    os.unlink(orf)

    return "\n".join(textbits), tokens

class ImageStripper:
    def analyze(self, parts):
        if not parts:
            return "", Set()

        # need ocrad
        if not find_program("ocrad"):
            return "", Set()

        decoders = find_decoders()
        pnmfiles = decode_parts(parts, decoders)

        if not pnmfiles:
            return "", Set()

        return extract_ocr_info(pnmfiles)

        

Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.135
retrieving revision 1.136
diff -C2 -d -r1.135 -r1.136
*** Options.py  6 Aug 2006 16:58:31 -0000       1.135
--- Options.py  6 Aug 2006 17:09:05 -0000       1.136
***************
*** 125,128 ****
--- 125,142 ----
       BOOLEAN, RESTORE),
  
+     ("x-crack_images", _("Look inside images for text"), False,
+      _("""(EXPERIMENTAL) If true, generate tokens based on the
+      (hopefully) text content contained in any images in each message.
+      The current support is minimal, relies on the installation of
+      ocrad (http://www.gnu.org/software/ocrad/ocrad.html) and netpbm.
+      It is almost certainly only useful in its current form on Unix-like
+      machines."""),
+      BOOLEAN, RESTORE),
+ 
+     ("max_image_size", _("Max image size to try OCR-ing"), 100000,
+      _("""When crack_images is enabled, this specifies the largest
+      image to try OCR on."""),
+      INTEGER, RESTORE),
+ 
      ("count_all_header_lines", _("Count all header lines"), False,
       _("""Generate tokens just counting the number of instances of each kind

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.41
retrieving revision 1.42
diff -C2 -d -r1.41 -r1.42
*** tokenizer.py        6 Aug 2006 16:58:31 -0000       1.41
--- tokenizer.py        6 Aug 2006 17:09:05 -0000       1.42
***************
*** 1618,1621 ****
--- 1618,1629 ----
                  yield "image-size:2**%d" % round(log2(len(text)))
  
+         if options["Tokenizer", "x-crack_images"]:
+             from spambayes.ImageStripper import ImageStripper
+             text, tokens = ImageStripper().analyze(parts)
+             for t in tokens:
+                 yield t
+             for t in self.tokenize_text(text):
+                 yield t
+ 
          # Find, decode (base64, qp), and tokenize textual parts of the body.
          for part in textparts(msg):

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to