Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8064/spambayes

Modified Files:
        Options.py tokenizer.py 
Log Message:
Add an image-size token.  Enabled with the x-image_size option.  Uses the
usual log2() gimmick.


Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.134
retrieving revision 1.135
diff -C2 -d -r1.134 -r1.135
*** Options.py  6 Aug 2006 16:52:54 -0000       1.134
--- Options.py  6 Aug 2006 16:58:31 -0000       1.135
***************
*** 120,123 ****
--- 120,128 ----
       FILE, RESTORE),
  
+     ("x-image_size", _("Generate image size tokens"), False,
+      _("""(EXPERIMENTAL) If true, generate tokens based on the sizes of
+      embedded images."""),
+      BOOLEAN, RESTORE),
+ 
      ("count_all_header_lines", _("Count all header lines"), False,
       _("""Generate tokens just counting the number of instances of each kind

Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** tokenizer.py        6 Aug 2006 16:52:54 -0000       1.40
--- tokenizer.py        6 Aug 2006 16:58:31 -0000       1.41
***************
*** 636,639 ****
--- 636,647 ----
                        msg.walk()))
  
+ def imageparts(msg):
+     """Return a list of all msg parts with type 'image/*'."""
+     # Don't want a set here because we want to be able to process them in
+     # order.
+     return filter(lambda part:
+                   part.get_content_type().startswith('image/'),
+                   msg.walk())
+ 
  has_highbit_char = re.compile(r"[\x80-\xff]").search
  
***************
*** 1592,1595 ****
--- 1600,1621 ----
                                                   "octet_prefix_size"]]
  
+         parts = imageparts(msg)
+         if options["Tokenizer", "x-image_size"]:
+             # Find image/* parts of the body, calculating the log(size) of
+             # each image.
+             
+             for part in parts:
+                 try:
+                     text = part.get_payload(decode=True)
+                 except:
+                     yield "control: couldn't decode image"
+                     text = part.get_payload(decode=False)
+ 
+                 if text is None:
+                     yield "control: image payload is None"
+                     continue
+ 
+                 yield "image-size:2**%d" % round(log2(len(text)))
+ 
          # Find, decode (base64, qp), and tokenize textual parts of the body.
          for part in textparts(msg):

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to