Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv8064/spambayes
Modified Files:
Options.py tokenizer.py
Log Message:
Add an image-size token. Enabled with the x-image_size option. Uses the
usual log2() gimmick.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.134
retrieving revision 1.135
diff -C2 -d -r1.134 -r1.135
*** Options.py 6 Aug 2006 16:52:54 -0000 1.134
--- Options.py 6 Aug 2006 16:58:31 -0000 1.135
***************
*** 120,123 ****
--- 120,128 ----
FILE, RESTORE),
+ ("x-image_size", _("Generate image size tokens"), False,
+ _("""(EXPERIMENTAL) If true, generate tokens based on the sizes of
+ embedded images."""),
+ BOOLEAN, RESTORE),
+
("count_all_header_lines", _("Count all header lines"), False,
_("""Generate tokens just counting the number of instances of each kind
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.40
retrieving revision 1.41
diff -C2 -d -r1.40 -r1.41
*** tokenizer.py 6 Aug 2006 16:52:54 -0000 1.40
--- tokenizer.py 6 Aug 2006 16:58:31 -0000 1.41
***************
*** 636,639 ****
--- 636,647 ----
msg.walk()))
+ def imageparts(msg):
+ """Return a list of all msg parts with type 'image/*'."""
+ # Don't want a set here because we want to be able to process them in
+ # order.
+ return filter(lambda part:
+ part.get_content_type().startswith('image/'),
+ msg.walk())
+
has_highbit_char = re.compile(r"[\x80-\xff]").search
***************
*** 1592,1595 ****
--- 1600,1621 ----
"octet_prefix_size"]]
+ parts = imageparts(msg)
+ if options["Tokenizer", "x-image_size"]:
+ # Find image/* parts of the body, calculating the log(size) of
+ # each image.
+
+ for part in parts:
+ try:
+ text = part.get_payload(decode=True)
+ except:
+ yield "control: couldn't decode image"
+ text = part.get_payload(decode=False)
+
+ if text is None:
+ yield "control: image payload is None"
+ continue
+
+ yield "image-size:2**%d" % round(log2(len(text)))
+
# Find, decode (base64, qp), and tokenize textual parts of the body.
for part in textparts(msg):
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins