Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv30712/spambayes
Modified Files:
Options.py tokenizer.py
Log Message:
Add an x-short_runs option. When enabled, instead of completely skipping
short words, runs of them are counted, the longest generating a token using
the usual log2() technique. See the comment in tokenizer.py and doc string
in Options.py for examples of the sort of things it attempts to catch.
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.132
retrieving revision 1.133
diff -C2 -d -r1.132 -r1.133
*** Options.py 6 Aug 2006 16:14:17 -0000 1.132
--- Options.py 6 Aug 2006 16:34:37 -0000 1.133
***************
*** 98,101 ****
--- 98,109 ----
INTEGER, RESTORE),
+ ("x-short_runs", _("Count runs of short 'words'"), False,
+ _("""(EXPERIMENTAL) If true, generate tokens based on max number of
+ short word runs. Short words are anything of length < the
+ skip_max_word_size option. Normally they are skipped, but one common
+ spam technique spells words like 'V I A G RA'.
+ """),
+ BOOLEAN, RESTORE),
+
("count_all_header_lines", _("Count all header lines"), False,
_("""Generate tokens just counting the number of instances of each kind
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.38
retrieving revision 1.39
diff -C2 -d -r1.38 -r1.39
*** tokenizer.py 6 Aug 2006 16:19:19 -0000 1.38
--- tokenizer.py 6 Aug 2006 16:34:37 -0000 1.39
***************
*** 1531,1543 ****
"skip_max_word_size"]):
"""Tokenize everything in the chunk of text we were handed."""
for w in text.split():
n = len(w)
! # Make sure this range matches in tokenize_word().
! if 3 <= n <= maxword:
! yield w
! elif n >= 3:
! for t in tokenize_word(w):
! yield t
def tokenize_body(self, msg):
--- 1531,1558 ----
"skip_max_word_size"]):
"""Tokenize everything in the chunk of text we were handed."""
+ short_runs = Set()
+ short_count = 0
for w in text.split():
n = len(w)
! if n < 3:
! # count how many short words we see in a row - meant to
! # latch onto crap like this:
! # X j A m N j A d X h
! # M k E z R d I p D u I m A c
! # C o I d A t L j I v S j
! short_count += 1
! else:
! if short_count:
! short_runs.add(short_count)
! short_count = 0
! # Make sure this range matches in tokenize_word().
! if 3 <= n <= maxword:
! yield w
! elif n >= 3:
! for t in tokenize_word(w):
! yield t
! if short_runs and options["Tokenizer", "x-short_runs"]:
! yield "short:%d" % int(log2(max(short_runs)))
def tokenize_body(self, msg):
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins