Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv25513
Modified Files:
tokenizer.py
Log Message:
Break basic text tokenizing out into its own method in preparation for some
other changes.
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** tokenizer.py 15 Nov 2005 00:16:20 -0000 1.37
--- tokenizer.py 6 Aug 2006 16:19:19 -0000 1.38
***************
*** 1528,1533 ****
yield "noheader:" + k
! def tokenize_body(self, msg, maxword=options["Tokenizer",
! "skip_max_word_size"]):
"""Generate a stream of tokens from an email Message.
--- 1528,1545 ----
yield "noheader:" + k
! def tokenize_text(self, text, maxword=options["Tokenizer",
! "skip_max_word_size"]):
! """Tokenize everything in the chunk of text we were handed."""
! for w in text.split():
! n = len(w)
! # Make sure this range matches in tokenize_word().
! if 3 <= n <= maxword:
! yield w
!
! elif n >= 3:
! for t in tokenize_word(w):
! yield t
!
! def tokenize_body(self, msg):
"""Generate a stream of tokens from an email Message.
***************
*** 1606,1619 ****
text = html_re.sub('', text)
! # Tokenize everything in the body.
! for w in text.split():
! n = len(w)
! # Make sure this range matches in tokenize_word().
! if 3 <= n <= maxword:
! yield w
!
! elif n >= 3:
! for t in tokenize_word(w):
! yield t
global_tokenizer = Tokenizer()
--- 1618,1623 ----
text = html_re.sub('', text)
! for t in self.tokenize_text(text):
! yield t
global_tokenizer = Tokenizer()
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins