Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv25513

Modified Files:
        tokenizer.py 
Log Message:
Break basic text tokenizing out into its own method in preparation for some
other changes.



Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.37
retrieving revision 1.38
diff -C2 -d -r1.37 -r1.38
*** tokenizer.py        15 Nov 2005 00:16:20 -0000      1.37
--- tokenizer.py        6 Aug 2006 16:19:19 -0000       1.38
***************
*** 1528,1533 ****
                      yield "noheader:" + k
  
!     def tokenize_body(self, msg, maxword=options["Tokenizer",
!                                                  "skip_max_word_size"]):
          """Generate a stream of tokens from an email Message.
  
--- 1528,1545 ----
                      yield "noheader:" + k
  
!     def tokenize_text(self, text, maxword=options["Tokenizer",
!                                                   "skip_max_word_size"]):
!         """Tokenize everything in the chunk of text we were handed."""
!         for w in text.split():
!             n = len(w)
!             # Make sure this range matches in tokenize_word().
!             if 3 <= n <= maxword:
!                 yield w
! 
!             elif n >= 3:
!                 for t in tokenize_word(w):
!                     yield t
! 
!     def tokenize_body(self, msg):
          """Generate a stream of tokens from an email Message.
  
***************
*** 1606,1619 ****
              text = html_re.sub('', text)
  
!             # Tokenize everything in the body.
!             for w in text.split():
!                 n = len(w)
!                 # Make sure this range matches in tokenize_word().
!                 if 3 <= n <= maxword:
!                     yield w
! 
!                 elif n >= 3:
!                     for t in tokenize_word(w):
!                         yield t
  
  global_tokenizer = Tokenizer()
--- 1618,1623 ----
              text = html_re.sub('', text)
  
!             for t in self.tokenize_text(text):
!                 yield t
  
  global_tokenizer = Tokenizer()

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to