Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16139/spambayes

Modified Files:
        FileCorpus.py 
Log Message:
Move test code out into the unit test scripts.

Change FileMessage from being a subclass of message so load doesn't have to use 
the
 deprecated setpayload function.

Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** FileCorpus.py       21 Dec 2004 23:10:21 -0000      1.17
--- FileCorpus.py       16 Mar 2005 03:26:23 -0000      1.18
***************
*** 32,80 ****
      See Corpus.__doc__ for more information.
  
- Test harness:
-     FileCorpus [options]
- 
-         options:
-             -h : show this message
-             -v : execute in verbose mode, useful for general understanding
-                  and debugging purposes
-             -g : use GzipFileMessage and GzipFileMessageFactory
-             -s : setup self test, useful for seeing what is going into the
-                  test
-             -t : setup and execute a self test.
-             -c : clean up file system after self test
- 
-     Please note that running with -s or -t will create file system artifacts
-     in the current directory.  Be sure this doesn't stomp something of
-     yours...  The artifacts created are:
- 
-         fctestmisc.bayes
-         fctestclass.bayes
-         fctestspamcorpus/MSG00001
-         fctestspamcorpus/MSG00002
-         fctestunsurecorpus/MSG00003
-         fctestunsurecorpus/MSG00004
-         fctestunsurecorpus/MSG00005
-         fctestunsurecorpus/MSG00006
-         fctesthamcorpus/
- 
-     After the test has executed, the following file system artifacts
-     (should) will exist:
- 
-         fctestmisc.bayes
-         fctestclass.bayes
-         fctestspamcorpus/MSG00001
-         fctestspamcorpus/MSG00004
-         fctesthamcorpus/MSG00002
-         fctesthamcorpus/MSG00005
-         fctesthamcorpus/MSG00006
-         fctestunsurecorpus/
- 
  To Do:
      o Suggestions?
- 
  """
  
! # This module is part of the spambayes project, which is Copyright 2002
  # The Python Software Foundation and is covered by the Python Software
  # Foundation license.
--- 32,40 ----
      See Corpus.__doc__ for more information.
  
  To Do:
      o Suggestions?
  """
  
! # This module is part of the spambayes project, which is Copyright 2002-5
  # The Python Software Foundation and is covered by the Python Software
  # Foundation license.
***************
*** 185,201 ****
  
  
! class FileMessage(message.SBHeaderMessage):
      '''Message that persists as a file system artifact.'''
  
      def __init__(self, file_name=None, directory=None):
          '''Constructor(message file name, corpus directory name)'''
-         message.SBHeaderMessage.__init__(self)
          self.file_name = file_name
          self.directory = directory
          self.loaded = False
  
!     def as_string(self):
          self.load() # ensure that the substance is loaded
!         return message.SBHeaderMessage.as_string(self)
  
      def pathname(self):
--- 145,180 ----
  
  
! class FileMessage(object):
      '''Message that persists as a file system artifact.'''
  
      def __init__(self, file_name=None, directory=None):
          '''Constructor(message file name, corpus directory name)'''
          self.file_name = file_name
          self.directory = directory
          self.loaded = False
+         self._msg = message.SBHeaderMessage()
  
!     def __getattr__(self, att):
!         """Pretend we are a subclass of message.SBHeaderMessage."""
!         if hasattr(self, "_msg") and hasattr(self._msg, att):
!             return getattr(self._msg, att)
!         raise AttributeError()
! 
!     def __getitem__(self, k):
!         """Pretend we are a subclass of message.SBHeaderMessage."""
!         if hasattr(self, "_msg"):
!             return self._msg[k]
!         raise TypeError()
! 
!     def __setitem__(self, k, v):
!         """Pretend we are a subclass of message.SBHeaderMessage."""
!         if hasattr(self, "_msg"):
!             self._msg[k] = v
!             return
!         raise TypeError()
! 
!     def as_string(self, unixfrom=False):
          self.load() # ensure that the substance is loaded
!         return self._msg.as_string(unixfrom)
  
      def pathname(self):
***************
*** 230,234 ****
          fp = gzip.open(pn, 'rb')
          try:
!             self.setPayload(fp.read())
          except IOError, e:
              if str(e) == 'Not a gzipped file':
--- 209,214 ----
          fp = gzip.open(pn, 'rb')
          try:
!             self._msg = email.message_from_string(\
!                 fp.read(), _class = message.SBHeaderMessage)
          except IOError, e:
              if str(e) == 'Not a gzipped file':
***************
*** 237,241 ****
                  fp.close()
                  fp = open(self.pathname(), 'rb')
!                 self.setPayload(fp.read())
                  fp.close()
          else:
--- 217,222 ----
                  fp.close()
                  fp = open(self.pathname(), 'rb')
!                 self._msg = email.message_from_string(\
!                     fp.read(), _class = message.SBHeaderMessage)
                  fp.close()
          else:
***************
*** 256,286 ****
          fp.close()
  
-     def setPayload(self, payload):
-         # This is a less-than-ideal method.  The Python email package
-         # has a clear distinction between parsing an email message and
-         # creating an email message object.  Here, we don't share that
-         # distinction, because our message object is trying to do its
-         # own parsing.  A better system would be to have the factory
-         # that creates these messages do the load from file bit (this
-         # does mean we lose the current load-on-demand feature, but
-         # I'm not sure that's ever used).  Alternatively, we could have
-         # a third type of FileMessage - PickledFileMessage - that stored
-         # the parsed form of the message.  This might also remove the
-         # need for some of the message database (although that would then
-         # expire along with the messages...).  This is something to
-         # consider before 1.1, however.
-         self.loaded = True
- 
-         # We parse the content into a generic email.Message object.
-         msg = email.message_from_string(payload)
- 
-         # And then we set ourselves to be equal to it.
-         self.set_payload(msg.get_payload())
-         self.set_unixfrom(msg.get_unixfrom())
-         self.set_charset(msg.get_charset())
-         for name, value in msg.items():
-             del self[name]
-             self[name] = value
- 
      def remove(self):
          '''Message hara-kiri'''
--- 237,240 ----
***************
*** 341,356 ****
  
  
! class FileMessageFactory(Corpus.MessageFactory):
!     '''MessageFactory for FileMessage objects'''
! 
      def create(self, key, directory, content=None):
          '''Create a message object from a filename in a directory'''
          if content:
!             msg = email.message_from_string(content, _class=FileMessage)
              msg.file_name = key
              msg.directory = directory
              msg.loaded = True
              return msg
!         return FileMessage(key, directory)
  
  
--- 295,316 ----
  
  
! class MessageFactory(Corpus.MessageFactory):
!     # Subclass must define a concrete message klass.
!     klass = None
      def create(self, key, directory, content=None):
          '''Create a message object from a filename in a directory'''
          if content:
!             msg = email.message_from_string(content,
!                                             _class=self.klass)
              msg.file_name = key
              msg.directory = directory
              msg.loaded = True
              return msg
!         return self.klass(key, directory)
!     
! 
! class FileMessageFactory(MessageFactory):
!     '''MessageFactory for FileMessage objects'''
!     klass = FileMessage
  
  
***************
*** 372,761 ****
  
  
! class GzipFileMessageFactory(FileMessageFactory):
      '''MessageFactory for FileMessage objects'''
! 
!     def create(self, key, directory, content=None):
!         '''Create a message object from a filename in a directory'''
!         if content:
!             msg = email.message_from_string(content,
!                                             _class=GzipFileMessage)
!             msg.file_name = key
!             msg.directory = directory
!             msg.loaded = True
!             return msg
!         return GzipFileMessage(key, directory)
! 
! 
! def runTest(useGzip):
! 
!     print 'Executing Test'
! 
!     if useGzip:
!         fmFact = GzipFileMessageFactory()
!         print 'Executing with Gzipped files'
!     else:
!         fmFact =  FileMessageFactory()
!         print 'Executing with uncompressed files'
! 
!     print '\n\nCreating two Classifier databases'
!     miscbayes = storage.PickledClassifier('fctestmisc.bayes')
!     classbayes = storage.DBDictClassifier('fctestclass.bayes')
! 
!     print '\n\nSetting up spam corpus'
!     spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
!     spamtrainer = storage.SpamTrainer(miscbayes)
!     spamcorpus.addObserver(spamtrainer)
!     anotherspamtrainer = storage.SpamTrainer(classbayes, storage.UPDATEPROBS)
!     spamcorpus.addObserver(anotherspamtrainer)
! 
!     keys = spamcorpus.keys()
!     keys.sort()
!     for key in keys:                          # iterate the list of keys
!         msg = spamcorpus[key]                 # corpus is a dictionary
!         spamtrainer.train(msg)
!         anotherspamtrainer.train(msg)
! 
! 
!     print '\n\nSetting up ham corpus'
!     hamcorpus = FileCorpus(fmFact, \
!                           'fctesthamcorpus', \
!                           'MSG*')
!     hamtrainer = storage.HamTrainer(miscbayes)
!     hamcorpus.addObserver(hamtrainer)
!     hamtrainer.trainAll(hamcorpus)
! 
!     print '\n\nA couple of message related tests'
!     if useGzip:
!         fmFactory = GzipFileMessageFactory()
!     else:
!         fmFactory = FileMessageFactory()
! 
!     m1 = fmFactory.create('XMG00001', 'fctestspamcorpus', testmsg2())
! 
!     print '\n\nAdd a message to hamcorpus that does not match the filter'
! 
!     try:
!         hamcorpus.addMessage(m1)
!     except ValueError:
!         print 'Add failed, test passed'
!     else:
!         print 'Add passed, test failed'
! 
! 
!     print '\n\nThis is the hamcorpus'
!     print hamcorpus
! 
! 
!     print '\n\nThis is the spamcorpus'
!     print spamcorpus
! 
! 
!     print '\n\nSetting up unsure corpus'
!     # the unsure corpus is an expiry corpus with five second expiry
!     # and a cache size of 2 (for testing purposes only...), and
!     # no trainers, since there's no such thing as 'unsure training'
!     unsurecorpus = ExpiryFileCorpus(5, fmFact, \
!                                     'fctestunsurecorpus', 'MSG*', 2)
!     unsurecorpus.removeExpiredMessages()
! 
! 
!     print '\n\nIterate the unsure corpus twice, to make sure cache size \
! is managed correctly, and to make sure iteration is repeatable. \
! We should not see MSG00003 in this iteration.'
!     for msg in unsurecorpus:
!         print msg.key()    # don't print msg, too much information
!     print '...and again'
!     for msg in unsurecorpus:
!         print msg.key()    # don't print msg, too much information
! 
! 
!     print '\n\nRemoving expired messages from unsure corpus.'
!     unsurecorpus.removeExpiredMessages()
! 
! 
!     print '\n\nTrain with an individual message'
!     anotherhamtrainer = storage.HamTrainer(classbayes)
!     anotherhamtrainer.train(unsurecorpus['MSG00005'])
! 
! 
!     print '\n\nMoving msg00002 from spamcorpus to hamcorpus'
!     hamcorpus.takeMessage('MSG00002', spamcorpus)   # Oops. made a mistake...
! 
! 
!     print "\n\nLet's test printing a message"
!     msg = spamcorpus['MSG00001']
!     print msg
! 
! 
!     print '\n\nClassifying messages in unsure corpus'
! 
!     for msg in unsurecorpus:
!         prob = classbayes.spamprob(msg.tokenize())
! 
!         print 'Message %s spam probability is %f' % (msg.key(), prob)
! 
!         if prob < options["Categorization", "ham_cutoff"]:
!             print 'Moving %s from unsurecorpus to hamcorpus, \
! based on prob of %f' % (msg.key(), prob)
!             hamcorpus.takeMessage(msg.key(), unsurecorpus)
!         elif prob > options["Categorization", "spam_cutoff"]:
!             print 'Moving %s from unsurecorpus to spamcorpus, \
! based on prob of %f' % (msg.key(), prob)
!             spamcorpus.takeMessage(msg.key(), unsurecorpus)
! 
! 
!     print '\n\nThis is the new hamcorpus'
!     print hamcorpus
! 
! 
!     print '\n\nThis is the new spamcorpus'
!     print spamcorpus
! 
! 
!     print '\n\nThis is the new unsurecorpus'
!     print unsurecorpus
!     print 'unsurecorpus cache contains', unsurecorpus.keysInMemory
!     print 'unsurecorpus msgs dict contains', unsurecorpus.msgs
! 
! 
!     print '\n\nStoring bayes databases'
!     miscbayes.store()
!     classbayes.store()
! 
! def cleanupTest():
! 
!     print 'Cleaning up'
! 
!     cleanupDirectory('fctestspamcorpus')
!     cleanupDirectory('fctesthamcorpus')
!     cleanupDirectory('fctestunsurecorpus')
! 
!     if not useExistingDB:
!         try:
!             os.unlink('fctestmisc.bayes')
!         except OSError, e:
!             if e.errno != 2:     # errno.<WHAT>
!                 raise
! 
!         try:
!             os.unlink('fctestclass.bayes')
!         except OSError, e:
!             if e.errno != 2:     # errno.<WHAT>
!                 raise
! 
! def cleanupDirectory(dirname):
! 
!     try:
!         flist = os.listdir(dirname)
!     except OSError, e:
!         if e.errno != 3:     # errno.<WHAT>
!             raise
!     else:
!         for filename in flist:
!             fn = os.path.join(dirname, filename)
!             os.unlink(fn)
!     try:
!         os.rmdir(dirname)
!     except OSError, e:
!         if e.errno != 2:     # errno.<WHAT>
!             raise
! 
! def setupTest(useGzip):
! 
!     cleanupTest()
! 
!     print 'Setting up test'
! 
!     # no try blocks here, because if any of this dies, the test
!     # cannot proceed
! 
!     os.mkdir('fctestspamcorpus')
!     os.mkdir('fctesthamcorpus')
!     os.mkdir('fctestunsurecorpus')
! 
!     tm1 = testmsg1()
!     tm2 = testmsg2()
! 
!     if useGzip:
!         fmFactory = GzipFileMessageFactory()
!     else:
!         fmFactory = FileMessageFactory()
! 
!     m1 = fmFactory.create('MSG00001', 'fctestspamcorpus', tm1)
!     m1.store()
! 
!     m2 = fmFactory.create('MSG00002', 'fctestspamcorpus', tm2)
!     m2.store()
! 
!     m3 = fmFactory.create('MSG00003', 'fctestunsurecorpus', tm1)
!     m3.store()
! 
!     for x in range(11):
!         time.sleep(1)    # make sure MSG00003 has expired
!         if 10-x == 1:
!             s = ''
!         else:
!             s = 's'
!         print 'wait',10-x,'more second%s' % (s)
! 
!     m4 = fmFactory.create('MSG00004', 'fctestunsurecorpus', tm1)
!     m4.store()
! 
!     m5 = fmFactory.create('MSG00005', 'fctestunsurecorpus', tm2)
!     m5.store()
! 
!     m6 = fmFactory.create('MSG00006', 'fctestunsurecorpus', tm2)
!     m6.store()
! 
! 
! def testmsg1():
! 
!     return """
! X-Hd:[EMAIL PROTECTED] Mon Nov 04 10:50:49 2002
! Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop 
(v1.31
! 1998/05/13) Mon Nov 4 08:50:58 2002)
! X-From_:[EMAIL PROTECTED] Mon Nov 4 08:49:03 2002
! Return-Path:<[EMAIL PROTECTED]>
! Delivered-To:[EMAIL PROTECTED]
! Received:from manatee.mojam.com (manatee.mojam.com [199.249.165.175]) by
! mail.powweb.com (Postfix) with ESMTP id DC95A1BB1D0 for
! <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 08:49:02 -0800 (PST)
! Received:from montanaro.dyndns.org (12-248-11-90.client.attbi.com
! [12.248.11.90]) by manatee.mojam.com (8.12.1/8.12.1) with ESMTP id
! gA4Gn0oY029655 for <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 10:49:00
! -0600
! Received:from montanaro.dyndns.org (localhost [127.0.0.1]) by
! montanaro.dyndns.org (8.12.2/8.12.2) with ESMTP id gA4Gn3cP015572 for
! <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 10:49:03 -0600 (CST)
! Received:(from [EMAIL PROTECTED]) by montanaro.dyndns.org 
(8.12.2/8.12.2/Submit)
! id gA4Gn37l015569; Mon, 4 Nov 2002 10:49:03 -0600 (CST)
! From:Skip Montanaro <[EMAIL PROTECTED]>
! MIME-Version:1.0
! Content-Type:text/plain; charset=us-ascii
! Content- Transfer- Encoding:7bit
! Message-ID:<[EMAIL PROTECTED]>
! Date:Mon, 4 Nov 2002 10:49:02 -0600
! To:Four Stones Expressions <[EMAIL PROTECTED]>
! Subject:Reformat mail to 80 columns?
! In-Reply-To:<[EMAIL PROTECTED]>
! References:<[EMAIL PROTECTED]>
! <[EMAIL PROTECTED]>
! X-Mailer:VM 7.07 under 21.5 (beta9) "brussels sprouts" XEmacs Lucid
! Reply-To:[EMAIL PROTECTED]
! X-Hammie- Disposition:Unsure
! 
! 
! 11/4/2002 10:49:02 AM, Skip Montanaro <[EMAIL PROTECTED]> wrote:
! 
! >(off-list)
! >
! >Tim,
! >
! >Any chance you can easily generate messages to the spambayes list which wrap
! >at something between 70 and 78 columns?  I find I have to always edit your
! >messages to read them easily.
! >
! >Thanks,
! >
! >--
! >Skip Montanaro - [EMAIL PROTECTED]
! >http://www.mojam.com/
! >http://www.musi-cal.com/
! >
! >
! - Tim
! www.fourstonesExpressions.com """
! 
! def testmsg2():
!     return """
! X-Hd:[EMAIL PROTECTED] Wed Nov 06 12:05:41 2002
! Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop 
(v1.31
! 1998/05/13) Wed Nov 6 10:05:45 2002)
! X-From_:[EMAIL PROTECTED] Wed Nov 6 10:05:33 2002
! Return-Path:<[EMAIL PROTECTED]>
! Delivered-To:[EMAIL PROTECTED]
! Received:from anchor-post-31.mail.demon.net (anchor-post-31.mail.demon.net
! [194.217.242.89]) by mail.powweb.com (Postfix) with ESMTP id 3DC431BB06A for
! <[EMAIL PROTECTED]>; Wed, 6 Nov 2002 10:05:33 -0800 (PST)
! Received:from sundog.demon.co.uk ([158.152.226.183]) by
! anchor-post-31.mail.demon.net with smtp (Exim 3.35 #1) id 189UYP-000IAw-0V for
! [EMAIL PROTECTED]; Wed, 06 Nov 2002 18:05:25 +0000
! From:Richie Hindle <[EMAIL PROTECTED]>
! To:[EMAIL PROTECTED]
! Subject:Re: What to call this training stuff
! Date:Wed, 06 Nov 2002 18:05:56 +0000
! Organization:entrian.com
! Reply-To:[EMAIL PROTECTED]
! Message-ID:<[EMAIL PROTECTED]>
! References:<[EMAIL PROTECTED]>
! In-Reply-To:<[EMAIL PROTECTED]>
! X-Mailer:Forte Agent 1.7/32.534
! MIME-Version:1.0
! Content-Type:text/plain; charset=us-ascii
! Content- Transfer- Encoding:7bit
! X-Hammie- Disposition:Unsure
! 
! 
! Hi Tim,
! 
! > Richie, I think we should package these classes I've been writing as
! > 'corpusManagement.py'  What we're really doing here is creating a set of
! tools
! > that can be used to manage corpi (?) corpusses (?)  corpae (?)  whatever...
! of
! > messages.
! 
! Good plan.  Minor point of style: mixed-case module names (like class
! names) tend to have an initial capital: CorpusManagement.py
! 
! On the name... sorry to disagree about names again, but what does the word
! 'management' add?  This is a module for manipulating corpuses, so I reckon
! it should be called Corpus.  Like Cookie, gzip, zipfile, locale, mailbox...
! see what I mean?
! 
! --
! Richie Hindle
! [EMAIL PROTECTED]"""
! 
! if __name__ == '__main__':
!     try:
!         opts, args = getopt.getopt(sys.argv[1:], 'estgvhcu')
!     except getopt.error, msg:
!         print >>sys.stderr, str(msg) + '\n\n' + __doc__
!         sys.exit()
! 
!     options["globals", "verbose"] = False
!     runTestServer = False
!     setupTestServer = False
!     cleanupTestServer = False
!     useGzip = False
!     useExistingDB = False
! 
!     for opt, arg in opts:
!         if opt == '-h':
!             print >>sys.stderr, __doc__
!             sys.exit()
!         elif opt == '-s':
!             setupTestServer = True
!         elif opt == '-e':
!             runTestServer = True
!         elif opt == '-t':
!             setupTestServer = True
!             runTestServer = True
!         elif opt == '-c':
!             cleanupTestServer = True
!         elif opt == '-v':
!             options["globals", "verbose"] = True
!         elif opt == '-g':
!             useGzip = True
!         elif opt == '-u':
!             useExistingDB = True
! 
!     if setupTestServer:
!         setupTest(useGzip)
!     if runTestServer:
!         runTest(useGzip)
!     elif cleanupTestServer:
!         cleanupTest()
!     else:
!         print >>sys.stderr, __doc__
--- 332,336 ----
  
  
! class GzipFileMessageFactory(MessageFactory):
      '''MessageFactory for FileMessage objects'''
!     klass = GzipFileMessage

_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to