Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16139/spambayes
Modified Files:
FileCorpus.py
Log Message:
Move test code out into the unit test scripts.
Change FileMessage from being a subclass of message so load doesn't have to use
the
deprecated setpayload function.
Index: FileCorpus.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/FileCorpus.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -C2 -d -r1.17 -r1.18
*** FileCorpus.py 21 Dec 2004 23:10:21 -0000 1.17
--- FileCorpus.py 16 Mar 2005 03:26:23 -0000 1.18
***************
*** 32,80 ****
See Corpus.__doc__ for more information.
- Test harness:
- FileCorpus [options]
-
- options:
- -h : show this message
- -v : execute in verbose mode, useful for general understanding
- and debugging purposes
- -g : use GzipFileMessage and GzipFileMessageFactory
- -s : setup self test, useful for seeing what is going into the
- test
- -t : setup and execute a self test.
- -c : clean up file system after self test
-
- Please note that running with -s or -t will create file system artifacts
- in the current directory. Be sure this doesn't stomp something of
- yours... The artifacts created are:
-
- fctestmisc.bayes
- fctestclass.bayes
- fctestspamcorpus/MSG00001
- fctestspamcorpus/MSG00002
- fctestunsurecorpus/MSG00003
- fctestunsurecorpus/MSG00004
- fctestunsurecorpus/MSG00005
- fctestunsurecorpus/MSG00006
- fctesthamcorpus/
-
- After the test has executed, the following file system artifacts
- (should) will exist:
-
- fctestmisc.bayes
- fctestclass.bayes
- fctestspamcorpus/MSG00001
- fctestspamcorpus/MSG00004
- fctesthamcorpus/MSG00002
- fctesthamcorpus/MSG00005
- fctesthamcorpus/MSG00006
- fctestunsurecorpus/
-
To Do:
o Suggestions?
-
"""
! # This module is part of the spambayes project, which is Copyright 2002
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
--- 32,40 ----
See Corpus.__doc__ for more information.
To Do:
o Suggestions?
"""
! # This module is part of the spambayes project, which is Copyright 2002-5
# The Python Software Foundation and is covered by the Python Software
# Foundation license.
***************
*** 185,201 ****
! class FileMessage(message.SBHeaderMessage):
'''Message that persists as a file system artifact.'''
def __init__(self, file_name=None, directory=None):
'''Constructor(message file name, corpus directory name)'''
- message.SBHeaderMessage.__init__(self)
self.file_name = file_name
self.directory = directory
self.loaded = False
! def as_string(self):
self.load() # ensure that the substance is loaded
! return message.SBHeaderMessage.as_string(self)
def pathname(self):
--- 145,180 ----
! class FileMessage(object):
'''Message that persists as a file system artifact.'''
def __init__(self, file_name=None, directory=None):
'''Constructor(message file name, corpus directory name)'''
self.file_name = file_name
self.directory = directory
self.loaded = False
+ self._msg = message.SBHeaderMessage()
! def __getattr__(self, att):
! """Pretend we are a subclass of message.SBHeaderMessage."""
! if hasattr(self, "_msg") and hasattr(self._msg, att):
! return getattr(self._msg, att)
! raise AttributeError()
!
! def __getitem__(self, k):
! """Pretend we are a subclass of message.SBHeaderMessage."""
! if hasattr(self, "_msg"):
! return self._msg[k]
! raise TypeError()
!
! def __setitem__(self, k, v):
! """Pretend we are a subclass of message.SBHeaderMessage."""
! if hasattr(self, "_msg"):
! self._msg[k] = v
! return
! raise TypeError()
!
! def as_string(self, unixfrom=False):
self.load() # ensure that the substance is loaded
! return self._msg.as_string(unixfrom)
def pathname(self):
***************
*** 230,234 ****
fp = gzip.open(pn, 'rb')
try:
! self.setPayload(fp.read())
except IOError, e:
if str(e) == 'Not a gzipped file':
--- 209,214 ----
fp = gzip.open(pn, 'rb')
try:
! self._msg = email.message_from_string(\
! fp.read(), _class = message.SBHeaderMessage)
except IOError, e:
if str(e) == 'Not a gzipped file':
***************
*** 237,241 ****
fp.close()
fp = open(self.pathname(), 'rb')
! self.setPayload(fp.read())
fp.close()
else:
--- 217,222 ----
fp.close()
fp = open(self.pathname(), 'rb')
! self._msg = email.message_from_string(\
! fp.read(), _class = message.SBHeaderMessage)
fp.close()
else:
***************
*** 256,286 ****
fp.close()
- def setPayload(self, payload):
- # This is a less-than-ideal method. The Python email package
- # has a clear distinction between parsing an email message and
- # creating an email message object. Here, we don't share that
- # distinction, because our message object is trying to do its
- # own parsing. A better system would be to have the factory
- # that creates these messages do the load from file bit (this
- # does mean we lose the current load-on-demand feature, but
- # I'm not sure that's ever used). Alternatively, we could have
- # a third type of FileMessage - PickledFileMessage - that stored
- # the parsed form of the message. This might also remove the
- # need for some of the message database (although that would then
- # expire along with the messages...). This is something to
- # consider before 1.1, however.
- self.loaded = True
-
- # We parse the content into a generic email.Message object.
- msg = email.message_from_string(payload)
-
- # And then we set ourselves to be equal to it.
- self.set_payload(msg.get_payload())
- self.set_unixfrom(msg.get_unixfrom())
- self.set_charset(msg.get_charset())
- for name, value in msg.items():
- del self[name]
- self[name] = value
-
def remove(self):
'''Message hara-kiri'''
--- 237,240 ----
***************
*** 341,356 ****
! class FileMessageFactory(Corpus.MessageFactory):
! '''MessageFactory for FileMessage objects'''
!
def create(self, key, directory, content=None):
'''Create a message object from a filename in a directory'''
if content:
! msg = email.message_from_string(content, _class=FileMessage)
msg.file_name = key
msg.directory = directory
msg.loaded = True
return msg
! return FileMessage(key, directory)
--- 295,316 ----
! class MessageFactory(Corpus.MessageFactory):
! # Subclass must define a concrete message klass.
! klass = None
def create(self, key, directory, content=None):
'''Create a message object from a filename in a directory'''
if content:
! msg = email.message_from_string(content,
! _class=self.klass)
msg.file_name = key
msg.directory = directory
msg.loaded = True
return msg
! return self.klass(key, directory)
!
!
! class FileMessageFactory(MessageFactory):
! '''MessageFactory for FileMessage objects'''
! klass = FileMessage
***************
*** 372,761 ****
! class GzipFileMessageFactory(FileMessageFactory):
'''MessageFactory for FileMessage objects'''
!
! def create(self, key, directory, content=None):
! '''Create a message object from a filename in a directory'''
! if content:
! msg = email.message_from_string(content,
! _class=GzipFileMessage)
! msg.file_name = key
! msg.directory = directory
! msg.loaded = True
! return msg
! return GzipFileMessage(key, directory)
!
!
! def runTest(useGzip):
!
! print 'Executing Test'
!
! if useGzip:
! fmFact = GzipFileMessageFactory()
! print 'Executing with Gzipped files'
! else:
! fmFact = FileMessageFactory()
! print 'Executing with uncompressed files'
!
! print '\n\nCreating two Classifier databases'
! miscbayes = storage.PickledClassifier('fctestmisc.bayes')
! classbayes = storage.DBDictClassifier('fctestclass.bayes')
!
! print '\n\nSetting up spam corpus'
! spamcorpus = FileCorpus(fmFact, 'fctestspamcorpus')
! spamtrainer = storage.SpamTrainer(miscbayes)
! spamcorpus.addObserver(spamtrainer)
! anotherspamtrainer = storage.SpamTrainer(classbayes, storage.UPDATEPROBS)
! spamcorpus.addObserver(anotherspamtrainer)
!
! keys = spamcorpus.keys()
! keys.sort()
! for key in keys: # iterate the list of keys
! msg = spamcorpus[key] # corpus is a dictionary
! spamtrainer.train(msg)
! anotherspamtrainer.train(msg)
!
!
! print '\n\nSetting up ham corpus'
! hamcorpus = FileCorpus(fmFact, \
! 'fctesthamcorpus', \
! 'MSG*')
! hamtrainer = storage.HamTrainer(miscbayes)
! hamcorpus.addObserver(hamtrainer)
! hamtrainer.trainAll(hamcorpus)
!
! print '\n\nA couple of message related tests'
! if useGzip:
! fmFactory = GzipFileMessageFactory()
! else:
! fmFactory = FileMessageFactory()
!
! m1 = fmFactory.create('XMG00001', 'fctestspamcorpus', testmsg2())
!
! print '\n\nAdd a message to hamcorpus that does not match the filter'
!
! try:
! hamcorpus.addMessage(m1)
! except ValueError:
! print 'Add failed, test passed'
! else:
! print 'Add passed, test failed'
!
!
! print '\n\nThis is the hamcorpus'
! print hamcorpus
!
!
! print '\n\nThis is the spamcorpus'
! print spamcorpus
!
!
! print '\n\nSetting up unsure corpus'
! # the unsure corpus is an expiry corpus with five second expiry
! # and a cache size of 2 (for testing purposes only...), and
! # no trainers, since there's no such thing as 'unsure training'
! unsurecorpus = ExpiryFileCorpus(5, fmFact, \
! 'fctestunsurecorpus', 'MSG*', 2)
! unsurecorpus.removeExpiredMessages()
!
!
! print '\n\nIterate the unsure corpus twice, to make sure cache size \
! is managed correctly, and to make sure iteration is repeatable. \
! We should not see MSG00003 in this iteration.'
! for msg in unsurecorpus:
! print msg.key() # don't print msg, too much information
! print '...and again'
! for msg in unsurecorpus:
! print msg.key() # don't print msg, too much information
!
!
! print '\n\nRemoving expired messages from unsure corpus.'
! unsurecorpus.removeExpiredMessages()
!
!
! print '\n\nTrain with an individual message'
! anotherhamtrainer = storage.HamTrainer(classbayes)
! anotherhamtrainer.train(unsurecorpus['MSG00005'])
!
!
! print '\n\nMoving msg00002 from spamcorpus to hamcorpus'
! hamcorpus.takeMessage('MSG00002', spamcorpus) # Oops. made a mistake...
!
!
! print "\n\nLet's test printing a message"
! msg = spamcorpus['MSG00001']
! print msg
!
!
! print '\n\nClassifying messages in unsure corpus'
!
! for msg in unsurecorpus:
! prob = classbayes.spamprob(msg.tokenize())
!
! print 'Message %s spam probability is %f' % (msg.key(), prob)
!
! if prob < options["Categorization", "ham_cutoff"]:
! print 'Moving %s from unsurecorpus to hamcorpus, \
! based on prob of %f' % (msg.key(), prob)
! hamcorpus.takeMessage(msg.key(), unsurecorpus)
! elif prob > options["Categorization", "spam_cutoff"]:
! print 'Moving %s from unsurecorpus to spamcorpus, \
! based on prob of %f' % (msg.key(), prob)
! spamcorpus.takeMessage(msg.key(), unsurecorpus)
!
!
! print '\n\nThis is the new hamcorpus'
! print hamcorpus
!
!
! print '\n\nThis is the new spamcorpus'
! print spamcorpus
!
!
! print '\n\nThis is the new unsurecorpus'
! print unsurecorpus
! print 'unsurecorpus cache contains', unsurecorpus.keysInMemory
! print 'unsurecorpus msgs dict contains', unsurecorpus.msgs
!
!
! print '\n\nStoring bayes databases'
! miscbayes.store()
! classbayes.store()
!
! def cleanupTest():
!
! print 'Cleaning up'
!
! cleanupDirectory('fctestspamcorpus')
! cleanupDirectory('fctesthamcorpus')
! cleanupDirectory('fctestunsurecorpus')
!
! if not useExistingDB:
! try:
! os.unlink('fctestmisc.bayes')
! except OSError, e:
! if e.errno != 2: # errno.<WHAT>
! raise
!
! try:
! os.unlink('fctestclass.bayes')
! except OSError, e:
! if e.errno != 2: # errno.<WHAT>
! raise
!
! def cleanupDirectory(dirname):
!
! try:
! flist = os.listdir(dirname)
! except OSError, e:
! if e.errno != 3: # errno.<WHAT>
! raise
! else:
! for filename in flist:
! fn = os.path.join(dirname, filename)
! os.unlink(fn)
! try:
! os.rmdir(dirname)
! except OSError, e:
! if e.errno != 2: # errno.<WHAT>
! raise
!
! def setupTest(useGzip):
!
! cleanupTest()
!
! print 'Setting up test'
!
! # no try blocks here, because if any of this dies, the test
! # cannot proceed
!
! os.mkdir('fctestspamcorpus')
! os.mkdir('fctesthamcorpus')
! os.mkdir('fctestunsurecorpus')
!
! tm1 = testmsg1()
! tm2 = testmsg2()
!
! if useGzip:
! fmFactory = GzipFileMessageFactory()
! else:
! fmFactory = FileMessageFactory()
!
! m1 = fmFactory.create('MSG00001', 'fctestspamcorpus', tm1)
! m1.store()
!
! m2 = fmFactory.create('MSG00002', 'fctestspamcorpus', tm2)
! m2.store()
!
! m3 = fmFactory.create('MSG00003', 'fctestunsurecorpus', tm1)
! m3.store()
!
! for x in range(11):
! time.sleep(1) # make sure MSG00003 has expired
! if 10-x == 1:
! s = ''
! else:
! s = 's'
! print 'wait',10-x,'more second%s' % (s)
!
! m4 = fmFactory.create('MSG00004', 'fctestunsurecorpus', tm1)
! m4.store()
!
! m5 = fmFactory.create('MSG00005', 'fctestunsurecorpus', tm2)
! m5.store()
!
! m6 = fmFactory.create('MSG00006', 'fctestunsurecorpus', tm2)
! m6.store()
!
!
! def testmsg1():
!
! return """
! X-Hd:[EMAIL PROTECTED] Mon Nov 04 10:50:49 2002
! Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop
(v1.31
! 1998/05/13) Mon Nov 4 08:50:58 2002)
! X-From_:[EMAIL PROTECTED] Mon Nov 4 08:49:03 2002
! Return-Path:<[EMAIL PROTECTED]>
! Delivered-To:[EMAIL PROTECTED]
! Received:from manatee.mojam.com (manatee.mojam.com [199.249.165.175]) by
! mail.powweb.com (Postfix) with ESMTP id DC95A1BB1D0 for
! <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 08:49:02 -0800 (PST)
! Received:from montanaro.dyndns.org (12-248-11-90.client.attbi.com
! [12.248.11.90]) by manatee.mojam.com (8.12.1/8.12.1) with ESMTP id
! gA4Gn0oY029655 for <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 10:49:00
! -0600
! Received:from montanaro.dyndns.org (localhost [127.0.0.1]) by
! montanaro.dyndns.org (8.12.2/8.12.2) with ESMTP id gA4Gn3cP015572 for
! <[EMAIL PROTECTED]>; Mon, 4 Nov 2002 10:49:03 -0600 (CST)
! Received:(from [EMAIL PROTECTED]) by montanaro.dyndns.org
(8.12.2/8.12.2/Submit)
! id gA4Gn37l015569; Mon, 4 Nov 2002 10:49:03 -0600 (CST)
! From:Skip Montanaro <[EMAIL PROTECTED]>
! MIME-Version:1.0
! Content-Type:text/plain; charset=us-ascii
! Content- Transfer- Encoding:7bit
! Message-ID:<[EMAIL PROTECTED]>
! Date:Mon, 4 Nov 2002 10:49:02 -0600
! To:Four Stones Expressions <[EMAIL PROTECTED]>
! Subject:Reformat mail to 80 columns?
! In-Reply-To:<[EMAIL PROTECTED]>
! References:<[EMAIL PROTECTED]>
! <[EMAIL PROTECTED]>
! X-Mailer:VM 7.07 under 21.5 (beta9) "brussels sprouts" XEmacs Lucid
! Reply-To:[EMAIL PROTECTED]
! X-Hammie- Disposition:Unsure
!
!
! 11/4/2002 10:49:02 AM, Skip Montanaro <[EMAIL PROTECTED]> wrote:
!
! >(off-list)
! >
! >Tim,
! >
! >Any chance you can easily generate messages to the spambayes list which wrap
! >at something between 70 and 78 columns? I find I have to always edit your
! >messages to read them easily.
! >
! >Thanks,
! >
! >--
! >Skip Montanaro - [EMAIL PROTECTED]
! >http://www.mojam.com/
! >http://www.musi-cal.com/
! >
! >
! - Tim
! www.fourstonesExpressions.com """
!
! def testmsg2():
! return """
! X-Hd:[EMAIL PROTECTED] Wed Nov 06 12:05:41 2002
! Received:by mail.powweb.com (mbox timstone) (with Cubic Circle's cucipop
(v1.31
! 1998/05/13) Wed Nov 6 10:05:45 2002)
! X-From_:[EMAIL PROTECTED] Wed Nov 6 10:05:33 2002
! Return-Path:<[EMAIL PROTECTED]>
! Delivered-To:[EMAIL PROTECTED]
! Received:from anchor-post-31.mail.demon.net (anchor-post-31.mail.demon.net
! [194.217.242.89]) by mail.powweb.com (Postfix) with ESMTP id 3DC431BB06A for
! <[EMAIL PROTECTED]>; Wed, 6 Nov 2002 10:05:33 -0800 (PST)
! Received:from sundog.demon.co.uk ([158.152.226.183]) by
! anchor-post-31.mail.demon.net with smtp (Exim 3.35 #1) id 189UYP-000IAw-0V for
! [EMAIL PROTECTED]; Wed, 06 Nov 2002 18:05:25 +0000
! From:Richie Hindle <[EMAIL PROTECTED]>
! To:[EMAIL PROTECTED]
! Subject:Re: What to call this training stuff
! Date:Wed, 06 Nov 2002 18:05:56 +0000
! Organization:entrian.com
! Reply-To:[EMAIL PROTECTED]
! Message-ID:<[EMAIL PROTECTED]>
! References:<[EMAIL PROTECTED]>
! In-Reply-To:<[EMAIL PROTECTED]>
! X-Mailer:Forte Agent 1.7/32.534
! MIME-Version:1.0
! Content-Type:text/plain; charset=us-ascii
! Content- Transfer- Encoding:7bit
! X-Hammie- Disposition:Unsure
!
!
! Hi Tim,
!
! > Richie, I think we should package these classes I've been writing as
! > 'corpusManagement.py' What we're really doing here is creating a set of
! tools
! > that can be used to manage corpi (?) corpusses (?) corpae (?) whatever...
! of
! > messages.
!
! Good plan. Minor point of style: mixed-case module names (like class
! names) tend to have an initial capital: CorpusManagement.py
!
! On the name... sorry to disagree about names again, but what does the word
! 'management' add? This is a module for manipulating corpuses, so I reckon
! it should be called Corpus. Like Cookie, gzip, zipfile, locale, mailbox...
! see what I mean?
!
! --
! Richie Hindle
! [EMAIL PROTECTED]"""
!
! if __name__ == '__main__':
! try:
! opts, args = getopt.getopt(sys.argv[1:], 'estgvhcu')
! except getopt.error, msg:
! print >>sys.stderr, str(msg) + '\n\n' + __doc__
! sys.exit()
!
! options["globals", "verbose"] = False
! runTestServer = False
! setupTestServer = False
! cleanupTestServer = False
! useGzip = False
! useExistingDB = False
!
! for opt, arg in opts:
! if opt == '-h':
! print >>sys.stderr, __doc__
! sys.exit()
! elif opt == '-s':
! setupTestServer = True
! elif opt == '-e':
! runTestServer = True
! elif opt == '-t':
! setupTestServer = True
! runTestServer = True
! elif opt == '-c':
! cleanupTestServer = True
! elif opt == '-v':
! options["globals", "verbose"] = True
! elif opt == '-g':
! useGzip = True
! elif opt == '-u':
! useExistingDB = True
!
! if setupTestServer:
! setupTest(useGzip)
! if runTestServer:
! runTest(useGzip)
! elif cleanupTestServer:
! cleanupTest()
! else:
! print >>sys.stderr, __doc__
--- 332,336 ----
! class GzipFileMessageFactory(MessageFactory):
'''MessageFactory for FileMessage objects'''
! klass = GzipFileMessage
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins