Update of /cvsroot/spambayes/spambayes/spambayes
In directory sc8-pr-cvs8.sourceforge.net:/tmp/cvs-serv5725/spambayes
Modified Files:
Options.py tokenizer.py
Added Files:
dnscache.py
Log Message:
Add Matt Cowles' dnscache module and x-lookup_ip option. Underwent some
substantial changes, most importantly, I got most of the way adding support
for persisting the cache to either dbm or zodb stores. Also ran reindent
over dnscache.py.
--- NEW FILE: dnscache.py ---
# Copyright 2004, Matthew Dixon Cowles <[EMAIL PROTECTED]>.
# Distributable under the same terms as the Python programming language.
# Inspired by the KevinL's cache included with PyDNS.
# Provided with NO WARRANTY.
# Version 0.1 2004 06 27
# Version 0.11 2004 07 06 Fixed zero division error in __del__
import DNS # From http://sourceforge.net/projects/pydns/
import sys
import os
import operator
import time
import types
import shelve
import socket
from spambayes.Options import options
kCheckForPruneEvery=20
kMaxTTL=60 * 60 * 24 * 7 # One week
kPruneThreshold=1500 # May go over slightly; numbers chosen at random
kPruneDownTo=1000
class lookupResult(object):
#__slots__=("qType","answer","question","expiresAt","lastUsed")
def __init__(self,qType,answer,question,expiresAt,now):
self.qType=qType
self.answer=answer
self.question=question
self.expiresAt=expiresAt
self.lastUsed=now
return None
# From ActiveState's Python cookbook
# Yakov Markovitch, Fast sort the list of objects by object's attribute
# http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52230
def sort_by_attr(seq, attr):
"""Sort the sequence of objects by object's attribute
Arguments:
seq - the list or any sequence (including immutable one) of objects to
sort.
attr - the name of attribute to sort by
Returns:
the sorted list of objects.
"""
#import operator
# Use the "Schwartzian transform"
# Create the auxiliary list of tuples where every i-th tuple has form
# (seq[i].attr, i, seq[i]) and sort it. The second item of tuple is needed
not
# only to provide stable sorting, but mainly to eliminate comparison of
objects
# (which can be expensive or prohibited) in case of equal attribute values.
intermed = map(None, map(getattr, seq, (attr,)*len(seq)), xrange(len(seq)),
seq)
intermed.sort()
return map(operator.getitem, intermed, (-1,) * len(intermed))
class cache:
def __init__(self,dnsServer=None,cachefile=None):
# These attributes intended for user setting
self.printStatsAtEnd=False
# As far as I can tell from the standards,
# it's legal to have more than one PTR record
# for an address. That is, it's legal to get
# more than one name back when you do a
# reverse lookup on an IP address. I don't
# know of a use for that and I've never seen
# it done. And I don't think that most
# people would expect it. So forward ("A")
# lookups always return a list. Reverse
# ("PTR") lookups return a single name unless
# this attribute is set to False.
self.returnSinglePTR=True
# How long to cache an error as no data
self.cacheErrorSecs=5*60
# How long to wait for the server
self.dnsTimeout=10
# Some servers always return a TTL of zero.
# In those cases, turning this up a bit is
# probably reasonable.
self.minTTL=0
# end of user-settable attributes
self.cachefile = cachefile
if cachefile:
self.open_cachefile(cachefile)
else:
self.caches={ "A": {}, "PTR": {} }
self.hits=0 # These two for statistics
self.misses=0
self.pruneTicker=0
if dnsServer==None:
DNS.DiscoverNameServers()
self.queryObj=DNS.DnsRequest()
else:
self.queryObj=DNS.DnsRequest(server=dnsServer)
return None
def open_cachefile(self, cachefile):
filetype = options["Storage", "persistent_use_database"]
cachefile = os.path.expanduser(cachefile)
if filetype == "dbm":
self.caches=shelve.open(cachefile)
if not self.caches.has_key("A"):
self.caches["A"] = {}
if not self.caches.has_key("PTR"):
self.caches["PTR"] = {}
elif filetype == "zodb":
from ZODB import DB
from ZODB.FileStorage import FileStorage
self._zodb_storage = FileStorage(cachefile, read_only=False)
self._DB = DB(self._zodb_storage, cache_size=10000)
self._conn = self._DB.open()
root = self._conn.root()
self.caches = root.get("dnscache")
if self.caches is None:
# There is no classifier, so create one.
from BTrees.OOBTree import OOBTree
self.caches = root["dnscache"] = OOBTree()
self.caches["A"] = {}
self.caches["PTR"] = {}
print "opened new cache"
else:
print "opened existing cache with", len(self.caches["A"]), "A
records",
print "and", len(self.caches["PTR"]), "PTR records"
def close(self):
if not self.cachefile:
return
filetype = options["Storage", "persistent_use_database"]
if filetype == "dbm":
self.caches.close()
elif filetype == "zodb":
self._zodb_close()
def _zodb_store(self):
import transaction
from ZODB.POSException import ConflictError
from ZODB.POSException import TransactionFailedError
try:
transaction.commit()
except ConflictError, msg:
# We'll save it next time, or on close. It'll be lost if we
# hard-crash, but that's unlikely, and not a particularly big
# deal.
if options["globals", "verbose"]:
print >> sys.stderr, "Conflict on commit.", msg
transaction.abort()
except TransactionFailedError, msg:
# Saving isn't working. Try to abort, but chances are that
# restarting is needed.
if options["globals", "verbose"]:
print >> sys.stderr, "Store failed. Need to restart.", msg
transaction.abort()
def _zodb_close(self):
# Ensure that the db is saved before closing. Alternatively, we
# could abort any waiting transaction. We need to do *something*
# with it, though, or it will be still around after the db is
# closed and cause problems. For now, saving seems to make sense
# (and we can always add abort methods if they are ever needed).
self._zodb_store()
# Do the closing.
self._DB.close()
# We don't make any use of the 'undo' capabilities of the
# FileStorage at the moment, so might as well pack the database
# each time it is closed, to save as much disk space as possible.
# Pack it up to where it was 'yesterday'.
# XXX What is the 'referencesf' parameter for pack()? It doesn't
# XXX seem to do anything according to the source.
## self._zodb_storage.pack(time.time()-60*60*24, None)
self._zodb_storage.close()
self._zodb_closed = True
if options["globals", "verbose"]:
print >> sys.stderr, 'Closed dnscache database'
def __del__(self):
if self.printStatsAtEnd:
self.printStats()
def printStats(self):
for key,val in self.caches.items():
totAnswers=0
for item in val.values():
totAnswers+=len(item)
print "cache %s has %i question(s) and %i answer(s)" %
(key,len(self.caches[key]),totAnswers)
if self.hits+self.misses==0:
print "No queries"
else:
print "%i hits, %i misses (%.1f%% hits)" % (self.hits, self.misses,
self.hits/float(self.hits+self.misses)*100)
def prune(self,now):
# I want this to be as fast as reasonably possible.
# If I didn't, I'd probably do various things differently
# Is there a faster way to do this?
allAnswers=[]
for cache in self.caches.values():
for val in cache.values():
allAnswers += val
allAnswers=sort_by_attr(allAnswers,"expiresAt")
allAnswers.reverse()
while True:
if allAnswers[-1].expiresAt>now:
break
answer=allAnswers.pop()
c=self.caches[answer.type]
c[answer.question].remove(answer)
if len(c[answer.question])==0:
del c[answer.question]
self.printStats()
if len(allAnswers)<=kPruneDownTo:
return None
# Expiring didn't get us down to the size we want, so delete
# some entries least-recently-used-wise. I'm not by any means
# sure that this is the best strategy, but as yet I don't have
# data to test different strategies.
allAnswers=sort_by_attr(allAnswers,"lastUsed")
allAnswers.reverse()
numToDelete=len(allAnswers)-kPruneDownTo
for count in range(numToDelete):
answer=allAnswers.pop()
c=self.caches[answer.type]
c[answer.question].remove(answer)
if len(c[answer.question])==0:
del c[answer.question]
return None
def formatForReturn(self,listOfObjs):
if len(listOfObjs)==1 and listOfObjs[0].answer==None:
return []
if listOfObjs[0].qType=="PTR" and self.returnSinglePTR:
return listOfObjs[0].answer
return [ obj.answer for obj in listOfObjs ]
def lookup(self,question,qType="A"):
qType=qType.upper()
if qType not in ("A","PTR"):
raise ValueError,"Query type must be one of A, PTR"
now=int(time.time())
# Finding the len() of a dictionary isn't an expensive operation
# but doing it twice for every lookup isn't necessary.
self.pruneTicker+=1
if self.pruneTicker==kCheckForPruneEvery:
self.pruneTicker=0
if len(self.caches["A"])+len(self.caches["PTR"])>kPruneThreshold:
self.prune(now)
cacheToLookIn=self.caches[qType]
try:
answers=cacheToLookIn[question]
except KeyError:
pass
else:
assert len(answers)>0
ind=0
# No guarantee that expire has already been done
while ind<len(answers):
thisAnswer=answers[ind]
if thisAnswer.expiresAt<now:
del answers[ind]
else:
thisAnswer.lastUsed=now
ind+=1
if len(answers)==0:
del cacheToLookIn[question]
else:
self.hits+=1
return self.formatForReturn(answers)
# Not in cache or we just expired it
self.misses+=1
if qType=="PTR":
qList=question.split(".")
qList.reverse()
queryQuestion=".".join(qList)+".in-addr.arpa"
else:
queryQuestion=question
# where do we get NXDOMAIN?
try:
reply=self.queryObj.req(queryQuestion,qtype=qType,timeout=self.dnsTimeout)
except DNS.Base.DNSError,detail:
if detail.args[0]<>"Timeout":
print "Error, fixme",detail
print "Question was",queryQuestion
print "Origianal question was",question
print "Type was",qType
objs=[
lookupResult(qType,None,question,self.cacheErrorSecs+now,now) ]
cacheToLookIn[question]=objs # Add to format for return?
return self.formatForReturn(objs)
except socket.gaierror,detail:
print "DNS connection failure:", self.queryObj.ns, detail
print "Defaults:", DNS.defaults
objs=[]
for answer in reply.answers:
if answer["typename"]==qType:
# PyDNS returns TTLs as longs but RFC 1035 says that the
# TTL value is a signed 32-bit value and must be positive,
# so it should be safe to coerce it to a Python integer.
# And anyone who sets a time to live of more than 2^31-1
# seconds (68 years and change) is drunk.
# Arguably, I ought to impose a maximum rather than continuing
# with longs (int(long) returns long in recent versions of
Python).
ttl=max(min(int(answer["ttl"]),kMaxTTL),self.minTTL)
# RFC 2308 says that you should cache an NXDOMAIN for the
# minimum of the minimum field of the SOA record and the TTL
# of the SOA.
if ttl>0:
item=lookupResult(qType,answer["data"],question,ttl+now,now)
objs.append(item)
if len(objs)>0:
cacheToLookIn[question]=objs
return self.formatForReturn(objs)
# Probably SERVFAIL or the like
if len(reply.authority)==0:
objs=[
lookupResult(qType,None,question,self.cacheErrorSecs+now,now) ]
cacheToLookIn[question]=objs
return self.formatForReturn(objs)
# No such host
#
# I don't know in what circumstances you'd have more than one authority,
# so I'll just assume that the first is what we want.
#
# RFC 2308 specifies that this how to decide how long to cache an
# NXDOMAIN.
auth=reply.authority[0]
auTTL=int(auth["ttl"])
for item in auth["data"]:
if type(item)==types.TupleType and item[0]=="minimum":
auMin=int(item[1])
cacheNeg=min(auMin,auTTL)
break
else:
cacheNeg=auTTL
objs=[ lookupResult(qType,None,question,cacheNeg+now,now) ]
cacheToLookIn[question]=objs
return self.formatForReturn(objs)
def main():
import transaction
c=cache(cachefile=os.path.expanduser("~skip/.dnscache"))
c.printStatsAtEnd=True
for host in ["www.python.org", "www.timsbloggers.com",
"www.seeputofor.com", "www.completegarbage.tv",
"www.tradelinkllc.com"]:
print "checking", host
now=time.time()
ips=c.lookup(host)
print ips,time.time()-now
now=time.time()
ips=c.lookup(host)
print ips,time.time()-now
if ips:
ip=ips[0]
now=time.time()
name=c.lookup(ip,qType="PTR")
print name,time.time()-now
now=time.time()
name=c.lookup(ip,qType="PTR")
print name,time.time()-now
else:
print "unknown"
c.close()
return None
if __name__=="__main__":
main()
Index: Options.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/Options.py,v
retrieving revision 1.133
retrieving revision 1.134
diff -C2 -d -r1.133 -r1.134
*** Options.py 6 Aug 2006 16:34:37 -0000 1.133
--- Options.py 6 Aug 2006 16:52:54 -0000 1.134
***************
*** 106,109 ****
--- 106,123 ----
BOOLEAN, RESTORE),
+ ("x-lookup_ip", _("Generate IP address tokens from hostnames"), False,
+ _("""(EXPERIMENTAL) Generate IP address tokens from hostnames.
+ Requires PyDNS (http://pydns.sourceforge.net/)."""),
+ BOOLEAN, RESTORE),
+
+ ("lookup_ip_cache", _("x-lookup_ip cache file location"), "",
+ _("""Tell SpamBayes where to cache IP address lookup information.
+ Only comes into play if lookup_ip is enabled. The default
+ (empty string) disables the file cache. When caching is enabled,
+ the cache file is stored using the same database type as the main
+ token store (only dbm and zodb supported so far, zodb has problems,
+ dbm is untested, hence the default)."""),
+ FILE, RESTORE),
+
("count_all_header_lines", _("Count all header lines"), False,
_("""Generate tokens just counting the number of instances of each kind
Index: tokenizer.py
===================================================================
RCS file: /cvsroot/spambayes/spambayes/spambayes/tokenizer.py,v
retrieving revision 1.39
retrieving revision 1.40
diff -C2 -d -r1.39 -r1.40
*** tokenizer.py 6 Aug 2006 16:34:37 -0000 1.39
--- tokenizer.py 6 Aug 2006 16:52:54 -0000 1.40
***************
*** 40,43 ****
--- 40,54 ----
+ try:
+ import dnscache
+ cache = dnscache.cache(cachefile=options["Tokenizer", "lookup_ip_cache"])
+ cache.printStatsAtEnd = True
+ except (IOError, ImportError):
+ cache = None
+ else:
+ import atexit
+ atexit.register(cache.close)
+
+
# Patch encodings.aliases to recognize 'ansi_x3_4_1968'
from encodings.aliases import aliases # The aliases dictionary
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins