On Sun, 2003-08-31 at 15:16, mpierce wrote: > Is there a way to find duplicate messages in a specific folder? > > > _______________________________________________ > evolution maillist - [EMAIL PROTECTED] > http://lists.ximian.com/mailman/listinfo/evolution
You could import the messages from the folder into an MH folder, and then run "classify" on them. You may have to grep out some headers first though. If classify is hard to find, I'm attaching my equivs program, which does much the same thing. classify has more bells and whistles, but equivs tends to be fast on medium sized collections of files. classify is fine for small collections. I keep thinking I should add an algorithm to equivs to make it work well on large collections, but I keep not finding an actual need for it. :) This was written back before I knew python very well. So it's probably not code to study and emulate. -- Dan Stromberg DCS/NACS/UCI <[EMAIL PROTECTED]>
#!/dcs/bin/python
# def revcmp(a, b): return cmp(b, a)
# a.sort(revcmp)
import posix
import os
import stat
import sys
import string
class file_bucket:
def __init__(self,attributes):
self.attributes=attributes
self.names=[attributes.name]
def add_name(self,name):
self.names = self.names + [name]
def __str__(self):
return string.join(self.names)
class file_attributes:
def __init__(self,name):
self.name = name
statbuf=posix.stat(name)
self.dev = statbuf[stat.ST_DEV]
self.ino = statbuf[stat.ST_INO]
self.len = statbuf[stat.ST_SIZE]
self.len_summed = 0
self.checksum = 0
def extend_sum(self):
max_chunk=32768
len=self.len-self.len_summed
if len > max_chunk:
len=max_chunk
if len == 0:
return
# open for reading only
fp=posix.open(self.name,0)
dummy=posix.lseek(fp,self.len_summed,0)
buf=posix.read(fp,len)
posix.close(fp)
# for chr in buf[0:len-1]:
# self.checksum = self.checksum + ord(chr)
# I suspect a large prime is good here... Make sure it's larger
# than 256*max_chunk, or move the % into the loop!
# self.checksum = self.checksum % 366829
self.checksum = (self.checksum + hash(buf)) % 366829
self.len_summed = self.len_summed + len
# derived from Lib/cmp.py
def do_cmp(self, other): # Compare two files, for real
bufsize = 8096 # Could be tuned
self_fp = posix.open(self.name, 0)
other_fp = posix.open(other.name, 0)
while 1:
self_buf = posix.read(self_fp,bufsize)
other_buf = posix.read(other_fp,bufsize)
if self_buf <> other_buf:
posix.close(self_fp)
posix.close(other_fp)
return 0
if not self_buf:
posix.close(self_fp)
posix.close(other_fp)
return 1
# it might be helpful to make this idempotent, someday
def are_equal(self,other):
if self.dev == other.dev and self.ino == other.ino:
return 1
if self.len <> other.len:
return 0
if self.len_summed == 0:
self.extend_sum()
if other.len_summed == 0:
other.extend_sum()
# extend the len_summed's as needed, to make them equal.
# don't forget that the files must have equal length to reach this
# point
while self.len_summed < other.len_summed:
self.extend_sum()
while other.len_summed < self.len_summed:
other.extend_sum()
# while the sums are equal, and we haven't hit the end of the files...
while self.len_summed < self.len and self.checksum == other.checksum:
self.extend_sum()
other.extend_sum()
if self.checksum <> other.checksum:
return 0
# These two files have identical lengths and identical checksums
# It's time to do it the hard way
return file_attributes.do_cmp(self,other)
def __repr__(self):
return self.name
def main():
equivs=[]
# this is currently O(n^2), and operates much like a worst-case insertion
# sort. It -could- be done O(nlogn), but I'm not sure I see how to do
# so without giving up the incremental checksum heuristic. It might
# be worth trying without the checksumming someday; probably wouldn't
# take long - and I suppose either could be allowed with a command-line
# switch
if sys.argv[1] == '-v':
del sys.argv[1]
verbose=1
else:
verbose=0
fileno=0
for filename in sys.argv[1:]:
if verbose:
fileno=fileno+1
sys.stderr.write('adding '+filename+' '+`fileno`+' ')
sys.stderr.write(`len(sys.argv)-fileno`+' '+`len(equivs)`+'\n')
#print filename,equivs
nextfile = file_attributes(filename)
for possible_match in equivs:
if
file_attributes.are_equal(nextfile,possible_match.attributes):
possible_match = possible_match.add_name(filename)
break
else:
equivs = equivs + [file_bucket(nextfile)]
#print
for file in equivs:
print string.join(file.names)
main()
#filelist=[[]]
#for file in sys.argv[1:]:
# filelist[0]=filelist[0] + [file]
#
#print filelist
#
#
#stat1=posix.stat(filelist[0][0])
##print stat
#for filenum in range(len(filelist)-1):
# stat2=posix.stat(filelist[filenum+1][0])
# print filelist[filenum], filelist[filenum+1]
# print stat1[stat.ST_SIZE],stat2[stat.ST_SIZE]
# stat1=stat2
#
#
##stat=posix.stat(file)
#
signature.asc
Description: This is a digitally signed message part
