Re: [Evolution] Duplicate Messages

Dan Stromberg Tue, 02 Sep 2003 13:15:19 -0700

On Sun, 2003-08-31 at 15:16, mpierce wrote:
> Is there a way to find duplicate messages in a specific folder?
> 
> 
> _______________________________________________
> evolution maillist  -  [EMAIL PROTECTED]
> http://lists.ximian.com/mailman/listinfo/evolution


You could import the messages from the folder into an MH folder, and
then run "classify" on them.  You may have to grep out some headers
first though.

If classify is hard to find, I'm attaching my equivs program, which does
much the same thing.  classify has more bells and whistles, but equivs
tends to be fast on medium sized collections of files.  classify is fine
for small collections.  I keep thinking I should add an algorithm to
equivs to make it work well on large collections, but I keep not finding
an actual need for it.  :)

This was written back before I knew python very well.  So it's probably
not code to study and emulate.

-- 
Dan Stromberg DCS/NACS/UCI <[EMAIL PROTECTED]>

#!/dcs/bin/python

# def revcmp(a, b): return cmp(b, a)
# a.sort(revcmp)

import posix
import os
import stat
import sys
import string

class file_bucket:
        
        def __init__(self,attributes):
                self.attributes=attributes
                self.names=[attributes.name]
        
        def add_name(self,name):
                self.names = self.names + [name]
        
        def __str__(self):
                return string.join(self.names)

class file_attributes:

        def __init__(self,name):
                self.name = name
                statbuf=posix.stat(name)
                self.dev = statbuf[stat.ST_DEV]
                self.ino = statbuf[stat.ST_INO]
                self.len = statbuf[stat.ST_SIZE]
                self.len_summed = 0
                self.checksum = 0

        def extend_sum(self):
                max_chunk=32768
                len=self.len-self.len_summed
                if len > max_chunk:
                        len=max_chunk
                if len == 0:
                        return
                # open for reading only
                fp=posix.open(self.name,0)
                dummy=posix.lseek(fp,self.len_summed,0)
                buf=posix.read(fp,len)
                posix.close(fp)
#               for chr in buf[0:len-1]:
#                       self.checksum = self.checksum + ord(chr)
                # I suspect a large prime is good here...  Make sure it's larger
                # than 256*max_chunk, or move the % into the loop!
#               self.checksum = self.checksum % 366829
                self.checksum = (self.checksum + hash(buf)) % 366829
                self.len_summed = self.len_summed + len
        
        # derived from Lib/cmp.py
        def do_cmp(self, other): # Compare two files, for real
                bufsize = 8096 # Could be tuned
                self_fp = posix.open(self.name, 0)
                other_fp = posix.open(other.name, 0)
                while 1:
                        self_buf = posix.read(self_fp,bufsize)
                        other_buf = posix.read(other_fp,bufsize)
                        if self_buf <> other_buf:
                                posix.close(self_fp)
                                posix.close(other_fp)
                                return 0
                        if not self_buf:
                                posix.close(self_fp)
                                posix.close(other_fp)
                                return 1

        # it might be helpful to make this idempotent, someday
        def are_equal(self,other):
                if self.dev == other.dev and self.ino == other.ino:
                        return 1
                if self.len <> other.len:
                        return 0
                if self.len_summed == 0:
                        self.extend_sum()
                if other.len_summed == 0:
                        other.extend_sum()
                # extend the len_summed's as needed, to make them equal.
                # don't forget that the files must have equal length to reach this
                # point
                while self.len_summed < other.len_summed:
                        self.extend_sum()
                while other.len_summed < self.len_summed:
                        other.extend_sum()
                # while the sums are equal, and we haven't hit the end of the files...
                while self.len_summed < self.len and self.checksum == other.checksum:
                        self.extend_sum()
                        other.extend_sum()
                if self.checksum <> other.checksum:
                        return 0
                # These two files have identical lengths and identical checksums
                # It's time to do it the hard way
                return file_attributes.do_cmp(self,other)
                        
        def __repr__(self):
                return self.name


def main():
        equivs=[]
        # this is currently O(n^2), and operates much like a worst-case insertion
        # sort.  It -could- be done O(nlogn), but I'm not sure I see how to do
        # so without giving up the incremental checksum heuristic.  It might
        # be worth trying without the checksumming someday; probably wouldn't
        # take long - and I suppose either could be allowed with a command-line
        # switch
        if sys.argv[1] == '-v':
                del sys.argv[1]
                verbose=1
        else:
                verbose=0
        fileno=0
        for filename in sys.argv[1:]:
                if verbose:
                        fileno=fileno+1
                        sys.stderr.write('adding '+filename+' '+`fileno`+' ')
                        sys.stderr.write(`len(sys.argv)-fileno`+' '+`len(equivs)`+'\n')
                #print filename,equivs
                nextfile = file_attributes(filename)
                for possible_match in equivs:
                        if 
file_attributes.are_equal(nextfile,possible_match.attributes):
                                possible_match = possible_match.add_name(filename)
                                break
                else:
                        equivs = equivs + [file_bucket(nextfile)]
        #print
        for file in equivs:
                print string.join(file.names)

main()

#filelist=[[]]
#for file in sys.argv[1:]:
#       filelist[0]=filelist[0] + [file]
#
#print filelist
#
#
#stat1=posix.stat(filelist[0][0])
##print stat
#for filenum in range(len(filelist)-1):
#       stat2=posix.stat(filelist[filenum+1][0])
#       print filelist[filenum], filelist[filenum+1]
#       print stat1[stat.ST_SIZE],stat2[stat.ST_SIZE]
#       stat1=stat2
#       
#
##stat=posix.stat(file)
#

signature.asc
Description: This is a digitally signed message part

Re: [Evolution] Duplicate Messages

Reply via email to