Revision: 3222
          http://spambayes.svn.sourceforge.net/spambayes/?rev=3222&view=rev
Author:   montanaro
Date:     2008-11-28 15:45:43 +0000 (Fri, 28 Nov 2008)

Log Message:
-----------
Add -d flag (skip duplicate messages).

Modified Paths:
--------------
    trunk/spambayes/utilities/splitndirs.py

Modified: trunk/spambayes/utilities/splitndirs.py
===================================================================
--- trunk/spambayes/utilities/splitndirs.py     2008-11-25 15:34:18 UTC (rev 
3221)
+++ trunk/spambayes/utilities/splitndirs.py     2008-11-28 15:45:43 UTC (rev 
3222)
@@ -24,6 +24,8 @@
     -n N
         The number of output mboxes desired.  This is required.
 
+    -d  Eliminate duplicates.
+
 Arguments:
     sourcembox
         The mbox or path to an mbox to split.
@@ -49,6 +51,10 @@
 import random
 import getopt
 import glob
+try:
+    from hashlib import md5
+except ImportError:
+    from md5 import new as md5
 
 from spambayes import mboxutils
 
@@ -69,13 +75,14 @@
 
 def main():
     try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hgn:s:v', ['help'])
+        opts, args = getopt.getopt(sys.argv[1:], 'dhgn:s:v', ['help'])
     except getopt.error, msg:
         usage(1, msg)
 
     doglob = False
     n = None
     verbose = False
+    delete_dups = False
     for opt, arg in opts:
         if opt in ('-h', '--help'):
             usage(0)
@@ -87,6 +94,8 @@
             n = int(arg)
         elif opt == '-v':
             verbose = True
+        elif opt == '-d':
+            delete_dups = True
 
     if n is None or n <= 1:
         usage(1, "an -n value > 1 is required")
@@ -101,6 +110,8 @@
             os.makedirs(dir)
 
     counter = 0
+    cksums = set()
+    skipped = 0
     for inputpath in inputpaths:
         if doglob:
             inpaths = glob.glob(inputpath)
@@ -110,8 +121,13 @@
         for inpath in inpaths:
             mbox = mboxutils.getmbox(inpath)
             for msg in mbox:
+                astext = str(msg)
+                cksum = md5(astext).hexdigest()
+                if delete_dups and cksum in cksums:
+                    skipped += 1
+                    continue
+                cksums.add(cksum)
                 i = random.randrange(n)
-                astext = str(msg)
                 #assert astext.endswith('\n')
                 counter += 1
                 msgfile = open('%s/%d' % (outdirs[i], counter), 'wb')
@@ -125,6 +141,8 @@
     if verbose:
         print
         print counter, "messages split into", n, "directories"
+        if skipped:
+            print "skipped", skipped, "duplicate messages"
 
 if __name__ == '__main__':
     main()


This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.
_______________________________________________
Spambayes-checkins mailing list
[email protected]
http://mail.python.org/mailman/listinfo/spambayes-checkins

Reply via email to