Hello again,

 Here is another, less trivial, patch to a dspam tool. It adds the
possibility to use dspam_train not only with (maildir-like) directories but
also with the traditional Unix MBOX folders. It also refactors the code of
this script quite significantly, notably reducing the duplication of code
between TestNonspam() and TestSpam() functions which were almost exactly
the same.

 I've tested the patch by converting MBOX to maildir and verifying that
training dspam with the same messages in either format results in the same
tokens (as per dspam_dump output) except that the MBOX one also had the
extra tokens appearing in "From " line which disappeared after conversion
to maildir. I also verified that the results of training with maildir were
the same before and after applying the patch, so AFAICS it should be safe
to apply.

 Finally notice that the patch is cumulative with my previous patch fixing
dspam_train command-line handling and so should be applied after applying
the other one. As before, if you have any questions about it, I'd be glad
to answer them, please just post them to the list.

 Thanks,
VZ

diff -r 7c185b8bcdda man/dspam_train.1
--- a/man/dspam_train.1 Tue Apr 15 16:38:03 2008 +0200
+++ b/man/dspam_train.1 Wed Apr 16 02:37:33 2008 +0200
@@ -22,8 +22,8 @@ dspam_train - train a corpus of mail
 [\c
 .BI \ --client \fR
 ]
-.BI spam_dir \fR
-.BI nonspam_dir \fR
+.BI spam_corpus \fR
+.BI nonspam_corpus \fR
 
 .ad
 .SH DESCRIPTION 
@@ -55,15 +55,17 @@ Specifies the user to train, if omitted 
 
 .n3 3
 .TP
-.BI spam_dir\c
-Specifies the pathname to the directory containing the corpus of spam. Each
-message should be separate in its own file.
+.BI spam_corpus\c
+Specifies either the pathname to the directory containing the corpus of spam,
+with each in a separate file (e.g. maildir format) or a path to the mailbox in
+the traditional Unix MBOX format.
 
 .n3 3
 .TP
-.BI nonspam_dir\c
-Specifies the pathname to the directory containing the corpus of nonspam. Each 
-message should be separate in its own file.
+.BI nonspam_corpus\c
+Specifies either the pathname to the directory containing the corpus of
+nonspam with each message in a separate file or a path to the mailbox in the
+traditional Unix MBOX format.
 
 .SH EXIT VALUE
 .LP
diff -r 7c185b8bcdda src/tools/dspam_train.in
--- a/src/tools/dspam_train.in  Tue Apr 15 16:38:03 2008 +0200
+++ b/src/tools/dspam_train.in  Wed Apr 16 02:37:33 2008 +0200
@@ -3,7 +3,7 @@ use strict;
 use strict;
 use vars qw { $USER %CONFIG $SPAM_CORPUS $NONSPAM_CORPUS };
 
-$CONFIG{'SHOW_SUBJECTS'} = 1;
+$CONFIG{'SHOW_SUBJECTS'} = 0;
 $CONFIG{'DSPAM_BINARY'}  = '@bindir@/@dspam_transformed@'; 
 $CONFIG{'BINDIR'}        = '@bindir@';
 
@@ -24,19 +24,40 @@ if ($NONSPAM_CORPUS eq "") {
 }
 
 sub usage {
-    print STDERR "Usage: $0 [username] [--client] [[-i index]|[spam_dir] 
[nonspam_dir]]\n";
+    print STDERR "Usage: $0 [username] [--client] [[-i index]|[spam_corpus] 
[nonspam_corpus]]\n";
     exit(-1);
-}
-
-if ($SPAM_CORPUS ne "-i" && (! -d $SPAM_CORPUS || ! -d $NONSPAM_CORPUS)) {
-    print STDERR "ERROR: " . ((-d $SPAM_CORPUS) ? "nonspam" : "spam" ) . 
"corpus must be path to maildir directory\n";
-    usage(); 
 }
 
 print "Taking Snapshot...\n";
 system("$CONFIG{'BINDIR'}/dspam_stats -r $USER");
 
-&Train("$NONSPAM_CORPUS", "$SPAM_CORPUS");
+if ($SPAM_CORPUS eq "-i") {
+    TrainOnIndex($NONSPAM_CORPUS);
+}
+else {
+    &Train("$NONSPAM_CORPUS", "$SPAM_CORPUS");
+}
+
+FinishTraining();
+
+
+sub TrainOnIndex {
+  my($index) = @_;
+  open(INDEX, "<$index") or die "Failed to open index file \"$index\": $!\n";
+
+  print "Training on $index index...\n";
+  while(<INDEX>) {
+    chomp;
+    my($class, $filename) = split(/\s+/);
+    if ($class eq "ham" || $class eq "nonspam") {
+        TestNonspam($filename);
+    } elsif ($class eq "spam") {
+        TestSpam($filename);
+    } else {
+        die "ERROR: Unknown class '$class'. Test Broken.";
+    }
+  }
+}
 
 sub Train {
     my($nonspam, $spam) = @_;
@@ -45,25 +66,8 @@ sub Train {
 
     print "Training $nonspam / $spam corpora...\n";
 
-    # Train on index
-    if ($SPAM_CORPUS eq "-i") {
-      open(INDEX, "<$NONSPAM_CORPUS");
-      while(<INDEX>) {
-        chomp;
-        my($class, $filename) = split(/\s+/);
-        if ($class eq "ham" || $class eq "nonspam") {
-            TestNonspam(".", $filename);
-        } elsif ($class eq "spam") {
-            TestSpam(".", $filename);
-        } else {
-            die "ERROR: Unknown class '$class'. Test Broken.";
-        }
-      }
-      return FinishTraining();
-    }
-
-    @nonspam_corpus = GetFiles($nonspam);
-    @spam_corpus = GetFiles($spam); 
+    @nonspam_corpus = GetFilesOrMessages($nonspam);
+    @spam_corpus = GetFilesOrMessages($spam); 
 
     while($#nonspam_corpus > -1 || $#spam_corpus > -1) {
         if ($#nonspam_corpus > -1) {
@@ -71,14 +75,14 @@ sub Train {
 
             # Process nonspam until balanced
             $msg = shift(@nonspam_corpus);
-            TestNonspam($nonspam, $msg);
+            TestNonspam($msg);
             if ($#spam_corpus > -1) {
                 $count = ($#nonspam_corpus+1) / ($#spam_corpus+1);
             }
             for(1..$count-1)
             { 
                 $msg = shift(@nonspam_corpus);
-                TestNonspam($nonspam, $msg);
+                TestNonspam($msg);
             }
         }
 
@@ -86,7 +90,7 @@ sub Train {
             my($count) = 0;
             # Process spam until balanced
             $msg = shift(@spam_corpus);
-            TestSpam($spam, $msg);
+            TestSpam($msg);
             if ($#nonspam_corpus > -1) {
                 $count = ($#spam_corpus+1) / ($#nonspam_corpus+1);
             }
@@ -94,13 +98,10 @@ sub Train {
 
             {
                 $msg = shift(@spam_corpus);
-                TestSpam($spam, $msg);
-            }
-        }
-    }
-
-    FinishTraining();
-
+                TestSpam($msg);
+            }
+        }
+    }
 }
 
 sub FinishTraining() {
@@ -111,6 +112,19 @@ sub FinishTraining() {
     system("$CONFIG{'BINDIR'}/dspam_stats -S $USER");
 }
 
+sub GetFilesOrMessages {
+    my ($corpus) = @_;
+    if (-d $corpus) {
+        return GetFiles($corpus);
+    }
+    elsif (-f $corpus) {
+        return GetMessages($corpus);
+    }
+    else {
+        die "Corpus \"$corpus\" must be either a MBOX file or a maildir 
directory.\n"
+    }
+}
+
 sub GetFiles {
   my($corpus) = @_;
   my(@files);
@@ -118,17 +132,50 @@ sub GetFiles {
   opendir(DIR, "$corpus") || die "$corpus: $!";
   @files = grep(!/^\.\.?$/, readdir(DIR));
   closedir(DIR);
-  return @files;
+  return map { $_ = "$corpus/" . $_ } @files;
 } 
 
+sub GetMessages {
+    my ($mbox) = @_;
+
+    die "Please install Mail::MboxParser module if you want to be able to use 
" .
+        "MBOX files for training.\n" unless eval { require Mail::MboxParser; };
+
+    # filter out special pseudo messages used by Pine/UW-IMAPd
+    return grep { $_->header->{subject} ne "DON'T DELETE THIS MESSAGE -- 
FOLDER INTERNAL DATA" }
+            Mail::MboxParser->new($mbox)->get_messages();
+}
+
 sub TestNonspam {
-    my($code, $cmd, $response);
-    my($dir, $msg) = @_;
-    print "[test: nonspam] " . substr($msg . " " x 32, 0, 32) .  " result: ";
-    $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary --stdout < 
'$dir/$msg'";
-    $response = `$cmd`;
-
-    $code = "UNKNOWN";
+    my($msg) = @_;
+    TestAny($msg, "nonspam", "Innocent", "Whitelisted", "innocent", "fp")
+}
+
+sub TestSpam {
+    my($msg) = @_;
+    TestAny($msg, "spam   ", "Spam", "Blacklisted", "spam", "fn")
+}
+
+sub TestAny {
+    my($msg, $testname, $ok1, $ok2, $dspam_class, $short_class) = @_;
+    my $response;
+    print "[test: $testname] " . substr($msg . " " x 32, 0, 32) .  " result: ";
+    my $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary 
--stdout";
+    if ( -f $msg ) {
+        $response = `$cmd < '$msg'`;
+    }
+    else {
+        use FileHandle;
+        use IPC::Open2;
+        my ($dspam_in, $dspam_out);
+        my $pid = open2($dspam_out, $dspam_in, $cmd);
+        print $dspam_in $msg->as_string();
+        close $dspam_in;
+        $response = join('', <$dspam_out>);
+        waitpid $pid, 0
+    }
+
+    my $code = "UNKNOWN";
     if ($response =~ /class="(\S+)"/i) {
         $code = $1;
     }
@@ -141,7 +188,7 @@ sub TestNonspam {
         return;
     }
 
-    if ($code eq "Innocent" || $code eq "Whitelisted") {
+    if ($code eq $ok1 || $code eq $ok2) {
         print "PASS";
     } else {
         my($class) = "UNKNOWN";
@@ -167,87 +214,26 @@ sub TestNonspam {
 
         print "FAIL ($class)";
         if ($CONFIG{'SHOW_SUBJECTS'} == 1) {
-            print "\n\t[fp] ";
-            open(FILE, "<$dir/$msg");
-            while(<FILE>) {
-                if (/^Subject:/i) {
-                    chomp;
-                    print $_;
-                    close(FILE);
+            print "\n\t[$short_class] ";
+            if ( -f $msg ) {
+                open(FILE, "<$msg");
+                while(<FILE>) {
+                    if (/^Subject:/i) {
+                        chomp;
+                        print $_;
+                        close(FILE);
+                    }
                 }
-            }
-            close(FILE);
-        }
-        open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER --class=innocent " .
+                close(FILE);
+            }
+            else {
+                print $msg->header->{subject}
+            }
+        }
+        open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER 
--class=$dspam_class " .
                     "--source=error --signature=$signature");
         close(TRAIN);
     }
     print "\n";
-    return;
-}
-
-sub TestSpam {
-    my($code, $cmd, $response);
-    my($dir, $msg) = @_;
-
-    print "[test: spam   ] " . substr($msg . " " x 32, 0, 32) . " result: ";
-    $cmd = "$CONFIG{'DSPAM_BINARY'} --user $USER --deliver=summary --stdout < 
'$dir/$msg'";
-    $response = `$cmd`;
-
-    $code = "UNKNOWN";
-    if ($response =~ /class="(\S+)"/i) {
-        $code = $1;
-    }
-    if ($code eq "UNKNOWN") {
-        # print "\n===== WOAH THERE =====\n";
-        # print "I was unable to parse the result. Test Broken.\n";
-        # print "======================\n";
-        # exit(0);
-        print "BROKEN result!!\n";
-        return;
-    }
-
-    if ($code eq "Spam" || $code eq "Blacklisted") {
-        print "PASS";
-    } else {
-        my($class) = "UNKNOWN";
-        my($signature) = "UNKNOWN";
-        if ($response =~ /class="(\S+)"/i) {
-            $class = $1;
-        } else {
-            print "BROKEN class!!\n";
-            return;
-        }
-
-        if ($response =~ /signature=(\S+)/i) {
-            $signature = $1;
-        } else {
-            # print "\n===== WOAH THERE =====\n";
-            # print "I was unable to find the DSPAM signature. Test Broken.\n";
-            # print "======================\n";
-            # print "\n$response\n";
-            # exit(0);
-            print "BROKEN signature!!\n";
-            return;
-        }
-        print "FAIL ($class)";
-
-        if ($CONFIG{'SHOW_SUBJECTS'} == 1) {
-            print "\n\t[fn] ";
-            open(FILE, "<$dir/$msg");
-            while(<FILE>) {
-                if (/^Subject:/i) {
-                    chomp;
-                    print $_;
-                    close(FILE);
-                }
-            }
-            close(FILE);
-        }
-        open(TRAIN, "|$CONFIG{'DSPAM_BINARY'} --user $USER --class=spam ".
-                    "--source=error --signature=$signature");
-        close(TRAIN);
-    }
-    print "\n";
-    return;
-}
+}
+

Reply via email to