FYI, mass-check ham/spam.logs now use the compact hit logging by default, to save space with those pesky __LOWER_E(157)'s..
In case you use your own log parser or such. On Tue, Apr 20, 2021 at 07:35:52AM -0000, [email protected] wrote: > Author: hege > Date: Tue Apr 20 07:35:51 2021 > New Revision: 1888999 > > URL: http://svn.apache.org/viewvc?rev=1888999&view=rev > Log: > Support compacted/deduplicated RULE(hitcount) format for mass-check logs > > Modified: > spamassassin/trunk/masses/evolve_metarule/preproc.pl > spamassassin/trunk/masses/fp-fn-statistics > spamassassin/trunk/masses/freqdiff > spamassassin/trunk/masses/hit-frequencies > spamassassin/trunk/masses/logdiff > spamassassin/trunk/masses/logs-to-c > spamassassin/trunk/masses/mass-check > spamassassin/trunk/masses/mk-roc-graphs > spamassassin/trunk/masses/overlap > spamassassin/trunk/masses/post-ga-analysis.pl > spamassassin/trunk/masses/tenpass/compute-current-tcr > > Modified: spamassassin/trunk/masses/evolve_metarule/preproc.pl > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/evolve_metarule/preproc.pl?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/evolve_metarule/preproc.pl (original) > +++ spamassassin/trunk/masses/evolve_metarule/preproc.pl Tue Apr 20 07:35:51 > 2021 > @@ -67,7 +67,17 @@ while (<HAM>) { > my (undef,undef,undef, $test_str, undef) = split /\s/; > > # Extract the relevant rule hits and sort them by column number. > - my @hits = sort map { $rules{$_} } grep { exists $rules{$_} } split > /,/, $test_str; > + my @tests; > + foreach my $r (split(/,/, $test_str)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + next unless exists $rules{$r}; > + push @tests, $r for (1 .. $hits); > + } > + my @hits = sort map { $rules{$_} } @tests; > > # Count the number of occurrences and size of this pattern. > $ham_patterns{join (' ', @hits)}++; > @@ -95,7 +105,17 @@ while (<SPAM>) { > my (undef,undef,undef, $test_str, undef) = split /\s/; > > # Extract the relevant rule hits and sort them by column number. > - my @hits = sort map { $rules{$_} } grep { exists $rules{$_} } split > /,/, $test_str; > + my @tests; > + foreach my $r (split(/,/, $test_str)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + next unless exists $rules{$r}; > + push @tests, $r for (1 .. $hits); > + } > + my @hits = sort map { $rules{$_} } @tests; > > # Count the number of occurrences and size of this pattern. > $spam_patterns{join (' ', @hits)}++; > > Modified: spamassassin/trunk/masses/fp-fn-statistics > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/fp-fn-statistics?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/fp-fn-statistics (original) > +++ spamassassin/trunk/masses/fp-fn-statistics Tue Apr 20 07:35:51 2021 > @@ -173,8 +173,16 @@ sub readlogs { > next unless ($caught eq 'Y' || $caught eq '.') && $rules; > > # get tests, but ignore unknown tests and subrules > - my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } > - split(/,/, $rules); > + my @tests; > + foreach my $r (split(/,/, $rules)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); > + push @tests, $r for (1 .. $hits); > + } > > # run handler > log_line_count($isspam, $count, \@tests, $msgline); > > Modified: spamassassin/trunk/masses/freqdiff > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/freqdiff?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/freqdiff (original) > +++ spamassassin/trunk/masses/freqdiff Tue Apr 20 07:35:51 2021 > @@ -157,9 +157,15 @@ sub read_argv { > } > # "mass-check" format > elsif (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { > + my $test_str = $1; > $type = 2; > - foreach (split(/,/, $1)) { > - $freq{$_}++; > + foreach my $r (split(/,/, $test_str)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + $freq{$r} += $hits; > } > } > # "scores" format > > Modified: spamassassin/trunk/masses/hit-frequencies > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/hit-frequencies?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/hit-frequencies (original) > +++ spamassassin/trunk/masses/hit-frequencies Tue Apr 20 07:35:51 2021 > @@ -730,6 +730,18 @@ sub readlogs { > '; > } > > + $evalstr .= ' > + my @rules; > + foreach my $r (split(/,/, $rules)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > + '; > + > my $hmapstr = ''; > my $smapstr = ''; > if ($isspam) { > @@ -747,7 +759,7 @@ sub readlogs { > } > > $evalstr .= ' > - foreach my $r (split(/,/, $rules)) { > + foreach my $r (@rules) { > $freq_spam{$r}++ unless $freq_mesg{$r}++; > '.$hmapstr.$smapstr.' > } > @@ -768,7 +780,7 @@ sub readlogs { > } > > $evalstr .= ' > - foreach my $r (split(/,/, $rules)) { > + foreach my $r (@rules) { > $freq_ham{$r}++ unless $freq_mesg{$r}++; > '.$hmapstr.$smapstr.' > } > > Modified: spamassassin/trunk/masses/logdiff > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/logdiff?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/logdiff (original) > +++ spamassassin/trunk/masses/logdiff Tue Apr 20 07:35:51 2021 > @@ -36,7 +36,15 @@ sub fixfile { > } > > my ($scorepath, $rules, $meta) = ($1,$2,$3); > - my @rules = split(/,/, $rules); > + my @rules; > + foreach my $r (split(/,/, $rules)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > @rules = sort grep { > $_ !~ /^AWL$/ > } @rules; > > Modified: spamassassin/trunk/masses/logs-to-c > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/logs-to-c?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/logs-to-c (original) > +++ spamassassin/trunk/masses/logs-to-c Tue Apr 20 07:35:51 2021 > @@ -144,8 +144,16 @@ sub readlogs { > (undef, $rules) = split(/ /, $restofline, 3); > > # get tests, but ignore unknown tests and subrules > - my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } > - split(/,/, $rules); > + my @tests; > + foreach my $r (split(/,/, $rules)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); > + push @tests, $r for (1 .. $hits); > + } > > if ($isspam) { > $num_spam++; > > Modified: spamassassin/trunk/masses/mass-check > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/mass-check?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/mass-check (original) > +++ spamassassin/trunk/masses/mass-check Tue Apr 20 07:35:51 2021 > @@ -698,7 +698,8 @@ sub wanted { > # Amavis X-Spam-Status rules include score and are enclosed in [] > # Amavis: [RULENAME=0.01,RULENAME_2=0.01] > # Spamassassin: RULENAME,RULENAME_2 > - s/[\[\]]//, s/=.*// foreach (@previous); > + # .. also support compact RULE(hits), no need to count hits here > + s/[\[\]]//, s/=.*//, s/\(\d+\)$// foreach (@previous); > $ma->{metadata}->{reuse_tests_hit} = { map {$_ => 1} @previous }; > $reusing = 1; > } > @@ -826,11 +827,19 @@ sub wanted { > # don't bother adjusting scores for reuse > $score = $status->get_score(); > # list of tests hit > + my %tests; > + foreach (( > + split(/,/, $status->get_names_of_tests_hit()), > + split(/,/, $status->get_names_of_subtests_hit()) > + )) { > + $tests{$_}++; > + } > my @tests; > - push @tests, split(/,/, $status->get_names_of_tests_hit()); > - push @tests, split(/,/, $status->get_names_of_subtests_hit()); > - > - $tests = join(",", sort(@tests)); > + foreach (sort keys %tests) { > + # Use compact RULE(hitcount) format > + push @tests, $tests{$_} > 1 ? "$_($tests{$_})" : $_; > + } > + $tests = join(",", @tests); > $extra = join(",", @extra); > } > > > Modified: spamassassin/trunk/masses/mk-roc-graphs > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/mk-roc-graphs?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/mk-roc-graphs (original) > +++ spamassassin/trunk/masses/mk-roc-graphs Tue Apr 20 07:35:51 2021 > @@ -106,8 +106,16 @@ sub readlogs { > next unless ($caught eq 'Y' || $caught eq '.') && $rules; > > # get tests, but ignore unknown tests and subrules > - my @tests = grep { defined $scores{$_} && !$allrules{$_}->{issubrule} } > - split(/,/, $rules); > + my @tests; > + foreach my $r (split(/,/, $rules)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + next unless (defined $scores{$r} && !$allrules{$r}->{issubrule}); > + push @tests, $r for (1 .. $hits); > + } > > # run handler > $log_line->($isspam, $count, \@tests); > > Modified: spamassassin/trunk/masses/overlap > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/overlap?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/overlap (original) > +++ spamassassin/trunk/masses/overlap Tue Apr 20 07:35:51 2021 > @@ -105,8 +105,17 @@ sub read_file { > while(<FILE>) { > next if /^#/; > if (/^[Y.]\s+-?\d+\s+\S+\s+(\S+)/) { > - my @tests = split(/,/, $1); > - @tests = grep { !/^T_/ } @tests if $opt_t; > + my $test_str = $1; > + my @tests; > + foreach my $r (split(/,/, $test_str)) { > + next if ($opt_t && $r =~ /^T_/); # skip test rules > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @tests, $r for (1 .. $hits); > + } > my $i = 0; > for my $a (@tests) { > $solo{$a}++; > > Modified: spamassassin/trunk/masses/post-ga-analysis.pl > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/post-ga-analysis.pl?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/post-ga-analysis.pl (original) > +++ spamassassin/trunk/masses/post-ga-analysis.pl Tue Apr 20 07:35:51 2021 > @@ -26,7 +26,16 @@ while(<SPAM>) > { > next if /^#/; > /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)(\s+?:(?:bayes|time)=\S+)\s*?$/; > - my @rules=split /,/,$1; > + my $test_str = $1; > + my @rules; > + foreach my $r (split(/,/, $test_str)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > my $score = 0.0; > foreach $rule (@rules) > { > @@ -53,8 +62,17 @@ while(<NONSPAM>) > next if /^#/; > /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/; > next unless defined($1); > + my $test_str = $1; > + my @rules; > + foreach my $r (split(/,/, $test_str)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > > - my @rules=split /,/,$1; > my $score = 0.0; > foreach $rule (@rules) > { > > Modified: spamassassin/trunk/masses/tenpass/compute-current-tcr > URL: > http://svn.apache.org/viewvc/spamassassin/trunk/masses/tenpass/compute-current-tcr?rev=1888999&r1=1888998&r2=1888999&view=diff > ============================================================================== > --- spamassassin/trunk/masses/tenpass/compute-current-tcr (original) > +++ spamassassin/trunk/masses/tenpass/compute-current-tcr Tue Apr 20 07:35:51 > 2021 > @@ -24,7 +24,16 @@ while(<SPAM>) > next if /^\#/; > /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/; > my $testshit = $1; $testshit ||= ''; > - my @rules=split /,/,$testshit; > + > + my @rules; > + foreach my $r (split(/,/, $testshit)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > > my $score = 0.0; > foreach $rule (@rules) { > @@ -46,7 +55,16 @@ while(<HAM>) > next if /^\#/; > /.\s+[-0-9]*\s+[^\s]+\s+([^\s]*)\s*$/; > my $testshit = $1; $testshit ||= ''; > - my @rules=split /,/,$testshit; > + > + my @rules; > + foreach my $r (split(/,/, $testshit)) { > + my $hits = 1; > + # Support compacted RULE(hitcount) format > + if ($r =~ s/\((\d+)\)$//) { > + $hits = $1; > + } > + push @rules, $r for (1 .. $hits); > + } > > my $score = 0.0; > foreach $rule (@rules) { >
