On 1/11/19 12:04 AM, William Taylor wrote:
> We just enabled Mail::SpamAssassin::Plugin::Phishing and realized it was 
> causing our spam processing to slow way down.
> 
> Looks like the slow down is caused by matching urls with grep against an 
> array. Here is some sample code that shows the difference in speed.
> 
> Output should be similar to this below. Notice the performance difference 
> between the
> two? I didn't check memory usage but I can't imagine it being that bad plus
> ram is cheap and I'd rather see processing as fast as possible.
> 
> I can provide a proposed patch if needed but its pretty straight forward to 
> switch
> to hash based instead of an array.
> 
thanks, I committed some improvements.
 Cheers
  Giovanni Bechis


> This shows worst case scenario of not finding a match.
>                   Rate originalCode      newCode
> originalCode    7877/s           --        -100%
> newCode      4139773/s       52456%           --
> 
> This shows best case scenario of first element matching
>                   Rate originalCode      newCode
> originalCode    7907/s           --        -100%
> newCode      3665454/s       46260%           --
> 
> This shows element matching about halfway through the list
>                   Rate originalCode      newCode
> originalCode    7780/s           --        -100%
> newCode      3668842/s       47060%           --
> 
> 
> 
> #!/usr/bin/perl
> use strict;
> use warnings;
> use Benchmark qw(:all);
> use File::Slurp;
> use LWP::Simple;
> use POSIX qw(floor);
> 
> my $feed = 'feed.txt';
> 
> # Download feed file if needed
> if ( !-f $feed )
> {
>     unless ( getstore( 'https://openphish.com/feed.txt', $feed ) )
>     {
>         die "Could not download feed\n";
>     }
> }
> 
> my $pms = {};
> $pms->{PHISHING} = {};
> $pms->{PHISHING}->{phishurl} = [];
> my $info = {};
> $info->{cleaned} = [
>     qw(
>         http://www.example.com/phishurl
>         )
> ];
> 
> my @urls = read_file($feed);
> chomp @urls;
> my $halfCount = floor( scalar(@urls) / 2 );
> 
> $pms->{PHISHING}{phishurl} = \@urls;
> my %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> ### Worst Case
> print "This shows worst case scenario of not finding a match.\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> ### Best Case
> unshift @{ $pms->{PHISHING}{phishurl} }, "http://www.example.com/phishurl";;
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> print "\nThis shows best case scenario of first element matching\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> #### Halfway match
> @urls = read_file($feed);
> chomp @urls;
> $pms->{PHISHING}{phishurl} = \@urls;
> splice( @{ $pms->{PHISHING}{phishurl} }, $halfCount, 0, 
> "http://www.example.com/phishurl"; );
> %hash = map { $_ => undef } @{ $pms->{PHISHING}->{phishurl} };
> 
> print "\nThis shows element matching about halfway through the list\n";
> cmpthese(
>     -10,
>     {   originalCode => \&originalCode,
>         newCode      => \&newCode
>     }
> );
> 
> exit;
> 
> # Note this gets a lot worse if an email contains multiple urls to check.
> # Also original code had length check. This is really unecesarry. Benchmarks
> # weren't done with it but I'd imagine it adds some slowness. If really 
> parinoid
> # about it, do the length check when adding to the array or hash.
> sub originalCode
> {
>     foreach my $cluri ( @{ $info->{cleaned} } )
>     {
>         if ( grep { $cluri eq $_ } @{ $pms->{PHISHING}->{phishurl} } )
>         {
>             return 1;
>         }
>     }
> }
> 
> sub newCode
> {
>     foreach my $cluri ( @{ $info->{cleaned} } )
>     {
>         return 1 if exists $hash{$cluri};
>     }
> }
> 

Reply via email to