mirrors
/
rspamd
mirror of https://github.com/vstakhov/rspamd.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539
							#!/usr/bin/env perl

use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
use Time::HiRes qw(gettimeofday tv_interval);
use JSON::XS;
use String::ShellQuote;
use FileHandle;
use IPC::Open2;
use Data::Dumper;

my $spam_dir;
my $ham_dir;
my $parallel            = 1;
my $classifier          = "bayes";
my $spam_symbol         = "BAYES_SPAM";
my $ham_symbol          = "BAYES_HAM";
my $timeout             = 10;
my $rspamc              = $ENV{'RSPAMC'} || "rspamc";
my $bogofilter          = $ENV{'BOGOFILTER'} || "bogofilter";
my $dspam               = $ENV{'DSPAM'} || "dspam";
my $train_fraction      = 0.5;
my $use_bogofilter      = 0;
my $use_dspam           = 0;
my $check_only          = 0;
my $rspamc_prob_trigger = 95;
my $man;
my $help;

GetOptions(
    "spam|s=s"           => \$spam_dir,
    "ham|h=s"            => \$ham_dir,
    "spam-symbol=s"      => \$spam_symbol,
    "ham-symbol=s"       => \$ham_symbol,
    "classifier|c=s"     => \$classifier,
    "timeout|t=f"        => \$timeout,
    "parallel|p=i"       => \$parallel,
    "train-fraction|t=f" => \$train_fraction,
    "bogofilter|b"       => \$use_bogofilter,
    "dspam|d"            => \$use_dspam,
    "check-only"         => \$check_only,
    "help|?"             => \$help,
    "man"                => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;

sub read_dir_files {
    my ( $dir, $target ) = @_;
    opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
    while ( my $file = readdir $dh ) {
        if ( -f "$dir/$file" ) {
            push @{$target}, "$dir/$file";
        }
    }
}

sub shuffle_array {
    my ($ar) = @_;

    for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
        if ( $i > 1 ) {
            my $sel = int( rand( $i - 1 ) );
            ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
        }
    }
}

sub learn_rspamc {
    my ( $files, $spam ) = @_;
    my $processed = 0;

    my $cmd         = $spam ? "learn_spam" : "learn_ham";
    my $args_quoted = shell_quote @{$files};
    open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" )
      or die "cannot spawn $rspamc: $!";

    while (<$p>) {
        my $res = eval('decode_json($_)');
        if ( $res && $res->{'success'} ) {
            $processed++;
        }
    }

    return $processed;
}

sub learn_bogofilter {
    my ( $files, $spam ) = @_;
    my $processed = 0;

    foreach my $f ( @{$files} ) {
        my $args_quoted = shell_quote $f;
        my $fl          = $spam ? "-s" : "-n";
        `$bogofilter  -I $args_quoted $fl`;
        if ( $? == 0 ) {
            $processed++;
        }
    }

    return $processed;
}

sub learn_dspam {
    my ( $files, $spam ) = @_;
    my $processed = 0;

    foreach my $f ( @{$files} ) {
        my $args_quoted = shell_quote $f;
        my $fl          = $spam ? "--class=spam" : "--class=innocent";
        open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
          or die "cannot run $dspam: $!";

        open( my $inp, "< $f" );
        while (<$inp>) {
            print $p $_;
        }
    }

    return $processed;
}

sub learn_samples {
    my ( $ar_ham, $ar_spam ) = @_;
    my $len;
    my $processed = 0;
    my $total     = 0;
    my $learn_func;

    my @files_spam;
    my @files_ham;

    if ($use_dspam) {
        $learn_func = \&learn_dspam;
    }
    elsif ($use_bogofilter) {
        $learn_func = \&learn_bogofilter;
    }
    else {
        $learn_func = \&learn_rspamc;
    }

    $len = int( scalar @{$ar_ham} * $train_fraction );
    my @cur_vec;

    # Shuffle spam and ham samples
    for ( my $i = 0 ; $i < $len ; $i++ ) {
        if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
            push @cur_vec, @{$ar_ham}[$i];
            push @files_ham, [@cur_vec];
            @cur_vec = ();
            $total++;
        }
        else {
            push @cur_vec, @{$ar_ham}[$i];
        }
    }

    $len     = int( scalar @{$ar_spam} * $train_fraction );
    @cur_vec = ();
    for ( my $i = 0 ; $i < $len ; $i++ ) {
        if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
            push @cur_vec, @{$ar_spam}[$i];
            push @files_spam, [@cur_vec];
            @cur_vec = ();
            $total++;
        }
        else {
            push @cur_vec, @{$ar_spam}[$i];
        }
    }

    for ( my $i = 0 ; $i < $total ; $i++ ) {
        my $args;
        my $spam;

        if ( $i % 2 == 0 ) {
            $args = pop @files_spam;

            if ( !$args ) {
                $args = pop @files_ham;
                $spam = 0;
            }
            else {
                $spam = 1;
            }
        }
        else {
            $args = pop @files_ham;
            if ( !$args ) {
                $args = pop @files_spam;
                $spam = 1;
            }
            else {
                $spam = 0;
            }
        }

        my $r = $learn_func->( $args, $spam );
        if ($r) {
            $processed += $r;
        }
    }

    return $processed;
}

sub check_rspamc {
    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;

    my $args_quoted = shell_quote @{$files};
    my $processed   = 0;

    open(
        my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
    ) or die "cannot spawn $rspamc: $!";

    while (<$p>) {
        my $res = eval('decode_json($_)');
        if ( $res && $res->{'default'} ) {
            $processed++;

            if ($spam) {
                if ( $res->{'default'}->{$ham_symbol} ) {
                    my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
                    if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
                        my $percentage = int($1);
                        if ( $percentage >= $rspamc_prob_trigger ) {
                            $$fp_cnt++;
                        }
                    }
                    else {
                        $$fp_cnt++;
                    }
                }
                elsif ( !$res->{'default'}->{$spam_symbol} ) {
                    $$fn_cnt++;
                }
                else {
                    $$detected_cnt++;
                }
            }
            else {
                if ( $res->{'default'}->{$spam_symbol} ) {
                    my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
                    if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {

                        my $percentage = int($1);
                        if ( $percentage >= $rspamc_prob_trigger ) {
                            $$fp_cnt++;
                        }
                    }
                    else {
                        $$fp_cnt++;
                    }
                }
                elsif ( !$res->{'default'}->{$ham_symbol} ) {
                    $$fn_cnt++;
                }
                else {
                    $$detected_cnt++;
                }
            }
        }
    }

    return $processed;
}

sub check_bogofilter {
    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
    my $processed = 0;

    foreach my $f ( @{$files} ) {
        my $args_quoted = shell_quote $f;

        open( my $p, "$bogofilter -t -I $args_quoted |" )
          or die "cannot spawn $bogofilter: $!";

        while (<$p>) {
            if ( $_ =~ /^([SHU])\s+.*$/ ) {
                $processed++;

                if ($spam) {
                    if ( $1 eq 'H' ) {
                        $$fp_cnt++;
                    }
                    elsif ( $1 eq 'U' ) {
                        $$fn_cnt++;
                    }
                    else {
                        $$detected_cnt++;
                    }
                }
                else {
                    if ( $1 eq 'S' ) {
                        $$fp_cnt++;
                    }
                    elsif ( $1 eq 'U' ) {
                        $$fn_cnt++;
                    }
                    else {
                        $$detected_cnt++;
                    }
                }
            }
        }
    }

    return $processed;
}

sub check_dspam {
    my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
    my $processed = 0;

    foreach my $f ( @{$files} ) {
        my $args_quoted = shell_quote $f;

        my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" );
        open( my $inp, "< $f" );
        while (<$inp>) {
            print Writer $_;
        }
        close Writer;

        while (<Reader>) {
            if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) {
                $processed++;
                my $percentage = int( $2 * 100.0 );

                if ($spam) {
                    if ( $1 eq 'Innocent' ) {
                        if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) {
                            $$fp_cnt++;
                        }
                    }
                    elsif ( $1 ne 'Spam' ) {
                        $$fn_cnt++;
                    }
                    else {
                        $$detected_cnt++;
                    }
                }
                else {
                    if ( $1 eq 'Spam' ) {
                        if ( $percentage >= $rspamc_prob_trigger ) {
                            $$fp_cnt++;
                        }
                    }
                    elsif ( $1 ne 'Innocent' ) {
                        $$fn_cnt++;
                    }
                    else {
                        $$detected_cnt++;
                    }
                }
            }
        }
        close Reader;
        waitpid( $pid, 0 );
    }

    return $processed;
}

sub cross_validate {
    my ($hr)          = @_;
    my $args          = "";
    my $processed     = 0;
    my $fp_spam       = 0;
    my $fn_spam       = 0;
    my $fp_ham        = 0;
    my $fn_ham        = 0;
    my $total_spam    = 0;
    my $total_ham     = 0;
    my $detected_spam = 0;
    my $detected_ham  = 0;
    my $i             = 0;
    my $len           = scalar keys %{$hr};
    my @files_spam;
    my @files_ham;
    my @cur_spam;
    my @cur_ham;
    my $check_func;

    if ($use_dspam) {
        $check_func = \&check_dspam;
    }
    elsif ($use_bogofilter) {
        $check_func = \&check_bogofilter;
    }
    else {
        $check_func = \&check_rspamc;
    }

    while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
        if ($spam) {
            if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
                push @cur_spam, $fn;
                push @files_spam, [@cur_spam];
                @cur_spam = ();
            }
            else {
                push @cur_spam, $fn;
            }
        }
        else {
            if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
                push @cur_ham, $fn;
                push @files_ham, [@cur_ham];
                @cur_ham = ();
            }
            else {
                push @cur_ham, $fn;
            }
        }
    }

    shuffle_array( \@files_spam );

    foreach my $fn (@files_spam) {
        my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
        $total_spam += $r;
        $processed  += $r;
    }

    shuffle_array( \@files_ham );

    foreach my $fn (@files_ham) {
        my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
        $total_ham += $r;
        $processed += $r;
    }

    printf "Scanned %d messages
%d spam messages (%d detected)
%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;

    printf "\nHam FP rate: %.2f%% (%d messages)
Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham;

    printf "\nSpam FP rate: %.2f%% (%d messages)
Spam FN rate: %.2f%% (%d messages)\n",
      $fp_spam / $total_spam * 100.0, $fp_spam,
      $fn_spam / $total_spam * 100.0, $fn_spam;
}

if ( !$spam_dir || !$ham_dir ) {
    die "spam or/and ham directories are not specified";
}

my @spam_samples;
my @ham_samples;

read_dir_files( $spam_dir, \@spam_samples );
read_dir_files( $ham_dir,  \@ham_samples );
shuffle_array( \@spam_samples );
shuffle_array( \@ham_samples );

if ( !$check_only ) {
    my $learned = 0;
    my $t0      = [gettimeofday];
    $learned = learn_samples( \@ham_samples, \@spam_samples );
    my $t1 = [gettimeofday];

    printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 );
}

my %validation_set;
my $len = int( scalar @spam_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
    $validation_set{ $spam_samples[$i] } = 1;
}

$len = int( scalar @ham_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
    $validation_set{ $ham_samples[$i] } = 0;
}

cross_validate( \%validation_set );

__END__

=head1 NAME

classifier_test.pl - test various parameters for a classifier

=head1 SYNOPSIS

classifier_test.pl [options]

 Options:
   --spam                 Directory with spam files
   --ham                  Directory with ham files
   --spam-symbol          Symbol for spam (default: BAYES_SPAM)
   --ham-symbol           Symbol for ham (default: BAYES_HAM)
   --classifier           Classifier to test (default: bayes)
   --timeout              Timeout for rspamc (default: 10)
   --parallel             Parallel execution (default: 1)
   --help                 Brief help message
   --man                  Full documentation

=head1 OPTIONS

=over 8

=item B<--spam>

Directory with spam files.

=item B<--ham>

Directory with ham files.

=item B<--classifier>

Specifies classifier name to test.

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=back

=head1 DESCRIPTION

B<classifier_test.pl> is intended to test Rspamd classifier for false positives, false negatives and other parameters.
It uses half of the corpus for training and half for cross-validation.

=cut