diff options
Diffstat (limited to 'utils/classifier_test.pl')
-rw-r--r-- | utils/classifier_test.pl | 697 |
1 files changed, 343 insertions, 354 deletions
diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl index 2dbb4e903..08febe257 100644 --- a/utils/classifier_test.pl +++ b/utils/classifier_test.pl @@ -30,438 +30,428 @@ my $man; my $help; GetOptions( - "spam|s=s" => \$spam_dir, - "ham|h=s" => \$ham_dir, - "spam-symbol=s" => \$spam_symbol, - "ham-symbol=s" => \$ham_symbol, - "classifier|c=s" => \$classifier, - "timeout|t=f" => \$timeout, - "parallel|p=i" => \$parallel, - "train-fraction|t=f" => \$train_fraction, - "bogofilter|b" => \$use_bogofilter, - "dspam|d" => \$use_dspam, - "check-only" => \$check_only, - "help|?" => \$help, - "man" => \$man + "spam|s=s" => \$spam_dir, + "ham|h=s" => \$ham_dir, + "spam-symbol=s" => \$spam_symbol, + "ham-symbol=s" => \$ham_symbol, + "classifier|c=s" => \$classifier, + "timeout|t=f" => \$timeout, + "parallel|p=i" => \$parallel, + "train-fraction|t=f" => \$train_fraction, + "bogofilter|b" => \$use_bogofilter, + "dspam|d" => \$use_dspam, + "check-only" => \$check_only, + "help|?" => \$help, + "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; pod2usage( -exitval => 0, -verbose => 2 ) if $man; sub read_dir_files { - my ( $dir, $target ) = @_; - opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; - while ( my $file = readdir $dh ) { - if ( -f "$dir/$file" ) { - push @{$target}, "$dir/$file"; + my ( $dir, $target ) = @_; + opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; + while ( my $file = readdir $dh ) { + if ( -f "$dir/$file" ) { + push @{$target}, "$dir/$file"; + } } - } } sub shuffle_array { - my ($ar) = @_; + my ($ar) = @_; - for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { - if ( $i > 1 ) { - my $sel = int( rand( $i - 1 ) ); - ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { + if ( $i > 1 ) { + my $sel = int( rand( $i - 1 ) ); + ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + } } - } } sub learn_rspamc { - my ( $files, $spam ) = @_; - my $processed = 0; - - my $cmd = $spam ? "learn_spam" : "learn_ham"; - my $args_quoted = shell_quote @{$files}; - open( - my $p, -"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" - ) or die "cannot spawn $rspamc: $!"; - - while (<$p>) { - my $res = eval('decode_json($_)'); - if ( $res && $res->{'success'} ) { - $processed++; + my ( $files, $spam ) = @_; + my $processed = 0; + + my $cmd = $spam ? "learn_spam" : "learn_ham"; + my $args_quoted = shell_quote @{$files}; + open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) + or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'success'} ) { + $processed++; + } } - } - return $processed; + return $processed; } sub learn_bogofilter { - my ( $files, $spam ) = @_; - my $processed = 0; - - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; - my $fl = $spam ? "-s" : "-n"; - `$bogofilter -I $args_quoted $fl`; - if ( $? == 0 ) { - $processed++; + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "-s" : "-n"; + `$bogofilter -I $args_quoted $fl`; + if ( $? == 0 ) { + $processed++; + } } - } - return $processed; + return $processed; } sub learn_dspam { - my ( $files, $spam ) = @_; - my $processed = 0; - - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; - my $fl = $spam ? "--class=spam" : "--class=innocent"; - open( my $p, - "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) - or die "cannot run $dspam: $!"; - - open( my $inp, "< $f" ); - while (<$inp>) { - print $p $_; + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "--class=spam" : "--class=innocent"; + open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) + or die "cannot run $dspam: $!"; + + open( my $inp, "< $f" ); + while (<$inp>) { + print $p $_; + } } - } - return $processed; + return $processed; } sub learn_samples { - my ( $ar_ham, $ar_spam ) = @_; - my $len; - my $processed = 0; - my $total = 0; - my $learn_func; - - my @files_spam; - my @files_ham; - - if ($use_dspam) { - $learn_func = \&learn_dspam; - } - elsif ($use_bogofilter) { - $learn_func = \&learn_bogofilter; - } - else { - $learn_func = \&learn_rspamc; - } - - $len = int( scalar @{$ar_ham} * $train_fraction ); - my @cur_vec; - - # Shuffle spam and ham samples - for ( my $i = 0 ; $i < $len ; $i++ ) { - if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { - push @cur_vec, @{$ar_ham}[$i]; - push @files_ham, [@cur_vec]; - @cur_vec = (); - $total++; - } - else { - push @cur_vec, @{$ar_ham}[$i]; - } - } - - $len = int( scalar @{$ar_spam} * $train_fraction ); - @cur_vec = (); - for ( my $i = 0 ; $i < $len ; $i++ ) { - if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { - push @cur_vec, @{$ar_spam}[$i]; - push @files_spam, [@cur_vec]; - @cur_vec = (); - $total++; - } - else { - push @cur_vec, @{$ar_spam}[$i]; + my ( $ar_ham, $ar_spam ) = @_; + my $len; + my $processed = 0; + my $total = 0; + my $learn_func; + + my @files_spam; + my @files_ham; + + if ($use_dspam) { + $learn_func = \&learn_dspam; } - } - - for ( my $i = 0 ; $i < $total ; $i++ ) { - my $args; - my $spam; - - if ( $i % 2 == 0 ) { - $args = pop @files_spam; - - if ( !$args ) { - $args = pop @files_ham; - $spam = 0; - } - else { - $spam = 1; - } + elsif ($use_bogofilter) { + $learn_func = \&learn_bogofilter; } else { - $args = pop @files_ham; - if ( !$args ) { - $args = pop @files_spam; - $spam = 1; - } - else { - $spam = 0; - } + $learn_func = \&learn_rspamc; } - my $r = $learn_func->( $args, $spam ); - if ($r) { - $processed += $r; + $len = int( scalar @{$ar_ham} * $train_fraction ); + my @cur_vec; + + # Shuffle spam and ham samples + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_ham}[$i]; + push @files_ham, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_ham}[$i]; + } } - } - return $processed; -} + $len = int( scalar @{$ar_spam} * $train_fraction ); + @cur_vec = (); + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_spam}[$i]; + push @files_spam, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_spam}[$i]; + } + } -sub check_rspamc { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + for ( my $i = 0 ; $i < $total ; $i++ ) { + my $args; + my $spam; - my $args_quoted = shell_quote @{$files}; - my $processed = 0; + if ( $i % 2 == 0 ) { + $args = pop @files_spam; - open( - my $p, -"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" - ) or die "cannot spawn $rspamc: $!"; - - while (<$p>) { - my $res = eval('decode_json($_)'); - if ( $res && $res->{'default'} ) { - $processed++; - - if ($spam) { - if ( $res->{'default'}->{$ham_symbol} ) { - my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; - if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { - my $percentage = int($1); - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + if ( !$args ) { + $args = pop @files_ham; + $spam = 0; + } + else { + $spam = 1; } - } - else { - $$fp_cnt++; - } - } - elsif ( !$res->{'default'}->{$spam_symbol} ) { - $$fn_cnt++; } else { - $$detected_cnt++; - } - } - else { - if ( $res->{'default'}->{$spam_symbol} ) { - my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; - if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { - - my $percentage = int($1); - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + $args = pop @files_ham; + if ( !$args ) { + $args = pop @files_spam; + $spam = 1; + } + else { + $spam = 0; } - } - else { - $$fp_cnt++; - } - } - elsif ( !$res->{'default'}->{$ham_symbol} ) { - $$fn_cnt++; } - else { - $$detected_cnt++; + + my $r = $learn_func->( $args, $spam ); + if ($r) { + $processed += $r; } - } } - } - return $processed; + return $processed; } -sub check_bogofilter { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - my $processed = 0; +sub check_rspamc { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; + my $args_quoted = shell_quote @{$files}; + my $processed = 0; - open( my $p, "$bogofilter -t -I $args_quoted |" ) - or die "cannot spawn $bogofilter: $!"; + open( + my $p, +"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" + ) or die "cannot spawn $rspamc: $!"; while (<$p>) { - if ( $_ =~ /^([SHU])\s+.*$/ ) { - $processed++; - - if ($spam) { - if ( $1 eq 'H' ) { - $$fp_cnt++; - } - elsif ( $1 eq 'U' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } - } - else { - if ( $1 eq 'S' ) { - $$fp_cnt++; - } - elsif ( $1 eq 'U' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } + my $res = eval('decode_json($_)'); + if ( $res && $res->{'default'} ) { + $processed++; + + if ($spam) { + if ( $res->{'default'}->{$ham_symbol} ) { + my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$spam_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $res->{'default'}->{$spam_symbol} ) { + my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$ham_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } } - } } - } - return $processed; + return $processed; } -sub check_dspam { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - my $processed = 0; +sub check_bogofilter { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + open( my $p, "$bogofilter -t -I $args_quoted |" ) + or die "cannot spawn $bogofilter: $!"; + + while (<$p>) { + if ( $_ =~ /^([SHU])\s+.*$/ ) { + $processed++; + + if ($spam) { + if ( $1 eq 'H' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'S' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + } - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; + return $processed; +} - my $pid = open2( *Reader, *Writer, - "$dspam --user nobody --classify --stdout --mode=notrain" ); - open( my $inp, "< $f" ); - while (<$inp>) { - print Writer $_; - } - close Writer; +sub check_dspam { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; - while (<Reader>) { - if ( $_ =~ -qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) - ) - { - $processed++; - my $percentage = int($2 * 100.0); + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; - if ($spam) { - if ( $1 eq 'Innocent') { - if ( $percentage <= (100 - $rspamc_prob_trigger) ) { - $$fp_cnt++; - } - } - elsif ( $1 ne 'Spam' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } + my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); + open( my $inp, "< $f" ); + while (<$inp>) { + print Writer $_; } - else { - if ( $1 eq 'Spam' ) { - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + close Writer; + + while (<Reader>) { + if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { + $processed++; + my $percentage = int( $2 * 100.0 ); + + if ($spam) { + if ( $1 eq 'Innocent' ) { + if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Spam' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'Spam' ) { + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Innocent' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } } - } - elsif ( $1 ne 'Innocent' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } } - } + close Reader; + waitpid( $pid, 0 ); } - close Reader; - waitpid( $pid, 0 ); - } - return $processed; + return $processed; } sub cross_validate { - my ($hr) = @_; - my $args = ""; - my $processed = 0; - my $fp_spam = 0; - my $fn_spam = 0; - my $fp_ham = 0; - my $fn_ham = 0; - my $total_spam = 0; - my $total_ham = 0; - my $detected_spam = 0; - my $detected_ham = 0; - my $i = 0; - my $len = scalar keys %{$hr}; - my @files_spam; - my @files_ham; - my @cur_spam; - my @cur_ham; - my $check_func; - - if ($use_dspam) { - $check_func = \&check_dspam; - } - elsif ($use_bogofilter) { - $check_func = \&check_bogofilter; - } - else { - $check_func = \&check_rspamc; - } - - while ( my ( $fn, $spam ) = each( %{$hr} ) ) { - if ($spam) { - if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { - push @cur_spam, $fn; - push @files_spam, [@cur_spam]; - @cur_spam = (); - } - else { - push @cur_spam, $fn; - } + my ($hr) = @_; + my $args = ""; + my $processed = 0; + my $fp_spam = 0; + my $fn_spam = 0; + my $fp_ham = 0; + my $fn_ham = 0; + my $total_spam = 0; + my $total_ham = 0; + my $detected_spam = 0; + my $detected_ham = 0; + my $i = 0; + my $len = scalar keys %{$hr}; + my @files_spam; + my @files_ham; + my @cur_spam; + my @cur_ham; + my $check_func; + + if ($use_dspam) { + $check_func = \&check_dspam; + } + elsif ($use_bogofilter) { + $check_func = \&check_bogofilter; } else { - if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { - push @cur_ham, $fn; - push @files_ham, [@cur_ham]; - @cur_ham = (); - } - else { - push @cur_ham, $fn; - } + $check_func = \&check_rspamc; + } + + while ( my ( $fn, $spam ) = each( %{$hr} ) ) { + if ($spam) { + if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { + push @cur_spam, $fn; + push @files_spam, [@cur_spam]; + @cur_spam = (); + } + else { + push @cur_spam, $fn; + } + } + else { + if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { + push @cur_ham, $fn; + push @files_ham, [@cur_ham]; + @cur_ham = (); + } + else { + push @cur_ham, $fn; + } + } } - } - shuffle_array( \@files_spam ); + shuffle_array( \@files_spam ); - foreach my $fn (@files_spam) { - my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); - $total_spam += $r; - $processed += $r; - } + foreach my $fn (@files_spam) { + my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); + $total_spam += $r; + $processed += $r; + } - shuffle_array( \@files_ham ); + shuffle_array( \@files_ham ); - foreach my $fn (@files_ham) { - my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); - $total_ham += $r; - $processed += $r; - } + foreach my $fn (@files_ham) { + my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); + $total_ham += $r; + $processed += $r; + } - printf "Scanned %d messages + printf "Scanned %d messages %d spam messages (%d detected) -%d ham messages (%d detected)\n", - $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; +%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; - printf "\nHam FP rate: %.2f%% (%d messages) -Ham FN rate: %.2f%% (%d messages)\n", - $fp_ham / $total_ham * 100.0, $fp_ham, - $fn_ham / $total_ham * 100.0, $fn_ham; + printf "\nHam FP rate: %.2f%% (%d messages) +Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; - printf "\nSpam FP rate: %.2f%% (%d messages) + printf "\nSpam FP rate: %.2f%% (%d messages) Spam FN rate: %.2f%% (%d messages)\n", - $fp_spam / $total_spam * 100.0, $fp_spam, - $fn_spam / $total_spam * 100.0, $fn_spam; + $fp_spam / $total_spam * 100.0, $fp_spam, + $fn_spam / $total_spam * 100.0, $fn_spam; } if ( !$spam_dir || !$ham_dir ) { - die "spam or/and ham directories are not specified"; + die "spam or/and ham directories are not specified"; } my @spam_samples; @@ -473,24 +463,23 @@ shuffle_array( \@spam_samples ); shuffle_array( \@ham_samples ); if ( !$check_only ) { - my $learned = 0; - my $t0 = [gettimeofday]; - $learned = learn_samples( \@ham_samples, \@spam_samples ); - my $t1 = [gettimeofday]; + my $learned = 0; + my $t0 = [gettimeofday]; + $learned = learn_samples( \@ham_samples, \@spam_samples ); + my $t1 = [gettimeofday]; - printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", - $learned, tv_interval( $t0, $t1 ); + printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); } my %validation_set; my $len = int( scalar @spam_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { - $validation_set{ $spam_samples[$i] } = 1; + $validation_set{ $spam_samples[$i] } = 1; } $len = int( scalar @ham_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { - $validation_set{ $ham_samples[$i] } = 0; + $validation_set{ $ham_samples[$i] } = 0; } cross_validate( \%validation_set ); |