mirror of
https://github.com/rspamd/rspamd.git
synced 2024-08-07 09:05:58 +02:00
552 lines
12 KiB
Perl
552 lines
12 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
use warnings;
|
|
use strict;
|
|
use Pod::Usage;
|
|
use Getopt::Long;
|
|
use Time::HiRes qw(gettimeofday tv_interval);
|
|
use JSON::XS;
|
|
use String::ShellQuote;
|
|
use FileHandle;
|
|
use IPC::Open2;
|
|
use Data::Dumper;
|
|
|
|
my $spam_dir;
|
|
my $ham_dir;
|
|
my $parallel = 1;
|
|
my $classifier = "bayes";
|
|
my $spam_symbol = "BAYES_SPAM";
|
|
my $ham_symbol = "BAYES_HAM";
|
|
my $timeout = 10;
|
|
my $rspamc = $ENV{'RSPAMC'} || "rspamc";
|
|
my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter";
|
|
my $dspam = $ENV{'DSPAM'} || "dspam";
|
|
my $train_fraction = 0.5;
|
|
my $use_bogofilter = 0;
|
|
my $use_dspam = 0;
|
|
my $check_only = 0;
|
|
my $rspamc_prob_trigger = 95;
|
|
my $man;
|
|
my $help;
|
|
|
|
GetOptions(
|
|
"spam|s=s" => \$spam_dir,
|
|
"ham|h=s" => \$ham_dir,
|
|
"spam-symbol=s" => \$spam_symbol,
|
|
"ham-symbol=s" => \$ham_symbol,
|
|
"classifier|c=s" => \$classifier,
|
|
"timeout|t=f" => \$timeout,
|
|
"parallel|p=i" => \$parallel,
|
|
"train-fraction|t=f" => \$train_fraction,
|
|
"bogofilter|b" => \$use_bogofilter,
|
|
"dspam|d" => \$use_dspam,
|
|
"check-only" => \$check_only,
|
|
"help|?" => \$help,
|
|
"man" => \$man
|
|
) or pod2usage(2);
|
|
|
|
pod2usage(1) if $help;
|
|
pod2usage( -exitval => 0, -verbose => 2 ) if $man;
|
|
|
|
sub read_dir_files {
|
|
my ( $dir, $target ) = @_;
|
|
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
|
|
while ( my $file = readdir $dh ) {
|
|
if ( -f "$dir/$file" ) {
|
|
push @{$target}, "$dir/$file";
|
|
}
|
|
}
|
|
}
|
|
|
|
sub shuffle_array {
|
|
my ($ar) = @_;
|
|
|
|
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
|
|
if ( $i > 1 ) {
|
|
my $sel = int( rand( $i - 1 ) );
|
|
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
|
|
}
|
|
}
|
|
}
|
|
|
|
sub learn_rspamc {
|
|
my ( $files, $spam ) = @_;
|
|
my $processed = 0;
|
|
|
|
my $cmd = $spam ? "learn_spam" : "learn_ham";
|
|
my $args_quoted = shell_quote @{$files};
|
|
open(
|
|
my $p,
|
|
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
|
|
) or die "cannot spawn $rspamc: $!";
|
|
|
|
while (<$p>) {
|
|
my $res = eval('decode_json($_)');
|
|
if ( $res && $res->{'success'} ) {
|
|
$processed++;
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub learn_bogofilter {
|
|
my ( $files, $spam ) = @_;
|
|
my $processed = 0;
|
|
|
|
foreach my $f ( @{$files} ) {
|
|
my $args_quoted = shell_quote $f;
|
|
my $fl = $spam ? "-s" : "-n";
|
|
`$bogofilter -I $args_quoted $fl`;
|
|
if ( $? == 0 ) {
|
|
$processed++;
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub learn_dspam {
|
|
my ( $files, $spam ) = @_;
|
|
my $processed = 0;
|
|
|
|
foreach my $f ( @{$files} ) {
|
|
my $args_quoted = shell_quote $f;
|
|
my $fl = $spam ? "--class=spam" : "--class=innocent";
|
|
open( my $p,
|
|
"|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
|
|
or die "cannot run $dspam: $!";
|
|
|
|
open( my $inp, "< $f" );
|
|
while (<$inp>) {
|
|
print $p $_;
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub learn_samples {
|
|
my ( $ar_ham, $ar_spam ) = @_;
|
|
my $len;
|
|
my $processed = 0;
|
|
my $total = 0;
|
|
my $learn_func;
|
|
|
|
my @files_spam;
|
|
my @files_ham;
|
|
|
|
if ($use_dspam) {
|
|
$learn_func = \&learn_dspam;
|
|
}
|
|
elsif ($use_bogofilter) {
|
|
$learn_func = \&learn_bogofilter;
|
|
}
|
|
else {
|
|
$learn_func = \&learn_rspamc;
|
|
}
|
|
|
|
$len = int( scalar @{$ar_ham} * $train_fraction );
|
|
my @cur_vec;
|
|
|
|
# Shuffle spam and ham samples
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) {
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
|
|
push @cur_vec, @{$ar_ham}[$i];
|
|
push @files_ham, [@cur_vec];
|
|
@cur_vec = ();
|
|
$total++;
|
|
}
|
|
else {
|
|
push @cur_vec, @{$ar_ham}[$i];
|
|
}
|
|
}
|
|
|
|
$len = int( scalar @{$ar_spam} * $train_fraction );
|
|
@cur_vec = ();
|
|
for ( my $i = 0 ; $i < $len ; $i++ ) {
|
|
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
|
|
push @cur_vec, @{$ar_spam}[$i];
|
|
push @files_spam, [@cur_vec];
|
|
@cur_vec = ();
|
|
$total++;
|
|
}
|
|
else {
|
|
push @cur_vec, @{$ar_spam}[$i];
|
|
}
|
|
}
|
|
|
|
for ( my $i = 0 ; $i < $total ; $i++ ) {
|
|
my $args;
|
|
my $spam;
|
|
|
|
if ( $i % 2 == 0 ) {
|
|
$args = pop @files_spam;
|
|
|
|
if ( !$args ) {
|
|
$args = pop @files_ham;
|
|
$spam = 0;
|
|
}
|
|
else {
|
|
$spam = 1;
|
|
}
|
|
}
|
|
else {
|
|
$args = pop @files_ham;
|
|
if ( !$args ) {
|
|
$args = pop @files_spam;
|
|
$spam = 1;
|
|
}
|
|
else {
|
|
$spam = 0;
|
|
}
|
|
}
|
|
|
|
my $r = $learn_func->( $args, $spam );
|
|
if ($r) {
|
|
$processed += $r;
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub check_rspamc {
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
|
|
|
|
my $args_quoted = shell_quote @{$files};
|
|
my $processed = 0;
|
|
|
|
open(
|
|
my $p,
|
|
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
|
|
) or die "cannot spawn $rspamc: $!";
|
|
|
|
while (<$p>) {
|
|
my $res = eval('decode_json($_)');
|
|
if ( $res && $res->{'default'} ) {
|
|
$processed++;
|
|
|
|
if ($spam) {
|
|
if ( $res->{'default'}->{$ham_symbol} ) {
|
|
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
|
|
my $percentage = int($1);
|
|
if ( $percentage >= $rspamc_prob_trigger ) {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
else {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
elsif ( !$res->{'default'}->{$spam_symbol} ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
else {
|
|
if ( $res->{'default'}->{$spam_symbol} ) {
|
|
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
|
|
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
|
|
|
|
my $percentage = int($1);
|
|
if ( $percentage >= $rspamc_prob_trigger ) {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
else {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
elsif ( !$res->{'default'}->{$ham_symbol} ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub check_bogofilter {
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
|
|
my $processed = 0;
|
|
|
|
foreach my $f ( @{$files} ) {
|
|
my $args_quoted = shell_quote $f;
|
|
|
|
open( my $p, "$bogofilter -t -I $args_quoted |" )
|
|
or die "cannot spawn $bogofilter: $!";
|
|
|
|
while (<$p>) {
|
|
if ( $_ =~ /^([SHU])\s+.*$/ ) {
|
|
$processed++;
|
|
|
|
if ($spam) {
|
|
if ( $1 eq 'H' ) {
|
|
$$fp_cnt++;
|
|
}
|
|
elsif ( $1 eq 'U' ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
else {
|
|
if ( $1 eq 'S' ) {
|
|
$$fp_cnt++;
|
|
}
|
|
elsif ( $1 eq 'U' ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub check_dspam {
|
|
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
|
|
my $processed = 0;
|
|
|
|
foreach my $f ( @{$files} ) {
|
|
my $args_quoted = shell_quote $f;
|
|
|
|
my $pid = open2( *Reader, *Writer,
|
|
"$dspam --user nobody --classify --stdout --mode=notrain" );
|
|
open( my $inp, "< $f" );
|
|
while (<$inp>) {
|
|
print Writer $_;
|
|
}
|
|
close Writer;
|
|
|
|
while (<Reader>) {
|
|
if ( $_ =~
|
|
qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
|
|
)
|
|
{
|
|
$processed++;
|
|
my $percentage = int($2 * 100.0);
|
|
|
|
if ($spam) {
|
|
if ( $1 eq 'Innocent') {
|
|
if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
elsif ( $1 ne 'Spam' ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
else {
|
|
if ( $1 eq 'Spam' ) {
|
|
if ( $percentage >= $rspamc_prob_trigger ) {
|
|
$$fp_cnt++;
|
|
}
|
|
}
|
|
elsif ( $1 ne 'Innocent' ) {
|
|
$$fn_cnt++;
|
|
}
|
|
else {
|
|
$$detected_cnt++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close Reader;
|
|
waitpid( $pid, 0 );
|
|
}
|
|
|
|
return $processed;
|
|
}
|
|
|
|
sub cross_validate {
|
|
my ($hr) = @_;
|
|
my $args = "";
|
|
my $processed = 0;
|
|
my $fp_spam = 0;
|
|
my $fn_spam = 0;
|
|
my $fp_ham = 0;
|
|
my $fn_ham = 0;
|
|
my $total_spam = 0;
|
|
my $total_ham = 0;
|
|
my $detected_spam = 0;
|
|
my $detected_ham = 0;
|
|
my $i = 0;
|
|
my $len = scalar keys %{$hr};
|
|
my @files_spam;
|
|
my @files_ham;
|
|
my @cur_spam;
|
|
my @cur_ham;
|
|
my $check_func;
|
|
|
|
if ($use_dspam) {
|
|
$check_func = \&check_dspam;
|
|
}
|
|
elsif ($use_bogofilter) {
|
|
$check_func = \&check_bogofilter;
|
|
}
|
|
else {
|
|
$check_func = \&check_rspamc;
|
|
}
|
|
|
|
while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
|
|
if ($spam) {
|
|
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
|
|
push @cur_spam, $fn;
|
|
push @files_spam, [@cur_spam];
|
|
@cur_spam = ();
|
|
}
|
|
else {
|
|
push @cur_spam, $fn;
|
|
}
|
|
}
|
|
else {
|
|
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
|
|
push @cur_ham, $fn;
|
|
push @files_ham, [@cur_ham];
|
|
@cur_ham = ();
|
|
}
|
|
else {
|
|
push @cur_ham, $fn;
|
|
}
|
|
}
|
|
}
|
|
|
|
shuffle_array( \@files_spam );
|
|
|
|
foreach my $fn (@files_spam) {
|
|
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
|
|
$total_spam += $r;
|
|
$processed += $r;
|
|
}
|
|
|
|
shuffle_array( \@files_ham );
|
|
|
|
foreach my $fn (@files_ham) {
|
|
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
|
|
$total_ham += $r;
|
|
$processed += $r;
|
|
}
|
|
|
|
printf "Scanned %d messages
|
|
%d spam messages (%d detected)
|
|
%d ham messages (%d detected)\n",
|
|
$processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
|
|
|
|
printf "\nHam FP rate: %.2f%% (%d messages)
|
|
Ham FN rate: %.2f%% (%d messages)\n",
|
|
$fp_ham / $total_ham * 100.0, $fp_ham,
|
|
$fn_ham / $total_ham * 100.0, $fn_ham;
|
|
|
|
printf "\nSpam FP rate: %.2f%% (%d messages)
|
|
Spam FN rate: %.2f%% (%d messages)\n",
|
|
$fp_spam / $total_spam * 100.0, $fp_spam,
|
|
$fn_spam / $total_spam * 100.0, $fn_spam;
|
|
}
|
|
|
|
if ( !$spam_dir || !$ham_dir ) {
|
|
die "spam or/and ham directories are not specified";
|
|
}
|
|
|
|
my @spam_samples;
|
|
my @ham_samples;
|
|
|
|
read_dir_files( $spam_dir, \@spam_samples );
|
|
read_dir_files( $ham_dir, \@ham_samples );
|
|
shuffle_array( \@spam_samples );
|
|
shuffle_array( \@ham_samples );
|
|
|
|
if ( !$check_only ) {
|
|
my $learned = 0;
|
|
my $t0 = [gettimeofday];
|
|
$learned = learn_samples( \@ham_samples, \@spam_samples );
|
|
my $t1 = [gettimeofday];
|
|
|
|
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
|
|
$learned, tv_interval( $t0, $t1 );
|
|
}
|
|
|
|
my %validation_set;
|
|
my $len = int( scalar @spam_samples * $train_fraction );
|
|
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
|
|
$validation_set{ $spam_samples[$i] } = 1;
|
|
}
|
|
|
|
$len = int( scalar @ham_samples * $train_fraction );
|
|
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
|
|
$validation_set{ $ham_samples[$i] } = 0;
|
|
}
|
|
|
|
cross_validate( \%validation_set );
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
classifier_test.pl - test various parameters for a classifier
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
classifier_test.pl [options]
|
|
|
|
Options:
|
|
--spam Directory with spam files
|
|
--ham Directory with ham files
|
|
--spam-symbol Symbol for spam (default: BAYES_SPAM)
|
|
--ham-symbol Symbol for ham (default: BAYES_HAM)
|
|
--classifier Classifier to test (default: bayes)
|
|
--timeout Timeout for rspamc (default: 10)
|
|
--parallel Parallel execution (default: 1)
|
|
--help Brief help message
|
|
--man Full documentation
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<--spam>
|
|
|
|
Directory with spam files.
|
|
|
|
=item B<--ham>
|
|
|
|
Directory with ham files.
|
|
|
|
=item B<--classifier>
|
|
|
|
Specifies classifier name to test.
|
|
|
|
=item B<--help>
|
|
|
|
Print a brief help message and exits.
|
|
|
|
=item B<--man>
|
|
|
|
Prints the manual page and exits.
|
|
|
|
=back
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
B<classifier_test.pl> is intended to test Rspamd classifier for false positives,
|
|
false negatives and other parameters. It uses half of the corpus for training
|
|
and half for cross-validation.
|
|
|
|
=cut
|