123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551 |
- #!/usr/bin/env perl
-
- use warnings;
- use strict;
- use Pod::Usage;
- use Getopt::Long;
- use Time::HiRes qw(gettimeofday tv_interval);
- use JSON::XS;
- use String::ShellQuote;
- use FileHandle;
- use IPC::Open2;
- use Data::Dumper;
-
- my $spam_dir;
- my $ham_dir;
- my $parallel = 1;
- my $classifier = "bayes";
- my $spam_symbol = "BAYES_SPAM";
- my $ham_symbol = "BAYES_HAM";
- my $timeout = 10;
- my $rspamc = $ENV{'RSPAMC'} || "rspamc";
- my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter";
- my $dspam = $ENV{'DSPAM'} || "dspam";
- my $train_fraction = 0.5;
- my $use_bogofilter = 0;
- my $use_dspam = 0;
- my $check_only = 0;
- my $rspamc_prob_trigger = 95;
- my $man;
- my $help;
-
- GetOptions(
- "spam|s=s" => \$spam_dir,
- "ham|h=s" => \$ham_dir,
- "spam-symbol=s" => \$spam_symbol,
- "ham-symbol=s" => \$ham_symbol,
- "classifier|c=s" => \$classifier,
- "timeout|t=f" => \$timeout,
- "parallel|p=i" => \$parallel,
- "train-fraction|t=f" => \$train_fraction,
- "bogofilter|b" => \$use_bogofilter,
- "dspam|d" => \$use_dspam,
- "check-only" => \$check_only,
- "help|?" => \$help,
- "man" => \$man
- ) or pod2usage(2);
-
- pod2usage(1) if $help;
- pod2usage( -exitval => 0, -verbose => 2 ) if $man;
-
- sub read_dir_files {
- my ( $dir, $target ) = @_;
- opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
- while ( my $file = readdir $dh ) {
- if ( -f "$dir/$file" ) {
- push @{$target}, "$dir/$file";
- }
- }
- }
-
- sub shuffle_array {
- my ($ar) = @_;
-
- for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
- if ( $i > 1 ) {
- my $sel = int( rand( $i - 1 ) );
- ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
- }
- }
- }
-
- sub learn_rspamc {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- my $cmd = $spam ? "learn_spam" : "learn_ham";
- my $args_quoted = shell_quote @{$files};
- open(
- my $p,
- "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
- ) or die "cannot spawn $rspamc: $!";
-
- while (<$p>) {
- my $res = eval('decode_json($_)');
- if ( $res && $res->{'success'} ) {
- $processed++;
- }
- }
-
- return $processed;
- }
-
- sub learn_bogofilter {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
- my $fl = $spam ? "-s" : "-n";
- `$bogofilter -I $args_quoted $fl`;
- if ( $? == 0 ) {
- $processed++;
- }
- }
-
- return $processed;
- }
-
- sub learn_dspam {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
- my $fl = $spam ? "--class=spam" : "--class=innocent";
- open( my $p,
- "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
- or die "cannot run $dspam: $!";
-
- open( my $inp, "< $f" );
- while (<$inp>) {
- print $p $_;
- }
- }
-
- return $processed;
- }
-
- sub learn_samples {
- my ( $ar_ham, $ar_spam ) = @_;
- my $len;
- my $processed = 0;
- my $total = 0;
- my $learn_func;
-
- my @files_spam;
- my @files_ham;
-
- if ($use_dspam) {
- $learn_func = \&learn_dspam;
- }
- elsif ($use_bogofilter) {
- $learn_func = \&learn_bogofilter;
- }
- else {
- $learn_func = \&learn_rspamc;
- }
-
- $len = int( scalar @{$ar_ham} * $train_fraction );
- my @cur_vec;
-
- # Shuffle spam and ham samples
- for ( my $i = 0 ; $i < $len ; $i++ ) {
- if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
- push @cur_vec, @{$ar_ham}[$i];
- push @files_ham, [@cur_vec];
- @cur_vec = ();
- $total++;
- }
- else {
- push @cur_vec, @{$ar_ham}[$i];
- }
- }
-
- $len = int( scalar @{$ar_spam} * $train_fraction );
- @cur_vec = ();
- for ( my $i = 0 ; $i < $len ; $i++ ) {
- if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
- push @cur_vec, @{$ar_spam}[$i];
- push @files_spam, [@cur_vec];
- @cur_vec = ();
- $total++;
- }
- else {
- push @cur_vec, @{$ar_spam}[$i];
- }
- }
-
- for ( my $i = 0 ; $i < $total ; $i++ ) {
- my $args;
- my $spam;
-
- if ( $i % 2 == 0 ) {
- $args = pop @files_spam;
-
- if ( !$args ) {
- $args = pop @files_ham;
- $spam = 0;
- }
- else {
- $spam = 1;
- }
- }
- else {
- $args = pop @files_ham;
- if ( !$args ) {
- $args = pop @files_spam;
- $spam = 1;
- }
- else {
- $spam = 0;
- }
- }
-
- my $r = $learn_func->( $args, $spam );
- if ($r) {
- $processed += $r;
- }
- }
-
- return $processed;
- }
-
- sub check_rspamc {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
-
- my $args_quoted = shell_quote @{$files};
- my $processed = 0;
-
- open(
- my $p,
- "$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
- ) or die "cannot spawn $rspamc: $!";
-
- while (<$p>) {
- my $res = eval('decode_json($_)');
- if ( $res && $res->{'default'} ) {
- $processed++;
-
- if ($spam) {
- if ( $res->{'default'}->{$ham_symbol} ) {
- my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
- if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
- my $percentage = int($1);
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
- }
- }
- else {
- $$fp_cnt++;
- }
- }
- elsif ( !$res->{'default'}->{$spam_symbol} ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- else {
- if ( $res->{'default'}->{$spam_symbol} ) {
- my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
- if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
-
- my $percentage = int($1);
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
- }
- }
- else {
- $$fp_cnt++;
- }
- }
- elsif ( !$res->{'default'}->{$ham_symbol} ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- }
- }
-
- return $processed;
- }
-
- sub check_bogofilter {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
-
- open( my $p, "$bogofilter -t -I $args_quoted |" )
- or die "cannot spawn $bogofilter: $!";
-
- while (<$p>) {
- if ( $_ =~ /^([SHU])\s+.*$/ ) {
- $processed++;
-
- if ($spam) {
- if ( $1 eq 'H' ) {
- $$fp_cnt++;
- }
- elsif ( $1 eq 'U' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- else {
- if ( $1 eq 'S' ) {
- $$fp_cnt++;
- }
- elsif ( $1 eq 'U' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- }
- }
- }
-
- return $processed;
- }
-
- sub check_dspam {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
-
- my $pid = open2( *Reader, *Writer,
- "$dspam --user nobody --classify --stdout --mode=notrain" );
- open( my $inp, "< $f" );
- while (<$inp>) {
- print Writer $_;
- }
- close Writer;
-
- while (<Reader>) {
- if ( $_ =~
- qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
- )
- {
- $processed++;
- my $percentage = int($2 * 100.0);
-
- if ($spam) {
- if ( $1 eq 'Innocent') {
- if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
- $$fp_cnt++;
- }
- }
- elsif ( $1 ne 'Spam' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- else {
- if ( $1 eq 'Spam' ) {
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
- }
- }
- elsif ( $1 ne 'Innocent' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- }
- }
- close Reader;
- waitpid( $pid, 0 );
- }
-
- return $processed;
- }
-
- sub cross_validate {
- my ($hr) = @_;
- my $args = "";
- my $processed = 0;
- my $fp_spam = 0;
- my $fn_spam = 0;
- my $fp_ham = 0;
- my $fn_ham = 0;
- my $total_spam = 0;
- my $total_ham = 0;
- my $detected_spam = 0;
- my $detected_ham = 0;
- my $i = 0;
- my $len = scalar keys %{$hr};
- my @files_spam;
- my @files_ham;
- my @cur_spam;
- my @cur_ham;
- my $check_func;
-
- if ($use_dspam) {
- $check_func = \&check_dspam;
- }
- elsif ($use_bogofilter) {
- $check_func = \&check_bogofilter;
- }
- else {
- $check_func = \&check_rspamc;
- }
-
- while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
- if ($spam) {
- if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
- push @cur_spam, $fn;
- push @files_spam, [@cur_spam];
- @cur_spam = ();
- }
- else {
- push @cur_spam, $fn;
- }
- }
- else {
- if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
- push @cur_ham, $fn;
- push @files_ham, [@cur_ham];
- @cur_ham = ();
- }
- else {
- push @cur_ham, $fn;
- }
- }
- }
-
- shuffle_array( \@files_spam );
-
- foreach my $fn (@files_spam) {
- my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
- $total_spam += $r;
- $processed += $r;
- }
-
- shuffle_array( \@files_ham );
-
- foreach my $fn (@files_ham) {
- my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
- $total_ham += $r;
- $processed += $r;
- }
-
- printf "Scanned %d messages
- %d spam messages (%d detected)
- %d ham messages (%d detected)\n",
- $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
-
- printf "\nHam FP rate: %.2f%% (%d messages)
- Ham FN rate: %.2f%% (%d messages)\n",
- $fp_ham / $total_ham * 100.0, $fp_ham,
- $fn_ham / $total_ham * 100.0, $fn_ham;
-
- printf "\nSpam FP rate: %.2f%% (%d messages)
- Spam FN rate: %.2f%% (%d messages)\n",
- $fp_spam / $total_spam * 100.0, $fp_spam,
- $fn_spam / $total_spam * 100.0, $fn_spam;
- }
-
- if ( !$spam_dir || !$ham_dir ) {
- die "spam or/and ham directories are not specified";
- }
-
- my @spam_samples;
- my @ham_samples;
-
- read_dir_files( $spam_dir, \@spam_samples );
- read_dir_files( $ham_dir, \@ham_samples );
- shuffle_array( \@spam_samples );
- shuffle_array( \@ham_samples );
-
- if ( !$check_only ) {
- my $learned = 0;
- my $t0 = [gettimeofday];
- $learned = learn_samples( \@ham_samples, \@spam_samples );
- my $t1 = [gettimeofday];
-
- printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
- $learned, tv_interval( $t0, $t1 );
- }
-
- my %validation_set;
- my $len = int( scalar @spam_samples * $train_fraction );
- for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
- $validation_set{ $spam_samples[$i] } = 1;
- }
-
- $len = int( scalar @ham_samples * $train_fraction );
- for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
- $validation_set{ $ham_samples[$i] } = 0;
- }
-
- cross_validate( \%validation_set );
-
- __END__
-
- =head1 NAME
-
- classifier_test.pl - test various parameters for a classifier
-
- =head1 SYNOPSIS
-
- classifier_test.pl [options]
-
- Options:
- --spam Directory with spam files
- --ham Directory with ham files
- --spam-symbol Symbol for spam (default: BAYES_SPAM)
- --ham-symbol Symbol for ham (default: BAYES_HAM)
- --classifier Classifier to test (default: bayes)
- --timeout Timeout for rspamc (default: 10)
- --parallel Parallel execution (default: 1)
- --help Brief help message
- --man Full documentation
-
- =head1 OPTIONS
-
- =over 8
-
- =item B<--spam>
-
- Directory with spam files.
-
- =item B<--ham>
-
- Directory with ham files.
-
- =item B<--classifier>
-
- Specifies classifier name to test.
-
- =item B<--help>
-
- Print a brief help message and exits.
-
- =item B<--man>
-
- Prints the manual page and exits.
-
- =back
-
- =head1 DESCRIPTION
-
- B<classifier_test.pl> is intended to test Rspamd classifier for false positives,
- false negatives and other parameters. It uses half of the corpus for training
- and half for cross-validation.
-
- =cut
|