rspamd/utils/classifier_test.pl

#!/usr/bin/env perl

use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
use Time::HiRes qw(gettimeofday tv_interval);
use JSON::XS;
use String::ShellQuote;
use FileHandle;
use IPC::Open2;
use Data::Dumper;

my $spam_dir;
my $ham_dir;
my $parallel            = 1;
my $classifier          = "bayes";
my $spam_symbol         = "BAYES_SPAM";
my $ham_symbol          = "BAYES_HAM";
my $timeout             = 10;
my $rspamc              = $ENV{'RSPAMC'} || "rspamc";
my $bogofilter          = $ENV{'BOGOFILTER'} || "bogofilter";
my $dspam               = $ENV{'DSPAM'} || "dspam";
my $train_fraction      = 0.5;
my $use_bogofilter      = 0;
my $use_dspam           = 0;
my $check_only          = 0;
my $rspamc_prob_trigger = 95;
my $man;
my $help;

GetOptions(
  "spam|s=s"           => \$spam_dir,
  "ham|h=s"            => \$ham_dir,
  "spam-symbol=s"      => \$spam_symbol,
  "ham-symbol=s"       => \$ham_symbol,
  "classifier|c=s"     => \$classifier,
  "timeout|t=f"        => \$timeout,
  "parallel|p=i"       => \$parallel,
  "train-fraction|t=f" => \$train_fraction,
  "bogofilter|b"       => \$use_bogofilter,
  "dspam|d"            => \$use_dspam,
  "check-only"         => \$check_only,
  "help|?"             => \$help,
  "man"                => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;

sub read_dir_files {
  my ( $dir, $target ) = @_;
  opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
  while ( my $file = readdir $dh ) {
    if ( -f "$dir/$file" ) {
      push @{$target}, "$dir/$file";
    }
  }
}

sub shuffle_array {
  my ($ar) = @_;

  for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
    if ( $i > 1 ) {
      my $sel = int( rand( $i - 1 ) );
      ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
    }
  }
}

sub learn_rspamc {
  my ( $files, $spam ) = @_;
  my $processed = 0;

  my $cmd = $spam ? "learn_spam" : "learn_ham";
  my $args_quoted = shell_quote @{$files};
  open(
    my $p,
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
  ) or die "cannot spawn $rspamc: $!";

  while (<$p>) {
    my $res = eval('decode_json($_)');
    if ( $res && $res->{'success'} ) {
      $processed++;
    }
  }

  return $processed;
}

sub learn_bogofilter {
  my ( $files, $spam ) = @_;
  my $processed = 0;

  foreach my $f ( @{$files} ) {
    my $args_quoted = shell_quote $f;
    my $fl = $spam ? "-s" : "-n";
    `$bogofilter  -I $args_quoted $fl`;
    if ( $? == 0 ) {
      $processed++;
    }
  }

  return $processed;
}

sub learn_dspam {
  my ( $files, $spam ) = @_;
  my $processed = 0;

  foreach my $f ( @{$files} ) {
    my $args_quoted = shell_quote $f;
    my $fl = $spam ? "--class=spam" : "--class=innocent";
    open( my $p,
      "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
      or die "cannot run $dspam: $!";

    open( my $inp, "< $f" );
    while (<$inp>) {
      print $p $_;
    }
  }

  return $processed;
}

sub learn_samples {
  my ( $ar_ham, $ar_spam ) = @_;
  my $len;
  my $processed = 0;
  my $total     = 0;
  my $learn_func;

  my @files_spam;
  my @files_ham;

  if ($use_dspam) {
    $learn_func = \&learn_dspam;
  }
  elsif ($use_bogofilter) {
    $learn_func = \&learn_bogofilter;
  }
  else {
    $learn_func = \&learn_rspamc;
  }

  $len = int( scalar @{$ar_ham} * $train_fraction );
  my @cur_vec;

  # Shuffle spam and ham samples
  for ( my $i = 0 ; $i < $len ; $i++ ) {
    if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
      push @cur_vec, @{$ar_ham}[$i];
      push @files_ham, [@cur_vec];
      @cur_vec = ();
      $total++;
    }
    else {
      push @cur_vec, @{$ar_ham}[$i];
    }
  }

  $len     = int( scalar @{$ar_spam} * $train_fraction );
  @cur_vec = ();
  for ( my $i = 0 ; $i < $len ; $i++ ) {
    if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
      push @cur_vec, @{$ar_spam}[$i];
      push @files_spam, [@cur_vec];
      @cur_vec = ();
      $total++;
    }
    else {
      push @cur_vec, @{$ar_spam}[$i];
    }
  }

  for ( my $i = 0 ; $i < $total ; $i++ ) {
    my $args;
    my $spam;

    if ( $i % 2 == 0 ) {
      $args = pop @files_spam;

      if ( !$args ) {
        $args = pop @files_ham;
        $spam = 0;
      }
      else {
        $spam = 1;
      }
    }
    else {
      $args = pop @files_ham;
      if ( !$args ) {
        $args = pop @files_spam;
        $spam = 1;
      }
      else {
        $spam = 0;
      }
    }

    my $r = $learn_func->( $args, $spam );
    if ($r) {
      $processed += $r;
    }
  }

  return $processed;
}

sub check_rspamc {
  my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;

  my $args_quoted = shell_quote @{$files};
  my $processed   = 0;

  open(
    my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
  ) or die "cannot spawn $rspamc: $!";

  while (<$p>) {
    my $res = eval('decode_json($_)');
    if ( $res && $res->{'default'} ) {
      $processed++;

      if ($spam) {
        if ( $res->{'default'}->{$ham_symbol} ) {
          my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
          if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
            my $percentage = int($1);
            if ( $percentage >= $rspamc_prob_trigger ) {
              $$fp_cnt++;
            }
          }
          else {
            $$fp_cnt++;
          }
        }
        elsif ( !$res->{'default'}->{$spam_symbol} ) {
          $$fn_cnt++;
        }
        else {
          $$detected_cnt++;
        }
      }
      else {
        if ( $res->{'default'}->{$spam_symbol} ) {
          my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
          if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {

            my $percentage = int($1);
            if ( $percentage >= $rspamc_prob_trigger ) {
              $$fp_cnt++;
            }
          }
          else {
            $$fp_cnt++;
          }
        }
        elsif ( !$res->{'default'}->{$ham_symbol} ) {
          $$fn_cnt++;
        }
        else {
          $$detected_cnt++;
        }
      }
    }
  }

  return $processed;
}

sub check_bogofilter {
  my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
  my $processed = 0;

  foreach my $f ( @{$files} ) {
    my $args_quoted = shell_quote $f;

    open( my $p, "$bogofilter -t -I $args_quoted |" )
      or die "cannot spawn $bogofilter: $!";

    while (<$p>) {
      if ( $_ =~ /^([SHU])\s+.*$/ ) {
        $processed++;

        if ($spam) {
          if ( $1 eq 'H' ) {
            $$fp_cnt++;
          }
          elsif ( $1 eq 'U' ) {
            $$fn_cnt++;
          }
          else {
            $$detected_cnt++;
          }
        }
        else {
          if ( $1 eq 'S' ) {
            $$fp_cnt++;
          }
          elsif ( $1 eq 'U' ) {
            $$fn_cnt++;
          }
          else {
            $$detected_cnt++;
          }
        }
      }
    }
  }

  return $processed;
}

sub check_dspam {
  my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
  my $processed = 0;

  foreach my $f ( @{$files} ) {
    my $args_quoted = shell_quote $f;

    my $pid = open2( *Reader, *Writer,
      "$dspam --user nobody --classify --stdout --mode=notrain" );
    open( my $inp, "< $f" );
    while (<$inp>) {
      print Writer $_;
    }
    close Writer;

    while (<Reader>) {
      if ( $_ =~
qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
        )
      {
        $processed++;
        my $percentage = int($2 * 100.0);

        if ($spam) {
          if ( $1 eq 'Innocent') {
            if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
              $$fp_cnt++;
            }
          }
          elsif ( $1 ne 'Spam' ) {
            $$fn_cnt++;
          }
          else {
            $$detected_cnt++;
          }
        }
        else {
          if ( $1 eq 'Spam' ) {
            if ( $percentage >= $rspamc_prob_trigger ) {
              $$fp_cnt++;
            }
          }
          elsif ( $1 ne 'Innocent' ) {
            $$fn_cnt++;
          }
          else {
            $$detected_cnt++;
          }
        }
      }
    }
    close Reader;
    waitpid( $pid, 0 );
  }

  return $processed;
}

sub cross_validate {
  my ($hr)          = @_;
  my $args          = "";
  my $processed     = 0;
  my $fp_spam       = 0;
  my $fn_spam       = 0;
  my $fp_ham        = 0;
  my $fn_ham        = 0;
  my $total_spam    = 0;
  my $total_ham     = 0;
  my $detected_spam = 0;
  my $detected_ham  = 0;
  my $i             = 0;
  my $len           = scalar keys %{$hr};
  my @files_spam;
  my @files_ham;
  my @cur_spam;
  my @cur_ham;
  my $check_func;

  if ($use_dspam) {
    $check_func = \&check_dspam;
  }
  elsif ($use_bogofilter) {
    $check_func = \&check_bogofilter;
  }
  else {
    $check_func = \&check_rspamc;
  }

  while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
    if ($spam) {
      if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
        push @cur_spam, $fn;
        push @files_spam, [@cur_spam];
        @cur_spam = ();
      }
      else {
        push @cur_spam, $fn;
      }
    }
    else {
      if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
        push @cur_ham, $fn;
        push @files_ham, [@cur_ham];
        @cur_ham = ();
      }
      else {
        push @cur_ham, $fn;
      }
    }
  }

  shuffle_array( \@files_spam );

  foreach my $fn (@files_spam) {
    my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
    $total_spam += $r;
    $processed  += $r;
  }

  shuffle_array( \@files_ham );

  foreach my $fn (@files_ham) {
    my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
    $total_ham += $r;
    $processed += $r;
  }

  printf "Scanned %d messages
%d spam messages (%d detected)
%d ham messages (%d detected)\n",
    $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;

  printf "\nHam FP rate: %.2f%% (%d messages)
Ham FN rate: %.2f%% (%d messages)\n",
    $fp_ham / $total_ham * 100.0, $fp_ham,
    $fn_ham / $total_ham * 100.0, $fn_ham;

  printf "\nSpam FP rate: %.2f%% (%d messages)
Spam FN rate: %.2f%% (%d messages)\n",
    $fp_spam / $total_spam * 100.0, $fp_spam,
    $fn_spam / $total_spam * 100.0, $fn_spam;
}

if ( !$spam_dir || !$ham_dir ) {
  die "spam or/and ham directories are not specified";
}

my @spam_samples;
my @ham_samples;

read_dir_files( $spam_dir, \@spam_samples );
read_dir_files( $ham_dir,  \@ham_samples );
shuffle_array( \@spam_samples );
shuffle_array( \@ham_samples );

if ( !$check_only ) {
  my $learned = 0;
  my $t0      = [gettimeofday];
  $learned = learn_samples( \@ham_samples, \@spam_samples );
  my $t1 = [gettimeofday];

  printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
    $learned, tv_interval( $t0, $t1 );
}

my %validation_set;
my $len = int( scalar @spam_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
  $validation_set{ $spam_samples[$i] } = 1;
}

$len = int( scalar @ham_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
  $validation_set{ $ham_samples[$i] } = 0;
}

cross_validate( \%validation_set );

__END__

=head1 NAME

classifier_test.pl - test various parameters for a classifier

=head1 SYNOPSIS

classifier_test.pl [options]

 Options:
   --spam                 Directory with spam files
   --ham                  Directory with ham files
   --spam-symbol          Symbol for spam (default: BAYES_SPAM)
   --ham-symbol           Symbol for ham (default: BAYES_HAM)
   --classifier           Classifier to test (default: bayes)
   --timeout              Timeout for rspamc (default: 10)
   --parallel             Parallel execution (default: 1)
   --help                 Brief help message
   --man                  Full documentation

=head1 OPTIONS

=over 8

=item B<--spam>

Directory with spam files.

=item B<--ham>

Directory with ham files.

=item B<--classifier>

Specifies classifier name to test.

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=back

=head1 DESCRIPTION

B<classifier_test.pl> is intended to test Rspamd classifier for false positives,
false negatives and other parameters. It uses half of the corpus for training
and half for cross-validation.

=cut