Browse Source

[Feature] Add utility to perform classifier tests

tags/1.4.0
Vsevolod Stakhov 7 years ago
parent
commit
26f0b37daa
1 changed files with 378 additions and 0 deletions
  1. 378
    0
      utils/classifier_test.pl

+ 378
- 0
utils/classifier_test.pl View File

@@ -0,0 +1,378 @@
#!/usr/bin/env perl

use warnings;
use strict;
use Pod::Usage;
use Getopt::Long;
use Time::HiRes qw(gettimeofday tv_interval);
use JSON::XS;
use String::ShellQuote;
use Data::Dumper;

my $spam_dir;
my $ham_dir;
my $parallel = 1;
my $classifier = "bayes";
my $spam_symbol = "BAYES_SPAM";
my $ham_symbol = "BAYES_HAM";
my $timeout = 10;
my $rspamc = $ENV{'RSPAMC'} || "rspamc";
my $train_fraction = 0.5;
my $man;
my $help;

GetOptions(
"spam|s=s" => \$spam_dir,
"ham|h=s" => \$ham_dir,
"spam-symbol=s" => \$spam_symbol,
"ham-symbol=s" => \$ham_symbol,
"classifier|c=s" => \$classifier,
"timeout|t=f" => \$timeout,
"parallel|p=i" => \$parallel,
"help|?" => \$help,
"man" => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;

sub read_dir_files {
my ( $dir, $target ) = @_;
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
while ( my $file = readdir $dh ) {
if ( -f "$dir/$file" ) {
push @{$target}, "$dir/$file";
}
}
}

sub shuffle_array {
my ($ar) = @_;

for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
if ( $i > 1 ) {
my $sel = int( rand( $i - 1 ) );
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
}
}
}

sub learn_samples {
my ( $ar_ham, $ar_spam ) = @_;
my $len;
my $processed = 0;
my $total = 0;

my @files_spam;
my @files_ham;

$len = int( scalar @{$ar_ham} * $train_fraction );
my @cur_vec;

# Shuffle spam and ham samples
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_ham}[$i];
push @files_ham, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_ham}[$i];
}
}

$len = int( scalar @{$ar_spam} * $train_fraction );
@cur_vec = ();
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_spam}[$i];
push @files_spam, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_spam}[$i];
}
}

for ( my $i = 0 ; $i < $total ; $i++ ) {
my $args;
my $cmd;

if ( $i % 2 == 0 ) {
$args = pop @files_spam;

if ( !$args ) {
$args = pop @files_ham;
$cmd = 'learn_ham';
}
else {
$cmd = 'learn_spam';
}
}
else {
$args = pop @files_ham;
if ( !$args ) {
$args = pop @files_spam;
$cmd = 'learn_spam';
}
else {
$cmd = 'learn_ham';
}
}

my $args_quoted = shell_quote @{$args};
open(
my $p,
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'success'} ) {
$processed++;
}
}
}

return $processed;
}

sub cross_validate {
my ($hr) = @_;
my $args = "";
my $processed = 0;
my $fp_spam = 0;
my $fn_spam = 0;
my $fp_ham = 0;
my $fn_ham = 0;
my $total_spam = 0;
my $total_ham = 0;
my $detected_spam = 0;
my $detected_ham = 0;
my $i = 0;
my $len = scalar keys %{$hr};
my @files_spam;
my @files_ham;
my @cur_spam;
my @cur_ham;

while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
if ($spam) {
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
push @cur_spam, $fn;
push @files_spam, [@cur_spam];
@cur_spam = ();
}
else {
push @cur_spam, $fn;
}
}
else {
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
push @cur_ham, $fn;
push @files_ham, [@cur_ham];
@cur_ham = ();
}
else {
push @cur_ham, $fn;
}
}
}

shuffle_array( \@files_spam );

foreach my $fn (@files_spam) {
my $args_quoted = shell_quote @{$fn};
my $spam = 1;

open(
my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'default'} ) {
$processed++;

if ($spam) {
$total_spam++;

if ( $res->{'default'}->{$ham_symbol} ) {
$fp_spam++;
}
elsif ( !$res->{'default'}->{$spam_symbol} ) {
$fn_spam++;
}
else {
$detected_spam++;
}
}
else {
$total_ham++;

if ( $res->{'default'}->{$spam_symbol} ) {
$fp_ham++;
}
elsif ( !$res->{'default'}->{$ham_symbol} ) {
$fn_ham++;
}
else {
$detected_ham++;
}
}
}
}
}

shuffle_array( \@files_ham );

foreach my $fn (@files_ham) {
my $args_quoted = shell_quote @{$fn};
my $spam = 0;

open(
my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'default'} ) {
$processed++;

if ($spam) {
$total_spam++;

if ( $res->{'default'}->{$ham_symbol} ) {
$fp_spam++;
}
elsif ( !$res->{'default'}->{$spam_symbol} ) {
$fn_spam++;
}
else {
$detected_spam++;
}
}
else {
$total_ham++;

if ( $res->{'default'}->{$spam_symbol} ) {
$fp_ham++;
}
elsif ( !$res->{'default'}->{$ham_symbol} ) {
$fn_ham++;
}
else {
$detected_ham++;
}
}
}
}
}

printf "Scanned %d messages
%d spam messages (%d detected)
%d ham messages (%d detected)\n",
$processed, $total_spam, $detected_spam, $total_ham, $detected_ham;

printf "\nHam FP rate: %.2f%% (%d messages)
Ham FN rate: %.2f%% (%d messages)\n",
$fp_ham / $total_ham * 100.0, $fp_ham,
$fn_ham / $total_ham * 100.0, $fn_ham;

printf "\nSpam FP rate: %.2f%% (%d messages)
Spam FN rate: %.2f%% (%d messages)\n",
$fp_spam / $total_spam * 100.0, $fp_spam,
$fn_spam / $total_spam * 100.0, $fn_spam;
}

if ( !$spam_dir || !$ham_dir ) {
die "spam or/and ham directories are not specified";
}

my @spam_samples;
my @ham_samples;

read_dir_files( $spam_dir, \@spam_samples );
read_dir_files( $ham_dir, \@ham_samples );
shuffle_array( \@spam_samples );
shuffle_array( \@ham_samples );

my $learned = 0;
my $t0 = [gettimeofday];
$learned = learn_samples( \@ham_samples, \@spam_samples );
my $t1 = [gettimeofday];

printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
$learned, tv_interval( $t0, $t1 );

my %validation_set;
my $len = int( scalar @spam_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
$validation_set{ $spam_samples[$i] } = 1;
}

$len = int( scalar @ham_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
$validation_set{ $ham_samples[$i] } = 0;
}

cross_validate( \%validation_set );

__END__

=head1 NAME

classifier_test.pl - test various parameters for a classifier

=head1 SYNOPSIS

classifier_test.pl [options]

Options:
--spam Directory with spam files
--ham Directory with ham files
--spam-symbol Symbol for spam (default: BAYES_SPAM)
--ham-symbol Symbol for ham (default: BAYES_HAM)
--classifier Classifier to test (default: bayes)
--timeout Timeout for rspamc (default: 10)
--parallel Parallel execution (default: 1)
--help Brief help message
--man Full documentation

=head1 OPTIONS

=over 8

=item B<--spam>

Directory with spam files.

=item B<--ham>

Directory with ham files.

=item B<--classifier>

Specifies classifier name to test.

=item B<--help>

Print a brief help message and exits.

=item B<--man>

Prints the manual page and exits.

=back

=head1 DESCRIPTION

B<classifier_test.pl> is intended to test Rspamd classifier for false positives,
false negatives and other parameters. It uses half of the corpus for training
and half for cross-validation.

=cut

Loading…
Cancel
Save