2015-12-22 01:17:59 +01:00
|
|
|
#!/usr/bin/env perl
|
|
|
|
|
|
|
|
# This script is a very simple prototype to learn fann from rspamd logs
|
|
|
|
# For now, it is intended for internal use only
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
use warnings FATAL => 'all';
|
|
|
|
use AI::FANN qw(:all);
|
|
|
|
use Getopt::Std;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
my %sym_idx; # Symbols by index
|
|
|
|
my %sym_names; # Symbols by name
|
|
|
|
my $num = 1; # Number of symbols
|
2015-12-22 01:17:59 +01:00
|
|
|
my @spam;
|
|
|
|
my @ham;
|
2018-10-17 14:06:14 +02:00
|
|
|
my $max_samples = -1;
|
|
|
|
my $split = 1;
|
|
|
|
my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
|
|
|
|
my $score_spam = 12;
|
|
|
|
my $score_ham = -6;
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
sub process {
|
2018-10-17 14:06:14 +02:00
|
|
|
my ( $input, $spam, $ham ) = @_;
|
2015-12-22 01:17:59 +01:00
|
|
|
my $samples = 0;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
while (<$input>) {
|
|
|
|
if ( !$preprocessed ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) {
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $1 > $score_spam ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$_ = "$1:1: $3";
|
|
|
|
}
|
2018-10-17 14:06:14 +02:00
|
|
|
elsif ( $1 < $score_ham ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$_ = "$1:0: $3\n";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# Out of boundary
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
# Not our log message
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$_ =~ /^(-?\d+\.?\d*):([01]):\s*(\S.*)$/;
|
|
|
|
|
|
|
|
my $is_spam = 0;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $2 == 1 ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$is_spam = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
my @ar = split /,/, $3;
|
|
|
|
my %sample;
|
|
|
|
|
|
|
|
foreach my $sym (@ar) {
|
|
|
|
chomp $sym;
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( !$sym_idx{$sym} ) {
|
|
|
|
$sym_idx{$sym} = $num;
|
2015-12-22 01:17:59 +01:00
|
|
|
$sym_names{$num} = $sym;
|
|
|
|
$num++;
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
$sample{ $sym_idx{$sym} } = 1;
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if ($is_spam) {
|
|
|
|
push @{$spam}, \%sample;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
push @{$ham}, \%sample;
|
|
|
|
}
|
|
|
|
|
|
|
|
$samples++;
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $max_samples > 0 && $samples > $max_samples ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Shuffle array
|
2018-10-17 14:06:14 +02:00
|
|
|
sub fisher_yates_shuffle {
|
2015-12-22 01:17:59 +01:00
|
|
|
my $array = shift;
|
2018-10-17 14:06:14 +02:00
|
|
|
my $i = @$array;
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
while ( --$i ) {
|
|
|
|
my $j = int rand( $i + 1 );
|
2018-10-17 14:06:14 +02:00
|
|
|
@$array[ $i, $j ] = @$array[ $j, $i ];
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Train network
|
|
|
|
sub train {
|
2018-10-17 14:06:14 +02:00
|
|
|
my ( $ann, $sample, $result ) = @_;
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
my @row;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = 1 ; $i < $num ; $i++ ) {
|
|
|
|
if ( $sample->{$i} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
push @row, 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
push @row, 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#print "@row -> @{$result}\n";
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
$ann->train( \@row, \@{$result} );
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
sub test {
|
2018-10-17 14:06:14 +02:00
|
|
|
my ( $ann, $sample ) = @_;
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
my @row;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = 1 ; $i < $num ; $i++ ) {
|
|
|
|
if ( $sample->{$i} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
push @row, 1;
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
push @row, 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
my $ret = $ann->run( \@row );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
return $ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
my %opts;
|
2018-10-17 14:06:14 +02:00
|
|
|
getopts( 'o:i:s:n:t:hpS:H:', \%opts );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'h'} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n";
|
|
|
|
exit;
|
|
|
|
}
|
|
|
|
|
|
|
|
my $input = *STDIN;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'i'} ) {
|
|
|
|
open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}";
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'n'} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$max_samples = $opts{'n'};
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'t'} ) {
|
|
|
|
|
2015-12-22 01:17:59 +01:00
|
|
|
# Test split
|
|
|
|
$split = $opts{'t'};
|
|
|
|
}
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'p'} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$preprocessed = 1;
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'H'} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$score_ham = $opts{'H'};
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'S'} ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$score_spam = $opts{'S'};
|
|
|
|
}
|
|
|
|
|
|
|
|
# ham_prob, spam_prob
|
|
|
|
my @spam_out = (1);
|
2018-10-17 14:06:14 +02:00
|
|
|
my @ham_out = (0);
|
2015-12-22 01:17:59 +01:00
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
process( $input, \@spam, \@ham );
|
|
|
|
fisher_yates_shuffle( \@spam );
|
|
|
|
fisher_yates_shuffle( \@ham );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
my $nspam = int( scalar(@spam) / $split );
|
|
|
|
my $nham = int( scalar(@ham) / $split );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
my @train_data;
|
2018-10-17 14:06:14 +02:00
|
|
|
|
2015-12-22 01:17:59 +01:00
|
|
|
# Train ANN
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = 0 ; $i < $nham ; $i++ ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
push @train_data, [ $ham[$i], \@ham_out ];
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = 0 ; $i < $nspam ; $i++ ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
push @train_data, [ $spam[$i], \@spam_out ];
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
fisher_yates_shuffle( \@train_data );
|
2015-12-22 01:17:59 +01:00
|
|
|
|
|
|
|
foreach my $train_row (@train_data) {
|
2018-10-17 14:06:14 +02:00
|
|
|
train( $ann, @{$train_row}[0], @{$train_row}[1] );
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
print "Trained $nspam SPAM and $nham HAM samples\n";
|
|
|
|
|
|
|
|
# Now run fann
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $split > 1 ) {
|
|
|
|
my $sample = 0.0;
|
2015-12-22 01:17:59 +01:00
|
|
|
my $correct = 0.0;
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = $nham ; $i < $nham * $split ; $i++ ) {
|
|
|
|
my $ret = test( $ann, $ham[$i] );
|
|
|
|
|
2015-12-22 01:17:59 +01:00
|
|
|
#print "@{$ret}\n";
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( @{$ret}[0] < 0.5 ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$correct++;
|
|
|
|
}
|
|
|
|
$sample++;
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
|
2015-12-22 01:17:59 +01:00
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
$sample = 0.0;
|
2015-12-22 01:17:59 +01:00
|
|
|
$correct = 0.0;
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) {
|
|
|
|
my $ret = test( $ann, $spam[$i] );
|
|
|
|
|
2015-12-22 01:17:59 +01:00
|
|
|
#print "@{$ret}\n";
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( @{$ret}[0] > 0.5 ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
$correct++;
|
|
|
|
}
|
|
|
|
$sample++;
|
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'o'} ) {
|
|
|
|
$ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}";
|
2015-12-22 01:17:59 +01:00
|
|
|
}
|
|
|
|
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $opts{'s'} ) {
|
|
|
|
open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}";
|
2015-12-22 01:17:59 +01:00
|
|
|
print $scores "{";
|
2018-10-17 14:06:14 +02:00
|
|
|
for ( my $i = 1 ; $i < $num ; $i++ ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
my $n = $i - 1;
|
2018-10-17 14:06:14 +02:00
|
|
|
if ( $i != $num - 1 ) {
|
2015-12-22 01:17:59 +01:00
|
|
|
print $scores "\"$sym_names{$i}\":$n,";
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
print $scores "\"$sym_names{$i}\":$n}\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|