@@ -0,0 +1,2 @@ | |||
# Code::TidyAll | |||
/.tidyall.d/ |
@@ -0,0 +1,25 @@ | |||
; Run "tidyall -a" to process all files. | |||
; Run "tidyall -g" to process all added or modified files in the current git working directory. | |||
; Ignore third-party code | |||
ignore = contrib/**/* doc/doxydown/doxydown.pl | |||
;[PerlCritic] | |||
;select = **/*.{pl,pm,t} | |||
[PerlTidy] | |||
select = **/*.{pl,pm,t} | |||
argv = -l=120 | |||
[PodChecker] | |||
select = **/*.{pl,pm,pod} | |||
;[PodSpell] | |||
;select = **/*.{pl,pm,pod} | |||
;[PodTidy] | |||
;select = **/*.{pl,pm,pod} | |||
;argv = --columns=120 | |||
[Test::Vars] | |||
select = **/*.{pl,pm,t} |
@@ -16,14 +16,14 @@ $LWP::Simple::ua->show_progress(1); | |||
$Net::MRT::USE_RFC4760 = -1; | |||
my %config = ( | |||
asn_sources => [ | |||
'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest', | |||
'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest', | |||
'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest', | |||
'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest', | |||
'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest' | |||
], | |||
bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz'] | |||
asn_sources => [ | |||
'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest', | |||
'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest', | |||
'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest', | |||
'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest', | |||
'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest' | |||
], | |||
bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz'] | |||
); | |||
my $download_asn = 0; | |||
@@ -38,171 +38,169 @@ my $v4_zone = "asn.rspamd.com"; | |||
my $v6_zone = "asn6.rspamd.com"; | |||
my $v4_file = "asn.zone"; | |||
my $v6_file = "asn6.zone"; | |||
my $ns_servers = ["asn-ns.rspamd.com", "asn-ns2.rspamd.com"]; | |||
my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ]; | |||
GetOptions( | |||
"download-asn" => \$download_asn, | |||
"download-bgp" => \$download_bgp, | |||
"4!" => \$v4, | |||
"6!" => \$v6, | |||
"parse!" => \$parse, | |||
"target=s" => \$download_target, | |||
"zone-v4=s" => \$v4_zone, | |||
"zone-v6=s" => \$v6_zone, | |||
"file-v4=s" => \$v4_file, | |||
"file-v6=s" => \$v6_file, | |||
"ns-server=s@" => \$ns_servers, | |||
"help|?" => \$help, | |||
"man" => \$man | |||
"download-asn" => \$download_asn, | |||
"download-bgp" => \$download_bgp, | |||
"4!" => \$v4, | |||
"6!" => \$v6, | |||
"parse!" => \$parse, | |||
"target=s" => \$download_target, | |||
"zone-v4=s" => \$v4_zone, | |||
"zone-v6=s" => \$v6_zone, | |||
"file-v4=s" => \$v4_file, | |||
"file-v6=s" => \$v6_file, | |||
"ns-server=s@" => \$ns_servers, | |||
"help|?" => \$help, | |||
"man" => \$man | |||
) or pod2usage(2); | |||
pod2usage(1) if $help; | |||
pod2usage( -exitval => 0, -verbose => 2 ) if $man; | |||
sub download_file { | |||
my ($u) = @_; | |||
my ($u) = @_; | |||
print "Fetching $u\n"; | |||
my $ff = File::Fetch->new( uri => $u ); | |||
my $where = $ff->fetch( to => $download_target ) or die $ff->error; | |||
print "Fetching $u\n"; | |||
my $ff = File::Fetch->new( uri => $u ); | |||
my $where = $ff->fetch( to => $download_target ) or die $ff->error; | |||
return $where; | |||
return $where; | |||
} | |||
if ($download_asn) { | |||
foreach my $u ( @{ $config{'asn_sources'} } ) { | |||
download_file($u); | |||
} | |||
foreach my $u ( @{ $config{'asn_sources'} } ) { | |||
download_file($u); | |||
} | |||
} | |||
if ($download_bgp) { | |||
foreach my $u ( @{ $config{'bgp_sources'} } ) { | |||
download_file($u); | |||
} | |||
foreach my $u ( @{ $config{'bgp_sources'} } ) { | |||
download_file($u); | |||
} | |||
} | |||
if ( !$parse ) { | |||
exit 0; | |||
exit 0; | |||
} | |||
my $v4_fh; | |||
my $v6_fh; | |||
if ($v4) { | |||
open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!"; | |||
print $v4_fh | |||
"\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; | |||
foreach my $ns (@{$ns_servers}) { | |||
print $v4_fh "\$NS 43200 $ns\n"; | |||
} | |||
open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!"; | |||
print $v4_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; | |||
foreach my $ns ( @{$ns_servers} ) { | |||
print $v4_fh "\$NS 43200 $ns\n"; | |||
} | |||
} | |||
if ($v6) { | |||
open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!"; | |||
print $v6_fh | |||
"\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; | |||
foreach my $ns (@{$ns_servers}) { | |||
print $v6_fh "\$NS 43200 $ns\n"; | |||
} | |||
open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!"; | |||
print $v6_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; | |||
foreach my $ns ( @{$ns_servers} ) { | |||
print $v6_fh "\$NS 43200 $ns\n"; | |||
} | |||
} | |||
# Now load BGP data | |||
my $networks = {}; | |||
foreach my $u ( @{ $config{'bgp_sources'} } ) { | |||
my $parsed = URI->new($u); | |||
my $fname = $download_target . '/' . basename( $parsed->path ); | |||
open( my $fh, "<:gzip", $fname ) | |||
or die "Cannot open $fname: $!"; | |||
while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) { | |||
if ( $dd->{'prefix'} && $dd->{'bits'} ) { | |||
next if $dd->{'subtype'} == 2 and !$v4; | |||
next if $dd->{'subtype'} == 4 and !$v6; | |||
my $entry = $dd->{'entries'}->[0]; | |||
my $net = $dd->{'prefix'} . '/' . $dd->{'bits'}; | |||
if ( $entry && $entry->{'AS_PATH'} ) { | |||
my $as = $entry->{'AS_PATH'}->[-1]; | |||
if (ref($as) eq "ARRAY") { | |||
$as = @{$as}[0]; | |||
my $parsed = URI->new($u); | |||
my $fname = $download_target . '/' . basename( $parsed->path ); | |||
open( my $fh, "<:gzip", $fname ) | |||
or die "Cannot open $fname: $!"; | |||
while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) { | |||
if ( $dd->{'prefix'} && $dd->{'bits'} ) { | |||
next if $dd->{'subtype'} == 2 and !$v4; | |||
next if $dd->{'subtype'} == 4 and !$v6; | |||
my $entry = $dd->{'entries'}->[0]; | |||
my $net = $dd->{'prefix'} . '/' . $dd->{'bits'}; | |||
if ( $entry && $entry->{'AS_PATH'} ) { | |||
my $as = $entry->{'AS_PATH'}->[-1]; | |||
if ( ref($as) eq "ARRAY" ) { | |||
$as = @{$as}[0]; | |||
} | |||
if ( !$networks->{$as} ) { | |||
if ( $dd->{'subtype'} == 2 ) { | |||
$networks->{$as} = { nets_v4 => [$net], nets_v6 => [] }; | |||
} | |||
else { | |||
$networks->{$as} = { nets_v6 => [$net], nets_v4 => [] }; | |||
} | |||
} | |||
else { | |||
if ( $dd->{'subtype'} == 2 ) { | |||
push @{ $networks->{$as}->{'nets_v4'} }, $net; | |||
} | |||
else { | |||
push @{ $networks->{$as}->{'nets_v6'} }, $net; | |||
} | |||
} | |||
} | |||
} | |||
if ( !$networks->{$as} ) { | |||
if ( $dd->{'subtype'} == 2 ) { | |||
$networks->{$as} = { nets_v4 => [$net], nets_v6 => [] }; | |||
} | |||
else { | |||
$networks->{$as} = { nets_v6 => [$net], nets_v4 => [] }; | |||
} | |||
} | |||
else { | |||
if ( $dd->{'subtype'} == 2 ) { | |||
push @{ $networks->{$as}->{'nets_v4'} }, $net; | |||
} | |||
else { | |||
push @{ $networks->{$as}->{'nets_v6'} }, $net; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
# Now roughly detect countries | |||
foreach my $u ( @{ $config{'asn_sources'} } ) { | |||
my $parsed = URI->new($u); | |||
my $fname = $download_target . '/' . basename( $parsed->path ); | |||
open( my $fh, "<", $fname ) or die "Cannot open $fname: $!"; | |||
while (<$fh>) { | |||
next if /^\#/; | |||
chomp; | |||
my @elts = split /\|/; | |||
if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) { | |||
my $as_start = int( $elts[3] ); | |||
my $as_end = $as_start + int( $elts[4] ); | |||
for ( my $as = $as_start ; $as < $as_end ; $as++ ) { | |||
my $real_as = $as; | |||
if (ref($as) eq "ARRAY") { | |||
$real_as = @{$as}[0]; | |||
my $parsed = URI->new($u); | |||
my $fname = $download_target . '/' . basename( $parsed->path ); | |||
open( my $fh, "<", $fname ) or die "Cannot open $fname: $!"; | |||
while (<$fh>) { | |||
next if /^\#/; | |||
chomp; | |||
my @elts = split /\|/; | |||
if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) { | |||
my $as_start = int( $elts[3] ); | |||
my $as_end = $as_start + int( $elts[4] ); | |||
for ( my $as = $as_start ; $as < $as_end ; $as++ ) { | |||
my $real_as = $as; | |||
if ( ref($as) eq "ARRAY" ) { | |||
$real_as = @{$as}[0]; | |||
} | |||
if ( $networks->{"$real_as"} ) { | |||
$networks->{"$real_as"}->{'country'} = $elts[1]; | |||
$networks->{"$real_as"}->{'rir'} = $elts[0]; | |||
} | |||
} | |||
} | |||
if ( $networks->{"$real_as"} ) { | |||
$networks->{"$real_as"}->{'country'} = $elts[1]; | |||
$networks->{"$real_as"}->{'rir'} = $elts[0]; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
while ( my ( $k, $v ) = each( %{$networks} ) ) { | |||
if ($v4) { | |||
foreach my $n ( @{ $v->{'nets_v4'} } ) { | |||
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 | |||
if ( $v->{'country'} ) { | |||
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; | |||
} | |||
else { | |||
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; | |||
} | |||
if ($v4) { | |||
foreach my $n ( @{ $v->{'nets_v4'} } ) { | |||
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 | |||
if ( $v->{'country'} ) { | |||
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; | |||
} | |||
else { | |||
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; | |||
} | |||
} | |||
} | |||
} | |||
if ($v6) { | |||
foreach my $n ( @{ $v->{'nets_v6'} } ) { | |||
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 | |||
if ( $v->{'country'} ) { | |||
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; | |||
} | |||
else { | |||
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; | |||
} | |||
if ($v6) { | |||
foreach my $n ( @{ $v->{'nets_v6'} } ) { | |||
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 | |||
if ( $v->{'country'} ) { | |||
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; | |||
} | |||
else { | |||
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
__END__ |
@@ -359,6 +359,4 @@ protocol. On scan requests, this filter can query Rspamd to process a message. | |||
B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on | |||
Rspamd scan result. | |||
=back | |||
=cut |
@@ -30,438 +30,428 @@ my $man; | |||
my $help; | |||
GetOptions( | |||
"spam|s=s" => \$spam_dir, | |||
"ham|h=s" => \$ham_dir, | |||
"spam-symbol=s" => \$spam_symbol, | |||
"ham-symbol=s" => \$ham_symbol, | |||
"classifier|c=s" => \$classifier, | |||
"timeout|t=f" => \$timeout, | |||
"parallel|p=i" => \$parallel, | |||
"train-fraction|t=f" => \$train_fraction, | |||
"bogofilter|b" => \$use_bogofilter, | |||
"dspam|d" => \$use_dspam, | |||
"check-only" => \$check_only, | |||
"help|?" => \$help, | |||
"man" => \$man | |||
"spam|s=s" => \$spam_dir, | |||
"ham|h=s" => \$ham_dir, | |||
"spam-symbol=s" => \$spam_symbol, | |||
"ham-symbol=s" => \$ham_symbol, | |||
"classifier|c=s" => \$classifier, | |||
"timeout|t=f" => \$timeout, | |||
"parallel|p=i" => \$parallel, | |||
"train-fraction|t=f" => \$train_fraction, | |||
"bogofilter|b" => \$use_bogofilter, | |||
"dspam|d" => \$use_dspam, | |||
"check-only" => \$check_only, | |||
"help|?" => \$help, | |||
"man" => \$man | |||
) or pod2usage(2); | |||
pod2usage(1) if $help; | |||
pod2usage( -exitval => 0, -verbose => 2 ) if $man; | |||
sub read_dir_files { | |||
my ( $dir, $target ) = @_; | |||
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; | |||
while ( my $file = readdir $dh ) { | |||
if ( -f "$dir/$file" ) { | |||
push @{$target}, "$dir/$file"; | |||
my ( $dir, $target ) = @_; | |||
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; | |||
while ( my $file = readdir $dh ) { | |||
if ( -f "$dir/$file" ) { | |||
push @{$target}, "$dir/$file"; | |||
} | |||
} | |||
} | |||
} | |||
sub shuffle_array { | |||
my ($ar) = @_; | |||
my ($ar) = @_; | |||
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { | |||
if ( $i > 1 ) { | |||
my $sel = int( rand( $i - 1 ) ); | |||
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); | |||
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { | |||
if ( $i > 1 ) { | |||
my $sel = int( rand( $i - 1 ) ); | |||
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); | |||
} | |||
} | |||
} | |||
} | |||
sub learn_rspamc { | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
my $cmd = $spam ? "learn_spam" : "learn_ham"; | |||
my $args_quoted = shell_quote @{$files}; | |||
open( | |||
my $p, | |||
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" | |||
) or die "cannot spawn $rspamc: $!"; | |||
while (<$p>) { | |||
my $res = eval('decode_json($_)'); | |||
if ( $res && $res->{'success'} ) { | |||
$processed++; | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
my $cmd = $spam ? "learn_spam" : "learn_ham"; | |||
my $args_quoted = shell_quote @{$files}; | |||
open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) | |||
or die "cannot spawn $rspamc: $!"; | |||
while (<$p>) { | |||
my $res = eval('decode_json($_)'); | |||
if ( $res && $res->{'success'} ) { | |||
$processed++; | |||
} | |||
} | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub learn_bogofilter { | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
my $fl = $spam ? "-s" : "-n"; | |||
`$bogofilter -I $args_quoted $fl`; | |||
if ( $? == 0 ) { | |||
$processed++; | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
my $fl = $spam ? "-s" : "-n"; | |||
`$bogofilter -I $args_quoted $fl`; | |||
if ( $? == 0 ) { | |||
$processed++; | |||
} | |||
} | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub learn_dspam { | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
my $fl = $spam ? "--class=spam" : "--class=innocent"; | |||
open( my $p, | |||
"|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) | |||
or die "cannot run $dspam: $!"; | |||
open( my $inp, "< $f" ); | |||
while (<$inp>) { | |||
print $p $_; | |||
my ( $files, $spam ) = @_; | |||
my $processed = 0; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
my $fl = $spam ? "--class=spam" : "--class=innocent"; | |||
open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) | |||
or die "cannot run $dspam: $!"; | |||
open( my $inp, "< $f" ); | |||
while (<$inp>) { | |||
print $p $_; | |||
} | |||
} | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub learn_samples { | |||
my ( $ar_ham, $ar_spam ) = @_; | |||
my $len; | |||
my $processed = 0; | |||
my $total = 0; | |||
my $learn_func; | |||
my @files_spam; | |||
my @files_ham; | |||
if ($use_dspam) { | |||
$learn_func = \&learn_dspam; | |||
} | |||
elsif ($use_bogofilter) { | |||
$learn_func = \&learn_bogofilter; | |||
} | |||
else { | |||
$learn_func = \&learn_rspamc; | |||
} | |||
$len = int( scalar @{$ar_ham} * $train_fraction ); | |||
my @cur_vec; | |||
# Shuffle spam and ham samples | |||
for ( my $i = 0 ; $i < $len ; $i++ ) { | |||
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { | |||
push @cur_vec, @{$ar_ham}[$i]; | |||
push @files_ham, [@cur_vec]; | |||
@cur_vec = (); | |||
$total++; | |||
} | |||
else { | |||
push @cur_vec, @{$ar_ham}[$i]; | |||
} | |||
} | |||
$len = int( scalar @{$ar_spam} * $train_fraction ); | |||
@cur_vec = (); | |||
for ( my $i = 0 ; $i < $len ; $i++ ) { | |||
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { | |||
push @cur_vec, @{$ar_spam}[$i]; | |||
push @files_spam, [@cur_vec]; | |||
@cur_vec = (); | |||
$total++; | |||
} | |||
else { | |||
push @cur_vec, @{$ar_spam}[$i]; | |||
my ( $ar_ham, $ar_spam ) = @_; | |||
my $len; | |||
my $processed = 0; | |||
my $total = 0; | |||
my $learn_func; | |||
my @files_spam; | |||
my @files_ham; | |||
if ($use_dspam) { | |||
$learn_func = \&learn_dspam; | |||
} | |||
} | |||
for ( my $i = 0 ; $i < $total ; $i++ ) { | |||
my $args; | |||
my $spam; | |||
if ( $i % 2 == 0 ) { | |||
$args = pop @files_spam; | |||
if ( !$args ) { | |||
$args = pop @files_ham; | |||
$spam = 0; | |||
} | |||
else { | |||
$spam = 1; | |||
} | |||
elsif ($use_bogofilter) { | |||
$learn_func = \&learn_bogofilter; | |||
} | |||
else { | |||
$args = pop @files_ham; | |||
if ( !$args ) { | |||
$args = pop @files_spam; | |||
$spam = 1; | |||
} | |||
else { | |||
$spam = 0; | |||
} | |||
$learn_func = \&learn_rspamc; | |||
} | |||
my $r = $learn_func->( $args, $spam ); | |||
if ($r) { | |||
$processed += $r; | |||
$len = int( scalar @{$ar_ham} * $train_fraction ); | |||
my @cur_vec; | |||
# Shuffle spam and ham samples | |||
for ( my $i = 0 ; $i < $len ; $i++ ) { | |||
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { | |||
push @cur_vec, @{$ar_ham}[$i]; | |||
push @files_ham, [@cur_vec]; | |||
@cur_vec = (); | |||
$total++; | |||
} | |||
else { | |||
push @cur_vec, @{$ar_ham}[$i]; | |||
} | |||
} | |||
} | |||
return $processed; | |||
} | |||
$len = int( scalar @{$ar_spam} * $train_fraction ); | |||
@cur_vec = (); | |||
for ( my $i = 0 ; $i < $len ; $i++ ) { | |||
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { | |||
push @cur_vec, @{$ar_spam}[$i]; | |||
push @files_spam, [@cur_vec]; | |||
@cur_vec = (); | |||
$total++; | |||
} | |||
else { | |||
push @cur_vec, @{$ar_spam}[$i]; | |||
} | |||
} | |||
sub check_rspamc { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
for ( my $i = 0 ; $i < $total ; $i++ ) { | |||
my $args; | |||
my $spam; | |||
my $args_quoted = shell_quote @{$files}; | |||
my $processed = 0; | |||
if ( $i % 2 == 0 ) { | |||
$args = pop @files_spam; | |||
open( | |||
my $p, | |||
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" | |||
) or die "cannot spawn $rspamc: $!"; | |||
while (<$p>) { | |||
my $res = eval('decode_json($_)'); | |||
if ( $res && $res->{'default'} ) { | |||
$processed++; | |||
if ($spam) { | |||
if ( $res->{'default'}->{$ham_symbol} ) { | |||
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; | |||
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { | |||
my $percentage = int($1); | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
if ( !$args ) { | |||
$args = pop @files_ham; | |||
$spam = 0; | |||
} | |||
else { | |||
$spam = 1; | |||
} | |||
} | |||
else { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( !$res->{'default'}->{$spam_symbol} ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
else { | |||
if ( $res->{'default'}->{$spam_symbol} ) { | |||
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; | |||
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { | |||
my $percentage = int($1); | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
$args = pop @files_ham; | |||
if ( !$args ) { | |||
$args = pop @files_spam; | |||
$spam = 1; | |||
} | |||
else { | |||
$spam = 0; | |||
} | |||
} | |||
else { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( !$res->{'default'}->{$ham_symbol} ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
my $r = $learn_func->( $args, $spam ); | |||
if ($r) { | |||
$processed += $r; | |||
} | |||
} | |||
} | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub check_bogofilter { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
my $processed = 0; | |||
sub check_rspamc { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
my $args_quoted = shell_quote @{$files}; | |||
my $processed = 0; | |||
open( my $p, "$bogofilter -t -I $args_quoted |" ) | |||
or die "cannot spawn $bogofilter: $!"; | |||
open( | |||
my $p, | |||
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" | |||
) or die "cannot spawn $rspamc: $!"; | |||
while (<$p>) { | |||
if ( $_ =~ /^([SHU])\s+.*$/ ) { | |||
$processed++; | |||
if ($spam) { | |||
if ( $1 eq 'H' ) { | |||
$$fp_cnt++; | |||
} | |||
elsif ( $1 eq 'U' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
else { | |||
if ( $1 eq 'S' ) { | |||
$$fp_cnt++; | |||
} | |||
elsif ( $1 eq 'U' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
my $res = eval('decode_json($_)'); | |||
if ( $res && $res->{'default'} ) { | |||
$processed++; | |||
if ($spam) { | |||
if ( $res->{'default'}->{$ham_symbol} ) { | |||
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; | |||
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { | |||
my $percentage = int($1); | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
} | |||
} | |||
else { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( !$res->{'default'}->{$spam_symbol} ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
else { | |||
if ( $res->{'default'}->{$spam_symbol} ) { | |||
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; | |||
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { | |||
my $percentage = int($1); | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
} | |||
} | |||
else { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( !$res->{'default'}->{$ham_symbol} ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub check_dspam { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
my $processed = 0; | |||
sub check_bogofilter { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
my $processed = 0; | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
open( my $p, "$bogofilter -t -I $args_quoted |" ) | |||
or die "cannot spawn $bogofilter: $!"; | |||
while (<$p>) { | |||
if ( $_ =~ /^([SHU])\s+.*$/ ) { | |||
$processed++; | |||
if ($spam) { | |||
if ( $1 eq 'H' ) { | |||
$$fp_cnt++; | |||
} | |||
elsif ( $1 eq 'U' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
else { | |||
if ( $1 eq 'S' ) { | |||
$$fp_cnt++; | |||
} | |||
elsif ( $1 eq 'U' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
} | |||
} | |||
} | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
return $processed; | |||
} | |||
my $pid = open2( *Reader, *Writer, | |||
"$dspam --user nobody --classify --stdout --mode=notrain" ); | |||
open( my $inp, "< $f" ); | |||
while (<$inp>) { | |||
print Writer $_; | |||
} | |||
close Writer; | |||
sub check_dspam { | |||
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; | |||
my $processed = 0; | |||
while (<Reader>) { | |||
if ( $_ =~ | |||
qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) | |||
) | |||
{ | |||
$processed++; | |||
my $percentage = int($2 * 100.0); | |||
foreach my $f ( @{$files} ) { | |||
my $args_quoted = shell_quote $f; | |||
if ($spam) { | |||
if ( $1 eq 'Innocent') { | |||
if ( $percentage <= (100 - $rspamc_prob_trigger) ) { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( $1 ne 'Spam' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); | |||
open( my $inp, "< $f" ); | |||
while (<$inp>) { | |||
print Writer $_; | |||
} | |||
else { | |||
if ( $1 eq 'Spam' ) { | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
close Writer; | |||
while (<Reader>) { | |||
if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { | |||
$processed++; | |||
my $percentage = int( $2 * 100.0 ); | |||
if ($spam) { | |||
if ( $1 eq 'Innocent' ) { | |||
if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( $1 ne 'Spam' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
else { | |||
if ( $1 eq 'Spam' ) { | |||
if ( $percentage >= $rspamc_prob_trigger ) { | |||
$$fp_cnt++; | |||
} | |||
} | |||
elsif ( $1 ne 'Innocent' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
} | |||
} | |||
elsif ( $1 ne 'Innocent' ) { | |||
$$fn_cnt++; | |||
} | |||
else { | |||
$$detected_cnt++; | |||
} | |||
} | |||
} | |||
close Reader; | |||
waitpid( $pid, 0 ); | |||
} | |||
close Reader; | |||
waitpid( $pid, 0 ); | |||
} | |||
return $processed; | |||
return $processed; | |||
} | |||
sub cross_validate { | |||
my ($hr) = @_; | |||
my $args = ""; | |||
my $processed = 0; | |||
my $fp_spam = 0; | |||
my $fn_spam = 0; | |||
my $fp_ham = 0; | |||
my $fn_ham = 0; | |||
my $total_spam = 0; | |||
my $total_ham = 0; | |||
my $detected_spam = 0; | |||
my $detected_ham = 0; | |||
my $i = 0; | |||
my $len = scalar keys %{$hr}; | |||
my @files_spam; | |||
my @files_ham; | |||
my @cur_spam; | |||
my @cur_ham; | |||
my $check_func; | |||
if ($use_dspam) { | |||
$check_func = \&check_dspam; | |||
} | |||
elsif ($use_bogofilter) { | |||
$check_func = \&check_bogofilter; | |||
} | |||
else { | |||
$check_func = \&check_rspamc; | |||
} | |||
while ( my ( $fn, $spam ) = each( %{$hr} ) ) { | |||
if ($spam) { | |||
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { | |||
push @cur_spam, $fn; | |||
push @files_spam, [@cur_spam]; | |||
@cur_spam = (); | |||
} | |||
else { | |||
push @cur_spam, $fn; | |||
} | |||
my ($hr) = @_; | |||
my $args = ""; | |||
my $processed = 0; | |||
my $fp_spam = 0; | |||
my $fn_spam = 0; | |||
my $fp_ham = 0; | |||
my $fn_ham = 0; | |||
my $total_spam = 0; | |||
my $total_ham = 0; | |||
my $detected_spam = 0; | |||
my $detected_ham = 0; | |||
my $i = 0; | |||
my $len = scalar keys %{$hr}; | |||
my @files_spam; | |||
my @files_ham; | |||
my @cur_spam; | |||
my @cur_ham; | |||
my $check_func; | |||
if ($use_dspam) { | |||
$check_func = \&check_dspam; | |||
} | |||
elsif ($use_bogofilter) { | |||
$check_func = \&check_bogofilter; | |||
} | |||
else { | |||
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { | |||
push @cur_ham, $fn; | |||
push @files_ham, [@cur_ham]; | |||
@cur_ham = (); | |||
} | |||
else { | |||
push @cur_ham, $fn; | |||
} | |||
$check_func = \&check_rspamc; | |||
} | |||
while ( my ( $fn, $spam ) = each( %{$hr} ) ) { | |||
if ($spam) { | |||
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { | |||
push @cur_spam, $fn; | |||
push @files_spam, [@cur_spam]; | |||
@cur_spam = (); | |||
} | |||
else { | |||
push @cur_spam, $fn; | |||
} | |||
} | |||
else { | |||
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { | |||
push @cur_ham, $fn; | |||
push @files_ham, [@cur_ham]; | |||
@cur_ham = (); | |||
} | |||
else { | |||
push @cur_ham, $fn; | |||
} | |||
} | |||
} | |||
} | |||
shuffle_array( \@files_spam ); | |||
shuffle_array( \@files_spam ); | |||
foreach my $fn (@files_spam) { | |||
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); | |||
$total_spam += $r; | |||
$processed += $r; | |||
} | |||
foreach my $fn (@files_spam) { | |||
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); | |||
$total_spam += $r; | |||
$processed += $r; | |||
} | |||
shuffle_array( \@files_ham ); | |||
shuffle_array( \@files_ham ); | |||
foreach my $fn (@files_ham) { | |||
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); | |||
$total_ham += $r; | |||
$processed += $r; | |||
} | |||
foreach my $fn (@files_ham) { | |||
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); | |||
$total_ham += $r; | |||
$processed += $r; | |||
} | |||
printf "Scanned %d messages | |||
printf "Scanned %d messages | |||
%d spam messages (%d detected) | |||
%d ham messages (%d detected)\n", | |||
$processed, $total_spam, $detected_spam, $total_ham, $detected_ham; | |||
%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; | |||
printf "\nHam FP rate: %.2f%% (%d messages) | |||
Ham FN rate: %.2f%% (%d messages)\n", | |||
$fp_ham / $total_ham * 100.0, $fp_ham, | |||
$fn_ham / $total_ham * 100.0, $fn_ham; | |||
printf "\nHam FP rate: %.2f%% (%d messages) | |||
Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; | |||
printf "\nSpam FP rate: %.2f%% (%d messages) | |||
printf "\nSpam FP rate: %.2f%% (%d messages) | |||
Spam FN rate: %.2f%% (%d messages)\n", | |||
$fp_spam / $total_spam * 100.0, $fp_spam, | |||
$fn_spam / $total_spam * 100.0, $fn_spam; | |||
$fp_spam / $total_spam * 100.0, $fp_spam, | |||
$fn_spam / $total_spam * 100.0, $fn_spam; | |||
} | |||
if ( !$spam_dir || !$ham_dir ) { | |||
die "spam or/and ham directories are not specified"; | |||
die "spam or/and ham directories are not specified"; | |||
} | |||
my @spam_samples; | |||
@@ -473,24 +463,23 @@ shuffle_array( \@spam_samples ); | |||
shuffle_array( \@ham_samples ); | |||
if ( !$check_only ) { | |||
my $learned = 0; | |||
my $t0 = [gettimeofday]; | |||
$learned = learn_samples( \@ham_samples, \@spam_samples ); | |||
my $t1 = [gettimeofday]; | |||
my $learned = 0; | |||
my $t0 = [gettimeofday]; | |||
$learned = learn_samples( \@ham_samples, \@spam_samples ); | |||
my $t1 = [gettimeofday]; | |||
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", | |||
$learned, tv_interval( $t0, $t1 ); | |||
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); | |||
} | |||
my %validation_set; | |||
my $len = int( scalar @spam_samples * $train_fraction ); | |||
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { | |||
$validation_set{ $spam_samples[$i] } = 1; | |||
$validation_set{ $spam_samples[$i] } = 1; | |||
} | |||
$len = int( scalar @ham_samples * $train_fraction ); | |||
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { | |||
$validation_set{ $ham_samples[$i] } = 0; | |||
$validation_set{ $ham_samples[$i] } = 0; | |||
} | |||
cross_validate( \%validation_set ); |
@@ -8,28 +8,28 @@ use warnings FATAL => 'all'; | |||
use AI::FANN qw(:all); | |||
use Getopt::Std; | |||
my %sym_idx; # Symbols by index | |||
my %sym_names; # Symbols by name | |||
my $num = 1; # Number of symbols | |||
my %sym_idx; # Symbols by index | |||
my %sym_names; # Symbols by name | |||
my $num = 1; # Number of symbols | |||
my @spam; | |||
my @ham; | |||
my $max_samples = -1; | |||
my $split = 1; | |||
my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN> | |||
my $score_spam = 12; | |||
my $score_ham = -6; | |||
my $max_samples = -1; | |||
my $split = 1; | |||
my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN> | |||
my $score_spam = 12; | |||
my $score_ham = -6; | |||
sub process { | |||
my ($input, $spam, $ham) = @_; | |||
my ( $input, $spam, $ham ) = @_; | |||
my $samples = 0; | |||
while(<$input>) { | |||
if (!$preprocessed) { | |||
while (<$input>) { | |||
if ( !$preprocessed ) { | |||
if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) { | |||
if ($1 > $score_spam) { | |||
if ( $1 > $score_spam ) { | |||
$_ = "$1:1: $3"; | |||
} | |||
elsif ($1 < $score_ham) { | |||
elsif ( $1 < $score_ham ) { | |||
$_ = "$1:0: $3\n"; | |||
} | |||
else { | |||
@@ -47,7 +47,7 @@ sub process { | |||
my $is_spam = 0; | |||
if ($2 == 1) { | |||
if ( $2 == 1 ) { | |||
$is_spam = 1; | |||
} | |||
@@ -56,13 +56,13 @@ sub process { | |||
foreach my $sym (@ar) { | |||
chomp $sym; | |||
if (!$sym_idx{$sym}) { | |||
$sym_idx{$sym} = $num; | |||
if ( !$sym_idx{$sym} ) { | |||
$sym_idx{$sym} = $num; | |||
$sym_names{$num} = $sym; | |||
$num++; | |||
} | |||
$sample{$sym_idx{$sym}} = 1; | |||
$sample{ $sym_idx{$sym} } = 1; | |||
} | |||
if ($is_spam) { | |||
@@ -73,32 +73,31 @@ sub process { | |||
} | |||
$samples++; | |||
if ($max_samples > 0 && $samples > $max_samples) { | |||
if ( $max_samples > 0 && $samples > $max_samples ) { | |||
return; | |||
} | |||
} | |||
} | |||
# Shuffle array | |||
sub fisher_yates_shuffle | |||
{ | |||
sub fisher_yates_shuffle { | |||
my $array = shift; | |||
my $i = @$array; | |||
my $i = @$array; | |||
while ( --$i ) { | |||
my $j = int rand( $i + 1 ); | |||
@$array[$i, $j] = @$array[$j, $i]; | |||
@$array[ $i, $j ] = @$array[ $j, $i ]; | |||
} | |||
} | |||
# Train network | |||
sub train { | |||
my ($ann, $sample, $result) = @_; | |||
my ( $ann, $sample, $result ) = @_; | |||
my @row; | |||
for (my $i = 1; $i < $num; $i++) { | |||
if ($sample->{$i}) { | |||
for ( my $i = 1 ; $i < $num ; $i++ ) { | |||
if ( $sample->{$i} ) { | |||
push @row, 1; | |||
} | |||
else { | |||
@@ -108,16 +107,16 @@ sub train { | |||
#print "@row -> @{$result}\n"; | |||
$ann->train(\@row, \@{$result}); | |||
$ann->train( \@row, \@{$result} ); | |||
} | |||
sub test { | |||
my ($ann, $sample) = @_; | |||
my ( $ann, $sample ) = @_; | |||
my @row; | |||
for (my $i = 1; $i < $num; $i++) { | |||
if ($sample->{$i}) { | |||
for ( my $i = 1 ; $i < $num ; $i++ ) { | |||
if ( $sample->{$i} ) { | |||
push @row, 1; | |||
} | |||
else { | |||
@@ -125,117 +124,120 @@ sub test { | |||
} | |||
} | |||
my $ret = $ann->run(\@row); | |||
my $ret = $ann->run( \@row ); | |||
return $ret; | |||
} | |||
my %opts; | |||
getopts('o:i:s:n:t:hpS:H:', \%opts); | |||
getopts( 'o:i:s:n:t:hpS:H:', \%opts ); | |||
if ($opts{'h'}) { | |||
if ( $opts{'h'} ) { | |||
print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n"; | |||
exit; | |||
} | |||
my $input = *STDIN; | |||
if ($opts{'i'}) { | |||
open($input, '<', $opts{'i'}) or die "cannot open $opts{i}"; | |||
if ( $opts{'i'} ) { | |||
open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}"; | |||
} | |||
if ($opts{'n'}) { | |||
if ( $opts{'n'} ) { | |||
$max_samples = $opts{'n'}; | |||
} | |||
if ($opts{'t'}) { | |||
if ( $opts{'t'} ) { | |||
# Test split | |||
$split = $opts{'t'}; | |||
} | |||
if ($opts{'p'}) { | |||
if ( $opts{'p'} ) { | |||
$preprocessed = 1; | |||
} | |||
if ($opts{'H'}) { | |||
if ( $opts{'H'} ) { | |||
$score_ham = $opts{'H'}; | |||
} | |||
if ($opts{'S'}) { | |||
if ( $opts{'S'} ) { | |||
$score_spam = $opts{'S'}; | |||
} | |||
# ham_prob, spam_prob | |||
my @spam_out = (1); | |||
my @ham_out = (0); | |||
my @ham_out = (0); | |||
process($input, \@spam, \@ham); | |||
fisher_yates_shuffle(\@spam); | |||
fisher_yates_shuffle(\@ham); | |||
process( $input, \@spam, \@ham ); | |||
fisher_yates_shuffle( \@spam ); | |||
fisher_yates_shuffle( \@ham ); | |||
my $nspam = int(scalar(@spam) / $split); | |||
my $nham = int(scalar(@ham) / $split); | |||
my $nspam = int( scalar(@spam) / $split ); | |||
my $nham = int( scalar(@ham) / $split ); | |||
my $ann = AI::FANN->new_standard($num - 1, ($num + 2) / 2, 1); | |||
my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 ); | |||
my @train_data; | |||
# Train ANN | |||
for (my $i = 0; $i < $nham; $i++) { | |||
for ( my $i = 0 ; $i < $nham ; $i++ ) { | |||
push @train_data, [ $ham[$i], \@ham_out ]; | |||
} | |||
for (my $i = 0; $i < $nspam; $i++) { | |||
for ( my $i = 0 ; $i < $nspam ; $i++ ) { | |||
push @train_data, [ $spam[$i], \@spam_out ]; | |||
} | |||
fisher_yates_shuffle(\@train_data); | |||
fisher_yates_shuffle( \@train_data ); | |||
foreach my $train_row (@train_data) { | |||
train($ann, @{$train_row}[0], @{$train_row}[1]); | |||
train( $ann, @{$train_row}[0], @{$train_row}[1] ); | |||
} | |||
print "Trained $nspam SPAM and $nham HAM samples\n"; | |||
# Now run fann | |||
if ($split > 1) { | |||
my $sample = 0.0; | |||
if ( $split > 1 ) { | |||
my $sample = 0.0; | |||
my $correct = 0.0; | |||
for (my $i = $nham; $i < $nham * $split; $i++) { | |||
my $ret = test($ann, $ham[$i]); | |||
for ( my $i = $nham ; $i < $nham * $split ; $i++ ) { | |||
my $ret = test( $ann, $ham[$i] ); | |||
#print "@{$ret}\n"; | |||
if (@{$ret}[0] < 0.5) { | |||
if ( @{$ret}[0] < 0.5 ) { | |||
$correct++; | |||
} | |||
$sample++; | |||
} | |||
print "Tested $sample HAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n"; | |||
print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; | |||
$sample = 0.0; | |||
$sample = 0.0; | |||
$correct = 0.0; | |||
for (my $i = $nspam; $i < $nspam * $split; $i++) { | |||
my $ret = test($ann, $spam[$i]); | |||
for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) { | |||
my $ret = test( $ann, $spam[$i] ); | |||
#print "@{$ret}\n"; | |||
if (@{$ret}[0] > 0.5) { | |||
if ( @{$ret}[0] > 0.5 ) { | |||
$correct++; | |||
} | |||
$sample++; | |||
} | |||
print "Tested $sample SPAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n"; | |||
print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; | |||
} | |||
if ($opts{'o'}) { | |||
$ann->save($opts{'o'}) or die "cannot save ann into $opts{o}"; | |||
if ( $opts{'o'} ) { | |||
$ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}"; | |||
} | |||
if ($opts{'s'}) { | |||
open(my $scores, '>', | |||
$opts{'s'}) or die "cannot open score file $opts{'s'}"; | |||
if ( $opts{'s'} ) { | |||
open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}"; | |||
print $scores "{"; | |||
for (my $i = 1; $i < $num; $i++) { | |||
for ( my $i = 1 ; $i < $num ; $i++ ) { | |||
my $n = $i - 1; | |||
if ($i != $num - 1) { | |||
if ( $i != $num - 1 ) { | |||
print $scores "\"$sym_names{$i}\":$n,"; | |||
} | |||
else { |