Browse Source

[Minor] Reformat Perl scripts

tags/1.8.2
Alexander Moisseev 5 years ago
parent
commit
4582ebe807
7 changed files with 1148 additions and 1136 deletions
  1. 2
    0
      .gitignore
  2. 25
    0
      .tidyallrc
  3. 125
    127
      utils/asn.pl
  4. 0
    2
      utils/cgp_rspamd.pl
  5. 343
    354
      utils/classifier_test.pl
  6. 70
    68
      utils/fann_train.pl
  7. 583
    585
      utils/rspamd_stats.pl

+ 2
- 0
.gitignore View File

@@ -0,0 +1,2 @@
# Code::TidyAll
/.tidyall.d/

+ 25
- 0
.tidyallrc View File

@@ -0,0 +1,25 @@
; Run "tidyall -a" to process all files.
; Run "tidyall -g" to process all added or modified files in the current git working directory.

; Ignore third-party code
ignore = contrib/**/* doc/doxydown/doxydown.pl

;[PerlCritic]
;select = **/*.{pl,pm,t}

[PerlTidy]
select = **/*.{pl,pm,t}
argv = -l=120

[PodChecker]
select = **/*.{pl,pm,pod}

;[PodSpell]
;select = **/*.{pl,pm,pod}

;[PodTidy]
;select = **/*.{pl,pm,pod}
;argv = --columns=120

[Test::Vars]
select = **/*.{pl,pm,t}

+ 125
- 127
utils/asn.pl View File

@@ -16,14 +16,14 @@ $LWP::Simple::ua->show_progress(1);
$Net::MRT::USE_RFC4760 = -1;

my %config = (
asn_sources => [
'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest',
'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest',
'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest',
'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest',
'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest'
],
bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz']
asn_sources => [
'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest',
'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest',
'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest',
'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest',
'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest'
],
bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz']
);

my $download_asn = 0;
@@ -38,171 +38,169 @@ my $v4_zone = "asn.rspamd.com";
my $v6_zone = "asn6.rspamd.com";
my $v4_file = "asn.zone";
my $v6_file = "asn6.zone";
my $ns_servers = ["asn-ns.rspamd.com", "asn-ns2.rspamd.com"];
my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ];

GetOptions(
"download-asn" => \$download_asn,
"download-bgp" => \$download_bgp,
"4!" => \$v4,
"6!" => \$v6,
"parse!" => \$parse,
"target=s" => \$download_target,
"zone-v4=s" => \$v4_zone,
"zone-v6=s" => \$v6_zone,
"file-v4=s" => \$v4_file,
"file-v6=s" => \$v6_file,
"ns-server=s@" => \$ns_servers,
"help|?" => \$help,
"man" => \$man
"download-asn" => \$download_asn,
"download-bgp" => \$download_bgp,
"4!" => \$v4,
"6!" => \$v6,
"parse!" => \$parse,
"target=s" => \$download_target,
"zone-v4=s" => \$v4_zone,
"zone-v6=s" => \$v6_zone,
"file-v4=s" => \$v4_file,
"file-v6=s" => \$v6_file,
"ns-server=s@" => \$ns_servers,
"help|?" => \$help,
"man" => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;

sub download_file {
my ($u) = @_;
my ($u) = @_;

print "Fetching $u\n";
my $ff = File::Fetch->new( uri => $u );
my $where = $ff->fetch( to => $download_target ) or die $ff->error;
print "Fetching $u\n";
my $ff = File::Fetch->new( uri => $u );
my $where = $ff->fetch( to => $download_target ) or die $ff->error;

return $where;
return $where;
}

if ($download_asn) {
foreach my $u ( @{ $config{'asn_sources'} } ) {
download_file($u);
}
foreach my $u ( @{ $config{'asn_sources'} } ) {
download_file($u);
}
}

if ($download_bgp) {
foreach my $u ( @{ $config{'bgp_sources'} } ) {
download_file($u);
}
foreach my $u ( @{ $config{'bgp_sources'} } ) {
download_file($u);
}
}

if ( !$parse ) {
exit 0;
exit 0;
}

my $v4_fh;
my $v6_fh;

if ($v4) {
open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!";
print $v4_fh
"\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
foreach my $ns (@{$ns_servers}) {
print $v4_fh "\$NS 43200 $ns\n";
}
open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!";
print $v4_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
foreach my $ns ( @{$ns_servers} ) {
print $v4_fh "\$NS 43200 $ns\n";
}
}
if ($v6) {
open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!";
print $v6_fh
"\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
foreach my $ns (@{$ns_servers}) {
print $v6_fh "\$NS 43200 $ns\n";
}
open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!";
print $v6_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
foreach my $ns ( @{$ns_servers} ) {
print $v6_fh "\$NS 43200 $ns\n";
}
}

# Now load BGP data
my $networks = {};

foreach my $u ( @{ $config{'bgp_sources'} } ) {
my $parsed = URI->new($u);
my $fname = $download_target . '/' . basename( $parsed->path );
open( my $fh, "<:gzip", $fname )
or die "Cannot open $fname: $!";

while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) {
if ( $dd->{'prefix'} && $dd->{'bits'} ) {
next if $dd->{'subtype'} == 2 and !$v4;
next if $dd->{'subtype'} == 4 and !$v6;
my $entry = $dd->{'entries'}->[0];
my $net = $dd->{'prefix'} . '/' . $dd->{'bits'};
if ( $entry && $entry->{'AS_PATH'} ) {
my $as = $entry->{'AS_PATH'}->[-1];
if (ref($as) eq "ARRAY") {
$as = @{$as}[0];
my $parsed = URI->new($u);
my $fname = $download_target . '/' . basename( $parsed->path );
open( my $fh, "<:gzip", $fname )
or die "Cannot open $fname: $!";

while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) {
if ( $dd->{'prefix'} && $dd->{'bits'} ) {
next if $dd->{'subtype'} == 2 and !$v4;
next if $dd->{'subtype'} == 4 and !$v6;
my $entry = $dd->{'entries'}->[0];
my $net = $dd->{'prefix'} . '/' . $dd->{'bits'};
if ( $entry && $entry->{'AS_PATH'} ) {
my $as = $entry->{'AS_PATH'}->[-1];
if ( ref($as) eq "ARRAY" ) {
$as = @{$as}[0];
}

if ( !$networks->{$as} ) {
if ( $dd->{'subtype'} == 2 ) {
$networks->{$as} = { nets_v4 => [$net], nets_v6 => [] };
}
else {
$networks->{$as} = { nets_v6 => [$net], nets_v4 => [] };
}
}
else {
if ( $dd->{'subtype'} == 2 ) {
push @{ $networks->{$as}->{'nets_v4'} }, $net;
}
else {
push @{ $networks->{$as}->{'nets_v6'} }, $net;
}
}
}
}

if ( !$networks->{$as} ) {
if ( $dd->{'subtype'} == 2 ) {
$networks->{$as} = { nets_v4 => [$net], nets_v6 => [] };
}
else {
$networks->{$as} = { nets_v6 => [$net], nets_v4 => [] };
}
}
else {
if ( $dd->{'subtype'} == 2 ) {
push @{ $networks->{$as}->{'nets_v4'} }, $net;
}
else {
push @{ $networks->{$as}->{'nets_v6'} }, $net;
}
}
}
}
}
}

# Now roughly detect countries
foreach my $u ( @{ $config{'asn_sources'} } ) {
my $parsed = URI->new($u);
my $fname = $download_target . '/' . basename( $parsed->path );
open( my $fh, "<", $fname ) or die "Cannot open $fname: $!";

while (<$fh>) {
next if /^\#/;
chomp;
my @elts = split /\|/;

if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) {
my $as_start = int( $elts[3] );
my $as_end = $as_start + int( $elts[4] );

for ( my $as = $as_start ; $as < $as_end ; $as++ ) {
my $real_as = $as;

if (ref($as) eq "ARRAY") {
$real_as = @{$as}[0];
my $parsed = URI->new($u);
my $fname = $download_target . '/' . basename( $parsed->path );
open( my $fh, "<", $fname ) or die "Cannot open $fname: $!";

while (<$fh>) {
next if /^\#/;
chomp;
my @elts = split /\|/;

if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) {
my $as_start = int( $elts[3] );
my $as_end = $as_start + int( $elts[4] );

for ( my $as = $as_start ; $as < $as_end ; $as++ ) {
my $real_as = $as;

if ( ref($as) eq "ARRAY" ) {
$real_as = @{$as}[0];
}

if ( $networks->{"$real_as"} ) {
$networks->{"$real_as"}->{'country'} = $elts[1];
$networks->{"$real_as"}->{'rir'} = $elts[0];
}
}
}

if ( $networks->{"$real_as"} ) {
$networks->{"$real_as"}->{'country'} = $elts[1];
$networks->{"$real_as"}->{'rir'} = $elts[0];
}
}
}
}
}

while ( my ( $k, $v ) = each( %{$networks} ) ) {
if ($v4) {
foreach my $n ( @{ $v->{'nets_v4'} } ) {

# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
if ( $v->{'country'} ) {
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
}
else {
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
}
if ($v4) {
foreach my $n ( @{ $v->{'nets_v4'} } ) {

# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
if ( $v->{'country'} ) {
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
}
else {
printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
}
}
}
}
if ($v6) {
foreach my $n ( @{ $v->{'nets_v6'} } ) {
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
if ( $v->{'country'} ) {
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
}
else {
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
}
if ($v6) {
foreach my $n ( @{ $v->{'nets_v6'} } ) {
# "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
if ( $v->{'country'} ) {
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
}
else {
printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
}
}
}
}
}

__END__

+ 0
- 2
utils/cgp_rspamd.pl View File

@@ -359,6 +359,4 @@ protocol. On scan requests, this filter can query Rspamd to process a message.
B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on
Rspamd scan result.

=back

=cut

+ 343
- 354
utils/classifier_test.pl View File

@@ -30,438 +30,428 @@ my $man;
my $help;

GetOptions(
"spam|s=s" => \$spam_dir,
"ham|h=s" => \$ham_dir,
"spam-symbol=s" => \$spam_symbol,
"ham-symbol=s" => \$ham_symbol,
"classifier|c=s" => \$classifier,
"timeout|t=f" => \$timeout,
"parallel|p=i" => \$parallel,
"train-fraction|t=f" => \$train_fraction,
"bogofilter|b" => \$use_bogofilter,
"dspam|d" => \$use_dspam,
"check-only" => \$check_only,
"help|?" => \$help,
"man" => \$man
"spam|s=s" => \$spam_dir,
"ham|h=s" => \$ham_dir,
"spam-symbol=s" => \$spam_symbol,
"ham-symbol=s" => \$ham_symbol,
"classifier|c=s" => \$classifier,
"timeout|t=f" => \$timeout,
"parallel|p=i" => \$parallel,
"train-fraction|t=f" => \$train_fraction,
"bogofilter|b" => \$use_bogofilter,
"dspam|d" => \$use_dspam,
"check-only" => \$check_only,
"help|?" => \$help,
"man" => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;

sub read_dir_files {
my ( $dir, $target ) = @_;
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
while ( my $file = readdir $dh ) {
if ( -f "$dir/$file" ) {
push @{$target}, "$dir/$file";
my ( $dir, $target ) = @_;
opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
while ( my $file = readdir $dh ) {
if ( -f "$dir/$file" ) {
push @{$target}, "$dir/$file";
}
}
}
}

sub shuffle_array {
my ($ar) = @_;
my ($ar) = @_;

for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
if ( $i > 1 ) {
my $sel = int( rand( $i - 1 ) );
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
if ( $i > 1 ) {
my $sel = int( rand( $i - 1 ) );
( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
}
}
}
}

sub learn_rspamc {
my ( $files, $spam ) = @_;
my $processed = 0;

my $cmd = $spam ? "learn_spam" : "learn_ham";
my $args_quoted = shell_quote @{$files};
open(
my $p,
"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'success'} ) {
$processed++;
my ( $files, $spam ) = @_;
my $processed = 0;

my $cmd = $spam ? "learn_spam" : "learn_ham";
my $args_quoted = shell_quote @{$files};
open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" )
or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'success'} ) {
$processed++;
}
}
}

return $processed;
return $processed;
}

sub learn_bogofilter {
my ( $files, $spam ) = @_;
my $processed = 0;

foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
my $fl = $spam ? "-s" : "-n";
`$bogofilter -I $args_quoted $fl`;
if ( $? == 0 ) {
$processed++;
my ( $files, $spam ) = @_;
my $processed = 0;

foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
my $fl = $spam ? "-s" : "-n";
`$bogofilter -I $args_quoted $fl`;
if ( $? == 0 ) {
$processed++;
}
}
}

return $processed;
return $processed;
}

sub learn_dspam {
my ( $files, $spam ) = @_;
my $processed = 0;
foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
my $fl = $spam ? "--class=spam" : "--class=innocent";
open( my $p,
"|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
or die "cannot run $dspam: $!";
open( my $inp, "< $f" );
while (<$inp>) {
print $p $_;
my ( $files, $spam ) = @_;
my $processed = 0;
foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
my $fl = $spam ? "--class=spam" : "--class=innocent";
open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
or die "cannot run $dspam: $!";
open( my $inp, "< $f" );
while (<$inp>) {
print $p $_;
}
}
}

return $processed;
return $processed;
}

sub learn_samples {
my ( $ar_ham, $ar_spam ) = @_;
my $len;
my $processed = 0;
my $total = 0;
my $learn_func;

my @files_spam;
my @files_ham;

if ($use_dspam) {
$learn_func = \&learn_dspam;
}
elsif ($use_bogofilter) {
$learn_func = \&learn_bogofilter;
}
else {
$learn_func = \&learn_rspamc;
}

$len = int( scalar @{$ar_ham} * $train_fraction );
my @cur_vec;

# Shuffle spam and ham samples
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_ham}[$i];
push @files_ham, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_ham}[$i];
}
}

$len = int( scalar @{$ar_spam} * $train_fraction );
@cur_vec = ();
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_spam}[$i];
push @files_spam, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_spam}[$i];
my ( $ar_ham, $ar_spam ) = @_;
my $len;
my $processed = 0;
my $total = 0;
my $learn_func;

my @files_spam;
my @files_ham;

if ($use_dspam) {
$learn_func = \&learn_dspam;
}
}

for ( my $i = 0 ; $i < $total ; $i++ ) {
my $args;
my $spam;

if ( $i % 2 == 0 ) {
$args = pop @files_spam;

if ( !$args ) {
$args = pop @files_ham;
$spam = 0;
}
else {
$spam = 1;
}
elsif ($use_bogofilter) {
$learn_func = \&learn_bogofilter;
}
else {
$args = pop @files_ham;
if ( !$args ) {
$args = pop @files_spam;
$spam = 1;
}
else {
$spam = 0;
}
$learn_func = \&learn_rspamc;
}

my $r = $learn_func->( $args, $spam );
if ($r) {
$processed += $r;
$len = int( scalar @{$ar_ham} * $train_fraction );
my @cur_vec;

# Shuffle spam and ham samples
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_ham}[$i];
push @files_ham, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_ham}[$i];
}
}
}

return $processed;
}
$len = int( scalar @{$ar_spam} * $train_fraction );
@cur_vec = ();
for ( my $i = 0 ; $i < $len ; $i++ ) {
if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
push @cur_vec, @{$ar_spam}[$i];
push @files_spam, [@cur_vec];
@cur_vec = ();
$total++;
}
else {
push @cur_vec, @{$ar_spam}[$i];
}
}

sub check_rspamc {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
for ( my $i = 0 ; $i < $total ; $i++ ) {
my $args;
my $spam;

my $args_quoted = shell_quote @{$files};
my $processed = 0;
if ( $i % 2 == 0 ) {
$args = pop @files_spam;

open(
my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
my $res = eval('decode_json($_)');
if ( $res && $res->{'default'} ) {
$processed++;

if ($spam) {
if ( $res->{'default'}->{$ham_symbol} ) {
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
my $percentage = int($1);
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
if ( !$args ) {
$args = pop @files_ham;
$spam = 0;
}
else {
$spam = 1;
}
}
else {
$$fp_cnt++;
}
}
elsif ( !$res->{'default'}->{$spam_symbol} ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
else {
if ( $res->{'default'}->{$spam_symbol} ) {
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {

my $percentage = int($1);
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
$args = pop @files_ham;
if ( !$args ) {
$args = pop @files_spam;
$spam = 1;
}
else {
$spam = 0;
}
}
else {
$$fp_cnt++;
}
}
elsif ( !$res->{'default'}->{$ham_symbol} ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;

my $r = $learn_func->( $args, $spam );
if ($r) {
$processed += $r;
}
}
}
}

return $processed;
return $processed;
}

sub check_bogofilter {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
my $processed = 0;
sub check_rspamc {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;

foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
my $args_quoted = shell_quote @{$files};
my $processed = 0;

open( my $p, "$bogofilter -t -I $args_quoted |" )
or die "cannot spawn $bogofilter: $!";
open(
my $p,
"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
) or die "cannot spawn $rspamc: $!";

while (<$p>) {
if ( $_ =~ /^([SHU])\s+.*$/ ) {
$processed++;

if ($spam) {
if ( $1 eq 'H' ) {
$$fp_cnt++;
}
elsif ( $1 eq 'U' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
else {
if ( $1 eq 'S' ) {
$$fp_cnt++;
}
elsif ( $1 eq 'U' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
my $res = eval('decode_json($_)');
if ( $res && $res->{'default'} ) {
$processed++;

if ($spam) {
if ( $res->{'default'}->{$ham_symbol} ) {
my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
my $percentage = int($1);
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
}
}
else {
$$fp_cnt++;
}
}
elsif ( !$res->{'default'}->{$spam_symbol} ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
else {
if ( $res->{'default'}->{$spam_symbol} ) {
my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {

my $percentage = int($1);
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
}
}
else {
$$fp_cnt++;
}
}
elsif ( !$res->{'default'}->{$ham_symbol} ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
}
}
}
}

return $processed;
return $processed;
}

sub check_dspam {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
my $processed = 0;
sub check_bogofilter {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
my $processed = 0;

foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;

open( my $p, "$bogofilter -t -I $args_quoted |" )
or die "cannot spawn $bogofilter: $!";

while (<$p>) {
if ( $_ =~ /^([SHU])\s+.*$/ ) {
$processed++;

if ($spam) {
if ( $1 eq 'H' ) {
$$fp_cnt++;
}
elsif ( $1 eq 'U' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
else {
if ( $1 eq 'S' ) {
$$fp_cnt++;
}
elsif ( $1 eq 'U' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
}
}
}

foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;
return $processed;
}

my $pid = open2( *Reader, *Writer,
"$dspam --user nobody --classify --stdout --mode=notrain" );
open( my $inp, "< $f" );
while (<$inp>) {
print Writer $_;
}
close Writer;
sub check_dspam {
my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
my $processed = 0;

while (<Reader>) {
if ( $_ =~
qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
)
{
$processed++;
my $percentage = int($2 * 100.0);
foreach my $f ( @{$files} ) {
my $args_quoted = shell_quote $f;

if ($spam) {
if ( $1 eq 'Innocent') {
if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
$$fp_cnt++;
}
}
elsif ( $1 ne 'Spam' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" );
open( my $inp, "< $f" );
while (<$inp>) {
print Writer $_;
}
else {
if ( $1 eq 'Spam' ) {
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
close Writer;

while (<Reader>) {
if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) {
$processed++;
my $percentage = int( $2 * 100.0 );

if ($spam) {
if ( $1 eq 'Innocent' ) {
if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) {
$$fp_cnt++;
}
}
elsif ( $1 ne 'Spam' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
else {
if ( $1 eq 'Spam' ) {
if ( $percentage >= $rspamc_prob_trigger ) {
$$fp_cnt++;
}
}
elsif ( $1 ne 'Innocent' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
}
}
elsif ( $1 ne 'Innocent' ) {
$$fn_cnt++;
}
else {
$$detected_cnt++;
}
}
}
close Reader;
waitpid( $pid, 0 );
}
close Reader;
waitpid( $pid, 0 );
}

return $processed;
return $processed;
}

sub cross_validate {
my ($hr) = @_;
my $args = "";
my $processed = 0;
my $fp_spam = 0;
my $fn_spam = 0;
my $fp_ham = 0;
my $fn_ham = 0;
my $total_spam = 0;
my $total_ham = 0;
my $detected_spam = 0;
my $detected_ham = 0;
my $i = 0;
my $len = scalar keys %{$hr};
my @files_spam;
my @files_ham;
my @cur_spam;
my @cur_ham;
my $check_func;

if ($use_dspam) {
$check_func = \&check_dspam;
}
elsif ($use_bogofilter) {
$check_func = \&check_bogofilter;
}
else {
$check_func = \&check_rspamc;
}

while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
if ($spam) {
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
push @cur_spam, $fn;
push @files_spam, [@cur_spam];
@cur_spam = ();
}
else {
push @cur_spam, $fn;
}
my ($hr) = @_;
my $args = "";
my $processed = 0;
my $fp_spam = 0;
my $fn_spam = 0;
my $fp_ham = 0;
my $fn_ham = 0;
my $total_spam = 0;
my $total_ham = 0;
my $detected_spam = 0;
my $detected_ham = 0;
my $i = 0;
my $len = scalar keys %{$hr};
my @files_spam;
my @files_ham;
my @cur_spam;
my @cur_ham;
my $check_func;

if ($use_dspam) {
$check_func = \&check_dspam;
}
elsif ($use_bogofilter) {
$check_func = \&check_bogofilter;
}
else {
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
push @cur_ham, $fn;
push @files_ham, [@cur_ham];
@cur_ham = ();
}
else {
push @cur_ham, $fn;
}
$check_func = \&check_rspamc;
}

while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
if ($spam) {
if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
push @cur_spam, $fn;
push @files_spam, [@cur_spam];
@cur_spam = ();
}
else {
push @cur_spam, $fn;
}
}
else {
if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
push @cur_ham, $fn;
push @files_ham, [@cur_ham];
@cur_ham = ();
}
else {
push @cur_ham, $fn;
}
}
}
}

shuffle_array( \@files_spam );
shuffle_array( \@files_spam );

foreach my $fn (@files_spam) {
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
$total_spam += $r;
$processed += $r;
}
foreach my $fn (@files_spam) {
my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
$total_spam += $r;
$processed += $r;
}

shuffle_array( \@files_ham );
shuffle_array( \@files_ham );

foreach my $fn (@files_ham) {
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
$total_ham += $r;
$processed += $r;
}
foreach my $fn (@files_ham) {
my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
$total_ham += $r;
$processed += $r;
}

printf "Scanned %d messages
printf "Scanned %d messages
%d spam messages (%d detected)
%d ham messages (%d detected)\n",
$processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;

printf "\nHam FP rate: %.2f%% (%d messages)
Ham FN rate: %.2f%% (%d messages)\n",
$fp_ham / $total_ham * 100.0, $fp_ham,
$fn_ham / $total_ham * 100.0, $fn_ham;
printf "\nHam FP rate: %.2f%% (%d messages)
Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham;

printf "\nSpam FP rate: %.2f%% (%d messages)
printf "\nSpam FP rate: %.2f%% (%d messages)
Spam FN rate: %.2f%% (%d messages)\n",
$fp_spam / $total_spam * 100.0, $fp_spam,
$fn_spam / $total_spam * 100.0, $fn_spam;
$fp_spam / $total_spam * 100.0, $fp_spam,
$fn_spam / $total_spam * 100.0, $fn_spam;
}

if ( !$spam_dir || !$ham_dir ) {
die "spam or/and ham directories are not specified";
die "spam or/and ham directories are not specified";
}

my @spam_samples;
@@ -473,24 +463,23 @@ shuffle_array( \@spam_samples );
shuffle_array( \@ham_samples );

if ( !$check_only ) {
my $learned = 0;
my $t0 = [gettimeofday];
$learned = learn_samples( \@ham_samples, \@spam_samples );
my $t1 = [gettimeofday];
my $learned = 0;
my $t0 = [gettimeofday];
$learned = learn_samples( \@ham_samples, \@spam_samples );
my $t1 = [gettimeofday];

printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
$learned, tv_interval( $t0, $t1 );
printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 );
}

my %validation_set;
my $len = int( scalar @spam_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
$validation_set{ $spam_samples[$i] } = 1;
$validation_set{ $spam_samples[$i] } = 1;
}

$len = int( scalar @ham_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
$validation_set{ $ham_samples[$i] } = 0;
$validation_set{ $ham_samples[$i] } = 0;
}

cross_validate( \%validation_set );

+ 70
- 68
utils/fann_train.pl View File

@@ -8,28 +8,28 @@ use warnings FATAL => 'all';
use AI::FANN qw(:all);
use Getopt::Std;

my %sym_idx; # Symbols by index
my %sym_names; # Symbols by name
my $num = 1; # Number of symbols
my %sym_idx; # Symbols by index
my %sym_names; # Symbols by name
my $num = 1; # Number of symbols
my @spam;
my @ham;
my $max_samples = -1;
my $split = 1;
my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
my $score_spam = 12;
my $score_ham = -6;
my $max_samples = -1;
my $split = 1;
my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
my $score_spam = 12;
my $score_ham = -6;

sub process {
my ($input, $spam, $ham) = @_;
my ( $input, $spam, $ham ) = @_;
my $samples = 0;

while(<$input>) {
if (!$preprocessed) {
while (<$input>) {
if ( !$preprocessed ) {
if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) {
if ($1 > $score_spam) {
if ( $1 > $score_spam ) {
$_ = "$1:1: $3";
}
elsif ($1 < $score_ham) {
elsif ( $1 < $score_ham ) {
$_ = "$1:0: $3\n";
}
else {
@@ -47,7 +47,7 @@ sub process {

my $is_spam = 0;

if ($2 == 1) {
if ( $2 == 1 ) {
$is_spam = 1;
}

@@ -56,13 +56,13 @@ sub process {

foreach my $sym (@ar) {
chomp $sym;
if (!$sym_idx{$sym}) {
$sym_idx{$sym} = $num;
if ( !$sym_idx{$sym} ) {
$sym_idx{$sym} = $num;
$sym_names{$num} = $sym;
$num++;
}

$sample{$sym_idx{$sym}} = 1;
$sample{ $sym_idx{$sym} } = 1;
}

if ($is_spam) {
@@ -73,32 +73,31 @@ sub process {
}

$samples++;
if ($max_samples > 0 && $samples > $max_samples) {
if ( $max_samples > 0 && $samples > $max_samples ) {
return;
}
}
}

# Shuffle array
sub fisher_yates_shuffle
{
sub fisher_yates_shuffle {
my $array = shift;
my $i = @$array;
my $i = @$array;

while ( --$i ) {
my $j = int rand( $i + 1 );
@$array[$i, $j] = @$array[$j, $i];
@$array[ $i, $j ] = @$array[ $j, $i ];
}
}

# Train network
sub train {
my ($ann, $sample, $result) = @_;
my ( $ann, $sample, $result ) = @_;

my @row;

for (my $i = 1; $i < $num; $i++) {
if ($sample->{$i}) {
for ( my $i = 1 ; $i < $num ; $i++ ) {
if ( $sample->{$i} ) {
push @row, 1;
}
else {
@@ -108,16 +107,16 @@ sub train {

#print "@row -> @{$result}\n";

$ann->train(\@row, \@{$result});
$ann->train( \@row, \@{$result} );
}

sub test {
my ($ann, $sample) = @_;
my ( $ann, $sample ) = @_;

my @row;

for (my $i = 1; $i < $num; $i++) {
if ($sample->{$i}) {
for ( my $i = 1 ; $i < $num ; $i++ ) {
if ( $sample->{$i} ) {
push @row, 1;
}
else {
@@ -125,117 +124,120 @@ sub test {
}
}

my $ret = $ann->run(\@row);
my $ret = $ann->run( \@row );

return $ret;
}

my %opts;
getopts('o:i:s:n:t:hpS:H:', \%opts);
getopts( 'o:i:s:n:t:hpS:H:', \%opts );

if ($opts{'h'}) {
if ( $opts{'h'} ) {
print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n";
exit;
}

my $input = *STDIN;

if ($opts{'i'}) {
open($input, '<', $opts{'i'}) or die "cannot open $opts{i}";
if ( $opts{'i'} ) {
open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}";
}

if ($opts{'n'}) {
if ( $opts{'n'} ) {
$max_samples = $opts{'n'};
}

if ($opts{'t'}) {
if ( $opts{'t'} ) {

# Test split
$split = $opts{'t'};
}
if ($opts{'p'}) {
if ( $opts{'p'} ) {
$preprocessed = 1;
}

if ($opts{'H'}) {
if ( $opts{'H'} ) {
$score_ham = $opts{'H'};
}

if ($opts{'S'}) {
if ( $opts{'S'} ) {
$score_spam = $opts{'S'};
}

# ham_prob, spam_prob
my @spam_out = (1);
my @ham_out = (0);
my @ham_out = (0);

process($input, \@spam, \@ham);
fisher_yates_shuffle(\@spam);
fisher_yates_shuffle(\@ham);
process( $input, \@spam, \@ham );
fisher_yates_shuffle( \@spam );
fisher_yates_shuffle( \@ham );

my $nspam = int(scalar(@spam) / $split);
my $nham = int(scalar(@ham) / $split);
my $nspam = int( scalar(@spam) / $split );
my $nham = int( scalar(@ham) / $split );

my $ann = AI::FANN->new_standard($num - 1, ($num + 2) / 2, 1);
my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 );

my @train_data;

# Train ANN
for (my $i = 0; $i < $nham; $i++) {
for ( my $i = 0 ; $i < $nham ; $i++ ) {
push @train_data, [ $ham[$i], \@ham_out ];
}

for (my $i = 0; $i < $nspam; $i++) {
for ( my $i = 0 ; $i < $nspam ; $i++ ) {
push @train_data, [ $spam[$i], \@spam_out ];
}

fisher_yates_shuffle(\@train_data);
fisher_yates_shuffle( \@train_data );

foreach my $train_row (@train_data) {
train($ann, @{$train_row}[0], @{$train_row}[1]);
train( $ann, @{$train_row}[0], @{$train_row}[1] );
}

print "Trained $nspam SPAM and $nham HAM samples\n";

# Now run fann
if ($split > 1) {
my $sample = 0.0;
if ( $split > 1 ) {
my $sample = 0.0;
my $correct = 0.0;
for (my $i = $nham; $i < $nham * $split; $i++) {
my $ret = test($ann, $ham[$i]);
for ( my $i = $nham ; $i < $nham * $split ; $i++ ) {
my $ret = test( $ann, $ham[$i] );

#print "@{$ret}\n";
if (@{$ret}[0] < 0.5) {
if ( @{$ret}[0] < 0.5 ) {
$correct++;
}
$sample++;
}

print "Tested $sample HAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n";
print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";

$sample = 0.0;
$sample = 0.0;
$correct = 0.0;

for (my $i = $nspam; $i < $nspam * $split; $i++) {
my $ret = test($ann, $spam[$i]);
for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) {
my $ret = test( $ann, $spam[$i] );

#print "@{$ret}\n";
if (@{$ret}[0] > 0.5) {
if ( @{$ret}[0] > 0.5 ) {
$correct++;
}
$sample++;
}

print "Tested $sample SPAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n";
print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
}

if ($opts{'o'}) {
$ann->save($opts{'o'}) or die "cannot save ann into $opts{o}";
if ( $opts{'o'} ) {
$ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}";
}

if ($opts{'s'}) {
open(my $scores, '>',
$opts{'s'}) or die "cannot open score file $opts{'s'}";
if ( $opts{'s'} ) {
open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}";
print $scores "{";
for (my $i = 1; $i < $num; $i++) {
for ( my $i = 1 ; $i < $num ; $i++ ) {
my $n = $i - 1;
if ($i != $num - 1) {
if ( $i != $num - 1 ) {
print $scores "\"$sym_names{$i}\":$n,";
}
else {

+ 583
- 585
utils/rspamd_stats.pl
File diff suppressed because it is too large
View File


Loading…
Cancel
Save