diff options
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | .tidyallrc | 25 | ||||
-rw-r--r-- | utils/asn.pl | 252 | ||||
-rw-r--r-- | utils/cgp_rspamd.pl | 2 | ||||
-rw-r--r-- | utils/classifier_test.pl | 697 | ||||
-rwxr-xr-x | utils/fann_train.pl | 138 | ||||
-rwxr-xr-x | utils/rspamd_stats.pl | 1168 |
7 files changed, 1148 insertions, 1136 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..b8c677bfc --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# Code::TidyAll +/.tidyall.d/ diff --git a/.tidyallrc b/.tidyallrc new file mode 100644 index 000000000..272ba1936 --- /dev/null +++ b/.tidyallrc @@ -0,0 +1,25 @@ +; Run "tidyall -a" to process all files. +; Run "tidyall -g" to process all added or modified files in the current git working directory. + +; Ignore third-party code +ignore = contrib/**/* doc/doxydown/doxydown.pl + +;[PerlCritic] +;select = **/*.{pl,pm,t} + +[PerlTidy] +select = **/*.{pl,pm,t} +argv = -l=120 + +[PodChecker] +select = **/*.{pl,pm,pod} + +;[PodSpell] +;select = **/*.{pl,pm,pod} + +;[PodTidy] +;select = **/*.{pl,pm,pod} +;argv = --columns=120 + +[Test::Vars] +select = **/*.{pl,pm,t} diff --git a/utils/asn.pl b/utils/asn.pl index 11bb6746b..b5f2ca41e 100644 --- a/utils/asn.pl +++ b/utils/asn.pl @@ -16,14 +16,14 @@ $LWP::Simple::ua->show_progress(1); $Net::MRT::USE_RFC4760 = -1; my %config = ( - asn_sources => [ - 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest', - 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest', - 'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest', - 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest', - 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest' - ], - bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz'] + asn_sources => [ + 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest', + 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest', + 'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest', + 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest', + 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest' + ], + bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz'] ); my $download_asn = 0; @@ -38,171 +38,169 @@ my $v4_zone = "asn.rspamd.com"; my $v6_zone = "asn6.rspamd.com"; my $v4_file = "asn.zone"; my $v6_file = "asn6.zone"; -my $ns_servers = ["asn-ns.rspamd.com", "asn-ns2.rspamd.com"]; +my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ]; GetOptions( - "download-asn" => \$download_asn, - "download-bgp" => \$download_bgp, - "4!" => \$v4, - "6!" => \$v6, - "parse!" => \$parse, - "target=s" => \$download_target, - "zone-v4=s" => \$v4_zone, - "zone-v6=s" => \$v6_zone, - "file-v4=s" => \$v4_file, - "file-v6=s" => \$v6_file, - "ns-server=s@" => \$ns_servers, - "help|?" => \$help, - "man" => \$man + "download-asn" => \$download_asn, + "download-bgp" => \$download_bgp, + "4!" => \$v4, + "6!" => \$v6, + "parse!" => \$parse, + "target=s" => \$download_target, + "zone-v4=s" => \$v4_zone, + "zone-v6=s" => \$v6_zone, + "file-v4=s" => \$v4_file, + "file-v6=s" => \$v6_file, + "ns-server=s@" => \$ns_servers, + "help|?" => \$help, + "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; pod2usage( -exitval => 0, -verbose => 2 ) if $man; sub download_file { - my ($u) = @_; + my ($u) = @_; - print "Fetching $u\n"; - my $ff = File::Fetch->new( uri => $u ); - my $where = $ff->fetch( to => $download_target ) or die $ff->error; + print "Fetching $u\n"; + my $ff = File::Fetch->new( uri => $u ); + my $where = $ff->fetch( to => $download_target ) or die $ff->error; - return $where; + return $where; } if ($download_asn) { - foreach my $u ( @{ $config{'asn_sources'} } ) { - download_file($u); - } + foreach my $u ( @{ $config{'asn_sources'} } ) { + download_file($u); + } } if ($download_bgp) { - foreach my $u ( @{ $config{'bgp_sources'} } ) { - download_file($u); - } + foreach my $u ( @{ $config{'bgp_sources'} } ) { + download_file($u); + } } if ( !$parse ) { - exit 0; + exit 0; } my $v4_fh; my $v6_fh; if ($v4) { - open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!"; - print $v4_fh - "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; - foreach my $ns (@{$ns_servers}) { - print $v4_fh "\$NS 43200 $ns\n"; - } + open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!"; + print $v4_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; + foreach my $ns ( @{$ns_servers} ) { + print $v4_fh "\$NS 43200 $ns\n"; + } } if ($v6) { - open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!"; - print $v6_fh - "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; - foreach my $ns (@{$ns_servers}) { - print $v6_fh "\$NS 43200 $ns\n"; - } + open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!"; + print $v6_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n"; + foreach my $ns ( @{$ns_servers} ) { + print $v6_fh "\$NS 43200 $ns\n"; + } } # Now load BGP data my $networks = {}; foreach my $u ( @{ $config{'bgp_sources'} } ) { - my $parsed = URI->new($u); - my $fname = $download_target . '/' . basename( $parsed->path ); - open( my $fh, "<:gzip", $fname ) - or die "Cannot open $fname: $!"; - - while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) { - if ( $dd->{'prefix'} && $dd->{'bits'} ) { - next if $dd->{'subtype'} == 2 and !$v4; - next if $dd->{'subtype'} == 4 and !$v6; - my $entry = $dd->{'entries'}->[0]; - my $net = $dd->{'prefix'} . '/' . $dd->{'bits'}; - if ( $entry && $entry->{'AS_PATH'} ) { - my $as = $entry->{'AS_PATH'}->[-1]; - if (ref($as) eq "ARRAY") { - $as = @{$as}[0]; + my $parsed = URI->new($u); + my $fname = $download_target . '/' . basename( $parsed->path ); + open( my $fh, "<:gzip", $fname ) + or die "Cannot open $fname: $!"; + + while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) { + if ( $dd->{'prefix'} && $dd->{'bits'} ) { + next if $dd->{'subtype'} == 2 and !$v4; + next if $dd->{'subtype'} == 4 and !$v6; + my $entry = $dd->{'entries'}->[0]; + my $net = $dd->{'prefix'} . '/' . $dd->{'bits'}; + if ( $entry && $entry->{'AS_PATH'} ) { + my $as = $entry->{'AS_PATH'}->[-1]; + if ( ref($as) eq "ARRAY" ) { + $as = @{$as}[0]; + } + + if ( !$networks->{$as} ) { + if ( $dd->{'subtype'} == 2 ) { + $networks->{$as} = { nets_v4 => [$net], nets_v6 => [] }; + } + else { + $networks->{$as} = { nets_v6 => [$net], nets_v4 => [] }; + } + } + else { + if ( $dd->{'subtype'} == 2 ) { + push @{ $networks->{$as}->{'nets_v4'} }, $net; + } + else { + push @{ $networks->{$as}->{'nets_v6'} }, $net; + } + } + } } - - if ( !$networks->{$as} ) { - if ( $dd->{'subtype'} == 2 ) { - $networks->{$as} = { nets_v4 => [$net], nets_v6 => [] }; - } - else { - $networks->{$as} = { nets_v6 => [$net], nets_v4 => [] }; - } - } - else { - if ( $dd->{'subtype'} == 2 ) { - push @{ $networks->{$as}->{'nets_v4'} }, $net; - } - else { - push @{ $networks->{$as}->{'nets_v6'} }, $net; - } - } - } } - } } # Now roughly detect countries foreach my $u ( @{ $config{'asn_sources'} } ) { - my $parsed = URI->new($u); - my $fname = $download_target . '/' . basename( $parsed->path ); - open( my $fh, "<", $fname ) or die "Cannot open $fname: $!"; - - while (<$fh>) { - next if /^\#/; - chomp; - my @elts = split /\|/; - - if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) { - my $as_start = int( $elts[3] ); - my $as_end = $as_start + int( $elts[4] ); - - for ( my $as = $as_start ; $as < $as_end ; $as++ ) { - my $real_as = $as; - - if (ref($as) eq "ARRAY") { - $real_as = @{$as}[0]; + my $parsed = URI->new($u); + my $fname = $download_target . '/' . basename( $parsed->path ); + open( my $fh, "<", $fname ) or die "Cannot open $fname: $!"; + + while (<$fh>) { + next if /^\#/; + chomp; + my @elts = split /\|/; + + if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) { + my $as_start = int( $elts[3] ); + my $as_end = $as_start + int( $elts[4] ); + + for ( my $as = $as_start ; $as < $as_end ; $as++ ) { + my $real_as = $as; + + if ( ref($as) eq "ARRAY" ) { + $real_as = @{$as}[0]; + } + + if ( $networks->{"$real_as"} ) { + $networks->{"$real_as"}->{'country'} = $elts[1]; + $networks->{"$real_as"}->{'rir'} = $elts[0]; + } + } } - - if ( $networks->{"$real_as"} ) { - $networks->{"$real_as"}->{'country'} = $elts[1]; - $networks->{"$real_as"}->{'rir'} = $elts[0]; - } - } } - } } while ( my ( $k, $v ) = each( %{$networks} ) ) { - if ($v4) { - foreach my $n ( @{ $v->{'nets_v4'} } ) { - - # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 - if ( $v->{'country'} ) { - printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; - } - else { - printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; - } + if ($v4) { + foreach my $n ( @{ $v->{'nets_v4'} } ) { + + # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 + if ( $v->{'country'} ) { + printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; + } + else { + printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; + } + } } - } - if ($v6) { - foreach my $n ( @{ $v->{'nets_v6'} } ) { - - # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 - if ( $v->{'country'} ) { - printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; - } - else { - printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; - } + if ($v6) { + foreach my $n ( @{ $v->{'nets_v6'} } ) { + + # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8 + if ( $v->{'country'} ) { + printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'}; + } + else { + printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN'; + } + } } - } } __END__ diff --git a/utils/cgp_rspamd.pl b/utils/cgp_rspamd.pl index b1d30b905..e55ac5791 100644 --- a/utils/cgp_rspamd.pl +++ b/utils/cgp_rspamd.pl @@ -359,6 +359,4 @@ protocol. On scan requests, this filter can query Rspamd to process a message. B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on Rspamd scan result. -=back - =cut diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl index 2dbb4e903..08febe257 100644 --- a/utils/classifier_test.pl +++ b/utils/classifier_test.pl @@ -30,438 +30,428 @@ my $man; my $help; GetOptions( - "spam|s=s" => \$spam_dir, - "ham|h=s" => \$ham_dir, - "spam-symbol=s" => \$spam_symbol, - "ham-symbol=s" => \$ham_symbol, - "classifier|c=s" => \$classifier, - "timeout|t=f" => \$timeout, - "parallel|p=i" => \$parallel, - "train-fraction|t=f" => \$train_fraction, - "bogofilter|b" => \$use_bogofilter, - "dspam|d" => \$use_dspam, - "check-only" => \$check_only, - "help|?" => \$help, - "man" => \$man + "spam|s=s" => \$spam_dir, + "ham|h=s" => \$ham_dir, + "spam-symbol=s" => \$spam_symbol, + "ham-symbol=s" => \$ham_symbol, + "classifier|c=s" => \$classifier, + "timeout|t=f" => \$timeout, + "parallel|p=i" => \$parallel, + "train-fraction|t=f" => \$train_fraction, + "bogofilter|b" => \$use_bogofilter, + "dspam|d" => \$use_dspam, + "check-only" => \$check_only, + "help|?" => \$help, + "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; pod2usage( -exitval => 0, -verbose => 2 ) if $man; sub read_dir_files { - my ( $dir, $target ) = @_; - opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; - while ( my $file = readdir $dh ) { - if ( -f "$dir/$file" ) { - push @{$target}, "$dir/$file"; + my ( $dir, $target ) = @_; + opendir( my $dh, $dir ) or die "cannot open dir $dir: $!"; + while ( my $file = readdir $dh ) { + if ( -f "$dir/$file" ) { + push @{$target}, "$dir/$file"; + } } - } } sub shuffle_array { - my ($ar) = @_; + my ($ar) = @_; - for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { - if ( $i > 1 ) { - my $sel = int( rand( $i - 1 ) ); - ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) { + if ( $i > 1 ) { + my $sel = int( rand( $i - 1 ) ); + ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] ); + } } - } } sub learn_rspamc { - my ( $files, $spam ) = @_; - my $processed = 0; - - my $cmd = $spam ? "learn_spam" : "learn_ham"; - my $args_quoted = shell_quote @{$files}; - open( - my $p, -"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" - ) or die "cannot spawn $rspamc: $!"; - - while (<$p>) { - my $res = eval('decode_json($_)'); - if ( $res && $res->{'success'} ) { - $processed++; + my ( $files, $spam ) = @_; + my $processed = 0; + + my $cmd = $spam ? "learn_spam" : "learn_ham"; + my $args_quoted = shell_quote @{$files}; + open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" ) + or die "cannot spawn $rspamc: $!"; + + while (<$p>) { + my $res = eval('decode_json($_)'); + if ( $res && $res->{'success'} ) { + $processed++; + } } - } - return $processed; + return $processed; } sub learn_bogofilter { - my ( $files, $spam ) = @_; - my $processed = 0; - - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; - my $fl = $spam ? "-s" : "-n"; - `$bogofilter -I $args_quoted $fl`; - if ( $? == 0 ) { - $processed++; + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "-s" : "-n"; + `$bogofilter -I $args_quoted $fl`; + if ( $? == 0 ) { + $processed++; + } } - } - return $processed; + return $processed; } sub learn_dspam { - my ( $files, $spam ) = @_; - my $processed = 0; - - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; - my $fl = $spam ? "--class=spam" : "--class=innocent"; - open( my $p, - "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) - or die "cannot run $dspam: $!"; - - open( my $inp, "< $f" ); - while (<$inp>) { - print $p $_; + my ( $files, $spam ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + my $fl = $spam ? "--class=spam" : "--class=innocent"; + open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" ) + or die "cannot run $dspam: $!"; + + open( my $inp, "< $f" ); + while (<$inp>) { + print $p $_; + } } - } - return $processed; + return $processed; } sub learn_samples { - my ( $ar_ham, $ar_spam ) = @_; - my $len; - my $processed = 0; - my $total = 0; - my $learn_func; - - my @files_spam; - my @files_ham; - - if ($use_dspam) { - $learn_func = \&learn_dspam; - } - elsif ($use_bogofilter) { - $learn_func = \&learn_bogofilter; - } - else { - $learn_func = \&learn_rspamc; - } - - $len = int( scalar @{$ar_ham} * $train_fraction ); - my @cur_vec; - - # Shuffle spam and ham samples - for ( my $i = 0 ; $i < $len ; $i++ ) { - if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { - push @cur_vec, @{$ar_ham}[$i]; - push @files_ham, [@cur_vec]; - @cur_vec = (); - $total++; - } - else { - push @cur_vec, @{$ar_ham}[$i]; - } - } - - $len = int( scalar @{$ar_spam} * $train_fraction ); - @cur_vec = (); - for ( my $i = 0 ; $i < $len ; $i++ ) { - if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { - push @cur_vec, @{$ar_spam}[$i]; - push @files_spam, [@cur_vec]; - @cur_vec = (); - $total++; - } - else { - push @cur_vec, @{$ar_spam}[$i]; + my ( $ar_ham, $ar_spam ) = @_; + my $len; + my $processed = 0; + my $total = 0; + my $learn_func; + + my @files_spam; + my @files_ham; + + if ($use_dspam) { + $learn_func = \&learn_dspam; } - } - - for ( my $i = 0 ; $i < $total ; $i++ ) { - my $args; - my $spam; - - if ( $i % 2 == 0 ) { - $args = pop @files_spam; - - if ( !$args ) { - $args = pop @files_ham; - $spam = 0; - } - else { - $spam = 1; - } + elsif ($use_bogofilter) { + $learn_func = \&learn_bogofilter; } else { - $args = pop @files_ham; - if ( !$args ) { - $args = pop @files_spam; - $spam = 1; - } - else { - $spam = 0; - } + $learn_func = \&learn_rspamc; } - my $r = $learn_func->( $args, $spam ); - if ($r) { - $processed += $r; + $len = int( scalar @{$ar_ham} * $train_fraction ); + my @cur_vec; + + # Shuffle spam and ham samples + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_ham}[$i]; + push @files_ham, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_ham}[$i]; + } } - } - return $processed; -} + $len = int( scalar @{$ar_spam} * $train_fraction ); + @cur_vec = (); + for ( my $i = 0 ; $i < $len ; $i++ ) { + if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) { + push @cur_vec, @{$ar_spam}[$i]; + push @files_spam, [@cur_vec]; + @cur_vec = (); + $total++; + } + else { + push @cur_vec, @{$ar_spam}[$i]; + } + } -sub check_rspamc { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + for ( my $i = 0 ; $i < $total ; $i++ ) { + my $args; + my $spam; - my $args_quoted = shell_quote @{$files}; - my $processed = 0; + if ( $i % 2 == 0 ) { + $args = pop @files_spam; - open( - my $p, -"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" - ) or die "cannot spawn $rspamc: $!"; - - while (<$p>) { - my $res = eval('decode_json($_)'); - if ( $res && $res->{'default'} ) { - $processed++; - - if ($spam) { - if ( $res->{'default'}->{$ham_symbol} ) { - my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; - if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { - my $percentage = int($1); - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + if ( !$args ) { + $args = pop @files_ham; + $spam = 0; + } + else { + $spam = 1; } - } - else { - $$fp_cnt++; - } - } - elsif ( !$res->{'default'}->{$spam_symbol} ) { - $$fn_cnt++; } else { - $$detected_cnt++; - } - } - else { - if ( $res->{'default'}->{$spam_symbol} ) { - my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; - if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { - - my $percentage = int($1); - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + $args = pop @files_ham; + if ( !$args ) { + $args = pop @files_spam; + $spam = 1; + } + else { + $spam = 0; } - } - else { - $$fp_cnt++; - } - } - elsif ( !$res->{'default'}->{$ham_symbol} ) { - $$fn_cnt++; } - else { - $$detected_cnt++; + + my $r = $learn_func->( $args, $spam ); + if ($r) { + $processed += $r; } - } } - } - return $processed; + return $processed; } -sub check_bogofilter { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - my $processed = 0; +sub check_rspamc { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; + my $args_quoted = shell_quote @{$files}; + my $processed = 0; - open( my $p, "$bogofilter -t -I $args_quoted |" ) - or die "cannot spawn $bogofilter: $!"; + open( + my $p, +"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |" + ) or die "cannot spawn $rspamc: $!"; while (<$p>) { - if ( $_ =~ /^([SHU])\s+.*$/ ) { - $processed++; - - if ($spam) { - if ( $1 eq 'H' ) { - $$fp_cnt++; - } - elsif ( $1 eq 'U' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } - } - else { - if ( $1 eq 'S' ) { - $$fp_cnt++; - } - elsif ( $1 eq 'U' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } + my $res = eval('decode_json($_)'); + if ( $res && $res->{'default'} ) { + $processed++; + + if ($spam) { + if ( $res->{'default'}->{$ham_symbol} ) { + my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$spam_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $res->{'default'}->{$spam_symbol} ) { + my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0]; + if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) { + + my $percentage = int($1); + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + else { + $$fp_cnt++; + } + } + elsif ( !$res->{'default'}->{$ham_symbol} ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } } - } } - } - return $processed; + return $processed; } -sub check_dspam { - my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; - my $processed = 0; +sub check_bogofilter { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; + + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; + + open( my $p, "$bogofilter -t -I $args_quoted |" ) + or die "cannot spawn $bogofilter: $!"; + + while (<$p>) { + if ( $_ =~ /^([SHU])\s+.*$/ ) { + $processed++; + + if ($spam) { + if ( $1 eq 'H' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'S' ) { + $$fp_cnt++; + } + elsif ( $1 eq 'U' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + } + } + } - foreach my $f ( @{$files} ) { - my $args_quoted = shell_quote $f; + return $processed; +} - my $pid = open2( *Reader, *Writer, - "$dspam --user nobody --classify --stdout --mode=notrain" ); - open( my $inp, "< $f" ); - while (<$inp>) { - print Writer $_; - } - close Writer; +sub check_dspam { + my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_; + my $processed = 0; - while (<Reader>) { - if ( $_ =~ -qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) - ) - { - $processed++; - my $percentage = int($2 * 100.0); + foreach my $f ( @{$files} ) { + my $args_quoted = shell_quote $f; - if ($spam) { - if ( $1 eq 'Innocent') { - if ( $percentage <= (100 - $rspamc_prob_trigger) ) { - $$fp_cnt++; - } - } - elsif ( $1 ne 'Spam' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } + my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" ); + open( my $inp, "< $f" ); + while (<$inp>) { + print Writer $_; } - else { - if ( $1 eq 'Spam' ) { - if ( $percentage >= $rspamc_prob_trigger ) { - $$fp_cnt++; + close Writer; + + while (<Reader>) { + if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) { + $processed++; + my $percentage = int( $2 * 100.0 ); + + if ($spam) { + if ( $1 eq 'Innocent' ) { + if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Spam' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } + else { + if ( $1 eq 'Spam' ) { + if ( $percentage >= $rspamc_prob_trigger ) { + $$fp_cnt++; + } + } + elsif ( $1 ne 'Innocent' ) { + $$fn_cnt++; + } + else { + $$detected_cnt++; + } + } } - } - elsif ( $1 ne 'Innocent' ) { - $$fn_cnt++; - } - else { - $$detected_cnt++; - } } - } + close Reader; + waitpid( $pid, 0 ); } - close Reader; - waitpid( $pid, 0 ); - } - return $processed; + return $processed; } sub cross_validate { - my ($hr) = @_; - my $args = ""; - my $processed = 0; - my $fp_spam = 0; - my $fn_spam = 0; - my $fp_ham = 0; - my $fn_ham = 0; - my $total_spam = 0; - my $total_ham = 0; - my $detected_spam = 0; - my $detected_ham = 0; - my $i = 0; - my $len = scalar keys %{$hr}; - my @files_spam; - my @files_ham; - my @cur_spam; - my @cur_ham; - my $check_func; - - if ($use_dspam) { - $check_func = \&check_dspam; - } - elsif ($use_bogofilter) { - $check_func = \&check_bogofilter; - } - else { - $check_func = \&check_rspamc; - } - - while ( my ( $fn, $spam ) = each( %{$hr} ) ) { - if ($spam) { - if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { - push @cur_spam, $fn; - push @files_spam, [@cur_spam]; - @cur_spam = (); - } - else { - push @cur_spam, $fn; - } + my ($hr) = @_; + my $args = ""; + my $processed = 0; + my $fp_spam = 0; + my $fn_spam = 0; + my $fp_ham = 0; + my $fn_ham = 0; + my $total_spam = 0; + my $total_ham = 0; + my $detected_spam = 0; + my $detected_ham = 0; + my $i = 0; + my $len = scalar keys %{$hr}; + my @files_spam; + my @files_ham; + my @cur_spam; + my @cur_ham; + my $check_func; + + if ($use_dspam) { + $check_func = \&check_dspam; + } + elsif ($use_bogofilter) { + $check_func = \&check_bogofilter; } else { - if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { - push @cur_ham, $fn; - push @files_ham, [@cur_ham]; - @cur_ham = (); - } - else { - push @cur_ham, $fn; - } + $check_func = \&check_rspamc; + } + + while ( my ( $fn, $spam ) = each( %{$hr} ) ) { + if ($spam) { + if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) { + push @cur_spam, $fn; + push @files_spam, [@cur_spam]; + @cur_spam = (); + } + else { + push @cur_spam, $fn; + } + } + else { + if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) { + push @cur_ham, $fn; + push @files_ham, [@cur_ham]; + @cur_ham = (); + } + else { + push @cur_ham, $fn; + } + } } - } - shuffle_array( \@files_spam ); + shuffle_array( \@files_spam ); - foreach my $fn (@files_spam) { - my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); - $total_spam += $r; - $processed += $r; - } + foreach my $fn (@files_spam) { + my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam ); + $total_spam += $r; + $processed += $r; + } - shuffle_array( \@files_ham ); + shuffle_array( \@files_ham ); - foreach my $fn (@files_ham) { - my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); - $total_ham += $r; - $processed += $r; - } + foreach my $fn (@files_ham) { + my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham ); + $total_ham += $r; + $processed += $r; + } - printf "Scanned %d messages + printf "Scanned %d messages %d spam messages (%d detected) -%d ham messages (%d detected)\n", - $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; +%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham; - printf "\nHam FP rate: %.2f%% (%d messages) -Ham FN rate: %.2f%% (%d messages)\n", - $fp_ham / $total_ham * 100.0, $fp_ham, - $fn_ham / $total_ham * 100.0, $fn_ham; + printf "\nHam FP rate: %.2f%% (%d messages) +Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham; - printf "\nSpam FP rate: %.2f%% (%d messages) + printf "\nSpam FP rate: %.2f%% (%d messages) Spam FN rate: %.2f%% (%d messages)\n", - $fp_spam / $total_spam * 100.0, $fp_spam, - $fn_spam / $total_spam * 100.0, $fn_spam; + $fp_spam / $total_spam * 100.0, $fp_spam, + $fn_spam / $total_spam * 100.0, $fn_spam; } if ( !$spam_dir || !$ham_dir ) { - die "spam or/and ham directories are not specified"; + die "spam or/and ham directories are not specified"; } my @spam_samples; @@ -473,24 +463,23 @@ shuffle_array( \@spam_samples ); shuffle_array( \@ham_samples ); if ( !$check_only ) { - my $learned = 0; - my $t0 = [gettimeofday]; - $learned = learn_samples( \@ham_samples, \@spam_samples ); - my $t1 = [gettimeofday]; + my $learned = 0; + my $t0 = [gettimeofday]; + $learned = learn_samples( \@ham_samples, \@spam_samples ); + my $t1 = [gettimeofday]; - printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", - $learned, tv_interval( $t0, $t1 ); + printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 ); } my %validation_set; my $len = int( scalar @spam_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { - $validation_set{ $spam_samples[$i] } = 1; + $validation_set{ $spam_samples[$i] } = 1; } $len = int( scalar @ham_samples * $train_fraction ); for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) { - $validation_set{ $ham_samples[$i] } = 0; + $validation_set{ $ham_samples[$i] } = 0; } cross_validate( \%validation_set ); diff --git a/utils/fann_train.pl b/utils/fann_train.pl index 46b539489..2ce422eb4 100755 --- a/utils/fann_train.pl +++ b/utils/fann_train.pl @@ -8,28 +8,28 @@ use warnings FATAL => 'all'; use AI::FANN qw(:all); use Getopt::Std; -my %sym_idx; # Symbols by index -my %sym_names; # Symbols by name -my $num = 1; # Number of symbols +my %sym_idx; # Symbols by index +my %sym_names; # Symbols by name +my $num = 1; # Number of symbols my @spam; my @ham; -my $max_samples = -1; -my $split = 1; -my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN> -my $score_spam = 12; -my $score_ham = -6; +my $max_samples = -1; +my $split = 1; +my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN> +my $score_spam = 12; +my $score_ham = -6; sub process { - my ($input, $spam, $ham) = @_; + my ( $input, $spam, $ham ) = @_; my $samples = 0; - while(<$input>) { - if (!$preprocessed) { + while (<$input>) { + if ( !$preprocessed ) { if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) { - if ($1 > $score_spam) { + if ( $1 > $score_spam ) { $_ = "$1:1: $3"; } - elsif ($1 < $score_ham) { + elsif ( $1 < $score_ham ) { $_ = "$1:0: $3\n"; } else { @@ -47,7 +47,7 @@ sub process { my $is_spam = 0; - if ($2 == 1) { + if ( $2 == 1 ) { $is_spam = 1; } @@ -56,13 +56,13 @@ sub process { foreach my $sym (@ar) { chomp $sym; - if (!$sym_idx{$sym}) { - $sym_idx{$sym} = $num; + if ( !$sym_idx{$sym} ) { + $sym_idx{$sym} = $num; $sym_names{$num} = $sym; $num++; } - $sample{$sym_idx{$sym}} = 1; + $sample{ $sym_idx{$sym} } = 1; } if ($is_spam) { @@ -73,32 +73,31 @@ sub process { } $samples++; - if ($max_samples > 0 && $samples > $max_samples) { + if ( $max_samples > 0 && $samples > $max_samples ) { return; } } } # Shuffle array -sub fisher_yates_shuffle -{ +sub fisher_yates_shuffle { my $array = shift; - my $i = @$array; + my $i = @$array; while ( --$i ) { my $j = int rand( $i + 1 ); - @$array[$i, $j] = @$array[$j, $i]; + @$array[ $i, $j ] = @$array[ $j, $i ]; } } # Train network sub train { - my ($ann, $sample, $result) = @_; + my ( $ann, $sample, $result ) = @_; my @row; - for (my $i = 1; $i < $num; $i++) { - if ($sample->{$i}) { + for ( my $i = 1 ; $i < $num ; $i++ ) { + if ( $sample->{$i} ) { push @row, 1; } else { @@ -108,16 +107,16 @@ sub train { #print "@row -> @{$result}\n"; - $ann->train(\@row, \@{$result}); + $ann->train( \@row, \@{$result} ); } sub test { - my ($ann, $sample) = @_; + my ( $ann, $sample ) = @_; my @row; - for (my $i = 1; $i < $num; $i++) { - if ($sample->{$i}) { + for ( my $i = 1 ; $i < $num ; $i++ ) { + if ( $sample->{$i} ) { push @row, 1; } else { @@ -125,117 +124,120 @@ sub test { } } - my $ret = $ann->run(\@row); + my $ret = $ann->run( \@row ); return $ret; } my %opts; -getopts('o:i:s:n:t:hpS:H:', \%opts); +getopts( 'o:i:s:n:t:hpS:H:', \%opts ); -if ($opts{'h'}) { +if ( $opts{'h'} ) { print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n"; exit; } my $input = *STDIN; -if ($opts{'i'}) { - open($input, '<', $opts{'i'}) or die "cannot open $opts{i}"; +if ( $opts{'i'} ) { + open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}"; } -if ($opts{'n'}) { +if ( $opts{'n'} ) { $max_samples = $opts{'n'}; } -if ($opts{'t'}) { +if ( $opts{'t'} ) { + # Test split $split = $opts{'t'}; } -if ($opts{'p'}) { +if ( $opts{'p'} ) { $preprocessed = 1; } -if ($opts{'H'}) { +if ( $opts{'H'} ) { $score_ham = $opts{'H'}; } -if ($opts{'S'}) { +if ( $opts{'S'} ) { $score_spam = $opts{'S'}; } # ham_prob, spam_prob my @spam_out = (1); -my @ham_out = (0); +my @ham_out = (0); -process($input, \@spam, \@ham); -fisher_yates_shuffle(\@spam); -fisher_yates_shuffle(\@ham); +process( $input, \@spam, \@ham ); +fisher_yates_shuffle( \@spam ); +fisher_yates_shuffle( \@ham ); -my $nspam = int(scalar(@spam) / $split); -my $nham = int(scalar(@ham) / $split); +my $nspam = int( scalar(@spam) / $split ); +my $nham = int( scalar(@ham) / $split ); -my $ann = AI::FANN->new_standard($num - 1, ($num + 2) / 2, 1); +my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 ); my @train_data; + # Train ANN -for (my $i = 0; $i < $nham; $i++) { +for ( my $i = 0 ; $i < $nham ; $i++ ) { push @train_data, [ $ham[$i], \@ham_out ]; } -for (my $i = 0; $i < $nspam; $i++) { +for ( my $i = 0 ; $i < $nspam ; $i++ ) { push @train_data, [ $spam[$i], \@spam_out ]; } -fisher_yates_shuffle(\@train_data); +fisher_yates_shuffle( \@train_data ); foreach my $train_row (@train_data) { - train($ann, @{$train_row}[0], @{$train_row}[1]); + train( $ann, @{$train_row}[0], @{$train_row}[1] ); } print "Trained $nspam SPAM and $nham HAM samples\n"; # Now run fann -if ($split > 1) { - my $sample = 0.0; +if ( $split > 1 ) { + my $sample = 0.0; my $correct = 0.0; - for (my $i = $nham; $i < $nham * $split; $i++) { - my $ret = test($ann, $ham[$i]); + for ( my $i = $nham ; $i < $nham * $split ; $i++ ) { + my $ret = test( $ann, $ham[$i] ); + #print "@{$ret}\n"; - if (@{$ret}[0] < 0.5) { + if ( @{$ret}[0] < 0.5 ) { $correct++; } $sample++; } - print "Tested $sample HAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n"; + print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; - $sample = 0.0; + $sample = 0.0; $correct = 0.0; - for (my $i = $nspam; $i < $nspam * $split; $i++) { - my $ret = test($ann, $spam[$i]); + for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) { + my $ret = test( $ann, $spam[$i] ); + #print "@{$ret}\n"; - if (@{$ret}[0] > 0.5) { + if ( @{$ret}[0] > 0.5 ) { $correct++; } $sample++; } - print "Tested $sample SPAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n"; + print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n"; } -if ($opts{'o'}) { - $ann->save($opts{'o'}) or die "cannot save ann into $opts{o}"; +if ( $opts{'o'} ) { + $ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}"; } -if ($opts{'s'}) { - open(my $scores, '>', - $opts{'s'}) or die "cannot open score file $opts{'s'}"; +if ( $opts{'s'} ) { + open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}"; print $scores "{"; - for (my $i = 1; $i < $num; $i++) { + for ( my $i = 1 ; $i < $num ; $i++ ) { my $n = $i - 1; - if ($i != $num - 1) { + if ( $i != $num - 1 ) { print $scores "\"$sym_names{$i}\":$n,"; } else { diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl index ac7b1349f..f97e35188 100755 --- a/utils/rspamd_stats.pl +++ b/utils/rspamd_stats.pl @@ -15,20 +15,20 @@ my @symbols_bidirectional; my @symbols_groups; my @symbols_ignored; my %groups; -my $reject_score = 15.0; -my $junk_score = 6.0; -my $diff_alpha = 0.1; -my $correlations = 0; -my $nrelated = 10; -my $log_file = ""; +my $reject_score = 15.0; +my $junk_score = 6.0; +my $diff_alpha = 0.1; +my $correlations = 0; +my $nrelated = 10; +my $log_file = ""; my $search_pattern = ""; -my $startTime=""; +my $startTime = ""; my $endTime; my $num_logs; my $exclude_logs = 0; -my $man = 0; -my $json = 0; -my $help = 0; +my $man = 0; +my $json = 0; +my $help = 0; # Associate file extensions with decompressors my %decompressor = ( @@ -39,44 +39,43 @@ my %decompressor = ( ); GetOptions( - "reject-score|r=f" => \$reject_score, - "junk-score|j=f" => \$junk_score, - "symbol|s=s@" => \@symbols_search, - "symbol-bidir|S=s@" => \@symbols_bidirectional, - "exclude|X=s@" => \@symbols_exclude, - "ignore=s@" => \@symbols_ignored, - "group|g=s@" => \@symbols_groups, - "log|l=s" => \$log_file, - "alpha-score|alpha|a=f" => \$diff_alpha, - "correlations|c" => \$correlations, - "nrelated=i" => \$nrelated, - "search-pattern=s" => \$search_pattern, - "start=s" => \$startTime, - "end=s" => \$endTime, - "num-logs|n=i" => \$num_logs, - "exclude-logs|x=i" => \$exclude_logs, - "json|j" => \$json, - "help|?" => \$help, - "man" => \$man + "reject-score|r=f" => \$reject_score, + "junk-score|j=f" => \$junk_score, + "symbol|s=s@" => \@symbols_search, + "symbol-bidir|S=s@" => \@symbols_bidirectional, + "exclude|X=s@" => \@symbols_exclude, + "ignore=s@" => \@symbols_ignored, + "group|g=s@" => \@symbols_groups, + "log|l=s" => \$log_file, + "alpha-score|alpha|a=f" => \$diff_alpha, + "correlations|c" => \$correlations, + "nrelated=i" => \$nrelated, + "search-pattern=s" => \$search_pattern, + "start=s" => \$startTime, + "end=s" => \$endTime, + "num-logs|n=i" => \$num_logs, + "exclude-logs|x=i" => \$exclude_logs, + "json|j" => \$json, + "help|?" => \$help, + "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; -pod2usage(-exitval => 0, -verbose => 2) if $man; - +pod2usage( -exitval => 0, -verbose => 2 ) if $man; # Global vars -my $total = 0; -my $total_spam = 0; -my $total_junk = 0; -my $junk_symbols = 0; -my $spam_symbols = 0; -my $ham_symbols = 0; +my $total = 0; +my $total_spam = 0; +my $total_junk = 0; +my $junk_symbols = 0; +my $spam_symbols = 0; +my $ham_symbols = 0; my $ham_spam_change = 0; my $ham_junk_change = 0; my %sym_res; my $rspamd_log; -my $enabled = 0; -my $log_file_num = 1; +my $enabled = 0; +my $log_file_num = 1; my $spinner_update_time = 0; my %action; @@ -91,688 +90,689 @@ foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) } # Convert bidirectional symbols foreach my $s (@symbols_bidirectional) { - $bidir_match{$s} = { - spam => "${s}_SPAM", - ham => "${s}_HAM", - }; - push @symbols_search, $s unless grep /^$s$/, @symbols_search; + $bidir_match{$s} = { + spam => "${s}_SPAM", + ham => "${s}_HAM", + }; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; } # Deal with groups my $group_id = 0; foreach my $g (@symbols_groups) { - my @symbols = split /,/,$g; - my $group_name = "group$group_id"; + my @symbols = split /,/, $g; + my $group_name = "group$group_id"; - foreach my $s (@symbols) { - $groups{$s} = $group_name; - push @symbols_search, $s unless grep /^$s$/, @symbols_search; - } + foreach my $s (@symbols) { + $groups{$s} = $group_name; + push @symbols_search, $s unless grep /^$s$/, @symbols_search; + } } @symbols_search = '.*' unless @symbols_search; -if ($log_file eq '-' || $log_file eq '') { - $rspamd_log = \*STDIN; - &ProcessLog(); +if ( $log_file eq '-' || $log_file eq '' ) { + $rspamd_log = \*STDIN; + &ProcessLog(); } elsif ( -d "$log_file" ) { - my $log_dir = "$log_file"; + my $log_dir = "$log_file"; - my @logs = &GetLogfilesList($log_dir); + my @logs = &GetLogfilesList($log_dir); - # Process logs - foreach (@logs) { - my $ext = (/[^.]+\.?([^.]*?)$/)[0]; - my $dc = $decompressor{$ext} || 'cat'; + # Process logs + foreach (@logs) { + my $ext = (/[^.]+\.?([^.]*?)$/)[0]; + my $dc = $decompressor{$ext} || 'cat'; - open( $rspamd_log, "-|", "$dc $log_dir/$_" ) - or die "cannot execute $dc $log_dir/$_ : $!"; + open( $rspamd_log, "-|", "$dc $log_dir/$_" ) + or die "cannot execute $dc $log_dir/$_ : $!"; - printf {interactive(*STDERR)} "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, $_; - $spinner_update_time = 0; # Force spinner update - &spinner; + printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, + $_; + $spinner_update_time = 0; # Force spinner update + &spinner; - &ProcessLog; + &ProcessLog; - close($rspamd_log) - or warn "cannot close $dc $log_dir/$_: $!"; - } - print {interactive(*STDERR)} "\033[J\033[G"; # Progress indicator clean-up + close($rspamd_log) + or warn "cannot close $dc $log_dir/$_: $!"; + } + print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up } else { - my $ext = ($log_file =~ /[^.]+\.?([^.]*?)$/)[0]; - my $dc = $decompressor{$ext} || 'cat'; - open( $rspamd_log, "-|", "$dc $log_file" ) - or die "cannot execute $dc $log_file : $!"; - $spinner_update_time = 0; # Force spinner update - &spinner; - &ProcessLog(); + my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0]; + my $dc = $decompressor{$ext} || 'cat'; + open( $rspamd_log, "-|", "$dc $log_file" ) + or die "cannot execute $dc $log_file : $!"; + $spinner_update_time = 0; # Force spinner update + &spinner; + &ProcessLog(); } -my $total_ham = $total - ($total_spam + $total_junk); +my $total_ham = $total - ( $total_spam + $total_junk ); if ($json) { - print "{"; - &Summary(); - print '"symbols":{'; - &SymbolsStat(); - print "}}\n"; + print "{"; + &Summary(); + print '"symbols":{'; + &SymbolsStat(); + print "}}\n"; } else { - &SymbolsStat(); - &Summary(); + &SymbolsStat(); + &Summary(); } exit; sub IsIgnored { - my ($sym) = @_; + my ($sym) = @_; - foreach my $ex (@symbols_ignored) { - if ($sym =~ /^$ex$/) { - return 1; + foreach my $ex (@symbols_ignored) { + if ( $sym =~ /^$ex$/ ) { + return 1; + } } - } - return 0; + return 0; } sub GenRelated { - my ($htb, $target_sym) = @_; - - my @result; - my $i = 0; - foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) { - if ($sym ne $target_sym) { - my @elt = ($sym, $htb->{$sym}); - push @result, \@elt; - $i ++; - } + my ( $htb, $target_sym ) = @_; + + my @result; + my $i = 0; + foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) { + if ( $sym ne $target_sym ) { + my @elt = ( $sym, $htb->{$sym} ); + push @result, \@elt; + $i++; + } - last if $i > $nrelated; - } + last if $i > $nrelated; + } - return \@result; + return \@result; } sub StringifyRelated { - my ($ar, $total) = @_; - return join("\n", (map { sprintf "\t%s(%s: %.1f%%)", - $_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar})); + my ( $ar, $total ) = @_; + return + join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) ); } sub SymbolsStat { - if ($total > 0) { - my $has_comma = 0; - while (my ($s, $r) = each(%sym_res)) { - if ($r->{hits} > 0) { - my $th = $r->{hits}; - my $sh = $r->{spam_hits}; - my $jh = $r->{junk_hits}; - my $hh = $r->{hits} - $sh - $jh; - my $htp = $hh * 100.0 / $total_ham if $total_ham != 0; - my $stp = $sh * 100.0 / $total_spam if $total_spam != 0; - my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0; - - if ($json) { - if ($has_comma) { - print ","; - } - else { - $has_comma = 1; - } - print "\"$s\":{"; - JsonObjectElt("avg_weight", $r->{'weight'},"%.4f"); - print ","; - JsonObjectElt("hits", $th, "%d"); - print ","; - JsonObjectElt("hits_percentage", $th/$total, "%.4f"); - print ","; - JsonObjectElt("spam_hits", $sh, "%d"); - print ","; - JsonObjectElt("spam_to_total", $sh/$th, "%.4f"); - print ","; - JsonObjectElt("spam_percentage", $stp/100.0 || 0, "%.4f"); - print ","; - JsonObjectElt("ham_hits", $hh, "%d"); - print ","; - JsonObjectElt("ham_to_total", $hh/$th, "%.4f"); - print ","; - JsonObjectElt("ham_percentage", $htp/100.0 || 0, "%.4f"); - print ","; - JsonObjectElt("junk_hits", $jh, "%d"); - print ","; - JsonObjectElt("junk_to_total", $jh/$th, "%.4f"); - print ","; - JsonObjectElt("junk_percentage", $jtp/100.0 || 0, "%.4f"); - } - else { - printf "%s avg. weight %.3f, hits %d(%.3f%%): + if ( $total > 0 ) { + my $has_comma = 0; + while ( my ( $s, $r ) = each(%sym_res) ) { + if ( $r->{hits} > 0 ) { + my $th = $r->{hits}; + my $sh = $r->{spam_hits}; + my $jh = $r->{junk_hits}; + my $hh = $r->{hits} - $sh - $jh; + my $htp = $hh * 100.0 / $total_ham if $total_ham != 0; + my $stp = $sh * 100.0 / $total_spam if $total_spam != 0; + my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0; + + if ($json) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + print "\"$s\":{"; + JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" ); + print ","; + JsonObjectElt( "hits", $th, "%d" ); + print ","; + JsonObjectElt( "hits_percentage", $th / $total, "%.4f" ); + print ","; + JsonObjectElt( "spam_hits", $sh, "%d" ); + print ","; + JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" ); + print ","; + JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "ham_hits", $hh, "%d" ); + print ","; + JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" ); + print ","; + JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" ); + print ","; + JsonObjectElt( "junk_hits", $jh, "%d" ); + print ","; + JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" ); + print ","; + JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" ); + } + else { + printf "%s avg. weight %.3f, hits %d(%.3f%%): Ham %7.3f%%, %6d/%-6d (%7.3f%%) Spam %7.3f%%, %6d/%-6d (%7.3f%%) Junk %7.3f%%, %6d/%-6d (%7.3f%%) -", - $s, $r->{weight} / $r->{hits}, $th, ($th / $total * 100), - ($hh / $th * 100), $hh, $total_ham, ($htp or 0), - ($sh / $th * 100), $sh, $total_spam, ($stp or 0), - ($jh / $th * 100), $jh, $total_junk, ($jtp or 0); - } +", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ), + ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ), + ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ), + ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 ); + } - my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam; - my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk; + my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam; + my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk; - if ($r->{weight} != 0) { - if (!$json) { - if ($r->{weight} > 0) { - printf " + if ( $r->{weight} != 0 ) { + if ( !$json ) { + if ( $r->{weight} > 0 ) { + printf " Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%) Spam changes / total spam hits: %6d/%-6d (%7.3f%%) Junk changes (ham -> junk): %6d/%-6d (%7.3f%%) Junk changes / total junk hits: %6d/%-6d (%7.3f%%) ", - $r->{spam_change}, $th, ($r->{spam_change} / $th * 100), - $r->{spam_change}, $total_spam, ($schp or 0), - $r->{junk_change}, $th, ($r->{junk_change} / $th * 100), - $r->{junk_change}, $total_junk, ($jchp or 0); - } - else { - printf " + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + else { + printf " Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%) Spam changes / total spam hits : %6d/%-6d (%7.3f%%) Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%) Junk changes / total junk hits : %6d/%-6d (%7.3f%%) ", - $r->{spam_change}, $th, ($r->{spam_change} / $th * 100), - $r->{spam_change}, $total_spam, ($schp or 0), - $r->{junk_change}, $th, ($r->{junk_change} / $th * 100), - $r->{junk_change}, $total_junk, ($jchp or 0); - } - } - else { - print ","; - JsonObjectElt("spam_change", $r->{spam_change}, "%.4f"); - print ","; - JsonObjectElt("junk_change", $r->{junk_change}, "%.4f"); - } - } - - if ($correlations) { - - my $spam_related = GenRelated($r->{symbols_met_spam}, $s); - my $junk_related = GenRelated($r->{symbols_met_junk}, $s); - my $ham_related = GenRelated($r->{symbols_met_ham}, $s); - - if (!$json) { - print "Correlations report:\n"; - - while (my ($cs, $hits) = each %{$r->{corr}}) { - my $corr_prob = $r->{'hits'} / $total; - my $merged_hits = 0; - if($r->{symbols_met_spam}->{$cs}) { - $merged_hits += $r->{symbols_met_spam}->{$cs}; - } - if($r->{symbols_met_junk}->{$cs}) { - $merged_hits += $r->{symbols_met_junk}->{$cs}; - } - if($r->{symbols_met_ham}->{$cs}) { - $merged_hits += $r->{symbols_met_ham}->{$cs}; - } - - if ($merged_hits > 0) { - printf "Probability of %s when %s fires: %.3f\n", $cs, $s, - (($merged_hits / $total) / $corr_prob); - } - } + $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ), + $r->{spam_change}, $total_spam, ( $schp or 0 ), + $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ), + $r->{junk_change}, $total_junk, ( $jchp or 0 ); + } + } + else { + print ","; + JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" ); + print ","; + JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" ); + } + } - print "Related symbols report:\n"; - printf "Top related in spam:\n %s\n", StringifyRelated($spam_related, - $r->{spam_hits}); - printf "Top related in junk:\n %s\n", StringifyRelated($junk_related, - $r->{junk_hits}); - printf "Top related in ham:\n %s\n", StringifyRelated($ham_related, - $r->{hits} - $r->{spam_hits} - $r->{junk_hits}); - } - else { - print ","; - print "\"correllations\":{"; + if ($correlations) { + + my $spam_related = GenRelated( $r->{symbols_met_spam}, $s ); + my $junk_related = GenRelated( $r->{symbols_met_junk}, $s ); + my $ham_related = GenRelated( $r->{symbols_met_ham}, $s ); + + if ( !$json ) { + print "Correlations report:\n"; + + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + my $corr_prob = $r->{'hits'} / $total; + my $merged_hits = 0; + if ( $r->{symbols_met_spam}->{$cs} ) { + $merged_hits += $r->{symbols_met_spam}->{$cs}; + } + if ( $r->{symbols_met_junk}->{$cs} ) { + $merged_hits += $r->{symbols_met_junk}->{$cs}; + } + if ( $r->{symbols_met_ham}->{$cs} ) { + $merged_hits += $r->{symbols_met_ham}->{$cs}; + } + + if ( $merged_hits > 0 ) { + printf "Probability of %s when %s fires: %.3f\n", $cs, $s, + ( ( $merged_hits / $total ) / $corr_prob ); + } + } + + print "Related symbols report:\n"; + printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} ); + printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} ); + printf "Top related in ham:\n %s\n", + StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} ); + } + else { + print ","; + print "\"correllations\":{"; + + my $has_comma_ = 0; + while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) { + if ($has_comma_) { + print ","; + } + else { + $has_comma_ = 1; + } + my $corr_prob = $hits / $total; + my $sym_prob = $r->{hits} / $total; + JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" ); + } + + print "}"; + } + } - my $has_comma_ = 0; - while (my ($cs, $hits) = each %{$r->{corr}}) { - if ($has_comma_) { - print ","; - } - else { - $has_comma_ = 1; - } - my $corr_prob = $hits / $total; - my $sym_prob = $r->{hits} / $total; - JsonObjectElt($cs, ($corr_prob / $sym_prob) ,"%.4f"); + print "}" if $json; + } + else { + print "Symbol $s has not been met\n" if !$json; } - print "}"; - } + print '-' x 80 . "\n" if !$json; } - - print "}" if $json; - } - else { - print "Symbol $s has not been met\n" if !$json; - } - - print '-' x 80 . "\n" if !$json; } - } } sub Summary() { - if (!$json) { - print " + if ( !$json ) { + print " === Summary ", '=' x 68, " Messages scanned: $total"; - printf " [ %s / %s ] + printf " [ %s / %s ] ", $timeStamp{'start'}, $timeStamp{'end'} - if defined $timeStamp{'start'}; - say ''; - printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} - for sort keys %action; - say ''; - printf "scan time min/avg/max = %.2f/%.2f/%.2f s -", $scanTime{'min'} / 1000, - ($total) ? $scanTime{'total'} / $total / 1000 : undef, - $scanTime{'max'} / 1000 - if exists $scanTime{'min'}; - say '=' x 80; - } - else { - JsonObjectElt("total", $total, "%d"); - print ","; - - if (defined $timeStamp{'start'}) { - JsonObjectElt("start", $timeStamp{'start'}); - print ","; + if defined $timeStamp{'start'}; + say ''; + printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action; + say ''; + printf "scan time min/avg/max = %.2f/%.2f/%.2f s +", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000 + if exists $scanTime{'min'}; + say '=' x 80; } + else { + JsonObjectElt( "total", $total, "%d" ); + print ","; - if (defined $timeStamp{'end'}) { - JsonObjectElt("end", $timeStamp{'end'}); - print ","; - } + if ( defined $timeStamp{'start'} ) { + JsonObjectElt( "start", $timeStamp{'start'} ); + print ","; + } - print "\"actions\":{"; + if ( defined $timeStamp{'end'} ) { + JsonObjectElt( "end", $timeStamp{'end'} ); + print ","; + } - my $has_comma = 0; - foreach my $a (sort keys %action) { - if ($has_comma) { - print ","; - } - else { - $has_comma = 1; - } - JsonObjectElt($a, $action{$a}, "%d"); + print "\"actions\":{"; + + my $has_comma = 0; + foreach my $a ( sort keys %action ) { + if ($has_comma) { + print ","; + } + else { + $has_comma = 1; + } + JsonObjectElt( $a, $action{$a}, "%d" ); + } + print "},"; } - print "},"; - } } sub ProcessRelated { - my ($symbols, $target, $source) = @_; + my ( $symbols, $target, $source ) = @_; - foreach my $s (@{$symbols}) { - $s =~ /^([^\(]+)(\(([^\)]+)\))?/; - my $sym_name = $1; - my $sym_score = 0; + foreach my $s ( @{$symbols} ) { + $s =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; - if ($groups{$sym_name}) { - $sym_name = $groups{$sym_name}; - } + if ( $groups{$sym_name} ) { + $sym_name = $groups{$sym_name}; + } - next if ($source eq $sym_name); + next if ( $source eq $sym_name ); - next if IsIgnored($sym_name); + next if IsIgnored($sym_name); - if ($2) { - $sym_score = $3 * 1.0; + if ($2) { + $sym_score = $3 * 1.0; - if (abs($sym_score) < $diff_alpha) { - next; - } + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } - my $bm = $bidir_match{$sym_name}; - if ($bm) { - if ($sym_score >= 0) { - $sym_name = $bm->{'spam'}; + if ( exists( $target->{$sym_name} ) ) { + $target->{$sym_name}++; } else { - $sym_name = $bm->{'ham'}; + $target->{$sym_name} = 1; } - } - } - - if (exists($target->{$sym_name})) { - $target->{$sym_name} ++; - } - else { - $target->{$sym_name} = 1; } - } } sub ProcessLog { - my ( $ts_format, @line ) = &log_time_format($rspamd_log); + my ( $ts_format, @line ) = &log_time_format($rspamd_log); - while() { - last if eof $rspamd_log; - $_ = (@line) ? shift @line : <$rspamd_log>; + while () { + last if eof $rspamd_log; + $_ = (@line) ? shift @line : <$rspamd_log>; - if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) { - $enabled = 1; - } - - next if !$enabled; - - if (/^.*rspamd_task_write_log.*$/) { - &spinner; - my $ts; - if ( $ts_format eq 'syslog' ) { - $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] ); - } elsif ( $ts_format eq 'syslog5424' ) { - /^([0-9-]+)T([0-9:]+)/; - $ts = "$1 $2"; - } else { - $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ]; - } - - next if ( $ts lt $startTime ); - next if ( defined $endTime && $ts gt $endTime ); - - if ($_ !~ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/) { - #print "BAD: $_\n"; - next; - } - - my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4; - my $scan_time = $5; - my $act = $1; - my $score = $2 * 1.0; - my $skip = 0; - - foreach my $ex (@symbols_exclude) { - my @found = grep {/^$ex/} @symbols; - - if (scalar(@found) > 0) { - $skip = 1; - last; + if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) { + $enabled = 1; } - } - - next if ( $skip != 0 ); - - if (defined($timeStamp{'end'})) { - $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} ); - } - else { - $timeStamp{'end'} = $ts; - } - - if (defined($timeStamp{'start'})) { - $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} ); - } - else { - $timeStamp{'start'} = $ts; - } - - $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time ); - $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time ); - $scanTime{'total'} += $scan_time; - - $action{$act}++; - $total ++; - if ($score >= $reject_score) { - $total_spam ++; - } - elsif ($score >= $junk_score) { - $total_junk ++; - } + next if !$enabled; - my @sym_names; - - foreach my $s (@symbols_search) { - my @selected = grep /$s/, @symbols; + if (/^.*rspamd_task_write_log.*$/) { + &spinner; + my $ts; + if ( $ts_format eq 'syslog' ) { + $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] ); + } + elsif ( $ts_format eq 'syslog5424' ) { + /^([0-9-]+)T([0-9:]+)/; + $ts = "$1 $2"; + } + else { + $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ]; + } - if (scalar(@selected) > 0) { + next if ( $ts lt $startTime ); + next if ( defined $endTime && $ts gt $endTime ); - foreach my $sym (@selected) { - $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; - my $sym_name = $1; - my $sym_score = 0; - my $orig_name = $sym_name; + if ( $_ !~ + /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/ ) + { + #print "BAD: $_\n"; + next; + } - if ($2) { - $sym_score = $3 * 1.0; + my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4; + my $scan_time = $5; + my $act = $1; + my $score = $2 * 1.0; + my $skip = 0; - if (abs($sym_score) < $diff_alpha) { - next; - } + foreach my $ex (@symbols_exclude) { + my @found = grep { /^$ex/ } @symbols; - my $bm = $bidir_match{$sym_name}; - if ($bm) { - if ($sym_score >= 0) { - $sym_name = $bm->{'spam'}; - } - else { - $sym_name = $bm->{'ham'}; + if ( scalar(@found) > 0 ) { + $skip = 1; + last; } - } } - next if $orig_name !~ /^$s/; + next if ( $skip != 0 ); - if ($groups{$s}) { - # Replace with group - $sym_name = $groups{$s}; + if ( defined( $timeStamp{'end'} ) ) { + $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} ); + } + else { + $timeStamp{'end'} = $ts; } - push @sym_names, $sym_name; - - if (!$sym_res{$sym_name}) { - $sym_res{$sym_name} = { - hits => 0, - spam_hits => 0, - junk_hits => 0, - spam_change => 0, - junk_change => 0, - weight => 0, - corr => {}, - symbols_met_spam => {}, - symbols_met_ham => {}, - symbols_met_junk => {}, - }; + if ( defined( $timeStamp{'start'} ) ) { + $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} ); + } + else { + $timeStamp{'start'} = $ts; } - my $r = $sym_res{$sym_name}; + $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time ); + $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time ); + $scanTime{'total'} += $scan_time; - $r->{hits} ++; - $r->{weight} += $sym_score; - my $is_spam = 0; - my $is_junk = 0; + $action{$act}++; + $total++; - if ($score >= $reject_score) { - $is_spam = 1; - $r->{spam_hits} ++; - if ($correlations) { - ProcessRelated(\@symbols, $r->{symbols_met_spam}, $sym_name); - } + if ( $score >= $reject_score ) { + $total_spam++; } - elsif ($score >= $junk_score) { - $is_junk = 1; - $r->{junk_hits} ++; - if ($correlations) { - ProcessRelated(\@symbols, $r->{symbols_met_junk}, $sym_name); - } - } - else { - if ($correlations) { - ProcessRelated(\@symbols, $r->{symbols_met_ham}, $sym_name); - } + elsif ( $score >= $junk_score ) { + $total_junk++; } - if ($sym_score != 0) { - my $score_without = $score - $sym_score; - - if ($sym_score > 0) { - if ($is_spam && $score_without < $reject_score) { - $r->{spam_change} ++; - } - if ($is_junk && $score_without < $junk_score) { - $r->{junk_change} ++; + my @sym_names; + + foreach my $s (@symbols_search) { + my @selected = grep /$s/, @symbols; + + if ( scalar(@selected) > 0 ) { + + foreach my $sym (@selected) { + $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + my $orig_name = $sym_name; + + if ($2) { + $sym_score = $3 * 1.0; + + if ( abs($sym_score) < $diff_alpha ) { + next; + } + + my $bm = $bidir_match{$sym_name}; + if ($bm) { + if ( $sym_score >= 0 ) { + $sym_name = $bm->{'spam'}; + } + else { + $sym_name = $bm->{'ham'}; + } + } + } + + next if $orig_name !~ /^$s/; + + if ( $groups{$s} ) { + + # Replace with group + $sym_name = $groups{$s}; + } + + push @sym_names, $sym_name; + + if ( !$sym_res{$sym_name} ) { + $sym_res{$sym_name} = { + hits => 0, + spam_hits => 0, + junk_hits => 0, + spam_change => 0, + junk_change => 0, + weight => 0, + corr => {}, + symbols_met_spam => {}, + symbols_met_ham => {}, + symbols_met_junk => {}, + }; + } + + my $r = $sym_res{$sym_name}; + + $r->{hits}++; + $r->{weight} += $sym_score; + my $is_spam = 0; + my $is_junk = 0; + + if ( $score >= $reject_score ) { + $is_spam = 1; + $r->{spam_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name ); + } + } + elsif ( $score >= $junk_score ) { + $is_junk = 1; + $r->{junk_hits}++; + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name ); + } + } + else { + if ($correlations) { + ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name ); + } + } + + if ( $sym_score != 0 ) { + my $score_without = $score - $sym_score; + + if ( $sym_score > 0 ) { + if ( $is_spam && $score_without < $reject_score ) { + $r->{spam_change}++; + } + if ( $is_junk && $score_without < $junk_score ) { + $r->{junk_change}++; + } + } + else { + if ( !$is_spam && $score_without >= $reject_score ) { + $r->{spam_change}++; + } + if ( !$is_junk && $score_without >= $junk_score ) { + $r->{junk_change}++; + } + } + } + } # End foreach symbols selected } - } - else { - if (!$is_spam && $score_without >= $reject_score) { - $r->{spam_change} ++; - } - if (!$is_junk && $score_without >= $junk_score) { - $r->{junk_change} ++; - } - } } - } # End foreach symbols selected - } - } - - if ($correlations) { - foreach my $sym (@sym_names) { - next if IsIgnored($sym); - my $r = $sym_res{$sym}; - - foreach my $corr_sym (@sym_names) { - if ($corr_sym ne $sym) { - if ($r->{'corr'}->{$corr_sym}) { - $r->{'corr'}->{$corr_sym} ++; - } - else { - $r->{'corr'}->{$corr_sym} = 1; - } + + if ($correlations) { + foreach my $sym (@sym_names) { + next if IsIgnored($sym); + my $r = $sym_res{$sym}; + + foreach my $corr_sym (@sym_names) { + if ( $corr_sym ne $sym ) { + if ( $r->{'corr'}->{$corr_sym} ) { + $r->{'corr'}->{$corr_sym}++; + } + else { + $r->{'corr'}->{$corr_sym} = 1; + } + } + } + } # End of correlations check } - } - } # End of correlations check - } + } } - } } sub JsonObjectElt() { - my ($k, $v) = @_; - my $f = defined $_[2] ? $_[2] : '%s'; + my ( $k, $v ) = @_; + my $f = defined $_[2] ? $_[2] : '%s'; - if ($f eq "%s") { - $f = "\"%s\""; - } + if ( $f eq "%s" ) { + $f = "\"%s\""; + } - printf "\"%s\":$f", $k, $v; + printf "\"%s\":$f", $k, $v; } sub GetLogfilesList { - my ($dir) = @_; - opendir( DIR, $dir ) or die $!; + my ($dir) = @_; + opendir( DIR, $dir ) or die $!; - my $pattern = join( '|', keys %decompressor ); - my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; + my $pattern = join( '|', keys %decompressor ); + my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; - # Add unnumbered logs first - my @logs = - grep { -f "$dir/$_" && !/$re/ } readdir(DIR); + # Add unnumbered logs first + my @logs = + grep { -f "$dir/$_" && !/$re/ } readdir(DIR); - # Add numbered logs - rewinddir(DIR); - push( @logs, - ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) ); + # Add numbered logs + rewinddir(DIR); + push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) ); - closedir(DIR); + closedir(DIR); - # Select required logs and revers their order - @logs = - reverse - splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); + # Select required logs and revers their order + @logs = + reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); - # Loop through array printing out filenames - print {interactive(*STDERR)} "\nLog files to process:\n"; - foreach my $file (@logs) { - print {interactive(*STDERR)} " $file\n"; - } - print {interactive(*STDERR)} "\n"; + # Loop through array printing out filenames + print { interactive(*STDERR) } "\nLog files to process:\n"; + foreach my $file (@logs) { + print { interactive(*STDERR) } " $file\n"; + } + print { interactive(*STDERR) } "\n"; - return @logs; + return @logs; } sub log_time_format { - my $fh = shift; - my ( $format, $line ); - while (<$fh>) { - $line = $_; - - # 2017-08-08 00:00:01 #66984( - # 2017-08-08 00:00:01.001 #66984( - if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) { - $format = 'rspamd'; - last; - } + my $fh = shift; + my ( $format, $line ); + while (<$fh>) { + $line = $_; + + # 2017-08-08 00:00:01 #66984( + # 2017-08-08 00:00:01.001 #66984( + if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) { + $format = 'rspamd'; + last; + } - # Aug 8 00:02:50 #66986( - elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) { - $format = 'syslog'; - last; - } + # Aug 8 00:02:50 #66986( + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) { + $format = 'syslog'; + last; + } - # Aug 8 00:02:50 hostname rspamd[66986] - elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) { - $format = 'syslog'; - last; - } + # Aug 8 00:02:50 hostname rspamd[66986] + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) { + $format = 'syslog'; + last; + } - # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968] - elsif(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) { - $format = 'syslog5424'; - last; - } + # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968] + elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) { + $format = 'syslog5424'; + last; + } - # Skip newsyslog messages - # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over - elsif ( /^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/ ) { - next; - } - # Skip journalctl messages - # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. -- - elsif ( /^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/ ) { - next; - } - else { - print "Unknown log format\n"; - exit 1; + # Skip newsyslog messages + # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over + elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) { + next; + } + + # Skip journalctl messages + # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. -- + elsif ( +/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/ + ) + { + next; + } + else { + print "Unknown log format\n"; + exit 1; + } } - } - return ( $format, $line ); + return ( $format, $line ); } sub normalized_time { - return undef - if !defined( $_ = shift ); + return undef + if !defined( $_ = shift ); - /^\d\d(?::\d\d){0,2}$/ - ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], - (localtime)[3], $_ - : $_; + /^\d\d(?::\d\d){0,2}$/ + ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_ + : $_; } sub numeric { - $a =~ /\.(\d+)\./; - my $a_num = $1; - $b =~ /\.(\d+)\./; - my $b_num = $1; + $a =~ /\.(\d+)\./; + my $a_num = $1; + $b =~ /\.(\d+)\./; + my $b_num = $1; - $a_num <=> $b_num; + $a_num <=> $b_num; } sub spinner { @@ -780,7 +780,7 @@ sub spinner { return if ( ( time - $spinner_update_time ) < 1 ); $spinner_update_time = time; - printf {interactive(*STDERR)} "%s\r", $spinner[ $spinner_update_time % @spinner ]; + printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ]; select()->flush(); } @@ -788,33 +788,30 @@ sub spinner { # using current year as syslog does not record the year (nor the timezone) # or the last year if the guessed time is in the future. sub syslog2iso { - my %month_map; - @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11; - - my ( $month, @t ) = - $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x; - my $epoch = - timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] ); - sprintf '%04d-%02d-%02d %02d:%02d:%02d', - 1900 + (localtime)[5] - ( $epoch > time ), - $month_map{$month} + 1, @t; + my %month_map; + @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11; + + my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x; + my $epoch = + timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] ); + sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t; } ### Imported from IO::Interactive 1.022 Perl module sub is_interactive { - my ($out_handle) = (@_, select); # Default to default output handle + my ($out_handle) = ( @_, select ); # Default to default output handle # Not interactive if output is not to terminal... return 0 if not -t $out_handle; # If *ARGV is opened, we're interactive if... - if ( tied(*ARGV) or defined(fileno(ARGV)) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to + if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to # ...it's currently opened to the magic '-' file return -t *STDIN if defined $ARGV && $ARGV eq '-'; # ...it's at end-of-file and the next file is the magic '-' file - return @ARGV>0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV; + return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV; # ...it's directly attached to the terminal return -t *ARGV; @@ -828,17 +825,18 @@ sub is_interactive { } ### Imported from IO::Interactive 1.022 Perl module -local (*DEV_NULL, *DEV_NULL2); +local ( *DEV_NULL, *DEV_NULL2 ); my $dev_null; + BEGIN { pipe *DEV_NULL, *DEV_NULL2 - or die "Internal error: can't create null filehandle"; + or die "Internal error: can't create null filehandle"; $dev_null = \*DEV_NULL; } ### Imported from IO::Interactive 1.022 Perl module sub interactive { - my ($out_handle) = (@_, \*STDOUT); # Default to STDOUT + my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT return &is_interactive ? $out_handle : $dev_null; } |