summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore2
-rw-r--r--.tidyallrc25
-rw-r--r--utils/asn.pl252
-rw-r--r--utils/cgp_rspamd.pl2
-rw-r--r--utils/classifier_test.pl697
-rwxr-xr-xutils/fann_train.pl138
-rwxr-xr-xutils/rspamd_stats.pl1168
7 files changed, 1148 insertions, 1136 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..b8c677bfc
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# Code::TidyAll
+/.tidyall.d/
diff --git a/.tidyallrc b/.tidyallrc
new file mode 100644
index 000000000..272ba1936
--- /dev/null
+++ b/.tidyallrc
@@ -0,0 +1,25 @@
+; Run "tidyall -a" to process all files.
+; Run "tidyall -g" to process all added or modified files in the current git working directory.
+
+; Ignore third-party code
+ignore = contrib/**/* doc/doxydown/doxydown.pl
+
+;[PerlCritic]
+;select = **/*.{pl,pm,t}
+
+[PerlTidy]
+select = **/*.{pl,pm,t}
+argv = -l=120
+
+[PodChecker]
+select = **/*.{pl,pm,pod}
+
+;[PodSpell]
+;select = **/*.{pl,pm,pod}
+
+;[PodTidy]
+;select = **/*.{pl,pm,pod}
+;argv = --columns=120
+
+[Test::Vars]
+select = **/*.{pl,pm,t}
diff --git a/utils/asn.pl b/utils/asn.pl
index 11bb6746b..b5f2ca41e 100644
--- a/utils/asn.pl
+++ b/utils/asn.pl
@@ -16,14 +16,14 @@ $LWP::Simple::ua->show_progress(1);
$Net::MRT::USE_RFC4760 = -1;
my %config = (
- asn_sources => [
- 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest',
- 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest',
- 'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest',
- 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest',
- 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest'
- ],
- bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz']
+ asn_sources => [
+ 'ftp://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest',
+ 'ftp://ftp.ripe.net/ripe/stats/delegated-ripencc-latest',
+ 'ftp://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-latest',
+ 'ftp://ftp.apnic.net/pub/stats/apnic/delegated-apnic-latest',
+ 'ftp://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-latest'
+ ],
+ bgp_sources => ['http://data.ris.ripe.net/rrc00/latest-bview.gz']
);
my $download_asn = 0;
@@ -38,171 +38,169 @@ my $v4_zone = "asn.rspamd.com";
my $v6_zone = "asn6.rspamd.com";
my $v4_file = "asn.zone";
my $v6_file = "asn6.zone";
-my $ns_servers = ["asn-ns.rspamd.com", "asn-ns2.rspamd.com"];
+my $ns_servers = [ "asn-ns.rspamd.com", "asn-ns2.rspamd.com" ];
GetOptions(
- "download-asn" => \$download_asn,
- "download-bgp" => \$download_bgp,
- "4!" => \$v4,
- "6!" => \$v6,
- "parse!" => \$parse,
- "target=s" => \$download_target,
- "zone-v4=s" => \$v4_zone,
- "zone-v6=s" => \$v6_zone,
- "file-v4=s" => \$v4_file,
- "file-v6=s" => \$v6_file,
- "ns-server=s@" => \$ns_servers,
- "help|?" => \$help,
- "man" => \$man
+ "download-asn" => \$download_asn,
+ "download-bgp" => \$download_bgp,
+ "4!" => \$v4,
+ "6!" => \$v6,
+ "parse!" => \$parse,
+ "target=s" => \$download_target,
+ "zone-v4=s" => \$v4_zone,
+ "zone-v6=s" => \$v6_zone,
+ "file-v4=s" => \$v4_file,
+ "file-v6=s" => \$v6_file,
+ "ns-server=s@" => \$ns_servers,
+ "help|?" => \$help,
+ "man" => \$man
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;
sub download_file {
- my ($u) = @_;
+ my ($u) = @_;
- print "Fetching $u\n";
- my $ff = File::Fetch->new( uri => $u );
- my $where = $ff->fetch( to => $download_target ) or die $ff->error;
+ print "Fetching $u\n";
+ my $ff = File::Fetch->new( uri => $u );
+ my $where = $ff->fetch( to => $download_target ) or die $ff->error;
- return $where;
+ return $where;
}
if ($download_asn) {
- foreach my $u ( @{ $config{'asn_sources'} } ) {
- download_file($u);
- }
+ foreach my $u ( @{ $config{'asn_sources'} } ) {
+ download_file($u);
+ }
}
if ($download_bgp) {
- foreach my $u ( @{ $config{'bgp_sources'} } ) {
- download_file($u);
- }
+ foreach my $u ( @{ $config{'bgp_sources'} } ) {
+ download_file($u);
+ }
}
if ( !$parse ) {
- exit 0;
+ exit 0;
}
my $v4_fh;
my $v6_fh;
if ($v4) {
- open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!";
- print $v4_fh
- "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
- foreach my $ns (@{$ns_servers}) {
- print $v4_fh "\$NS 43200 $ns\n";
- }
+ open( $v4_fh, ">", $v4_file ) or die "Cannot open $v4_file for writing: $!";
+ print $v4_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
+ foreach my $ns ( @{$ns_servers} ) {
+ print $v4_fh "\$NS 43200 $ns\n";
+ }
}
if ($v6) {
- open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!";
- print $v6_fh
- "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
- foreach my $ns (@{$ns_servers}) {
- print $v6_fh "\$NS 43200 $ns\n";
- }
+ open( $v6_fh, ">", $v6_file ) or die "Cannot open $v6_file for writing: $!";
+ print $v6_fh "\$SOA 43200 $ns_servers->[0] support.rspamd.com 0 600 300 86400 300\n";
+ foreach my $ns ( @{$ns_servers} ) {
+ print $v6_fh "\$NS 43200 $ns\n";
+ }
}
# Now load BGP data
my $networks = {};
foreach my $u ( @{ $config{'bgp_sources'} } ) {
- my $parsed = URI->new($u);
- my $fname = $download_target . '/' . basename( $parsed->path );
- open( my $fh, "<:gzip", $fname )
- or die "Cannot open $fname: $!";
-
- while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) {
- if ( $dd->{'prefix'} && $dd->{'bits'} ) {
- next if $dd->{'subtype'} == 2 and !$v4;
- next if $dd->{'subtype'} == 4 and !$v6;
- my $entry = $dd->{'entries'}->[0];
- my $net = $dd->{'prefix'} . '/' . $dd->{'bits'};
- if ( $entry && $entry->{'AS_PATH'} ) {
- my $as = $entry->{'AS_PATH'}->[-1];
- if (ref($as) eq "ARRAY") {
- $as = @{$as}[0];
+ my $parsed = URI->new($u);
+ my $fname = $download_target . '/' . basename( $parsed->path );
+ open( my $fh, "<:gzip", $fname )
+ or die "Cannot open $fname: $!";
+
+ while ( my $dd = eval { Net::MRT::mrt_read_next($fh) } ) {
+ if ( $dd->{'prefix'} && $dd->{'bits'} ) {
+ next if $dd->{'subtype'} == 2 and !$v4;
+ next if $dd->{'subtype'} == 4 and !$v6;
+ my $entry = $dd->{'entries'}->[0];
+ my $net = $dd->{'prefix'} . '/' . $dd->{'bits'};
+ if ( $entry && $entry->{'AS_PATH'} ) {
+ my $as = $entry->{'AS_PATH'}->[-1];
+ if ( ref($as) eq "ARRAY" ) {
+ $as = @{$as}[0];
+ }
+
+ if ( !$networks->{$as} ) {
+ if ( $dd->{'subtype'} == 2 ) {
+ $networks->{$as} = { nets_v4 => [$net], nets_v6 => [] };
+ }
+ else {
+ $networks->{$as} = { nets_v6 => [$net], nets_v4 => [] };
+ }
+ }
+ else {
+ if ( $dd->{'subtype'} == 2 ) {
+ push @{ $networks->{$as}->{'nets_v4'} }, $net;
+ }
+ else {
+ push @{ $networks->{$as}->{'nets_v6'} }, $net;
+ }
+ }
+ }
}
-
- if ( !$networks->{$as} ) {
- if ( $dd->{'subtype'} == 2 ) {
- $networks->{$as} = { nets_v4 => [$net], nets_v6 => [] };
- }
- else {
- $networks->{$as} = { nets_v6 => [$net], nets_v4 => [] };
- }
- }
- else {
- if ( $dd->{'subtype'} == 2 ) {
- push @{ $networks->{$as}->{'nets_v4'} }, $net;
- }
- else {
- push @{ $networks->{$as}->{'nets_v6'} }, $net;
- }
- }
- }
}
- }
}
# Now roughly detect countries
foreach my $u ( @{ $config{'asn_sources'} } ) {
- my $parsed = URI->new($u);
- my $fname = $download_target . '/' . basename( $parsed->path );
- open( my $fh, "<", $fname ) or die "Cannot open $fname: $!";
-
- while (<$fh>) {
- next if /^\#/;
- chomp;
- my @elts = split /\|/;
-
- if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) {
- my $as_start = int( $elts[3] );
- my $as_end = $as_start + int( $elts[4] );
-
- for ( my $as = $as_start ; $as < $as_end ; $as++ ) {
- my $real_as = $as;
-
- if (ref($as) eq "ARRAY") {
- $real_as = @{$as}[0];
+ my $parsed = URI->new($u);
+ my $fname = $download_target . '/' . basename( $parsed->path );
+ open( my $fh, "<", $fname ) or die "Cannot open $fname: $!";
+
+ while (<$fh>) {
+ next if /^\#/;
+ chomp;
+ my @elts = split /\|/;
+
+ if ( $elts[2] eq 'asn' && $elts[3] ne '*' ) {
+ my $as_start = int( $elts[3] );
+ my $as_end = $as_start + int( $elts[4] );
+
+ for ( my $as = $as_start ; $as < $as_end ; $as++ ) {
+ my $real_as = $as;
+
+ if ( ref($as) eq "ARRAY" ) {
+ $real_as = @{$as}[0];
+ }
+
+ if ( $networks->{"$real_as"} ) {
+ $networks->{"$real_as"}->{'country'} = $elts[1];
+ $networks->{"$real_as"}->{'rir'} = $elts[0];
+ }
+ }
}
-
- if ( $networks->{"$real_as"} ) {
- $networks->{"$real_as"}->{'country'} = $elts[1];
- $networks->{"$real_as"}->{'rir'} = $elts[0];
- }
- }
}
- }
}
while ( my ( $k, $v ) = each( %{$networks} ) ) {
- if ($v4) {
- foreach my $n ( @{ $v->{'nets_v4'} } ) {
-
- # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
- if ( $v->{'country'} ) {
- printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
- }
- else {
- printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
- }
+ if ($v4) {
+ foreach my $n ( @{ $v->{'nets_v4'} } ) {
+
+ # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
+ if ( $v->{'country'} ) {
+ printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
+ }
+ else {
+ printf $v4_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
+ }
+ }
}
- }
- if ($v6) {
- foreach my $n ( @{ $v->{'nets_v6'} } ) {
-
- # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
- if ( $v->{'country'} ) {
- printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
- }
- else {
- printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
- }
+ if ($v6) {
+ foreach my $n ( @{ $v->{'nets_v6'} } ) {
+
+ # "15169 | 8.8.8.0/24 | US | arin |" for 8.8.8.8
+ if ( $v->{'country'} ) {
+ printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, $v->{'country'}, $v->{'rir'};
+ }
+ else {
+ printf $v6_fh "%s %s|%s|%s|%s|\n", $n, $k, $n, 'UN', 'UN';
+ }
+ }
}
- }
}
__END__
diff --git a/utils/cgp_rspamd.pl b/utils/cgp_rspamd.pl
index b1d30b905..e55ac5791 100644
--- a/utils/cgp_rspamd.pl
+++ b/utils/cgp_rspamd.pl
@@ -359,6 +359,4 @@ protocol. On scan requests, this filter can query Rspamd to process a message.
B<cgp_rspamd> can tell CGP to add header or reject SPAM messages depending on
Rspamd scan result.
-=back
-
=cut
diff --git a/utils/classifier_test.pl b/utils/classifier_test.pl
index 2dbb4e903..08febe257 100644
--- a/utils/classifier_test.pl
+++ b/utils/classifier_test.pl
@@ -30,438 +30,428 @@ my $man;
my $help;
GetOptions(
- "spam|s=s" => \$spam_dir,
- "ham|h=s" => \$ham_dir,
- "spam-symbol=s" => \$spam_symbol,
- "ham-symbol=s" => \$ham_symbol,
- "classifier|c=s" => \$classifier,
- "timeout|t=f" => \$timeout,
- "parallel|p=i" => \$parallel,
- "train-fraction|t=f" => \$train_fraction,
- "bogofilter|b" => \$use_bogofilter,
- "dspam|d" => \$use_dspam,
- "check-only" => \$check_only,
- "help|?" => \$help,
- "man" => \$man
+ "spam|s=s" => \$spam_dir,
+ "ham|h=s" => \$ham_dir,
+ "spam-symbol=s" => \$spam_symbol,
+ "ham-symbol=s" => \$ham_symbol,
+ "classifier|c=s" => \$classifier,
+ "timeout|t=f" => \$timeout,
+ "parallel|p=i" => \$parallel,
+ "train-fraction|t=f" => \$train_fraction,
+ "bogofilter|b" => \$use_bogofilter,
+ "dspam|d" => \$use_dspam,
+ "check-only" => \$check_only,
+ "help|?" => \$help,
+ "man" => \$man
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage( -exitval => 0, -verbose => 2 ) if $man;
sub read_dir_files {
- my ( $dir, $target ) = @_;
- opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
- while ( my $file = readdir $dh ) {
- if ( -f "$dir/$file" ) {
- push @{$target}, "$dir/$file";
+ my ( $dir, $target ) = @_;
+ opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
+ while ( my $file = readdir $dh ) {
+ if ( -f "$dir/$file" ) {
+ push @{$target}, "$dir/$file";
+ }
}
- }
}
sub shuffle_array {
- my ($ar) = @_;
+ my ($ar) = @_;
- for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
- if ( $i > 1 ) {
- my $sel = int( rand( $i - 1 ) );
- ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
+ for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
+ if ( $i > 1 ) {
+ my $sel = int( rand( $i - 1 ) );
+ ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
+ }
}
- }
}
sub learn_rspamc {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- my $cmd = $spam ? "learn_spam" : "learn_ham";
- my $args_quoted = shell_quote @{$files};
- open(
- my $p,
-"$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
- ) or die "cannot spawn $rspamc: $!";
-
- while (<$p>) {
- my $res = eval('decode_json($_)');
- if ( $res && $res->{'success'} ) {
- $processed++;
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ my $cmd = $spam ? "learn_spam" : "learn_ham";
+ my $args_quoted = shell_quote @{$files};
+ open( my $p, "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |" )
+ or die "cannot spawn $rspamc: $!";
+
+ while (<$p>) {
+ my $res = eval('decode_json($_)');
+ if ( $res && $res->{'success'} ) {
+ $processed++;
+ }
}
- }
- return $processed;
+ return $processed;
}
sub learn_bogofilter {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
- my $fl = $spam ? "-s" : "-n";
- `$bogofilter -I $args_quoted $fl`;
- if ( $? == 0 ) {
- $processed++;
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+ my $fl = $spam ? "-s" : "-n";
+ `$bogofilter -I $args_quoted $fl`;
+ if ( $? == 0 ) {
+ $processed++;
+ }
}
- }
- return $processed;
+ return $processed;
}
sub learn_dspam {
- my ( $files, $spam ) = @_;
- my $processed = 0;
-
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
- my $fl = $spam ? "--class=spam" : "--class=innocent";
- open( my $p,
- "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
- or die "cannot run $dspam: $!";
-
- open( my $inp, "< $f" );
- while (<$inp>) {
- print $p $_;
+ my ( $files, $spam ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+ my $fl = $spam ? "--class=spam" : "--class=innocent";
+ open( my $p, "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
+ or die "cannot run $dspam: $!";
+
+ open( my $inp, "< $f" );
+ while (<$inp>) {
+ print $p $_;
+ }
}
- }
- return $processed;
+ return $processed;
}
sub learn_samples {
- my ( $ar_ham, $ar_spam ) = @_;
- my $len;
- my $processed = 0;
- my $total = 0;
- my $learn_func;
-
- my @files_spam;
- my @files_ham;
-
- if ($use_dspam) {
- $learn_func = \&learn_dspam;
- }
- elsif ($use_bogofilter) {
- $learn_func = \&learn_bogofilter;
- }
- else {
- $learn_func = \&learn_rspamc;
- }
-
- $len = int( scalar @{$ar_ham} * $train_fraction );
- my @cur_vec;
-
- # Shuffle spam and ham samples
- for ( my $i = 0 ; $i < $len ; $i++ ) {
- if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
- push @cur_vec, @{$ar_ham}[$i];
- push @files_ham, [@cur_vec];
- @cur_vec = ();
- $total++;
- }
- else {
- push @cur_vec, @{$ar_ham}[$i];
- }
- }
-
- $len = int( scalar @{$ar_spam} * $train_fraction );
- @cur_vec = ();
- for ( my $i = 0 ; $i < $len ; $i++ ) {
- if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
- push @cur_vec, @{$ar_spam}[$i];
- push @files_spam, [@cur_vec];
- @cur_vec = ();
- $total++;
- }
- else {
- push @cur_vec, @{$ar_spam}[$i];
+ my ( $ar_ham, $ar_spam ) = @_;
+ my $len;
+ my $processed = 0;
+ my $total = 0;
+ my $learn_func;
+
+ my @files_spam;
+ my @files_ham;
+
+ if ($use_dspam) {
+ $learn_func = \&learn_dspam;
}
- }
-
- for ( my $i = 0 ; $i < $total ; $i++ ) {
- my $args;
- my $spam;
-
- if ( $i % 2 == 0 ) {
- $args = pop @files_spam;
-
- if ( !$args ) {
- $args = pop @files_ham;
- $spam = 0;
- }
- else {
- $spam = 1;
- }
+ elsif ($use_bogofilter) {
+ $learn_func = \&learn_bogofilter;
}
else {
- $args = pop @files_ham;
- if ( !$args ) {
- $args = pop @files_spam;
- $spam = 1;
- }
- else {
- $spam = 0;
- }
+ $learn_func = \&learn_rspamc;
}
- my $r = $learn_func->( $args, $spam );
- if ($r) {
- $processed += $r;
+ $len = int( scalar @{$ar_ham} * $train_fraction );
+ my @cur_vec;
+
+ # Shuffle spam and ham samples
+ for ( my $i = 0 ; $i < $len ; $i++ ) {
+ if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
+ push @cur_vec, @{$ar_ham}[$i];
+ push @files_ham, [@cur_vec];
+ @cur_vec = ();
+ $total++;
+ }
+ else {
+ push @cur_vec, @{$ar_ham}[$i];
+ }
}
- }
- return $processed;
-}
+ $len = int( scalar @{$ar_spam} * $train_fraction );
+ @cur_vec = ();
+ for ( my $i = 0 ; $i < $len ; $i++ ) {
+ if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
+ push @cur_vec, @{$ar_spam}[$i];
+ push @files_spam, [@cur_vec];
+ @cur_vec = ();
+ $total++;
+ }
+ else {
+ push @cur_vec, @{$ar_spam}[$i];
+ }
+ }
-sub check_rspamc {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+ for ( my $i = 0 ; $i < $total ; $i++ ) {
+ my $args;
+ my $spam;
- my $args_quoted = shell_quote @{$files};
- my $processed = 0;
+ if ( $i % 2 == 0 ) {
+ $args = pop @files_spam;
- open(
- my $p,
-"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
- ) or die "cannot spawn $rspamc: $!";
-
- while (<$p>) {
- my $res = eval('decode_json($_)');
- if ( $res && $res->{'default'} ) {
- $processed++;
-
- if ($spam) {
- if ( $res->{'default'}->{$ham_symbol} ) {
- my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
- if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
- my $percentage = int($1);
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
+ if ( !$args ) {
+ $args = pop @files_ham;
+ $spam = 0;
+ }
+ else {
+ $spam = 1;
}
- }
- else {
- $$fp_cnt++;
- }
- }
- elsif ( !$res->{'default'}->{$spam_symbol} ) {
- $$fn_cnt++;
}
else {
- $$detected_cnt++;
- }
- }
- else {
- if ( $res->{'default'}->{$spam_symbol} ) {
- my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
- if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
-
- my $percentage = int($1);
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
+ $args = pop @files_ham;
+ if ( !$args ) {
+ $args = pop @files_spam;
+ $spam = 1;
+ }
+ else {
+ $spam = 0;
}
- }
- else {
- $$fp_cnt++;
- }
- }
- elsif ( !$res->{'default'}->{$ham_symbol} ) {
- $$fn_cnt++;
}
- else {
- $$detected_cnt++;
+
+ my $r = $learn_func->( $args, $spam );
+ if ($r) {
+ $processed += $r;
}
- }
}
- }
- return $processed;
+ return $processed;
}
-sub check_bogofilter {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
- my $processed = 0;
+sub check_rspamc {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
+ my $args_quoted = shell_quote @{$files};
+ my $processed = 0;
- open( my $p, "$bogofilter -t -I $args_quoted |" )
- or die "cannot spawn $bogofilter: $!";
+ open(
+ my $p,
+"$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
+ ) or die "cannot spawn $rspamc: $!";
while (<$p>) {
- if ( $_ =~ /^([SHU])\s+.*$/ ) {
- $processed++;
-
- if ($spam) {
- if ( $1 eq 'H' ) {
- $$fp_cnt++;
- }
- elsif ( $1 eq 'U' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
- }
- else {
- if ( $1 eq 'S' ) {
- $$fp_cnt++;
- }
- elsif ( $1 eq 'U' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
+ my $res = eval('decode_json($_)');
+ if ( $res && $res->{'default'} ) {
+ $processed++;
+
+ if ($spam) {
+ if ( $res->{'default'}->{$ham_symbol} ) {
+ my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
+ if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
+ my $percentage = int($1);
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ else {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( !$res->{'default'}->{$spam_symbol} ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $res->{'default'}->{$spam_symbol} ) {
+ my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
+ if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
+
+ my $percentage = int($1);
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ else {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( !$res->{'default'}->{$ham_symbol} ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
}
- }
}
- }
- return $processed;
+ return $processed;
}
-sub check_dspam {
- my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
- my $processed = 0;
+sub check_bogofilter {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+ my $processed = 0;
+
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
+
+ open( my $p, "$bogofilter -t -I $args_quoted |" )
+ or die "cannot spawn $bogofilter: $!";
+
+ while (<$p>) {
+ if ( $_ =~ /^([SHU])\s+.*$/ ) {
+ $processed++;
+
+ if ($spam) {
+ if ( $1 eq 'H' ) {
+ $$fp_cnt++;
+ }
+ elsif ( $1 eq 'U' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $1 eq 'S' ) {
+ $$fp_cnt++;
+ }
+ elsif ( $1 eq 'U' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ }
+ }
+ }
- foreach my $f ( @{$files} ) {
- my $args_quoted = shell_quote $f;
+ return $processed;
+}
- my $pid = open2( *Reader, *Writer,
- "$dspam --user nobody --classify --stdout --mode=notrain" );
- open( my $inp, "< $f" );
- while (<$inp>) {
- print Writer $_;
- }
- close Writer;
+sub check_dspam {
+ my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
+ my $processed = 0;
- while (<Reader>) {
- if ( $_ =~
-qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
- )
- {
- $processed++;
- my $percentage = int($2 * 100.0);
+ foreach my $f ( @{$files} ) {
+ my $args_quoted = shell_quote $f;
- if ($spam) {
- if ( $1 eq 'Innocent') {
- if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
- $$fp_cnt++;
- }
- }
- elsif ( $1 ne 'Spam' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
+ my $pid = open2( *Reader, *Writer, "$dspam --user nobody --classify --stdout --mode=notrain" );
+ open( my $inp, "< $f" );
+ while (<$inp>) {
+ print Writer $_;
}
- else {
- if ( $1 eq 'Spam' ) {
- if ( $percentage >= $rspamc_prob_trigger ) {
- $$fp_cnt++;
+ close Writer;
+
+ while (<Reader>) {
+ if ( $_ =~ qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$) ) {
+ $processed++;
+ my $percentage = int( $2 * 100.0 );
+
+ if ($spam) {
+ if ( $1 eq 'Innocent' ) {
+ if ( $percentage <= ( 100 - $rspamc_prob_trigger ) ) {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( $1 ne 'Spam' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
+ else {
+ if ( $1 eq 'Spam' ) {
+ if ( $percentage >= $rspamc_prob_trigger ) {
+ $$fp_cnt++;
+ }
+ }
+ elsif ( $1 ne 'Innocent' ) {
+ $$fn_cnt++;
+ }
+ else {
+ $$detected_cnt++;
+ }
+ }
}
- }
- elsif ( $1 ne 'Innocent' ) {
- $$fn_cnt++;
- }
- else {
- $$detected_cnt++;
- }
}
- }
+ close Reader;
+ waitpid( $pid, 0 );
}
- close Reader;
- waitpid( $pid, 0 );
- }
- return $processed;
+ return $processed;
}
sub cross_validate {
- my ($hr) = @_;
- my $args = "";
- my $processed = 0;
- my $fp_spam = 0;
- my $fn_spam = 0;
- my $fp_ham = 0;
- my $fn_ham = 0;
- my $total_spam = 0;
- my $total_ham = 0;
- my $detected_spam = 0;
- my $detected_ham = 0;
- my $i = 0;
- my $len = scalar keys %{$hr};
- my @files_spam;
- my @files_ham;
- my @cur_spam;
- my @cur_ham;
- my $check_func;
-
- if ($use_dspam) {
- $check_func = \&check_dspam;
- }
- elsif ($use_bogofilter) {
- $check_func = \&check_bogofilter;
- }
- else {
- $check_func = \&check_rspamc;
- }
-
- while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
- if ($spam) {
- if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
- push @cur_spam, $fn;
- push @files_spam, [@cur_spam];
- @cur_spam = ();
- }
- else {
- push @cur_spam, $fn;
- }
+ my ($hr) = @_;
+ my $args = "";
+ my $processed = 0;
+ my $fp_spam = 0;
+ my $fn_spam = 0;
+ my $fp_ham = 0;
+ my $fn_ham = 0;
+ my $total_spam = 0;
+ my $total_ham = 0;
+ my $detected_spam = 0;
+ my $detected_ham = 0;
+ my $i = 0;
+ my $len = scalar keys %{$hr};
+ my @files_spam;
+ my @files_ham;
+ my @cur_spam;
+ my @cur_ham;
+ my $check_func;
+
+ if ($use_dspam) {
+ $check_func = \&check_dspam;
+ }
+ elsif ($use_bogofilter) {
+ $check_func = \&check_bogofilter;
}
else {
- if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
- push @cur_ham, $fn;
- push @files_ham, [@cur_ham];
- @cur_ham = ();
- }
- else {
- push @cur_ham, $fn;
- }
+ $check_func = \&check_rspamc;
+ }
+
+ while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
+ if ($spam) {
+ if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
+ push @cur_spam, $fn;
+ push @files_spam, [@cur_spam];
+ @cur_spam = ();
+ }
+ else {
+ push @cur_spam, $fn;
+ }
+ }
+ else {
+ if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
+ push @cur_ham, $fn;
+ push @files_ham, [@cur_ham];
+ @cur_ham = ();
+ }
+ else {
+ push @cur_ham, $fn;
+ }
+ }
}
- }
- shuffle_array( \@files_spam );
+ shuffle_array( \@files_spam );
- foreach my $fn (@files_spam) {
- my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
- $total_spam += $r;
- $processed += $r;
- }
+ foreach my $fn (@files_spam) {
+ my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
+ $total_spam += $r;
+ $processed += $r;
+ }
- shuffle_array( \@files_ham );
+ shuffle_array( \@files_ham );
- foreach my $fn (@files_ham) {
- my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
- $total_ham += $r;
- $processed += $r;
- }
+ foreach my $fn (@files_ham) {
+ my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
+ $total_ham += $r;
+ $processed += $r;
+ }
- printf "Scanned %d messages
+ printf "Scanned %d messages
%d spam messages (%d detected)
-%d ham messages (%d detected)\n",
- $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
+%d ham messages (%d detected)\n", $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
- printf "\nHam FP rate: %.2f%% (%d messages)
-Ham FN rate: %.2f%% (%d messages)\n",
- $fp_ham / $total_ham * 100.0, $fp_ham,
- $fn_ham / $total_ham * 100.0, $fn_ham;
+ printf "\nHam FP rate: %.2f%% (%d messages)
+Ham FN rate: %.2f%% (%d messages)\n", $fp_ham / $total_ham * 100.0, $fp_ham, $fn_ham / $total_ham * 100.0, $fn_ham;
- printf "\nSpam FP rate: %.2f%% (%d messages)
+ printf "\nSpam FP rate: %.2f%% (%d messages)
Spam FN rate: %.2f%% (%d messages)\n",
- $fp_spam / $total_spam * 100.0, $fp_spam,
- $fn_spam / $total_spam * 100.0, $fn_spam;
+ $fp_spam / $total_spam * 100.0, $fp_spam,
+ $fn_spam / $total_spam * 100.0, $fn_spam;
}
if ( !$spam_dir || !$ham_dir ) {
- die "spam or/and ham directories are not specified";
+ die "spam or/and ham directories are not specified";
}
my @spam_samples;
@@ -473,24 +463,23 @@ shuffle_array( \@spam_samples );
shuffle_array( \@ham_samples );
if ( !$check_only ) {
- my $learned = 0;
- my $t0 = [gettimeofday];
- $learned = learn_samples( \@ham_samples, \@spam_samples );
- my $t1 = [gettimeofday];
+ my $learned = 0;
+ my $t0 = [gettimeofday];
+ $learned = learn_samples( \@ham_samples, \@spam_samples );
+ my $t1 = [gettimeofday];
- printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
- $learned, tv_interval( $t0, $t1 );
+ printf "Learned classifier, %d items processed, %.2f seconds elapsed\n", $learned, tv_interval( $t0, $t1 );
}
my %validation_set;
my $len = int( scalar @spam_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
- $validation_set{ $spam_samples[$i] } = 1;
+ $validation_set{ $spam_samples[$i] } = 1;
}
$len = int( scalar @ham_samples * $train_fraction );
for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
- $validation_set{ $ham_samples[$i] } = 0;
+ $validation_set{ $ham_samples[$i] } = 0;
}
cross_validate( \%validation_set );
diff --git a/utils/fann_train.pl b/utils/fann_train.pl
index 46b539489..2ce422eb4 100755
--- a/utils/fann_train.pl
+++ b/utils/fann_train.pl
@@ -8,28 +8,28 @@ use warnings FATAL => 'all';
use AI::FANN qw(:all);
use Getopt::Std;
-my %sym_idx; # Symbols by index
-my %sym_names; # Symbols by name
-my $num = 1; # Number of symbols
+my %sym_idx; # Symbols by index
+my %sym_names; # Symbols by name
+my $num = 1; # Number of symbols
my @spam;
my @ham;
-my $max_samples = -1;
-my $split = 1;
-my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
-my $score_spam = 12;
-my $score_ham = -6;
+my $max_samples = -1;
+my $split = 1;
+my $preprocessed = 0; # output is in format <score>:<0|1>:<SYM1,...SYMN>
+my $score_spam = 12;
+my $score_ham = -6;
sub process {
- my ($input, $spam, $ham) = @_;
+ my ( $input, $spam, $ham ) = @_;
my $samples = 0;
- while(<$input>) {
- if (!$preprocessed) {
+ while (<$input>) {
+ if ( !$preprocessed ) {
if (/^.*rspamd_task_write_log.*: \[(-?\d+\.?\d*)\/(\d+\.?\d*)\]\s*\[(.+)\].*$/) {
- if ($1 > $score_spam) {
+ if ( $1 > $score_spam ) {
$_ = "$1:1: $3";
}
- elsif ($1 < $score_ham) {
+ elsif ( $1 < $score_ham ) {
$_ = "$1:0: $3\n";
}
else {
@@ -47,7 +47,7 @@ sub process {
my $is_spam = 0;
- if ($2 == 1) {
+ if ( $2 == 1 ) {
$is_spam = 1;
}
@@ -56,13 +56,13 @@ sub process {
foreach my $sym (@ar) {
chomp $sym;
- if (!$sym_idx{$sym}) {
- $sym_idx{$sym} = $num;
+ if ( !$sym_idx{$sym} ) {
+ $sym_idx{$sym} = $num;
$sym_names{$num} = $sym;
$num++;
}
- $sample{$sym_idx{$sym}} = 1;
+ $sample{ $sym_idx{$sym} } = 1;
}
if ($is_spam) {
@@ -73,32 +73,31 @@ sub process {
}
$samples++;
- if ($max_samples > 0 && $samples > $max_samples) {
+ if ( $max_samples > 0 && $samples > $max_samples ) {
return;
}
}
}
# Shuffle array
-sub fisher_yates_shuffle
-{
+sub fisher_yates_shuffle {
my $array = shift;
- my $i = @$array;
+ my $i = @$array;
while ( --$i ) {
my $j = int rand( $i + 1 );
- @$array[$i, $j] = @$array[$j, $i];
+ @$array[ $i, $j ] = @$array[ $j, $i ];
}
}
# Train network
sub train {
- my ($ann, $sample, $result) = @_;
+ my ( $ann, $sample, $result ) = @_;
my @row;
- for (my $i = 1; $i < $num; $i++) {
- if ($sample->{$i}) {
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
+ if ( $sample->{$i} ) {
push @row, 1;
}
else {
@@ -108,16 +107,16 @@ sub train {
#print "@row -> @{$result}\n";
- $ann->train(\@row, \@{$result});
+ $ann->train( \@row, \@{$result} );
}
sub test {
- my ($ann, $sample) = @_;
+ my ( $ann, $sample ) = @_;
my @row;
- for (my $i = 1; $i < $num; $i++) {
- if ($sample->{$i}) {
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
+ if ( $sample->{$i} ) {
push @row, 1;
}
else {
@@ -125,117 +124,120 @@ sub test {
}
}
- my $ret = $ann->run(\@row);
+ my $ret = $ann->run( \@row );
return $ret;
}
my %opts;
-getopts('o:i:s:n:t:hpS:H:', \%opts);
+getopts( 'o:i:s:n:t:hpS:H:', \%opts );
-if ($opts{'h'}) {
+if ( $opts{'h'} ) {
print "$0 [-i input] [-o output] [-s scores] [-n max_samples] [-S spam_score] [-H ham_score] [-ph]\n";
exit;
}
my $input = *STDIN;
-if ($opts{'i'}) {
- open($input, '<', $opts{'i'}) or die "cannot open $opts{i}";
+if ( $opts{'i'} ) {
+ open( $input, '<', $opts{'i'} ) or die "cannot open $opts{i}";
}
-if ($opts{'n'}) {
+if ( $opts{'n'} ) {
$max_samples = $opts{'n'};
}
-if ($opts{'t'}) {
+if ( $opts{'t'} ) {
+
# Test split
$split = $opts{'t'};
}
-if ($opts{'p'}) {
+if ( $opts{'p'} ) {
$preprocessed = 1;
}
-if ($opts{'H'}) {
+if ( $opts{'H'} ) {
$score_ham = $opts{'H'};
}
-if ($opts{'S'}) {
+if ( $opts{'S'} ) {
$score_spam = $opts{'S'};
}
# ham_prob, spam_prob
my @spam_out = (1);
-my @ham_out = (0);
+my @ham_out = (0);
-process($input, \@spam, \@ham);
-fisher_yates_shuffle(\@spam);
-fisher_yates_shuffle(\@ham);
+process( $input, \@spam, \@ham );
+fisher_yates_shuffle( \@spam );
+fisher_yates_shuffle( \@ham );
-my $nspam = int(scalar(@spam) / $split);
-my $nham = int(scalar(@ham) / $split);
+my $nspam = int( scalar(@spam) / $split );
+my $nham = int( scalar(@ham) / $split );
-my $ann = AI::FANN->new_standard($num - 1, ($num + 2) / 2, 1);
+my $ann = AI::FANN->new_standard( $num - 1, ( $num + 2 ) / 2, 1 );
my @train_data;
+
# Train ANN
-for (my $i = 0; $i < $nham; $i++) {
+for ( my $i = 0 ; $i < $nham ; $i++ ) {
push @train_data, [ $ham[$i], \@ham_out ];
}
-for (my $i = 0; $i < $nspam; $i++) {
+for ( my $i = 0 ; $i < $nspam ; $i++ ) {
push @train_data, [ $spam[$i], \@spam_out ];
}
-fisher_yates_shuffle(\@train_data);
+fisher_yates_shuffle( \@train_data );
foreach my $train_row (@train_data) {
- train($ann, @{$train_row}[0], @{$train_row}[1]);
+ train( $ann, @{$train_row}[0], @{$train_row}[1] );
}
print "Trained $nspam SPAM and $nham HAM samples\n";
# Now run fann
-if ($split > 1) {
- my $sample = 0.0;
+if ( $split > 1 ) {
+ my $sample = 0.0;
my $correct = 0.0;
- for (my $i = $nham; $i < $nham * $split; $i++) {
- my $ret = test($ann, $ham[$i]);
+ for ( my $i = $nham ; $i < $nham * $split ; $i++ ) {
+ my $ret = test( $ann, $ham[$i] );
+
#print "@{$ret}\n";
- if (@{$ret}[0] < 0.5) {
+ if ( @{$ret}[0] < 0.5 ) {
$correct++;
}
$sample++;
}
- print "Tested $sample HAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n";
+ print "Tested $sample HAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
- $sample = 0.0;
+ $sample = 0.0;
$correct = 0.0;
- for (my $i = $nspam; $i < $nspam * $split; $i++) {
- my $ret = test($ann, $spam[$i]);
+ for ( my $i = $nspam ; $i < $nspam * $split ; $i++ ) {
+ my $ret = test( $ann, $spam[$i] );
+
#print "@{$ret}\n";
- if (@{$ret}[0] > 0.5) {
+ if ( @{$ret}[0] > 0.5 ) {
$correct++;
}
$sample++;
}
- print "Tested $sample SPAM samples, correct matched: $correct, rate: ".($correct / $sample)."\n";
+ print "Tested $sample SPAM samples, correct matched: $correct, rate: " . ( $correct / $sample ) . "\n";
}
-if ($opts{'o'}) {
- $ann->save($opts{'o'}) or die "cannot save ann into $opts{o}";
+if ( $opts{'o'} ) {
+ $ann->save( $opts{'o'} ) or die "cannot save ann into $opts{o}";
}
-if ($opts{'s'}) {
- open(my $scores, '>',
- $opts{'s'}) or die "cannot open score file $opts{'s'}";
+if ( $opts{'s'} ) {
+ open( my $scores, '>', $opts{'s'} ) or die "cannot open score file $opts{'s'}";
print $scores "{";
- for (my $i = 1; $i < $num; $i++) {
+ for ( my $i = 1 ; $i < $num ; $i++ ) {
my $n = $i - 1;
- if ($i != $num - 1) {
+ if ( $i != $num - 1 ) {
print $scores "\"$sym_names{$i}\":$n,";
}
else {
diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl
index ac7b1349f..f97e35188 100755
--- a/utils/rspamd_stats.pl
+++ b/utils/rspamd_stats.pl
@@ -15,20 +15,20 @@ my @symbols_bidirectional;
my @symbols_groups;
my @symbols_ignored;
my %groups;
-my $reject_score = 15.0;
-my $junk_score = 6.0;
-my $diff_alpha = 0.1;
-my $correlations = 0;
-my $nrelated = 10;
-my $log_file = "";
+my $reject_score = 15.0;
+my $junk_score = 6.0;
+my $diff_alpha = 0.1;
+my $correlations = 0;
+my $nrelated = 10;
+my $log_file = "";
my $search_pattern = "";
-my $startTime="";
+my $startTime = "";
my $endTime;
my $num_logs;
my $exclude_logs = 0;
-my $man = 0;
-my $json = 0;
-my $help = 0;
+my $man = 0;
+my $json = 0;
+my $help = 0;
# Associate file extensions with decompressors
my %decompressor = (
@@ -39,44 +39,43 @@ my %decompressor = (
);
GetOptions(
- "reject-score|r=f" => \$reject_score,
- "junk-score|j=f" => \$junk_score,
- "symbol|s=s@" => \@symbols_search,
- "symbol-bidir|S=s@" => \@symbols_bidirectional,
- "exclude|X=s@" => \@symbols_exclude,
- "ignore=s@" => \@symbols_ignored,
- "group|g=s@" => \@symbols_groups,
- "log|l=s" => \$log_file,
- "alpha-score|alpha|a=f" => \$diff_alpha,
- "correlations|c" => \$correlations,
- "nrelated=i" => \$nrelated,
- "search-pattern=s" => \$search_pattern,
- "start=s" => \$startTime,
- "end=s" => \$endTime,
- "num-logs|n=i" => \$num_logs,
- "exclude-logs|x=i" => \$exclude_logs,
- "json|j" => \$json,
- "help|?" => \$help,
- "man" => \$man
+ "reject-score|r=f" => \$reject_score,
+ "junk-score|j=f" => \$junk_score,
+ "symbol|s=s@" => \@symbols_search,
+ "symbol-bidir|S=s@" => \@symbols_bidirectional,
+ "exclude|X=s@" => \@symbols_exclude,
+ "ignore=s@" => \@symbols_ignored,
+ "group|g=s@" => \@symbols_groups,
+ "log|l=s" => \$log_file,
+ "alpha-score|alpha|a=f" => \$diff_alpha,
+ "correlations|c" => \$correlations,
+ "nrelated=i" => \$nrelated,
+ "search-pattern=s" => \$search_pattern,
+ "start=s" => \$startTime,
+ "end=s" => \$endTime,
+ "num-logs|n=i" => \$num_logs,
+ "exclude-logs|x=i" => \$exclude_logs,
+ "json|j" => \$json,
+ "help|?" => \$help,
+ "man" => \$man
) or pod2usage(2);
pod2usage(1) if $help;
-pod2usage(-exitval => 0, -verbose => 2) if $man;
-
+pod2usage( -exitval => 0, -verbose => 2 ) if $man;
# Global vars
-my $total = 0;
-my $total_spam = 0;
-my $total_junk = 0;
-my $junk_symbols = 0;
-my $spam_symbols = 0;
-my $ham_symbols = 0;
+my $total = 0;
+my $total_spam = 0;
+my $total_junk = 0;
+my $junk_symbols = 0;
+my $spam_symbols = 0;
+my $ham_symbols = 0;
my $ham_spam_change = 0;
my $ham_junk_change = 0;
my %sym_res;
my $rspamd_log;
-my $enabled = 0;
-my $log_file_num = 1;
+my $enabled = 0;
+my $log_file_num = 1;
my $spinner_update_time = 0;
my %action;
@@ -91,688 +90,689 @@ foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) }
# Convert bidirectional symbols
foreach my $s (@symbols_bidirectional) {
- $bidir_match{$s} = {
- spam => "${s}_SPAM",
- ham => "${s}_HAM",
- };
- push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+ $bidir_match{$s} = {
+ spam => "${s}_SPAM",
+ ham => "${s}_HAM",
+ };
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
}
# Deal with groups
my $group_id = 0;
foreach my $g (@symbols_groups) {
- my @symbols = split /,/,$g;
- my $group_name = "group$group_id";
+ my @symbols = split /,/, $g;
+ my $group_name = "group$group_id";
- foreach my $s (@symbols) {
- $groups{$s} = $group_name;
- push @symbols_search, $s unless grep /^$s$/, @symbols_search;
- }
+ foreach my $s (@symbols) {
+ $groups{$s} = $group_name;
+ push @symbols_search, $s unless grep /^$s$/, @symbols_search;
+ }
}
@symbols_search = '.*'
unless @symbols_search;
-if ($log_file eq '-' || $log_file eq '') {
- $rspamd_log = \*STDIN;
- &ProcessLog();
+if ( $log_file eq '-' || $log_file eq '' ) {
+ $rspamd_log = \*STDIN;
+ &ProcessLog();
}
elsif ( -d "$log_file" ) {
- my $log_dir = "$log_file";
+ my $log_dir = "$log_file";
- my @logs = &GetLogfilesList($log_dir);
+ my @logs = &GetLogfilesList($log_dir);
- # Process logs
- foreach (@logs) {
- my $ext = (/[^.]+\.?([^.]*?)$/)[0];
- my $dc = $decompressor{$ext} || 'cat';
+ # Process logs
+ foreach (@logs) {
+ my $ext = (/[^.]+\.?([^.]*?)$/)[0];
+ my $dc = $decompressor{$ext} || 'cat';
- open( $rspamd_log, "-|", "$dc $log_dir/$_" )
- or die "cannot execute $dc $log_dir/$_ : $!";
+ open( $rspamd_log, "-|", "$dc $log_dir/$_" )
+ or die "cannot execute $dc $log_dir/$_ : $!";
- printf {interactive(*STDERR)} "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, $_;
- $spinner_update_time = 0; # Force spinner update
- &spinner;
+ printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs,
+ $_;
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
- &ProcessLog;
+ &ProcessLog;
- close($rspamd_log)
- or warn "cannot close $dc $log_dir/$_: $!";
- }
- print {interactive(*STDERR)} "\033[J\033[G"; # Progress indicator clean-up
+ close($rspamd_log)
+ or warn "cannot close $dc $log_dir/$_: $!";
+ }
+ print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up
}
else {
- my $ext = ($log_file =~ /[^.]+\.?([^.]*?)$/)[0];
- my $dc = $decompressor{$ext} || 'cat';
- open( $rspamd_log, "-|", "$dc $log_file" )
- or die "cannot execute $dc $log_file : $!";
- $spinner_update_time = 0; # Force spinner update
- &spinner;
- &ProcessLog();
+ my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0];
+ my $dc = $decompressor{$ext} || 'cat';
+ open( $rspamd_log, "-|", "$dc $log_file" )
+ or die "cannot execute $dc $log_file : $!";
+ $spinner_update_time = 0; # Force spinner update
+ &spinner;
+ &ProcessLog();
}
-my $total_ham = $total - ($total_spam + $total_junk);
+my $total_ham = $total - ( $total_spam + $total_junk );
if ($json) {
- print "{";
- &Summary();
- print '"symbols":{';
- &SymbolsStat();
- print "}}\n";
+ print "{";
+ &Summary();
+ print '"symbols":{';
+ &SymbolsStat();
+ print "}}\n";
}
else {
- &SymbolsStat();
- &Summary();
+ &SymbolsStat();
+ &Summary();
}
exit;
sub IsIgnored {
- my ($sym) = @_;
+ my ($sym) = @_;
- foreach my $ex (@symbols_ignored) {
- if ($sym =~ /^$ex$/) {
- return 1;
+ foreach my $ex (@symbols_ignored) {
+ if ( $sym =~ /^$ex$/ ) {
+ return 1;
+ }
}
- }
- return 0;
+ return 0;
}
sub GenRelated {
- my ($htb, $target_sym) = @_;
-
- my @result;
- my $i = 0;
- foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) {
- if ($sym ne $target_sym) {
- my @elt = ($sym, $htb->{$sym});
- push @result, \@elt;
- $i ++;
- }
+ my ( $htb, $target_sym ) = @_;
+
+ my @result;
+ my $i = 0;
+ foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) {
+ if ( $sym ne $target_sym ) {
+ my @elt = ( $sym, $htb->{$sym} );
+ push @result, \@elt;
+ $i++;
+ }
- last if $i > $nrelated;
- }
+ last if $i > $nrelated;
+ }
- return \@result;
+ return \@result;
}
sub StringifyRelated {
- my ($ar, $total) = @_;
- return join("\n", (map { sprintf "\t%s(%s: %.1f%%)",
- $_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar}));
+ my ( $ar, $total ) = @_;
+ return
+ join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) );
}
sub SymbolsStat {
- if ($total > 0) {
- my $has_comma = 0;
- while (my ($s, $r) = each(%sym_res)) {
- if ($r->{hits} > 0) {
- my $th = $r->{hits};
- my $sh = $r->{spam_hits};
- my $jh = $r->{junk_hits};
- my $hh = $r->{hits} - $sh - $jh;
- my $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
- my $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
- my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
-
- if ($json) {
- if ($has_comma) {
- print ",";
- }
- else {
- $has_comma = 1;
- }
- print "\"$s\":{";
- JsonObjectElt("avg_weight", $r->{'weight'},"%.4f");
- print ",";
- JsonObjectElt("hits", $th, "%d");
- print ",";
- JsonObjectElt("hits_percentage", $th/$total, "%.4f");
- print ",";
- JsonObjectElt("spam_hits", $sh, "%d");
- print ",";
- JsonObjectElt("spam_to_total", $sh/$th, "%.4f");
- print ",";
- JsonObjectElt("spam_percentage", $stp/100.0 || 0, "%.4f");
- print ",";
- JsonObjectElt("ham_hits", $hh, "%d");
- print ",";
- JsonObjectElt("ham_to_total", $hh/$th, "%.4f");
- print ",";
- JsonObjectElt("ham_percentage", $htp/100.0 || 0, "%.4f");
- print ",";
- JsonObjectElt("junk_hits", $jh, "%d");
- print ",";
- JsonObjectElt("junk_to_total", $jh/$th, "%.4f");
- print ",";
- JsonObjectElt("junk_percentage", $jtp/100.0 || 0, "%.4f");
- }
- else {
- printf "%s avg. weight %.3f, hits %d(%.3f%%):
+ if ( $total > 0 ) {
+ my $has_comma = 0;
+ while ( my ( $s, $r ) = each(%sym_res) ) {
+ if ( $r->{hits} > 0 ) {
+ my $th = $r->{hits};
+ my $sh = $r->{spam_hits};
+ my $jh = $r->{junk_hits};
+ my $hh = $r->{hits} - $sh - $jh;
+ my $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
+ my $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
+ my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
+
+ if ($json) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ print "\"$s\":{";
+ JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" );
+ print ",";
+ JsonObjectElt( "hits", $th, "%d" );
+ print ",";
+ JsonObjectElt( "hits_percentage", $th / $total, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_hits", $sh, "%d" );
+ print ",";
+ JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_hits", $hh, "%d" );
+ print ",";
+ JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_hits", $jh, "%d" );
+ print ",";
+ JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" );
+ }
+ else {
+ printf "%s avg. weight %.3f, hits %d(%.3f%%):
Ham %7.3f%%, %6d/%-6d (%7.3f%%)
Spam %7.3f%%, %6d/%-6d (%7.3f%%)
Junk %7.3f%%, %6d/%-6d (%7.3f%%)
-",
- $s, $r->{weight} / $r->{hits}, $th, ($th / $total * 100),
- ($hh / $th * 100), $hh, $total_ham, ($htp or 0),
- ($sh / $th * 100), $sh, $total_spam, ($stp or 0),
- ($jh / $th * 100), $jh, $total_junk, ($jtp or 0);
- }
+", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ),
+ ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ),
+ ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ),
+ ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 );
+ }
- my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
- my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
+ my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
+ my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
- if ($r->{weight} != 0) {
- if (!$json) {
- if ($r->{weight} > 0) {
- printf "
+ if ( $r->{weight} != 0 ) {
+ if ( !$json ) {
+ if ( $r->{weight} > 0 ) {
+ printf "
Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
",
- $r->{spam_change}, $th, ($r->{spam_change} / $th * 100),
- $r->{spam_change}, $total_spam, ($schp or 0),
- $r->{junk_change}, $th, ($r->{junk_change} / $th * 100),
- $r->{junk_change}, $total_junk, ($jchp or 0);
- }
- else {
- printf "
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ else {
+ printf "
Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
",
- $r->{spam_change}, $th, ($r->{spam_change} / $th * 100),
- $r->{spam_change}, $total_spam, ($schp or 0),
- $r->{junk_change}, $th, ($r->{junk_change} / $th * 100),
- $r->{junk_change}, $total_junk, ($jchp or 0);
- }
- }
- else {
- print ",";
- JsonObjectElt("spam_change", $r->{spam_change}, "%.4f");
- print ",";
- JsonObjectElt("junk_change", $r->{junk_change}, "%.4f");
- }
- }
-
- if ($correlations) {
-
- my $spam_related = GenRelated($r->{symbols_met_spam}, $s);
- my $junk_related = GenRelated($r->{symbols_met_junk}, $s);
- my $ham_related = GenRelated($r->{symbols_met_ham}, $s);
-
- if (!$json) {
- print "Correlations report:\n";
-
- while (my ($cs, $hits) = each %{$r->{corr}}) {
- my $corr_prob = $r->{'hits'} / $total;
- my $merged_hits = 0;
- if($r->{symbols_met_spam}->{$cs}) {
- $merged_hits += $r->{symbols_met_spam}->{$cs};
- }
- if($r->{symbols_met_junk}->{$cs}) {
- $merged_hits += $r->{symbols_met_junk}->{$cs};
- }
- if($r->{symbols_met_ham}->{$cs}) {
- $merged_hits += $r->{symbols_met_ham}->{$cs};
- }
-
- if ($merged_hits > 0) {
- printf "Probability of %s when %s fires: %.3f\n", $cs, $s,
- (($merged_hits / $total) / $corr_prob);
- }
- }
+ $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
+ $r->{spam_change}, $total_spam, ( $schp or 0 ),
+ $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
+ $r->{junk_change}, $total_junk, ( $jchp or 0 );
+ }
+ }
+ else {
+ print ",";
+ JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" );
+ print ",";
+ JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" );
+ }
+ }
- print "Related symbols report:\n";
- printf "Top related in spam:\n %s\n", StringifyRelated($spam_related,
- $r->{spam_hits});
- printf "Top related in junk:\n %s\n", StringifyRelated($junk_related,
- $r->{junk_hits});
- printf "Top related in ham:\n %s\n", StringifyRelated($ham_related,
- $r->{hits} - $r->{spam_hits} - $r->{junk_hits});
- }
- else {
- print ",";
- print "\"correllations\":{";
+ if ($correlations) {
+
+ my $spam_related = GenRelated( $r->{symbols_met_spam}, $s );
+ my $junk_related = GenRelated( $r->{symbols_met_junk}, $s );
+ my $ham_related = GenRelated( $r->{symbols_met_ham}, $s );
+
+ if ( !$json ) {
+ print "Correlations report:\n";
+
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ my $corr_prob = $r->{'hits'} / $total;
+ my $merged_hits = 0;
+ if ( $r->{symbols_met_spam}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_spam}->{$cs};
+ }
+ if ( $r->{symbols_met_junk}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_junk}->{$cs};
+ }
+ if ( $r->{symbols_met_ham}->{$cs} ) {
+ $merged_hits += $r->{symbols_met_ham}->{$cs};
+ }
+
+ if ( $merged_hits > 0 ) {
+ printf "Probability of %s when %s fires: %.3f\n", $cs, $s,
+ ( ( $merged_hits / $total ) / $corr_prob );
+ }
+ }
+
+ print "Related symbols report:\n";
+ printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} );
+ printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} );
+ printf "Top related in ham:\n %s\n",
+ StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} );
+ }
+ else {
+ print ",";
+ print "\"correllations\":{";
+
+ my $has_comma_ = 0;
+ while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
+ if ($has_comma_) {
+ print ",";
+ }
+ else {
+ $has_comma_ = 1;
+ }
+ my $corr_prob = $hits / $total;
+ my $sym_prob = $r->{hits} / $total;
+ JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" );
+ }
+
+ print "}";
+ }
+ }
- my $has_comma_ = 0;
- while (my ($cs, $hits) = each %{$r->{corr}}) {
- if ($has_comma_) {
- print ",";
- }
- else {
- $has_comma_ = 1;
- }
- my $corr_prob = $hits / $total;
- my $sym_prob = $r->{hits} / $total;
- JsonObjectElt($cs, ($corr_prob / $sym_prob) ,"%.4f");
+ print "}" if $json;
+ }
+ else {
+ print "Symbol $s has not been met\n" if !$json;
}
- print "}";
- }
+ print '-' x 80 . "\n" if !$json;
}
-
- print "}" if $json;
- }
- else {
- print "Symbol $s has not been met\n" if !$json;
- }
-
- print '-' x 80 . "\n" if !$json;
}
- }
}
sub Summary() {
- if (!$json) {
- print "
+ if ( !$json ) {
+ print "
=== Summary ", '=' x 68, "
Messages scanned: $total";
- printf " [ %s / %s ]
+ printf " [ %s / %s ]
", $timeStamp{'start'}, $timeStamp{'end'}
- if defined $timeStamp{'start'};
- say '';
- printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_}
- for sort keys %action;
- say '';
- printf "scan time min/avg/max = %.2f/%.2f/%.2f s
-", $scanTime{'min'} / 1000,
- ($total) ? $scanTime{'total'} / $total / 1000 : undef,
- $scanTime{'max'} / 1000
- if exists $scanTime{'min'};
- say '=' x 80;
- }
- else {
- JsonObjectElt("total", $total, "%d");
- print ",";
-
- if (defined $timeStamp{'start'}) {
- JsonObjectElt("start", $timeStamp{'start'});
- print ",";
+ if defined $timeStamp{'start'};
+ say '';
+ printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action;
+ say '';
+ printf "scan time min/avg/max = %.2f/%.2f/%.2f s
+", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000
+ if exists $scanTime{'min'};
+ say '=' x 80;
}
+ else {
+ JsonObjectElt( "total", $total, "%d" );
+ print ",";
- if (defined $timeStamp{'end'}) {
- JsonObjectElt("end", $timeStamp{'end'});
- print ",";
- }
+ if ( defined $timeStamp{'start'} ) {
+ JsonObjectElt( "start", $timeStamp{'start'} );
+ print ",";
+ }
- print "\"actions\":{";
+ if ( defined $timeStamp{'end'} ) {
+ JsonObjectElt( "end", $timeStamp{'end'} );
+ print ",";
+ }
- my $has_comma = 0;
- foreach my $a (sort keys %action) {
- if ($has_comma) {
- print ",";
- }
- else {
- $has_comma = 1;
- }
- JsonObjectElt($a, $action{$a}, "%d");
+ print "\"actions\":{";
+
+ my $has_comma = 0;
+ foreach my $a ( sort keys %action ) {
+ if ($has_comma) {
+ print ",";
+ }
+ else {
+ $has_comma = 1;
+ }
+ JsonObjectElt( $a, $action{$a}, "%d" );
+ }
+ print "},";
}
- print "},";
- }
}
sub ProcessRelated {
- my ($symbols, $target, $source) = @_;
+ my ( $symbols, $target, $source ) = @_;
- foreach my $s (@{$symbols}) {
- $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
- my $sym_name = $1;
- my $sym_score = 0;
+ foreach my $s ( @{$symbols} ) {
+ $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
- if ($groups{$sym_name}) {
- $sym_name = $groups{$sym_name};
- }
+ if ( $groups{$sym_name} ) {
+ $sym_name = $groups{$sym_name};
+ }
- next if ($source eq $sym_name);
+ next if ( $source eq $sym_name );
- next if IsIgnored($sym_name);
+ next if IsIgnored($sym_name);
- if ($2) {
- $sym_score = $3 * 1.0;
+ if ($2) {
+ $sym_score = $3 * 1.0;
- if (abs($sym_score) < $diff_alpha) {
- next;
- }
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
- my $bm = $bidir_match{$sym_name};
- if ($bm) {
- if ($sym_score >= 0) {
- $sym_name = $bm->{'spam'};
+ if ( exists( $target->{$sym_name} ) ) {
+ $target->{$sym_name}++;
}
else {
- $sym_name = $bm->{'ham'};
+ $target->{$sym_name} = 1;
}
- }
- }
-
- if (exists($target->{$sym_name})) {
- $target->{$sym_name} ++;
- }
- else {
- $target->{$sym_name} = 1;
}
- }
}
sub ProcessLog {
- my ( $ts_format, @line ) = &log_time_format($rspamd_log);
+ my ( $ts_format, @line ) = &log_time_format($rspamd_log);
- while() {
- last if eof $rspamd_log;
- $_ = (@line) ? shift @line : <$rspamd_log>;
+ while () {
+ last if eof $rspamd_log;
+ $_ = (@line) ? shift @line : <$rspamd_log>;
- if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) {
- $enabled = 1;
- }
-
- next if !$enabled;
-
- if (/^.*rspamd_task_write_log.*$/) {
- &spinner;
- my $ts;
- if ( $ts_format eq 'syslog' ) {
- $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] );
- } elsif ( $ts_format eq 'syslog5424' ) {
- /^([0-9-]+)T([0-9:]+)/;
- $ts = "$1 $2";
- } else {
- $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
- }
-
- next if ( $ts lt $startTime );
- next if ( defined $endTime && $ts gt $endTime );
-
- if ($_ !~ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/) {
- #print "BAD: $_\n";
- next;
- }
-
- my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
- my $scan_time = $5;
- my $act = $1;
- my $score = $2 * 1.0;
- my $skip = 0;
-
- foreach my $ex (@symbols_exclude) {
- my @found = grep {/^$ex/} @symbols;
-
- if (scalar(@found) > 0) {
- $skip = 1;
- last;
+ if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) {
+ $enabled = 1;
}
- }
-
- next if ( $skip != 0 );
-
- if (defined($timeStamp{'end'})) {
- $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} );
- }
- else {
- $timeStamp{'end'} = $ts;
- }
-
- if (defined($timeStamp{'start'})) {
- $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} );
- }
- else {
- $timeStamp{'start'} = $ts;
- }
-
- $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
- $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time );
- $scanTime{'total'} += $scan_time;
-
- $action{$act}++;
- $total ++;
- if ($score >= $reject_score) {
- $total_spam ++;
- }
- elsif ($score >= $junk_score) {
- $total_junk ++;
- }
+ next if !$enabled;
- my @sym_names;
-
- foreach my $s (@symbols_search) {
- my @selected = grep /$s/, @symbols;
+ if (/^.*rspamd_task_write_log.*$/) {
+ &spinner;
+ my $ts;
+ if ( $ts_format eq 'syslog' ) {
+ $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] );
+ }
+ elsif ( $ts_format eq 'syslog5424' ) {
+ /^([0-9-]+)T([0-9:]+)/;
+ $ts = "$1 $2";
+ }
+ else {
+ $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
+ }
- if (scalar(@selected) > 0) {
+ next if ( $ts lt $startTime );
+ next if ( defined $endTime && $ts gt $endTime );
- foreach my $sym (@selected) {
- $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
- my $sym_name = $1;
- my $sym_score = 0;
- my $orig_name = $sym_name;
+ if ( $_ !~
+ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/ )
+ {
+ #print "BAD: $_\n";
+ next;
+ }
- if ($2) {
- $sym_score = $3 * 1.0;
+ my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
+ my $scan_time = $5;
+ my $act = $1;
+ my $score = $2 * 1.0;
+ my $skip = 0;
- if (abs($sym_score) < $diff_alpha) {
- next;
- }
+ foreach my $ex (@symbols_exclude) {
+ my @found = grep { /^$ex/ } @symbols;
- my $bm = $bidir_match{$sym_name};
- if ($bm) {
- if ($sym_score >= 0) {
- $sym_name = $bm->{'spam'};
- }
- else {
- $sym_name = $bm->{'ham'};
+ if ( scalar(@found) > 0 ) {
+ $skip = 1;
+ last;
}
- }
}
- next if $orig_name !~ /^$s/;
+ next if ( $skip != 0 );
- if ($groups{$s}) {
- # Replace with group
- $sym_name = $groups{$s};
+ if ( defined( $timeStamp{'end'} ) ) {
+ $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} );
+ }
+ else {
+ $timeStamp{'end'} = $ts;
}
- push @sym_names, $sym_name;
-
- if (!$sym_res{$sym_name}) {
- $sym_res{$sym_name} = {
- hits => 0,
- spam_hits => 0,
- junk_hits => 0,
- spam_change => 0,
- junk_change => 0,
- weight => 0,
- corr => {},
- symbols_met_spam => {},
- symbols_met_ham => {},
- symbols_met_junk => {},
- };
+ if ( defined( $timeStamp{'start'} ) ) {
+ $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} );
+ }
+ else {
+ $timeStamp{'start'} = $ts;
}
- my $r = $sym_res{$sym_name};
+ $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
+ $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time );
+ $scanTime{'total'} += $scan_time;
- $r->{hits} ++;
- $r->{weight} += $sym_score;
- my $is_spam = 0;
- my $is_junk = 0;
+ $action{$act}++;
+ $total++;
- if ($score >= $reject_score) {
- $is_spam = 1;
- $r->{spam_hits} ++;
- if ($correlations) {
- ProcessRelated(\@symbols, $r->{symbols_met_spam}, $sym_name);
- }
+ if ( $score >= $reject_score ) {
+ $total_spam++;
}
- elsif ($score >= $junk_score) {
- $is_junk = 1;
- $r->{junk_hits} ++;
- if ($correlations) {
- ProcessRelated(\@symbols, $r->{symbols_met_junk}, $sym_name);
- }
- }
- else {
- if ($correlations) {
- ProcessRelated(\@symbols, $r->{symbols_met_ham}, $sym_name);
- }
+ elsif ( $score >= $junk_score ) {
+ $total_junk++;
}
- if ($sym_score != 0) {
- my $score_without = $score - $sym_score;
-
- if ($sym_score > 0) {
- if ($is_spam && $score_without < $reject_score) {
- $r->{spam_change} ++;
- }
- if ($is_junk && $score_without < $junk_score) {
- $r->{junk_change} ++;
+ my @sym_names;
+
+ foreach my $s (@symbols_search) {
+ my @selected = grep /$s/, @symbols;
+
+ if ( scalar(@selected) > 0 ) {
+
+ foreach my $sym (@selected) {
+ $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+ my $orig_name = $sym_name;
+
+ if ($2) {
+ $sym_score = $3 * 1.0;
+
+ if ( abs($sym_score) < $diff_alpha ) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ( $sym_score >= 0 ) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ next if $orig_name !~ /^$s/;
+
+ if ( $groups{$s} ) {
+
+ # Replace with group
+ $sym_name = $groups{$s};
+ }
+
+ push @sym_names, $sym_name;
+
+ if ( !$sym_res{$sym_name} ) {
+ $sym_res{$sym_name} = {
+ hits => 0,
+ spam_hits => 0,
+ junk_hits => 0,
+ spam_change => 0,
+ junk_change => 0,
+ weight => 0,
+ corr => {},
+ symbols_met_spam => {},
+ symbols_met_ham => {},
+ symbols_met_junk => {},
+ };
+ }
+
+ my $r = $sym_res{$sym_name};
+
+ $r->{hits}++;
+ $r->{weight} += $sym_score;
+ my $is_spam = 0;
+ my $is_junk = 0;
+
+ if ( $score >= $reject_score ) {
+ $is_spam = 1;
+ $r->{spam_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name );
+ }
+ }
+ elsif ( $score >= $junk_score ) {
+ $is_junk = 1;
+ $r->{junk_hits}++;
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name );
+ }
+ }
+ else {
+ if ($correlations) {
+ ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name );
+ }
+ }
+
+ if ( $sym_score != 0 ) {
+ my $score_without = $score - $sym_score;
+
+ if ( $sym_score > 0 ) {
+ if ( $is_spam && $score_without < $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( $is_junk && $score_without < $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ else {
+ if ( !$is_spam && $score_without >= $reject_score ) {
+ $r->{spam_change}++;
+ }
+ if ( !$is_junk && $score_without >= $junk_score ) {
+ $r->{junk_change}++;
+ }
+ }
+ }
+ } # End foreach symbols selected
}
- }
- else {
- if (!$is_spam && $score_without >= $reject_score) {
- $r->{spam_change} ++;
- }
- if (!$is_junk && $score_without >= $junk_score) {
- $r->{junk_change} ++;
- }
- }
}
- } # End foreach symbols selected
- }
- }
-
- if ($correlations) {
- foreach my $sym (@sym_names) {
- next if IsIgnored($sym);
- my $r = $sym_res{$sym};
-
- foreach my $corr_sym (@sym_names) {
- if ($corr_sym ne $sym) {
- if ($r->{'corr'}->{$corr_sym}) {
- $r->{'corr'}->{$corr_sym} ++;
- }
- else {
- $r->{'corr'}->{$corr_sym} = 1;
- }
+
+ if ($correlations) {
+ foreach my $sym (@sym_names) {
+ next if IsIgnored($sym);
+ my $r = $sym_res{$sym};
+
+ foreach my $corr_sym (@sym_names) {
+ if ( $corr_sym ne $sym ) {
+ if ( $r->{'corr'}->{$corr_sym} ) {
+ $r->{'corr'}->{$corr_sym}++;
+ }
+ else {
+ $r->{'corr'}->{$corr_sym} = 1;
+ }
+ }
+ }
+ } # End of correlations check
}
- }
- } # End of correlations check
- }
+ }
}
- }
}
sub JsonObjectElt() {
- my ($k, $v) = @_;
- my $f = defined $_[2] ? $_[2] : '%s';
+ my ( $k, $v ) = @_;
+ my $f = defined $_[2] ? $_[2] : '%s';
- if ($f eq "%s") {
- $f = "\"%s\"";
- }
+ if ( $f eq "%s" ) {
+ $f = "\"%s\"";
+ }
- printf "\"%s\":$f", $k, $v;
+ printf "\"%s\":$f", $k, $v;
}
sub GetLogfilesList {
- my ($dir) = @_;
- opendir( DIR, $dir ) or die $!;
+ my ($dir) = @_;
+ opendir( DIR, $dir ) or die $!;
- my $pattern = join( '|', keys %decompressor );
- my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
+ my $pattern = join( '|', keys %decompressor );
+ my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
- # Add unnumbered logs first
- my @logs =
- grep { -f "$dir/$_" && !/$re/ } readdir(DIR);
+ # Add unnumbered logs first
+ my @logs =
+ grep { -f "$dir/$_" && !/$re/ } readdir(DIR);
- # Add numbered logs
- rewinddir(DIR);
- push( @logs,
- ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) );
+ # Add numbered logs
+ rewinddir(DIR);
+ push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) );
- closedir(DIR);
+ closedir(DIR);
- # Select required logs and revers their order
- @logs =
- reverse
- splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
+ # Select required logs and revers their order
+ @logs =
+ reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
- # Loop through array printing out filenames
- print {interactive(*STDERR)} "\nLog files to process:\n";
- foreach my $file (@logs) {
- print {interactive(*STDERR)} " $file\n";
- }
- print {interactive(*STDERR)} "\n";
+ # Loop through array printing out filenames
+ print { interactive(*STDERR) } "\nLog files to process:\n";
+ foreach my $file (@logs) {
+ print { interactive(*STDERR) } " $file\n";
+ }
+ print { interactive(*STDERR) } "\n";
- return @logs;
+ return @logs;
}
sub log_time_format {
- my $fh = shift;
- my ( $format, $line );
- while (<$fh>) {
- $line = $_;
-
- # 2017-08-08 00:00:01 #66984(
- # 2017-08-08 00:00:01.001 #66984(
- if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) {
- $format = 'rspamd';
- last;
- }
+ my $fh = shift;
+ my ( $format, $line );
+ while (<$fh>) {
+ $line = $_;
+
+ # 2017-08-08 00:00:01 #66984(
+ # 2017-08-08 00:00:01.001 #66984(
+ if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) {
+ $format = 'rspamd';
+ last;
+ }
- # Aug 8 00:02:50 #66986(
- elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
- $format = 'syslog';
- last;
- }
+ # Aug 8 00:02:50 #66986(
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
+ $format = 'syslog';
+ last;
+ }
- # Aug 8 00:02:50 hostname rspamd[66986]
- elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
- $format = 'syslog';
- last;
- }
+ # Aug 8 00:02:50 hostname rspamd[66986]
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
+ $format = 'syslog';
+ last;
+ }
- # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968]
- elsif(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) {
- $format = 'syslog5424';
- last;
- }
+ # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968]
+ elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) {
+ $format = 'syslog5424';
+ last;
+ }
- # Skip newsyslog messages
- # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
- elsif ( /^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/ ) {
- next;
- }
- # Skip journalctl messages
- # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. --
- elsif ( /^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/ ) {
- next;
- }
- else {
- print "Unknown log format\n";
- exit 1;
+ # Skip newsyslog messages
+ # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
+ elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) {
+ next;
+ }
+
+ # Skip journalctl messages
+ # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. --
+ elsif (
+/^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/
+ )
+ {
+ next;
+ }
+ else {
+ print "Unknown log format\n";
+ exit 1;
+ }
}
- }
- return ( $format, $line );
+ return ( $format, $line );
}
sub normalized_time {
- return undef
- if !defined( $_ = shift );
+ return undef
+ if !defined( $_ = shift );
- /^\d\d(?::\d\d){0,2}$/
- ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4],
- (localtime)[3], $_
- : $_;
+ /^\d\d(?::\d\d){0,2}$/
+ ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_
+ : $_;
}
sub numeric {
- $a =~ /\.(\d+)\./;
- my $a_num = $1;
- $b =~ /\.(\d+)\./;
- my $b_num = $1;
+ $a =~ /\.(\d+)\./;
+ my $a_num = $1;
+ $b =~ /\.(\d+)\./;
+ my $b_num = $1;
- $a_num <=> $b_num;
+ $a_num <=> $b_num;
}
sub spinner {
@@ -780,7 +780,7 @@ sub spinner {
return
if ( ( time - $spinner_update_time ) < 1 );
$spinner_update_time = time;
- printf {interactive(*STDERR)} "%s\r", $spinner[ $spinner_update_time % @spinner ];
+ printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ];
select()->flush();
}
@@ -788,33 +788,30 @@ sub spinner {
# using current year as syslog does not record the year (nor the timezone)
# or the last year if the guessed time is in the future.
sub syslog2iso {
- my %month_map;
- @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
-
- my ( $month, @t ) =
- $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
- my $epoch =
- timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
- sprintf '%04d-%02d-%02d %02d:%02d:%02d',
- 1900 + (localtime)[5] - ( $epoch > time ),
- $month_map{$month} + 1, @t;
+ my %month_map;
+ @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
+
+ my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
+ my $epoch =
+ timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
+ sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t;
}
### Imported from IO::Interactive 1.022 Perl module
sub is_interactive {
- my ($out_handle) = (@_, select); # Default to default output handle
+ my ($out_handle) = ( @_, select ); # Default to default output handle
# Not interactive if output is not to terminal...
return 0 if not -t $out_handle;
# If *ARGV is opened, we're interactive if...
- if ( tied(*ARGV) or defined(fileno(ARGV)) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
+ if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
# ...it's currently opened to the magic '-' file
return -t *STDIN if defined $ARGV && $ARGV eq '-';
# ...it's at end-of-file and the next file is the magic '-' file
- return @ARGV>0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
+ return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
# ...it's directly attached to the terminal
return -t *ARGV;
@@ -828,17 +825,18 @@ sub is_interactive {
}
### Imported from IO::Interactive 1.022 Perl module
-local (*DEV_NULL, *DEV_NULL2);
+local ( *DEV_NULL, *DEV_NULL2 );
my $dev_null;
+
BEGIN {
pipe *DEV_NULL, *DEV_NULL2
- or die "Internal error: can't create null filehandle";
+ or die "Internal error: can't create null filehandle";
$dev_null = \*DEV_NULL;
}
### Imported from IO::Interactive 1.022 Perl module
sub interactive {
- my ($out_handle) = (@_, \*STDOUT); # Default to STDOUT
+ my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT
return &is_interactive ? $out_handle : $dev_null;
}