mirror of
https://github.com/rspamd/rspamd.git
synced 2024-08-07 21:05:57 +02:00
932 lines
25 KiB
Perl
Executable File
932 lines
25 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
|
|
use 5.010;
|
|
use Data::Dumper;
|
|
use Getopt::Long;
|
|
use Pod::Usage;
|
|
use Time::Local;
|
|
use IO::Handle;
|
|
use warnings;
|
|
use strict;
|
|
|
|
my @symbols_search;
|
|
my @symbols_exclude;
|
|
my @symbols_bidirectional;
|
|
my $reject_score = 15.0;
|
|
my $junk_score = 6.0;
|
|
my $diff_alpha = 0.1;
|
|
my $correlations = 0;
|
|
my $nrelated = 10;
|
|
my $log_file = "";
|
|
my $search_pattern = "";
|
|
my $startTime="";
|
|
my $endTime;
|
|
my $num_logs;
|
|
my $exclude_logs = 0;
|
|
my $man = 0;
|
|
my $json = 0;
|
|
my $help = 0;
|
|
|
|
# Associate file extensions with decompressors
|
|
my %decompressor = (
|
|
'bz2' => 'bzip2 -cd',
|
|
'gz' => 'gzip -cd',
|
|
'xz' => 'xz -cd',
|
|
'zst' => 'zstd -cd',
|
|
);
|
|
|
|
GetOptions(
|
|
"reject-score|r=f" => \$reject_score,
|
|
"junk-score|j=f" => \$junk_score,
|
|
"symbol|s=s@" => \@symbols_search,
|
|
"symbol-bidir|S=s@" => \@symbols_bidirectional,
|
|
"exclude|X=s@" => \@symbols_exclude,
|
|
"log|l=s" => \$log_file,
|
|
"alpha-score|alpha|a=f" => \$diff_alpha,
|
|
"correlations|c" => \$correlations,
|
|
"nrelated=i" => \$nrelated,
|
|
"search-pattern=s" => \$search_pattern,
|
|
"start=s" => \$startTime,
|
|
"end=s" => \$endTime,
|
|
"num-logs|n=i" => \$num_logs,
|
|
"exclude-logs|x=i" => \$exclude_logs,
|
|
"json|j" => \$json,
|
|
"help|?" => \$help,
|
|
"man" => \$man
|
|
) or pod2usage(2);
|
|
|
|
pod2usage(1) if $help;
|
|
pod2usage(-exitval => 0, -verbose => 2) if $man;
|
|
|
|
|
|
# Global vars
|
|
my $total = 0;
|
|
my $total_spam = 0;
|
|
my $total_junk = 0;
|
|
my $junk_symbols = 0;
|
|
my $spam_symbols = 0;
|
|
my $ham_symbols = 0;
|
|
my $ham_spam_change = 0;
|
|
my $ham_junk_change = 0;
|
|
my %sym_res;
|
|
my $rspamd_log;
|
|
my $enabled = 0;
|
|
my $log_file_num = 1;
|
|
my $spinner_update_time = 0;
|
|
|
|
my %action;
|
|
my %timeStamp;
|
|
my %scanTime = (
|
|
max => 0,
|
|
total => 0,
|
|
);
|
|
my %bidir_match;
|
|
|
|
foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) }
|
|
|
|
# Convert bidirectional symbols
|
|
foreach my $s (@symbols_bidirectional) {
|
|
$bidir_match{$s} = {
|
|
spam => "${s}_SPAM",
|
|
ham => "${s}_HAM",
|
|
};
|
|
push @symbols_search, $s unless grep /^$s$/, @symbols_search;
|
|
}
|
|
|
|
@symbols_search = '.*'
|
|
unless @symbols_search;
|
|
|
|
if ($log_file eq '-' || $log_file eq '') {
|
|
$rspamd_log = \*STDIN;
|
|
&ProcessLog();
|
|
}
|
|
elsif ( -d "$log_file" ) {
|
|
my $log_dir = "$log_file";
|
|
|
|
my @logs = &GetLogfilesList($log_dir);
|
|
|
|
# Process logs
|
|
foreach (@logs) {
|
|
my $ext = (/[^.]+\.?([^.]*?)$/)[0];
|
|
my $dc = $decompressor{$ext} || 'cat';
|
|
|
|
open( $rspamd_log, "-|", "$dc $log_dir/$_" )
|
|
or die "cannot execute $dc $log_dir/$_ : $!";
|
|
|
|
printf {interactive(*STDERR)} "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, $_;
|
|
$spinner_update_time = 0; # Force spinner update
|
|
&spinner;
|
|
|
|
&ProcessLog;
|
|
|
|
close($rspamd_log)
|
|
or warn "cannot close $dc $log_dir/$_: $!";
|
|
}
|
|
print {interactive(*STDERR)} "\033[J\033[G"; # Progress indicator clean-up
|
|
}
|
|
else {
|
|
my $ext = ($log_file =~ /[^.]+\.?([^.]*?)$/)[0];
|
|
my $dc = $decompressor{$ext} || 'cat';
|
|
open( $rspamd_log, "-|", "$dc $log_file" )
|
|
or die "cannot execute $dc $log_file : $!";
|
|
$spinner_update_time = 0; # Force spinner update
|
|
&spinner;
|
|
&ProcessLog();
|
|
}
|
|
|
|
my $total_ham = $total - ($total_spam + $total_junk);
|
|
|
|
if ($json) {
|
|
print "{";
|
|
&Summary();
|
|
print '"symbols":{';
|
|
&SymbolsStat();
|
|
print "}}\n";
|
|
}
|
|
else {
|
|
&SymbolsStat();
|
|
&Summary();
|
|
}
|
|
|
|
exit;
|
|
|
|
sub GenRelated {
|
|
my ($htb, $target_sym) = @_;
|
|
|
|
my @result;
|
|
my $i = 0;
|
|
foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) {
|
|
if ($sym ne $target_sym) {
|
|
my @elt = ($sym, $htb->{$sym});
|
|
push @result, \@elt;
|
|
$i ++;
|
|
}
|
|
|
|
last if $i > $nrelated;
|
|
}
|
|
|
|
return \@result;
|
|
}
|
|
|
|
sub StringifyRelated {
|
|
my ($ar, $total) = @_;
|
|
return join("\n", (map { sprintf "\t%s(%s: %.1f%%)",
|
|
$_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar}));
|
|
}
|
|
|
|
sub SymbolsStat {
|
|
if ($total > 0) {
|
|
my $has_comma = 0;
|
|
while (my ($s, $r) = each(%sym_res)) {
|
|
if ($r->{hits} > 0) {
|
|
my $th = $r->{hits};
|
|
my $sh = $r->{spam_hits};
|
|
my $jh = $r->{junk_hits};
|
|
my $hh = $r->{hits} - $sh - $jh;
|
|
my $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
|
|
my $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
|
|
my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
|
|
|
|
if ($json) {
|
|
if ($has_comma) {
|
|
print ",";
|
|
}
|
|
else {
|
|
$has_comma = 1;
|
|
}
|
|
print "\"$s\":{";
|
|
JsonObjectElt("avg_weight", $r->{'weight'},"%.4f");
|
|
print ",";
|
|
JsonObjectElt("hits", $th, "%d");
|
|
print ",";
|
|
JsonObjectElt("hits_percentage", $th/$total, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("spam_hits", $sh, "%d");
|
|
print ",";
|
|
JsonObjectElt("spam_to_total", $sh/$th, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("spam_percentage", $stp/100.0 || 0, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("ham_hits", $hh, "%d");
|
|
print ",";
|
|
JsonObjectElt("ham_to_total", $hh/$th, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("ham_percentage", $htp/100.0 || 0, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("junk_hits", $jh, "%d");
|
|
print ",";
|
|
JsonObjectElt("junk_to_total", $jh/$th, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("junk_percentage", $jtp/100.0 || 0, "%.4f");
|
|
}
|
|
else {
|
|
printf "%s avg. weight %.3f, hits %d(%.3f%%):
|
|
Ham %7.3f%%, %6d/%-6d (%7.3f%%)
|
|
Spam %7.3f%%, %6d/%-6d (%7.3f%%)
|
|
Junk %7.3f%%, %6d/%-6d (%7.3f%%)
|
|
",
|
|
$s, $r->{weight} / $r->{hits}, $th, ($th / $total * 100),
|
|
($hh / $th * 100), $hh, $total_ham, ($htp or 0),
|
|
($sh / $th * 100), $sh, $total_spam, ($stp or 0),
|
|
($jh / $th * 100), $jh, $total_junk, ($jtp or 0);
|
|
}
|
|
|
|
my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
|
|
my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
|
|
|
|
if ($r->{weight} != 0) {
|
|
if (!$json) {
|
|
if ($r->{weight} > 0) {
|
|
printf "
|
|
Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
|
|
Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
|
|
Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
|
|
Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
|
|
",
|
|
$r->{spam_change}, $th, ($r->{spam_change} / $th * 100),
|
|
$r->{spam_change}, $total_spam, ($schp or 0),
|
|
$r->{junk_change}, $th, ($r->{junk_change} / $th * 100),
|
|
$r->{junk_change}, $total_junk, ($jchp or 0);
|
|
}
|
|
else {
|
|
printf "
|
|
Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
|
|
Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
|
|
Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
|
|
Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
|
|
",
|
|
$r->{spam_change}, $th, ($r->{spam_change} / $th * 100),
|
|
$r->{spam_change}, $total_spam, ($schp or 0),
|
|
$r->{junk_change}, $th, ($r->{junk_change} / $th * 100),
|
|
$r->{junk_change}, $total_junk, ($jchp or 0);
|
|
}
|
|
}
|
|
else {
|
|
print ",";
|
|
JsonObjectElt("spam_change", $r->{spam_change}, "%.4f");
|
|
print ",";
|
|
JsonObjectElt("junk_change", $r->{junk_change}, "%.4f");
|
|
}
|
|
}
|
|
|
|
if ($correlations) {
|
|
|
|
my $spam_related = GenRelated($r->{symbols_met_spam}, $s);
|
|
my $junk_related = GenRelated($r->{symbols_met_junk}, $s);
|
|
my $ham_related = GenRelated($r->{symbols_met_ham}, $s);
|
|
|
|
if (!$json) {
|
|
print "Correlations report:\n";
|
|
|
|
while (my ($cs, $hits) = each %{$r->{corr}}) {
|
|
my $corr_prob = $hits / $total;
|
|
my $sym_prob = $r->{hits} / $total;
|
|
printf "Probability of %s when %s fires: %.3f\n", $s, $cs,
|
|
($corr_prob / $sym_prob);
|
|
}
|
|
|
|
print "Related symbols report:\n";
|
|
printf "Top related in spam:\n %s\n", StringifyRelated($spam_related,
|
|
$r->{spam_hits});
|
|
printf "Top related in junk:\n %s\n", StringifyRelated($junk_related,
|
|
$r->{junk_hits});
|
|
printf "Top related in ham:\n %s\n", StringifyRelated($ham_related,
|
|
$r->{hits} - $r->{spam_hits} - $r->{junk_hits});
|
|
}
|
|
else {
|
|
print ",";
|
|
print "\"correllations\":{";
|
|
|
|
my $has_comma_ = 0;
|
|
while (my ($cs, $hits) = each %{$r->{corr}}) {
|
|
if ($has_comma_) {
|
|
print ",";
|
|
}
|
|
else {
|
|
$has_comma_ = 1;
|
|
}
|
|
my $corr_prob = $hits / $total;
|
|
my $sym_prob = $r->{hits} / $total;
|
|
JsonObjectElt($cs, ($corr_prob / $sym_prob) ,"%.4f");
|
|
}
|
|
|
|
print "}";
|
|
}
|
|
}
|
|
|
|
print "}" if $json;
|
|
}
|
|
else {
|
|
print "Symbol $s has not been met\n" if !$json;
|
|
}
|
|
|
|
print '-' x 80 . "\n" if !$json;
|
|
}
|
|
}
|
|
}
|
|
|
|
sub Summary() {
|
|
if (!$json) {
|
|
print "
|
|
=== Summary ", '=' x 68, "
|
|
Messages scanned: $total";
|
|
printf " [ %s / %s ]
|
|
", $timeStamp{'start'}, $timeStamp{'end'}
|
|
if defined $timeStamp{'start'};
|
|
say '';
|
|
printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_}
|
|
for sort keys %action;
|
|
say '';
|
|
printf "scan time min/avg/max = %.2f/%.2f/%.2f s
|
|
", $scanTime{'min'} / 1000,
|
|
($total) ? $scanTime{'total'} / $total / 1000 : undef,
|
|
$scanTime{'max'} / 1000
|
|
if exists $scanTime{'min'};
|
|
say '=' x 80;
|
|
}
|
|
else {
|
|
JsonObjectElt("total", $total, "%d");
|
|
print ",";
|
|
|
|
if (defined $timeStamp{'start'}) {
|
|
JsonObjectElt("start", $timeStamp{'start'});
|
|
print ",";
|
|
}
|
|
|
|
if (defined $timeStamp{'end'}) {
|
|
JsonObjectElt("end", $timeStamp{'end'});
|
|
print ",";
|
|
}
|
|
|
|
print "\"actions\":{";
|
|
|
|
my $has_comma = 0;
|
|
foreach my $a (sort keys %action) {
|
|
if ($has_comma) {
|
|
print ",";
|
|
}
|
|
else {
|
|
$has_comma = 1;
|
|
}
|
|
JsonObjectElt($a, $action{$a}, "%d");
|
|
}
|
|
print "},";
|
|
}
|
|
}
|
|
|
|
sub ProcessRelated {
|
|
my ($symbols, $target) = @_;
|
|
|
|
foreach my $s (@{$symbols}) {
|
|
$s =~ /^([^\(]+)(\(([^\)]+)\))?/;
|
|
my $sym_name = $1;
|
|
my $sym_score = 0;
|
|
|
|
if ($2) {
|
|
$sym_score = $3 * 1.0;
|
|
|
|
if (abs($sym_score) < $diff_alpha) {
|
|
next;
|
|
}
|
|
|
|
my $bm = $bidir_match{$sym_name};
|
|
if ($bm) {
|
|
if ($sym_score >= 0) {
|
|
$sym_name = $bm->{'spam'};
|
|
}
|
|
else {
|
|
$sym_name = $bm->{'ham'};
|
|
}
|
|
}
|
|
}
|
|
|
|
if (exists($target->{$sym_name})) {
|
|
$target->{$sym_name} ++;
|
|
}
|
|
else {
|
|
$target->{$sym_name} = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
sub ProcessLog {
|
|
my ( $ts_format, @line ) = &log_time_format($rspamd_log);
|
|
my $is_syslog = defined $ts_format && $ts_format eq 'syslog';
|
|
|
|
while() {
|
|
last if eof $rspamd_log;
|
|
$_ = (@line) ? shift @line : <$rspamd_log>;
|
|
|
|
if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) {
|
|
$enabled = 1;
|
|
}
|
|
|
|
next if !$enabled;
|
|
|
|
if (/^.*rspamd_task_write_log.*$/) {
|
|
&spinner;
|
|
my $ts =
|
|
($is_syslog)
|
|
? syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] )
|
|
: join ' ', ( split /\s+/ )[ 0 .. 1 ];
|
|
|
|
next if ( $ts lt $startTime );
|
|
next if ( defined $endTime && $ts gt $endTime );
|
|
|
|
if ($_ !~ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/) {
|
|
#print "BAD: $_\n";
|
|
next;
|
|
}
|
|
|
|
my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
|
|
my $scan_time = $5;
|
|
my $act = $1;
|
|
my $score = $2 * 1.0;
|
|
my $skip = 0;
|
|
|
|
foreach my $ex (@symbols_exclude) {
|
|
my @found = grep {/^$ex/} @symbols;
|
|
|
|
if (scalar(@found) > 0) {
|
|
$skip = 1;
|
|
last;
|
|
}
|
|
}
|
|
|
|
next if ( $skip != 0 );
|
|
|
|
$timeStamp{'end'} = $ts;
|
|
$timeStamp{'start'} //= $timeStamp{'end'};
|
|
$scanTime{'min'} = $scan_time
|
|
if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
|
|
$scanTime{'max'} = $scan_time
|
|
if ( $scanTime{'max'} < $scan_time );
|
|
$scanTime{'total'} += $scan_time;
|
|
|
|
$action{$act}++;
|
|
$total ++;
|
|
|
|
if ($score >= $reject_score) {
|
|
$total_spam ++;
|
|
}
|
|
elsif ($score >= $junk_score) {
|
|
$total_junk ++;
|
|
}
|
|
|
|
my @sym_names;
|
|
|
|
foreach my $s (@symbols_search) {
|
|
my @selected = grep /$s/, @symbols;
|
|
|
|
if (scalar(@selected) > 0) {
|
|
|
|
foreach my $sym (@selected) {
|
|
$sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
|
|
my $sym_name = $1;
|
|
my $sym_score = 0;
|
|
my $orig_name = $sym_name;
|
|
|
|
if ($2) {
|
|
$sym_score = $3 * 1.0;
|
|
|
|
if (abs($sym_score) < $diff_alpha) {
|
|
next;
|
|
}
|
|
|
|
my $bm = $bidir_match{$sym_name};
|
|
if ($bm) {
|
|
if ($sym_score >= 0) {
|
|
$sym_name = $bm->{'spam'};
|
|
}
|
|
else {
|
|
$sym_name = $bm->{'ham'};
|
|
}
|
|
}
|
|
}
|
|
|
|
next if $orig_name !~ /^$s/;
|
|
|
|
push @sym_names, $sym_name;
|
|
|
|
if (!$sym_res{$sym_name}) {
|
|
$sym_res{$sym_name} = {
|
|
hits => 0,
|
|
spam_hits => 0,
|
|
junk_hits => 0,
|
|
spam_change => 0,
|
|
junk_change => 0,
|
|
weight => 0,
|
|
corr => {},
|
|
symbols_met_spam => {},
|
|
symbols_met_ham => {},
|
|
symbols_met_junk => {},
|
|
};
|
|
}
|
|
|
|
my $r = $sym_res{$sym_name};
|
|
|
|
$r->{hits} ++;
|
|
$r->{weight} += $sym_score;
|
|
my $is_spam = 0;
|
|
my $is_junk = 0;
|
|
|
|
if ($score >= $reject_score) {
|
|
$is_spam = 1;
|
|
$r->{spam_hits} ++;
|
|
if ($correlations) {
|
|
ProcessRelated(\@symbols, $r->{symbols_met_spam});
|
|
}
|
|
}
|
|
elsif ($score >= $junk_score) {
|
|
$is_junk = 1;
|
|
$r->{junk_hits} ++;
|
|
if ($correlations) {
|
|
ProcessRelated(\@symbols, $r->{symbols_met_junk});
|
|
}
|
|
}
|
|
else {
|
|
if ($correlations) {
|
|
ProcessRelated(\@symbols, $r->{symbols_met_ham});
|
|
}
|
|
}
|
|
|
|
if ($sym_score != 0) {
|
|
my $score_without = $score - $sym_score;
|
|
|
|
if ($sym_score > 0) {
|
|
if ($is_spam && $score_without < $reject_score) {
|
|
$r->{spam_change} ++;
|
|
}
|
|
if ($is_junk && $score_without < $junk_score) {
|
|
$r->{junk_change} ++;
|
|
}
|
|
}
|
|
else {
|
|
if (!$is_spam && $score_without >= $reject_score) {
|
|
$r->{spam_change} ++;
|
|
}
|
|
if (!$is_junk && $score_without >= $junk_score) {
|
|
$r->{junk_change} ++;
|
|
}
|
|
}
|
|
}
|
|
} # End foreach symbols selected
|
|
}
|
|
}
|
|
|
|
if ($correlations) {
|
|
foreach my $sym (@sym_names) {
|
|
my $r = $sym_res{$sym};
|
|
|
|
foreach my $corr_sym (@sym_names) {
|
|
if ($corr_sym ne $sym) {
|
|
if ($r->{'corr'}->{$corr_sym}) {
|
|
$r->{'corr'}->{$corr_sym} ++;
|
|
}
|
|
else {
|
|
$r->{'corr'}->{$corr_sym} = 1;
|
|
}
|
|
}
|
|
}
|
|
} # End of correlations check
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
sub JsonObjectElt() {
|
|
my ($k, $v) = @_;
|
|
my $f = defined $_[2] ? $_[2] : '%s';
|
|
|
|
if ($f eq "%s") {
|
|
$f = "\"%s\"";
|
|
}
|
|
|
|
printf "\"%s\":$f", $k, $v;
|
|
}
|
|
|
|
sub GetLogfilesList {
|
|
my ($dir) = @_;
|
|
opendir( DIR, $dir ) or die $!;
|
|
|
|
my $pattern = join( '|', keys %decompressor );
|
|
my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
|
|
|
|
# Add unnumbered logs first
|
|
my @logs =
|
|
grep { -f "$dir/$_" && !/$re/ } readdir(DIR);
|
|
|
|
# Add numbered logs
|
|
rewinddir(DIR);
|
|
push( @logs,
|
|
( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) );
|
|
|
|
closedir(DIR);
|
|
|
|
# Select required logs and revers their order
|
|
@logs =
|
|
reverse
|
|
splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
|
|
|
|
# Loop through array printing out filenames
|
|
print {interactive(*STDERR)} "\nLog files to process:\n";
|
|
foreach my $file (@logs) {
|
|
print {interactive(*STDERR)} " $file\n";
|
|
}
|
|
print {interactive(*STDERR)} "\n";
|
|
|
|
return @logs;
|
|
}
|
|
|
|
sub log_time_format {
|
|
my $fh = shift;
|
|
my ( $format, $line );
|
|
while (<$fh>) {
|
|
$line = $_;
|
|
|
|
# 2017-08-08 00:00:01 #66984(
|
|
# 2017-08-08 00:00:01.001 #66984(
|
|
if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) {
|
|
$format = 'rspamd';
|
|
last;
|
|
}
|
|
|
|
# Aug 8 00:02:50 #66986(
|
|
elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
|
|
$format = 'syslog';
|
|
last;
|
|
}
|
|
|
|
# Aug 8 00:02:50 hostname rspamd[66986]
|
|
elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
|
|
$format = 'syslog';
|
|
last;
|
|
}
|
|
|
|
# Skip newsyslog messages
|
|
# Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
|
|
elsif ( /^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/ ) {
|
|
next;
|
|
}
|
|
else {
|
|
print "Unknown log format\n";
|
|
exit 1;
|
|
}
|
|
}
|
|
return ( $format, $line );
|
|
}
|
|
|
|
sub normalized_time {
|
|
return undef
|
|
if !defined( $_ = shift );
|
|
|
|
/^\d\d(?::\d\d){0,2}$/
|
|
? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4],
|
|
(localtime)[3], $_
|
|
: $_;
|
|
}
|
|
|
|
sub numeric {
|
|
$a =~ /\.(\d+)\./;
|
|
my $a_num = $1;
|
|
$b =~ /\.(\d+)\./;
|
|
my $b_num = $1;
|
|
|
|
$a_num <=> $b_num;
|
|
}
|
|
|
|
sub spinner {
|
|
my @spinner = qw{/ - \ |};
|
|
return
|
|
if ( ( time - $spinner_update_time ) < 1 );
|
|
$spinner_update_time = time;
|
|
printf {interactive(*STDERR)} "%s\r", $spinner[ $spinner_update_time % @spinner ];
|
|
select()->flush();
|
|
}
|
|
|
|
# Convert syslog timestamp to "ISO 8601 like" format
|
|
# using current year as syslog does not record the year (nor the timezone)
|
|
# or the last year if the guessed time is in the future.
|
|
sub syslog2iso {
|
|
my %month_map;
|
|
@month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
|
|
|
|
my ( $month, @t ) =
|
|
$_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
|
|
my $epoch =
|
|
timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
|
|
sprintf '%04d-%02d-%02d %02d:%02d:%02d',
|
|
1900 + (localtime)[5] - ( $epoch > time ),
|
|
$month_map{$month} + 1, @t;
|
|
}
|
|
|
|
### Imported from IO::Interactive 1.022 Perl module
|
|
sub is_interactive {
|
|
my ($out_handle) = (@_, select); # Default to default output handle
|
|
|
|
# Not interactive if output is not to terminal...
|
|
return 0 if not -t $out_handle;
|
|
|
|
# If *ARGV is opened, we're interactive if...
|
|
if ( tied(*ARGV) or defined(fileno(ARGV)) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
|
|
|
|
# ...it's currently opened to the magic '-' file
|
|
return -t *STDIN if defined $ARGV && $ARGV eq '-';
|
|
|
|
# ...it's at end-of-file and the next file is the magic '-' file
|
|
return @ARGV>0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
|
|
|
|
# ...it's directly attached to the terminal
|
|
return -t *ARGV;
|
|
}
|
|
|
|
# If *ARGV isn't opened, it will be interactive if *STDIN is attached
|
|
# to a terminal.
|
|
else {
|
|
return -t *STDIN;
|
|
}
|
|
}
|
|
|
|
### Imported from IO::Interactive 1.022 Perl module
|
|
local (*DEV_NULL, *DEV_NULL2);
|
|
my $dev_null;
|
|
BEGIN {
|
|
pipe *DEV_NULL, *DEV_NULL2
|
|
or die "Internal error: can't create null filehandle";
|
|
$dev_null = \*DEV_NULL;
|
|
}
|
|
|
|
### Imported from IO::Interactive 1.022 Perl module
|
|
sub interactive {
|
|
my ($out_handle) = (@_, \*STDOUT); # Default to STDOUT
|
|
return &is_interactive ? $out_handle : $dev_null;
|
|
}
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
rspamd_stats - analyze Rspamd rules by parsing log files
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file]
|
|
|
|
Options:
|
|
--log=file log file or directory to read (stdin by default)
|
|
--reject-score=score set reject threshold (15 by default)
|
|
--junk-score=score set junk score (6.0 by default)
|
|
--symbol=sym check specified symbol (perl regexps, '.*' by default)
|
|
--alpha-score=score set ignore score for symbols (0.1 by default)
|
|
--correlations enable correlations report
|
|
--nrelated=integer show that amount of related symbols (10 by default)
|
|
--search-pattern do not process input unless the desired pattern is found
|
|
--start starting time (oldest) for log parsing
|
|
--end ending time (newest) for log parsing
|
|
--num-logs=integer number of recent logfiles to analyze (all files in the directory by default)
|
|
--exclude-logs=integer number of latest logs to exclude (0 by default)
|
|
--json print json output instead of human readable
|
|
--help brief help message
|
|
--man full documentation
|
|
|
|
=head1 OPTIONS
|
|
|
|
=over 8
|
|
|
|
=item B<--log>
|
|
|
|
Specifies log file or directory to read data from.
|
|
If a directory is specified B<rspamd_stats> analyses files in the directory
|
|
including known compressed file types. Number of log files can be limited using
|
|
B<--num-logs> and B<--exclude-logs> options. This assumes that files in the log
|
|
directory have B<newsyslog(8)>- or B<logrotate(8)>-like name format with numeric
|
|
indexes. Files without indexes (generally it is merely one file) are considered
|
|
the most recent and files with lower indexes are considered newer.
|
|
|
|
=item B<--reject-score>
|
|
|
|
Specifies the reject (spam) threshold.
|
|
|
|
=item B<--junk-score>
|
|
|
|
Specifies the junk (add header or rewrite subject) threshold.
|
|
|
|
=item B<--alpha-score>
|
|
|
|
Specifies the minimum score for a symbol to be considered by this script.
|
|
|
|
=item B<--symbol>
|
|
|
|
Add symbol or pattern (pcre format) to analyze.
|
|
|
|
=item B<--num-logs>
|
|
|
|
If set, limits number of analyzed logfiles in the directory to the specified value.
|
|
|
|
=item B<--exclude-logs>
|
|
|
|
Number of latest logs to exclude (0 by default).
|
|
|
|
=item B<--correlations>
|
|
|
|
Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations between symbols.
|
|
|
|
=item B<--search-pattern>
|
|
|
|
Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position.
|
|
|
|
=item B<--exclude>
|
|
|
|
Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple symbols.
|
|
|
|
=item B<--start>
|
|
|
|
Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be
|
|
truncated to any desired accuracy). If used with B<--end> select entries between
|
|
B<--start> and B<--end>. The omitted date defaults to the current date if you
|
|
supply the time.
|
|
|
|
=item B<--end>
|
|
|
|
Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be
|
|
truncated to any desired accuracy). If used with B<--start> select entries between
|
|
B<--start> and B<--end>. The omitted date defaults to the current date if you
|
|
supply the time.
|
|
|
|
=item B<--help>
|
|
|
|
Print a brief help message and exits.
|
|
|
|
=item B<--man>
|
|
|
|
Prints the manual page and exits.
|
|
|
|
=back
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols:
|
|
|
|
Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%)
|
|
Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%)
|
|
Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%)
|
|
Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%)
|
|
Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102%
|
|
Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540%
|
|
|
|
Where there are the following attributes:
|
|
|
|
=over 4
|
|
|
|
=item *
|
|
|
|
B<Weight>: average score for a symbols
|
|
|
|
=item *
|
|
|
|
B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages
|
|
|
|
=item *
|
|
|
|
B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right):
|
|
|
|
=over 4
|
|
|
|
=item 1.
|
|
|
|
B<total symbol hits>: number of messages that has this symbol and are B<HAM>
|
|
|
|
=item 2.
|
|
|
|
B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count
|
|
|
|
=item 3.
|
|
|
|
B<total ham hits>: overall number of B<HAM> messages
|
|
|
|
=item 4.
|
|
|
|
B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total number of B<HAM> messages.
|
|
|
|
=back
|
|
|
|
=item *
|
|
|
|
B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class.
|
|
|
|
=item *
|
|
|
|
B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class.
|
|
|
|
=item *
|
|
|
|
B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight.
|
|
|
|
=item *
|
|
|
|
B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight.
|
|
|
|
=back
|
|
|
|
=cut
|