#!/usr/bin/env perl use 5.010; use Data::Dumper; use Getopt::Long; use Pod::Usage; use Time::Local; use warnings; use strict; my @symbols_search; my @symbols_exclude; my @symbols_bidirectional; my $reject_score = 15.0; my $junk_score = 6.0; my $diff_alpha = 0.1; my $correlations = 0; my $nrelated = 10; my $log_file = ""; my $search_pattern = ""; my $startTime=""; my $endTime; my $num_logs; my $exclude_logs = 0; my $man = 0; my $json = 0; my $help = 0; # Associate file extensions with decompressors my %decompressor = ( 'bz2' => 'bzip2 -cd', 'gz' => 'gzip -cd', 'xz' => 'xz -cd', 'zst' => 'zstd -cd', ); GetOptions( "reject-score|r=f" => \$reject_score, "junk-score|j=f" => \$junk_score, "symbol|s=s@" => \@symbols_search, "symbol-bidir|S=s@" => \@symbols_bidirectional, "exclude|X=s@" => \@symbols_exclude, "log|l=s" => \$log_file, "alpha-score|alpha|a=f" => \$diff_alpha, "correlations|c" => \$correlations, "nrelated" => \$nrelated, "search-pattern=s" => \$search_pattern, "start=s" => \$startTime, "end=s" => \$endTime, "num-logs|n=i" => \$num_logs, "exclude-logs|x=i" => \$exclude_logs, "json|j" => \$json, "help|?" => \$help, "man" => \$man ) or pod2usage(2); pod2usage(1) if $help; pod2usage(-exitval => 0, -verbose => 2) if $man; # Global vars my $total = 0; my $total_spam = 0; my $total_junk = 0; my $junk_symbols = 0; my $spam_symbols = 0; my $ham_symbols = 0; my $ham_spam_change = 0; my $ham_junk_change = 0; my %sym_res; my $rspamd_log; my $enabled = 0; my $log_file_num = 1; my $spinner_update_time = 0; my %action; my %timeStamp; my %scanTime = ( max => 0, total => 0, ); my %bidir_match; foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) } # Convert bidirectional symbols foreach my $s (@symbols_bidirectional) { $bidir_match{$s} = { spam => "${s}_SPAM", ham => "${s}_HAM", }; push @symbols_search, $s unless grep /^$s$/, @symbols_search; } @symbols_search = '.*' unless @symbols_search; if ($log_file eq '-' || $log_file eq '') { $rspamd_log = \*STDIN; &ProcessLog(); } elsif ( -d "$log_file" ) { my $log_dir = "$log_file"; my @logs = &GetLogfilesList($log_dir); # Process logs foreach (@logs) { my $ext = (/[^.]+\.?([^.]*?)$/)[0]; my $dc = $decompressor{$ext} || 'cat'; open( $rspamd_log, "-|", "$dc $log_dir/$_" ) or die "cannot execute $dc $log_dir/$_ : $!"; printf "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs, $_; $spinner_update_time = 0; # Force spinner update &spinner; &ProcessLog; close($rspamd_log) or warn "cannot close $dc $log_dir/$_: $!"; } } else { open($rspamd_log, '<', $log_file) or die "cannot open $log_file"; &ProcessLog(); } my $total_ham = $total - ($total_spam + $total_junk); if ($json) { print "{"; &Summary(); print '"symbols":{'; &SymbolsStat(); print "}}\n"; } else { &SymbolsStat(); &Summary(); } exit; sub GenRelated { my ($htb, $target_sym) = @_; my @result; my $i = 0; foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) { if ($sym ne $target_sym) { my @elt = ($sym, $htb->{$sym}); push @result, \@elt; $i ++; } last if $i > $nrelated; } return \@result; } sub StringifyRelated { my ($ar, $total) = @_; return join("\n", (map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar})); } sub SymbolsStat { if ($total > 0) { my $has_comma = 0; while (my ($s, $r) = each(%sym_res)) { if ($r->{hits} > 0) { my $th = $r->{hits}; my $sh = $r->{spam_hits}; my $jh = $r->{junk_hits}; my $hh = $r->{hits} - $sh - $jh; my $htp = $hh * 100.0 / $total_ham if $total_ham != 0; my $stp = $sh * 100.0 / $total_spam if $total_spam != 0; my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0; if ($json) { if ($has_comma) { print ","; } else { $has_comma = 1; } print "\"$s\":{"; JsonObjectElt("avg_weight", $r->{'weight'},"%.4f"); print ","; JsonObjectElt("hits", $th, "%d"); print ","; JsonObjectElt("hits_percentage", $th/$total, "%.4f"); print ","; JsonObjectElt("spam_hits", $sh, "%d"); print ","; JsonObjectElt("spam_to_total", $sh/$th, "%.4f"); print ","; JsonObjectElt("spam_percentage", $stp/100.0 || 0, "%.4f"); print ","; JsonObjectElt("ham_hits", $hh, "%d"); print ","; JsonObjectElt("ham_to_total", $hh/$th, "%.4f"); print ","; JsonObjectElt("ham_percentage", $htp/100.0 || 0, "%.4f"); print ","; JsonObjectElt("junk_hits", $jh, "%d"); print ","; JsonObjectElt("junk_to_total", $jh/$th, "%.4f"); print ","; JsonObjectElt("junk_percentage", $jtp/100.0 || 0, "%.4f"); } else { printf "%s avg. weight %.3f, hits %d(%.3f%%): Ham %7.3f%%, %6d/%-6d (%7.3f%%) Spam %7.3f%%, %6d/%-6d (%7.3f%%) Junk %7.3f%%, %6d/%-6d (%7.3f%%) ", $s, $r->{weight} / $r->{hits}, $th, ($th / $total * 100), ($hh / $th * 100), $hh, $total_ham, ($htp or 0), ($sh / $th * 100), $sh, $total_spam, ($stp or 0), ($jh / $th * 100), $jh, $total_junk, ($jtp or 0); } my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam; my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk; if ($r->{weight} != 0) { if (!$json) { if ($r->{weight} > 0) { printf " Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%) Spam changes / total spam hits: %6d/%-6d (%7.3f%%) Junk changes (ham -> junk): %6d/%-6d (%7.3f%%) Junk changes / total junk hits: %6d/%-6d (%7.3f%%) ", $r->{spam_change}, $th, ($r->{spam_change} / $th * 100), $r->{spam_change}, $total_spam, ($schp or 0), $r->{junk_change}, $th, ($r->{junk_change} / $th * 100), $r->{junk_change}, $total_junk, ($jchp or 0); } else { printf " Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%) Spam changes / total spam hits : %6d/%-6d (%7.3f%%) Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%) Junk changes / total junk hits : %6d/%-6d (%7.3f%%) ", $r->{spam_change}, $th, ($r->{spam_change} / $th * 100), $r->{spam_change}, $total_spam, ($schp or 0), $r->{junk_change}, $th, ($r->{junk_change} / $th * 100), $r->{junk_change}, $total_junk, ($jchp or 0); } } else { print ","; JsonObjectElt("spam_change", $r->{spam_change}, "%.4f"); print ","; JsonObjectElt("junk_change", $r->{junk_change}, "%.4f"); } } if ($correlations) { my $spam_related = GenRelated($r->{symbols_met_spam}, $s); my $junk_related = GenRelated($r->{symbols_met_junk}, $s); my $ham_related = GenRelated($r->{symbols_met_ham}, $s); if (!$json) { print "Correlations report:\n"; while (my ($cs, $hits) = each %{$r->{corr}}) { my $corr_prob = $hits / $total; my $sym_prob = $r->{hits} / $total; printf "Probability of %s when %s fires: %.3f\n", $s, $cs, ($corr_prob / $sym_prob); } print "Related symbols report:\n"; printf "Top related in spam:\n %s\n", StringifyRelated($spam_related, $r->{spam_hits}); printf "Top related in junk:\n %s\n", StringifyRelated($junk_related, $r->{junk_hits}); printf "Top related in ham:\n %s\n", StringifyRelated($ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits}); } else { print ","; print "\"correllations\":{"; my $has_comma_ = 0; while (my ($cs, $hits) = each %{$r->{corr}}) { if ($has_comma_) { print ","; } else { $has_comma_ = 1; } my $corr_prob = $hits / $total; my $sym_prob = $r->{hits} / $total; JsonObjectElt($cs, ($corr_prob / $sym_prob) ,"%.4f"); } print "}"; } } print "}" if $json; } else { print "Symbol $s has not been met\n" if !$json; } print '-' x 80 . "\n" if !$json; } } } sub Summary() { if (!$json) { print " === Summary ", '=' x 68, " Messages scanned: $total"; printf " [ %s / %s ] ", $timeStamp{'start'}, $timeStamp{'end'} if defined $timeStamp{'start'}; say ''; printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action; say ''; printf "scan time min/avg/max = %.2f/%.2f/%.2f s ", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000 if exists $scanTime{'min'}; say '=' x 80; } else { JsonObjectElt("total", $total, "%d"); print ","; if (defined $timeStamp{'start'}) { JsonObjectElt("start", $timeStamp{'start'}); print ","; } if (defined $timeStamp{'end'}) { JsonObjectElt("end", $timeStamp{'end'}); print ","; } print "\"actions\":{"; my $has_comma = 0; foreach my $a (sort keys %action) { if ($has_comma) { print ","; } else { $has_comma = 1; } JsonObjectElt($a, $action{$a}, "%d"); } print "},"; } } sub ProcessRelated { my ($symbols, $target) = @_; foreach my $s (@{$symbols}) { $s =~ /^([^\(]+)(\(([^\)]+)\))?/; my $sym_name = $1; my $sym_score = 0; if ($2) { $sym_score = $3 * 1.0; if (abs($sym_score) < $diff_alpha) { next; } my $bm = $bidir_match{$sym_name}; if ($bm) { if ($sym_score >= 0) { $sym_name = $bm->{'spam'}; } else { $sym_name = $bm->{'ham'}; } } } if (exists($target->{$sym_name})) { $target->{$sym_name} ++; } else { $target->{$sym_name} = 1; } } } sub ProcessLog { my ( $ts_format, @line ) = &log_time_format($rspamd_log); my $is_syslog = defined $ts_format && $ts_format eq 'syslog'; while() { last if eof $rspamd_log; $_ = (@line) ? shift @line : <$rspamd_log>; if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) { $enabled = 1; } next if !$enabled; if (/^.*rspamd_task_write_log.*$/) { &spinner; my $ts = ($is_syslog) ? syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] ) : join ' ', ( split /\s+/ )[ 0 .. 1 ]; next if ( $ts lt $startTime ); next if ( defined $endTime && $ts gt $endTime ); if ($_ !~ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/) { #print "BAD: $_\n"; next; } my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4; my $scan_time = $5; my $act = $1; my $score = $2 * 1.0; my $skip = 0; foreach my $ex (@symbols_exclude) { my @found = grep {/^$ex/} @symbols; if (scalar(@found) > 0) { $skip = 1; last; } } next if ( $skip != 0 ); $timeStamp{'end'} = $ts; $timeStamp{'start'} //= $timeStamp{'end'}; $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time ); $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time ); $scanTime{'total'} += $scan_time; $action{$act}++; $total ++; if ($score >= $reject_score) { $total_spam ++; } elsif ($score >= $junk_score) { $total_junk ++; } my @sym_names; foreach my $s (@symbols_search) { my @selected = grep /$s/, @symbols; if (scalar(@selected) > 0) { foreach my $sym (@selected) { $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; my $sym_name = $1; my $sym_score = 0; my $orig_name = $sym_name; if ($2) { $sym_score = $3 * 1.0; if (abs($sym_score) < $diff_alpha) { next; } my $bm = $bidir_match{$sym_name}; if ($bm) { if ($sym_score >= 0) { $sym_name = $bm->{'spam'}; } else { $sym_name = $bm->{'ham'}; } } } next if $orig_name !~ /^$s/; push @sym_names, $sym_name; if (!$sym_res{$sym_name}) { $sym_res{$sym_name} = { hits => 0, spam_hits => 0, junk_hits => 0, spam_change => 0, junk_change => 0, weight => 0, corr => {}, symbols_met_spam => {}, symbols_met_ham => {}, symbols_met_junk => {}, }; } my $r = $sym_res{$sym_name}; $r->{hits} ++; $r->{weight} += $sym_score; my $is_spam = 0; my $is_junk = 0; if ($score >= $reject_score) { $is_spam = 1; $r->{spam_hits} ++; if ($correlations) { ProcessRelated(\@symbols, $r->{symbols_met_spam}); } } elsif ($score >= $junk_score) { $is_junk = 1; $r->{junk_hits} ++; if ($correlations) { ProcessRelated(\@symbols, $r->{symbols_met_junk}); } } else { if ($correlations) { ProcessRelated(\@symbols, $r->{symbols_met_ham}); } } if ($sym_score != 0) { my $score_without = $score - $sym_score; if ($sym_score > 0) { if ($is_spam && $score_without < $reject_score) { $r->{spam_change} ++; } if ($is_junk && $score_without < $junk_score) { $r->{junk_change} ++; } } else { if (!$is_spam && $score_without >= $reject_score) { $r->{spam_change} ++; } if (!$is_junk && $score_without >= $junk_score) { $r->{junk_change} ++; } } } } # End foreach symbols selected } } if ($correlations) { foreach my $sym (@sym_names) { my $r = $sym_res{$sym}; foreach my $corr_sym (@sym_names) { if ($corr_sym ne $sym) { if ($r->{'corr'}->{$corr_sym}) { $r->{'corr'}->{$corr_sym} ++; } else { $r->{'corr'}->{$corr_sym} = 1; } } } } # End of correlations check } } } } sub JsonObjectElt() { my ($k, $v) = @_; my $f = defined $_[2] ? $_[2] : '%s'; if ($f eq "%s") { $f = "\"%s\""; } printf "\"%s\":$f", $k, $v; } sub GetLogfilesList { my ($dir) = @_; opendir( DIR, $dir ) or die $!; my $pattern = join( '|', keys %decompressor ); my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; # Add unnumbered logs first my @logs = grep { -f "$dir/$_" && !/$re/ } readdir(DIR); # Add numbered logs rewinddir(DIR); push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) ); closedir(DIR); # Select required logs and revers their order @logs = reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); # Loop through array printing out filenames if (!$json) { print "\nLog files to process:\n"; foreach my $file (@logs) { print " $file\n"; } print "\n"; } return @logs; } sub log_time_format { my $fh = shift; my ( $format, $line ); while (<$fh>) { $line = $_; # 2017-08-08 00:00:01 #66984( # 2017-08-08 00:00:01.001 #66984( if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3})? #\d+\(/) { $format = 'rspamd'; last; } # Aug 8 00:02:50 #66986( elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) { $format = 'syslog'; last; } # Aug 8 00:02:50 hostname rspamd[66986] elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) { $format = 'syslog'; last; } # Skip newsyslog messages # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over elsif ( /^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/ ) { next; } else { print "Unknown log format\n"; exit 1; } } return ( $format, $line ); } sub normalized_time { return undef if !defined( $_ = shift ); /^\d\d(?::\d\d){0,2}$/ ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_ : $_; } sub numeric { $a =~ /\.(\d+)\./; my $a_num = $1; $b =~ /\.(\d+)\./; my $b_num = $1; $a_num <=> $b_num; } sub spinner { my @spinner = qw{/ - \ |}; return if ( $json || ( time - $spinner_update_time ) < 1 ); printf "%s\r", $spinner[ $spinner_update_time % @spinner ]; $spinner_update_time = time; select()->flush(); } # Convert syslog timestamp to "ISO 8601 like" format # using current year as syslog does not record the year (nor the timezone) # or the last year if the guessed time is in the future. sub syslog2iso { my %month_map; @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11; my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x; my $epoch = timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] ); sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t; } __END__ =head1 NAME rspamd_stats - analyze Rspamd rules by parsing log files =head1 SYNOPSIS rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file] Options: --log=file log file or directory to read (stdin by default) --reject-score=score set reject threshold (15 by default) --junk-score=score set junk score (6.0 by default) --symbol=sym check specified symbol (perl regexps, '.*' by default) --alpha-score=score set ignore score for symbols (0.1 by default) --correlations enable correlations report --search-pattern do not process input unless the desired pattern is found --start starting time (oldest) for log parsing --end ending time (newest) for log parsing --num-logs=integer number of recent logfiles to analyze (all files in the directory by default) --exclude-logs=integer number of latest logs to exclude (0 by default) --json print json output instead of human readable --help brief help message --man full documentation =head1 OPTIONS =over 8 =item B<--log> Specifies log file or directory to read data from. If a directory is specified B analyses files in the directory including known compressed file types. Number of log files can be limited using B<--num-logs> and B<--exclude-logs> options. This assumes that files in the log directory have B- or B-like name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most recent and files with lower indexes are considered newer. =item B<--reject-score> Specifies the reject (spam) threshold. =item B<--junk-score> Specifies the junk (add header or rewrite subject) threshold. =item B<--alpha-score> Specifies the minimum score for a symbol to be considered by this script. =item B<--symbol> Add symbol or pattern (pcre format) to analyze. =item B<--num-logs> If set, limits number of analyzed logfiles in the directory to the specified value. =item B<--exclude-logs> Number of latest logs to exclude (0 by default). =item B<--correlations> Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations between symbols. =item B<--search-pattern> Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position. =item B<--exclude> Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple symbols. =item B<--start> Select log entries after this time. Format: C (can be truncated to any desired accuracy). If used with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you supply the time. =item B<--end> Select log entries before this time. Format: C (can be truncated to any desired accuracy). If used with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you supply the time. =item B<--help> Print a brief help message and exits. =item B<--man> Prints the manual page and exits. =back =head1 DESCRIPTION B will read the given log file (or standard input) and provide statistics for the specified symbols: Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%) Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%) Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%) Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%) Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102% Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540% Where there are the following attributes: =over 4 =item * B: average score for a symbols =item * B: total number of hits and percentage of symbol hits divided by total number of messages =item * B: provides the following information about B messages with the specified symbol (from left to right): =over 4 =item 1. B: number of messages that has this symbol and are B =item 2. B: number of symbol hits divided by overall B messages count =item 3. B: overall number of B messages =item 4. B: percentage of number of hits with specified symbol in B messages divided by total number of B messages. =back =item * B: provides the following information about B messages - same as previous but for B class. =item * B: provides the following information about B messages - same as previous but for B class. =item * B: displays data about how much messages switched their class because of the specific symbol weight. =item * B: displays data about how much messages switched their class because of the specific symbol weight. =back =cut