From e50e7ed4875e750862a410e8c25dedd5ea7488d9 Mon Sep 17 00:00:00 2001 From: Alexander Moisseev Date: Thu, 11 Aug 2016 09:31:25 +0300 Subject: [Minor] rspamd_stats: place parser code in a subroutine --- utils/rspamd_stats.pl | 246 +++++++++++++++++++++++++------------------------- 1 file changed, 125 insertions(+), 121 deletions(-) (limited to 'utils') diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl index 8aeddade6..fe1542ccb 100644 --- a/utils/rspamd_stats.pl +++ b/utils/rspamd_stats.pl @@ -49,130 +49,11 @@ my $enabled = 0; if ($log_file eq '-' || $log_file eq '') { $rspamd_log = \*STDIN; + &ProcessLog(); } else { open($rspamd_log, '<', $log_file) or die "cannot open $log_file"; -} - -while(<$rspamd_log>) { - if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) { - $enabled = 1; - } - - next if !$enabled; - - if (/^.*rspamd_task_write_log.*$/) { - my @elts = split /\s+/; - my $ts = $elts[0] . ' ' . $elts[1]; - - if ($_ !~ /\[(-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\]/) { - #print "BAD: $_\n"; - next; - } - - $total ++; - my $score = $1 * 1.0; - - if ($score >= $reject_score) { - $total_spam ++; - } - elsif ($score >= $junk_score) { - $total_junk ++; - } - - # Symbols - my @symbols = split /,/, $3; - my @sym_names; - - foreach my $s (@symbols_search) { - my @selected = grep /$s/, @symbols; - - if (scalar(@selected) > 0) { - - foreach my $sym (@selected) { - $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; - my $sym_name = $1; - my $sym_score = 0; - if ($2) { - $sym_score = $3 * 1.0; - - if (abs($sym_score) < $diff_alpha) { - next; - } - } - next if $sym_name !~ /^$s/; - - push @sym_names, $sym_name; - - if (!$sym_res{$sym_name}) { - $sym_res{$sym_name} = { - hits => 0, - spam_hits => 0, - junk_hits => 0, - spam_change => 0, - junk_change => 0, - weight => 0, - corr => {}, - }; - } - - my $r = $sym_res{$sym_name}; - - $r->{hits} ++; - $r->{weight} += $sym_score; - my $is_spam = 0; - my $is_junk = 0; - - if ($score >= $reject_score) { - $is_spam = 1; - $r->{spam_hits} ++; - } - elsif ($score >= $junk_score) { - $is_junk = 1; - $r->{junk_hits} ++; - } - - if ($sym_score != 0) { - my $score_without = $score - $sym_score; - - if ($sym_score > 0) { - if ($is_spam && $score_without < $reject_score) { - $r->{spam_change} ++; - } - if ($is_junk && $score_without < $junk_score) { - $r->{junk_change} ++; - } - } - else { - if (!$is_spam && $score_without >= $reject_score) { - $r->{spam_change} ++; - } - if (!$is_junk && $score_without >= $junk_score) { - $r->{junk_change} ++; - } - } - } - } # End foreach symbols selected - } - } - - if ($correlations) { - foreach my $sym (@sym_names) { - my $r = $sym_res{$sym}; - - foreach my $corr_sym (@sym_names) { - if ($corr_sym ne $sym) { - if ($r->{'corr'}->{$corr_sym}) { - $r->{'corr'}->{$corr_sym} ++; - } - else { - $r->{'corr'}->{$corr_sym} = 1; - } - } - } - } # End of correlations check - } - } + &ProcessLog(); } my $total_ham = $total - ($total_spam + $total_junk); @@ -247,6 +128,129 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%) } } +sub ProcessLog { + while(<$rspamd_log>) { + if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) { + $enabled = 1; + } + + next if !$enabled; + + if (/^.*rspamd_task_write_log.*$/) { + my @elts = split /\s+/; + my $ts = $elts[0] . ' ' . $elts[1]; + + if ($_ !~ /\[(-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\]/) { + #print "BAD: $_\n"; + next; + } + + $total ++; + my $score = $1 * 1.0; + + if ($score >= $reject_score) { + $total_spam ++; + } + elsif ($score >= $junk_score) { + $total_junk ++; + } + + # Symbols + my @symbols = split /,/, $3; + my @sym_names; + + foreach my $s (@symbols_search) { + my @selected = grep /$s/, @symbols; + + if (scalar(@selected) > 0) { + + foreach my $sym (@selected) { + $sym =~ /^([^\(]+)(\(([^\)]+)\))?/; + my $sym_name = $1; + my $sym_score = 0; + if ($2) { + $sym_score = $3 * 1.0; + + if (abs($sym_score) < $diff_alpha) { + next; + } + } + next if $sym_name !~ /^$s/; + + push @sym_names, $sym_name; + + if (!$sym_res{$sym_name}) { + $sym_res{$sym_name} = { + hits => 0, + spam_hits => 0, + junk_hits => 0, + spam_change => 0, + junk_change => 0, + weight => 0, + corr => {}, + }; + } + + my $r = $sym_res{$sym_name}; + + $r->{hits} ++; + $r->{weight} += $sym_score; + my $is_spam = 0; + my $is_junk = 0; + + if ($score >= $reject_score) { + $is_spam = 1; + $r->{spam_hits} ++; + } + elsif ($score >= $junk_score) { + $is_junk = 1; + $r->{junk_hits} ++; + } + + if ($sym_score != 0) { + my $score_without = $score - $sym_score; + + if ($sym_score > 0) { + if ($is_spam && $score_without < $reject_score) { + $r->{spam_change} ++; + } + if ($is_junk && $score_without < $junk_score) { + $r->{junk_change} ++; + } + } + else { + if (!$is_spam && $score_without >= $reject_score) { + $r->{spam_change} ++; + } + if (!$is_junk && $score_without >= $junk_score) { + $r->{junk_change} ++; + } + } + } + } # End foreach symbols selected + } + } + + if ($correlations) { + foreach my $sym (@sym_names) { + my $r = $sym_res{$sym}; + + foreach my $corr_sym (@sym_names) { + if ($corr_sym ne $sym) { + if ($r->{'corr'}->{$corr_sym}) { + $r->{'corr'}->{$corr_sym} ++; + } + else { + $r->{'corr'}->{$corr_sym} = 1; + } + } + } + } # End of correlations check + } + } + } +} + __END__ =head1 NAME -- cgit v1.2.3 From 68fc43a8f0f681ea36cf7e7fe89126c4f349ebca Mon Sep 17 00:00:00 2001 From: Alexander Moisseev Date: Thu, 11 Aug 2016 09:39:13 +0300 Subject: [Minor] rspamd_stats: fix typo Reported by: @dehnli via IRC --- utils/rspamd_stats.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'utils') diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl index fe1542ccb..a12c8a3c3 100644 --- a/utils/rspamd_stats.pl +++ b/utils/rspamd_stats.pl @@ -339,7 +339,7 @@ B: total number of hits and percentage of symbol hits divided by tot =item * -B: [rovides the following information about B messages with the specified symbol (from left to right): +B: provides the following information about B messages with the specified symbol (from left to right): =over 4 -- cgit v1.2.3 From 7d3b279aa3e78c72e8c1a8f022ac0cd84293df5b Mon Sep 17 00:00:00 2001 From: Alexander Moisseev Date: Thu, 11 Aug 2016 10:11:10 +0300 Subject: [Feature] rspamd_stats: support log directory reading rspamd_stats will read (and decompress) multiple log files in the specified directory. --- utils/rspamd_stats.pl | 94 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 92 insertions(+), 2 deletions(-) (limited to 'utils') diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl index a12c8a3c3..0d9571b42 100644 --- a/utils/rspamd_stats.pl +++ b/utils/rspamd_stats.pl @@ -13,9 +13,18 @@ my $diff_alpha = 0.1; my $correlations = 0; my $log_file = ""; my $search_pattern = ""; +my $num_logs; +my $exclude_logs = 0; my $man = 0; my $help = 0; +# Associate file extensions with decompressors +my %decompressor = ( + 'bz2' => 'bzcat', + 'gz' => 'zcat', + 'xz' => 'xzcat', +); + GetOptions( "reject-score|r=f" => \$reject_score, "junk-score|j=f" => \$junk_score, @@ -24,6 +33,8 @@ GetOptions( "alpha|a=f" => \$diff_alpha, "correlations|c" => \$correlations, "search-pattern=s" => \$search_pattern, + "num-logs|n=i" => \$num_logs, + "exclude-logs|x=i" => \$exclude_logs, "help|?" => \$help, "man" => \$man ) or pod2usage(2); @@ -51,6 +62,25 @@ if ($log_file eq '-' || $log_file eq '') { $rspamd_log = \*STDIN; &ProcessLog(); } +elsif ( -d "$log_file" ) { + my $log_dir = "$log_file"; + + my @logs = &GetLogfilesList($log_dir); + + # Process logs + foreach (@logs) { + my $ext = (/[^.]+\.?([^.]*?)$/)[0]; + my $dc = $decompressor{$ext} || 'cat'; + + open( $rspamd_log, "-|", "$dc $log_dir/$_" ) + or die "cannot execute $dc $log_dir/$_ : $!"; + + &ProcessLog; + + close($rspamd_log) + or warn "cannot close $dc $log_dir/$_: $!"; + } +} else { open($rspamd_log, '<', $log_file) or die "cannot open $log_file"; &ProcessLog(); @@ -128,6 +158,8 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%) } } +exit; + sub ProcessLog { while(<$rspamd_log>) { if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) { @@ -251,6 +283,48 @@ sub ProcessLog { } } +sub GetLogfilesList { + my ($dir) = @_; + opendir( DIR, $dir ) or die $!; + + my $pattern = join( '|', keys %decompressor ); + my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/; + + # Add unnumbered logs first + my @logs = + grep { -f "$dir/$_" && !/$re/ } readdir(DIR); + + # Add numbered logs + rewinddir(DIR); + push( @logs, + ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) ); + + closedir(DIR); + + # Select required logs and revers their order + @logs = + reverse + splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs ); + + # Loop through array printing out filenames + print "\nParsing log files:\n"; + foreach my $file (@logs) { + print " $file\n"; + } + print "\n"; + + return @logs; +} + +sub numeric { + $a =~ /\.(\d+)\./; + my $a_num = $1; + $b =~ /\.(\d+)\./; + my $b_num = $1; + + $a_num <=> $b_num; +} + __END__ =head1 NAME @@ -262,13 +336,15 @@ rspamd_stats - analyze Rspamd rules by parsing log files rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file] Options: - --log=file log file to read (stdin by default) + --log=file log file or directory to read (stdin by default) --reject-score=score set reject threshold (15 by default) --junk-score=score set junk score (6.0 by default) --symbol=sym check specified symbol (perl regexps, '.*' by default) --alpha=value set ignore score for symbols (0.1 by default) --correlations enable correlations report --search-pattern do not process input unless the desired pattern is found + --num-logs=integer number of recent logfiles to analyze (all files in the directory by default) + --exclude-logs=integer number of latest logs to exclude (0 by default) --help brief help message --man full documentation @@ -278,7 +354,13 @@ rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file] =item B<--log> -Specifies log file to read data from. +Specifies log file or directory to read data from. +If a directory is specified B analyses files in the directory +including known compressed file types. Number of log files can be limited using +B<--num-logs> and B<--exclude-logs> options. This assumes that files in the log +directory have B- or B-like name format with numeric +indexes. Files without indexes (generally it is merely one file) are considered +the most recent and files with lower indexes are considered newer. =item B<--reject-score> @@ -296,6 +378,14 @@ Specifies the minimum score for a symbol to be considered by this script. Add symbol or pattern (pcre format) to analyze. +=item B<--num-logs> + +If set, limits number of analyzed logfiles in the directory to the specified value. + +=item B<--exclude-logs> + +Number of latest logs to exclude (0 by default). + =item B<--correlations> Additionaly print correlation rate for each symbol displayed. This routine calculates merely paired correlations between symbols. -- cgit v1.2.3