summaryrefslogtreecommitdiffstats
path: root/utils/rspamd_stats.pl
diff options
context:
space:
mode:
Diffstat (limited to 'utils/rspamd_stats.pl')
-rwxr-xr-xutils/rspamd_stats.pl140
1 files changed, 114 insertions, 26 deletions
diff --git a/utils/rspamd_stats.pl b/utils/rspamd_stats.pl
index 8f2cc60e8..3af8c7bc8 100755
--- a/utils/rspamd_stats.pl
+++ b/utils/rspamd_stats.pl
@@ -15,6 +15,7 @@ my $reject_score = 15.0;
my $junk_score = 6.0;
my $diff_alpha = 0.1;
my $correlations = 0;
+my $nrelated = 10;
my $log_file = "";
my $search_pattern = "";
my $startTime="";
@@ -34,22 +35,23 @@ my %decompressor = (
);
GetOptions(
- "reject-score|r=f" => \$reject_score,
- "junk-score|j=f" => \$junk_score,
- "symbol|s=s@" => \@symbols_search,
- "symbol-bidir|S=s@" => \@symbols_bidirectional,
- "exclude|X=s@" => \@symbols_exclude,
- "log|l=s" => \$log_file,
+ "reject-score|r=f" => \$reject_score,
+ "junk-score|j=f" => \$junk_score,
+ "symbol|s=s@" => \@symbols_search,
+ "symbol-bidir|S=s@" => \@symbols_bidirectional,
+ "exclude|X=s@" => \@symbols_exclude,
+ "log|l=s" => \$log_file,
"alpha-score|alpha|a=f" => \$diff_alpha,
- "correlations|c" => \$correlations,
- "search-pattern=s" => \$search_pattern,
- "start=s" => \$startTime,
- "end=s" => \$endTime,
- "num-logs|n=i" => \$num_logs,
- "exclude-logs|x=i" => \$exclude_logs,
- "json|j" => \$json,
- "help|?" => \$help,
- "man" => \$man
+ "correlations|c" => \$correlations,
+ "nrelated" => \$nrelated,
+ "search-pattern=s" => \$search_pattern,
+ "start=s" => \$startTime,
+ "end=s" => \$endTime,
+ "num-logs|n=i" => \$num_logs,
+ "exclude-logs|x=i" => \$exclude_logs,
+ "json|j" => \$json,
+ "help|?" => \$help,
+ "man" => \$man
) or pod2usage(2);
pod2usage(1) if $help;
@@ -141,7 +143,31 @@ else {
exit;
-sub SymbolsStat() {
+sub GenRelated {
+ my ($htb, $target_sym) = @_;
+
+ my @result;
+ my $i = 0;
+ foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) {
+ if ($sym ne $target_sym) {
+ my @elt = ($sym, $htb->{$sym});
+ push @result, \@elt;
+ $i ++;
+ }
+
+ last if $i > $nrelated;
+ }
+
+ return \@result;
+}
+
+sub StringifyRelated {
+ my ($ar, $total) = @_;
+ return join("\n", (map { sprintf "\t%s(%s: %.1f%%)",
+ $_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar}));
+}
+
+sub SymbolsStat {
if ($total > 0) {
my $has_comma = 0;
while (my ($s, $r) = each(%sym_res)) {
@@ -237,6 +263,11 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
}
if ($correlations) {
+
+ my $spam_related = GenRelated($r->{symbols_met_spam}, $s);
+ my $junk_related = GenRelated($r->{symbols_met_junk}, $s);
+ my $ham_related = GenRelated($r->{symbols_met_ham}, $s);
+
if (!$json) {
print "Correlations report:\n";
@@ -246,18 +277,26 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
printf "Probability of %s when %s fires: %.3f\n", $s, $cs,
($corr_prob / $sym_prob);
}
+
+ print "Related symbols report:\n";
+ printf "Top related in spam:\n %s\n", StringifyRelated($spam_related,
+ $r->{spam_hits});
+ printf "Top related in junk:\n %s\n", StringifyRelated($junk_related,
+ $r->{junk_hits});
+ printf "Top related in ham:\n %s\n", StringifyRelated($ham_related,
+ $r->{hits} - $r->{spam_hits} - $r->{junk_hits});
}
else {
print ",";
print "\"correllations\":{";
- my $has_comma = 0;
+ my $has_comma_ = 0;
while (my ($cs, $hits) = each %{$r->{corr}}) {
- if ($has_comma) {
+ if ($has_comma_) {
print ",";
}
else {
- $has_comma = 1;
+ $has_comma_ = 1;
}
my $corr_prob = $hits / $total;
my $sym_prob = $r->{hits} / $total;
@@ -328,6 +367,41 @@ Messages scanned: $total";
}
}
+sub ProcessRelated {
+ my ($symbols, $target) = @_;
+
+ foreach my $s (@{$symbols}) {
+ $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
+ my $sym_name = $1;
+ my $sym_score = 0;
+
+ if ($2) {
+ $sym_score = $3 * 1.0;
+
+ if (abs($sym_score) < $diff_alpha) {
+ next;
+ }
+
+ my $bm = $bidir_match{$sym_name};
+ if ($bm) {
+ if ($sym_score >= 0) {
+ $sym_name = $bm->{'spam'};
+ }
+ else {
+ $sym_name = $bm->{'ham'};
+ }
+ }
+ }
+
+ if (exists($target->{$sym_name})) {
+ $target->{$sym_name} ++;
+ }
+ else {
+ $target->{$sym_name} = 1;
+ }
+ }
+}
+
sub ProcessLog {
my ( $ts_format, @line ) = &log_time_format($rspamd_log);
my $is_syslog = defined $ts_format && $ts_format eq 'syslog';
@@ -429,13 +503,16 @@ sub ProcessLog {
if (!$sym_res{$sym_name}) {
$sym_res{$sym_name} = {
- hits => 0,
- spam_hits => 0,
- junk_hits => 0,
- spam_change => 0,
- junk_change => 0,
- weight => 0,
- corr => {},
+ hits => 0,
+ spam_hits => 0,
+ junk_hits => 0,
+ spam_change => 0,
+ junk_change => 0,
+ weight => 0,
+ corr => {},
+ symbols_met_spam => {},
+ symbols_met_ham => {},
+ symbols_met_junk => {},
};
}
@@ -449,10 +526,21 @@ sub ProcessLog {
if ($score >= $reject_score) {
$is_spam = 1;
$r->{spam_hits} ++;
+ if ($correlations) {
+ ProcessRelated(\@symbols, $r->{symbols_met_spam});
+ }
}
elsif ($score >= $junk_score) {
$is_junk = 1;
$r->{junk_hits} ++;
+ if ($correlations) {
+ ProcessRelated(\@symbols, $r->{symbols_met_junk});
+ }
+ }
+ else {
+ if ($correlations) {
+ ProcessRelated(\@symbols, $r->{symbols_met_ham});
+ }
}
if ($sym_score != 0) {