]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Add related symbols analysis to rspamd_stats
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Mar 2018 12:56:22 +0000 (12:56 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Mar 2018 12:58:41 +0000 (12:58 +0000)
utils/rspamd_stats.pl

index 8f2cc60e889de6c57599140125f655eab68250f4..3af8c7bc8bd5766c378fb149b7c8b72b2bba0953 100755 (executable)
@@ -15,6 +15,7 @@ my $reject_score = 15.0;
 my $junk_score = 6.0;
 my $diff_alpha = 0.1;
 my $correlations = 0;
+my $nrelated = 10;
 my $log_file = "";
 my $search_pattern = "";
 my $startTime="";
@@ -34,22 +35,23 @@ my %decompressor = (
 );
 
 GetOptions(
-  "reject-score|r=f" => \$reject_score,
-  "junk-score|j=f" => \$junk_score,
-  "symbol|s=s@" => \@symbols_search,
-  "symbol-bidir|S=s@" => \@symbols_bidirectional,
-  "exclude|X=s@" => \@symbols_exclude,
-  "log|l=s" => \$log_file,
+  "reject-score|r=f"      => \$reject_score,
+  "junk-score|j=f"        => \$junk_score,
+  "symbol|s=s@"           => \@symbols_search,
+  "symbol-bidir|S=s@"     => \@symbols_bidirectional,
+  "exclude|X=s@"          => \@symbols_exclude,
+  "log|l=s"               => \$log_file,
   "alpha-score|alpha|a=f" => \$diff_alpha,
-  "correlations|c" => \$correlations,
-  "search-pattern=s" => \$search_pattern,
-  "start=s" => \$startTime,
-  "end=s" => \$endTime,
-  "num-logs|n=i" => \$num_logs,
-  "exclude-logs|x=i" => \$exclude_logs,
-  "json|j" => \$json,
-  "help|?" => \$help,
-  "man" => \$man
+  "correlations|c"        => \$correlations,
+  "nrelated"              => \$nrelated,
+  "search-pattern=s"      => \$search_pattern,
+  "start=s"               => \$startTime,
+  "end=s"                 => \$endTime,
+  "num-logs|n=i"          => \$num_logs,
+  "exclude-logs|x=i"      => \$exclude_logs,
+  "json|j"                => \$json,
+  "help|?"                => \$help,
+  "man"                   => \$man
 ) or pod2usage(2);
 
 pod2usage(1) if $help;
@@ -141,7 +143,31 @@ else {
 
 exit;
 
-sub SymbolsStat() {
+sub GenRelated {
+  my ($htb, $target_sym) = @_;
+
+  my @result;
+  my $i = 0;
+  foreach my $sym (sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb}) {
+    if ($sym ne $target_sym) {
+      my @elt = ($sym, $htb->{$sym});
+      push @result, \@elt;
+      $i ++;
+    }
+
+    last if $i > $nrelated;
+  }
+
+  return \@result;
+}
+
+sub StringifyRelated {
+  my ($ar, $total) = @_;
+  return join("\n", (map { sprintf "\t%s(%s: %.1f%%)",
+    $_->[0], $_->[1], $_->[1] / ($total * 1.0) * 100.0 } @{$ar}));
+}
+
+sub SymbolsStat {
   if ($total > 0) {
     my $has_comma = 0;
     while (my ($s, $r) = each(%sym_res)) {
@@ -237,6 +263,11 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
         }
 
         if ($correlations) {
+
+          my $spam_related = GenRelated($r->{symbols_met_spam}, $s);
+          my $junk_related = GenRelated($r->{symbols_met_junk}, $s);
+          my $ham_related = GenRelated($r->{symbols_met_ham}, $s);
+
           if (!$json) {
             print "Correlations report:\n";
 
@@ -246,18 +277,26 @@ Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
               printf "Probability of %s when %s fires: %.3f\n", $s, $cs,
                 ($corr_prob / $sym_prob);
             }
+
+            print "Related symbols report:\n";
+            printf "Top related in spam:\n %s\n", StringifyRelated($spam_related,
+              $r->{spam_hits});
+            printf "Top related in junk:\n %s\n", StringifyRelated($junk_related,
+              $r->{junk_hits});
+            printf "Top related in ham:\n %s\n", StringifyRelated($ham_related,
+              $r->{hits} - $r->{spam_hits} - $r->{junk_hits});
           }
           else {
             print ",";
             print "\"correllations\":{";
 
-            my $has_comma = 0;
+            my $has_comma_ = 0;
             while (my ($cs, $hits) = each %{$r->{corr}}) {
-              if ($has_comma) {
+              if ($has_comma_) {
                 print ",";
               }
               else {
-                $has_comma = 1;
+                $has_comma_ = 1;
               }
               my $corr_prob = $hits / $total;
               my $sym_prob = $r->{hits} / $total;
@@ -328,6 +367,41 @@ Messages scanned: $total";
   }
 }
 
+sub ProcessRelated {
+  my ($symbols, $target) = @_;
+
+  foreach my $s (@{$symbols}) {
+    $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
+    my $sym_name = $1;
+    my $sym_score = 0;
+
+    if ($2) {
+      $sym_score = $3 * 1.0;
+
+      if (abs($sym_score) < $diff_alpha) {
+        next;
+      }
+
+      my $bm = $bidir_match{$sym_name};
+      if ($bm) {
+        if ($sym_score >= 0) {
+          $sym_name = $bm->{'spam'};
+        }
+        else {
+          $sym_name = $bm->{'ham'};
+        }
+      }
+    }
+
+    if (exists($target->{$sym_name})) {
+      $target->{$sym_name} ++;
+    }
+    else {
+      $target->{$sym_name} = 1;
+    }
+  }
+}
+
 sub ProcessLog {
   my ( $ts_format, @line ) = &log_time_format($rspamd_log);
   my $is_syslog = defined $ts_format && $ts_format eq 'syslog';
@@ -429,13 +503,16 @@ sub ProcessLog {
 
             if (!$sym_res{$sym_name}) {
               $sym_res{$sym_name} = {
-                hits => 0,
-                spam_hits => 0,
-                junk_hits => 0,
-                spam_change => 0,
-                junk_change => 0,
-                weight => 0,
-                corr => {},
+                hits             => 0,
+                spam_hits        => 0,
+                junk_hits        => 0,
+                spam_change      => 0,
+                junk_change      => 0,
+                weight           => 0,
+                corr             => {},
+                symbols_met_spam => {},
+                symbols_met_ham  => {},
+                symbols_met_junk => {},
               };
             }
 
@@ -449,10 +526,21 @@ sub ProcessLog {
             if ($score >= $reject_score) {
               $is_spam = 1;
               $r->{spam_hits} ++;
+              if ($correlations) {
+                ProcessRelated(\@symbols, $r->{symbols_met_spam});
+              }
             }
             elsif ($score >= $junk_score) {
               $is_junk = 1;
               $r->{junk_hits} ++;
+              if ($correlations) {
+                ProcessRelated(\@symbols, $r->{symbols_met_junk});
+              }
+            }
+            else {
+              if ($correlations) {
+                ProcessRelated(\@symbols, $r->{symbols_met_ham});
+              }
             }
 
             if ($sym_score != 0) {