You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

rspamd_stats.pl 15KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546
  1. #!/usr/bin/env perl
  2. use 5.010;
  3. use Data::Dumper;
  4. use Getopt::Long;
  5. use Pod::Usage;
  6. use warnings;
  7. use strict;
  8. my @symbols_search;
  9. my @symbols_exclude;
  10. my $reject_score = 15.0;
  11. my $junk_score = 6.0;
  12. my $diff_alpha = 0.1;
  13. my $correlations = 0;
  14. my $log_file = "";
  15. my $search_pattern = "";
  16. my $startTime="";
  17. my $endTime;
  18. my $num_logs;
  19. my $exclude_logs = 0;
  20. my $man = 0;
  21. my $help = 0;
  22. # Associate file extensions with decompressors
  23. my %decompressor = (
  24. 'bz2' => 'bzip2 -cd',
  25. 'gz' => 'gzip -cd',
  26. 'xz' => 'xz -cd',
  27. );
  28. GetOptions(
  29. "reject-score|r=f" => \$reject_score,
  30. "junk-score|j=f" => \$junk_score,
  31. "symbol|s=s@" => \@symbols_search,
  32. "exclude|s=s@" => \@symbols_exclude,
  33. "log|l=s" => \$log_file,
  34. "alpha|a=f" => \$diff_alpha,
  35. "correlations|c" => \$correlations,
  36. "search-pattern=s" => \$search_pattern,
  37. "start=s" => \$startTime,
  38. "end=s" => \$endTime,
  39. "num-logs|n=i" => \$num_logs,
  40. "exclude-logs|x=i" => \$exclude_logs,
  41. "help|?" => \$help,
  42. "man" => \$man
  43. ) or pod2usage(2);
  44. pod2usage(1) if $help;
  45. pod2usage(-exitval => 0, -verbose => 2) if $man;
  46. @symbols_search = '.*'
  47. unless @symbols_search;
  48. # Global vars
  49. my $total = 0;
  50. my $total_spam = 0;
  51. my $total_junk = 0;
  52. my $junk_symbols = 0;
  53. my $spam_symbols = 0;
  54. my $ham_symbols = 0;
  55. my $ham_spam_change = 0;
  56. my $ham_junk_change = 0;
  57. my %sym_res;
  58. my $rspamd_log;
  59. my $enabled = 0;
  60. my %action;
  61. my %timeStamp;
  62. my %scanTime = (
  63. max => 0,
  64. total => 0,
  65. );
  66. if ($log_file eq '-' || $log_file eq '') {
  67. $rspamd_log = \*STDIN;
  68. &ProcessLog();
  69. }
  70. elsif ( -d "$log_file" ) {
  71. my $log_dir = "$log_file";
  72. my @logs = &GetLogfilesList($log_dir);
  73. # Process logs
  74. foreach (@logs) {
  75. my $ext = (/[^.]+\.?([^.]*?)$/)[0];
  76. my $dc = $decompressor{$ext} || 'cat';
  77. open( $rspamd_log, "-|", "$dc $log_dir/$_" )
  78. or die "cannot execute $dc $log_dir/$_ : $!";
  79. &ProcessLog;
  80. close($rspamd_log)
  81. or warn "cannot close $dc $log_dir/$_: $!";
  82. }
  83. }
  84. else {
  85. open($rspamd_log, '<', $log_file) or die "cannot open $log_file";
  86. &ProcessLog();
  87. }
  88. my $total_ham = $total - ($total_spam + $total_junk);
  89. if ($total > 0) {
  90. while (my ($s, $r) = each(%sym_res)) {
  91. if ($r->{hits} > 0) {
  92. my $th = $r->{hits};
  93. my $sh = $r->{spam_hits};
  94. my $jh = $r->{junk_hits};
  95. my $hh = $r->{hits} - $sh - $jh;
  96. my $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
  97. my $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
  98. my $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
  99. printf "%s avg. weight %.3f, hits %d(%.3f%%):
  100. Ham %7.3f%%, %6d/%-6d (%7.3f%%)
  101. Spam %7.3f%%, %6d/%-6d (%7.3f%%)
  102. Junk %7.3f%%, %6d/%-6d (%7.3f%%)
  103. ",
  104. $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ),
  105. ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ),
  106. ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ),
  107. ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 );
  108. my $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
  109. my $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
  110. if ($r->{weight} != 0) {
  111. if ($r->{weight} > 0) {
  112. printf "
  113. Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
  114. Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
  115. Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
  116. Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
  117. ",
  118. $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
  119. $r->{spam_change}, $total_spam, ( $schp or 0 ),
  120. $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
  121. $r->{junk_change}, $total_junk, ( $jchp or 0 );
  122. }
  123. else {
  124. printf "
  125. Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
  126. Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
  127. Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
  128. Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
  129. ",
  130. $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
  131. $r->{spam_change}, $total_spam, ( $schp or 0 ),
  132. $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
  133. $r->{junk_change}, $total_junk, ( $jchp or 0 );
  134. }
  135. }
  136. if ($correlations) {
  137. print "Correlations report:\n";
  138. while (my ($cs,$hits) = each %{$r->{corr}}) {
  139. my $corr_prob = $hits / $total;
  140. my $sym_prob = $r->{hits} / $total;
  141. printf "Probability of %s when %s fires: %.3f\n", $s, $cs, ($corr_prob / $sym_prob);
  142. }
  143. }
  144. }
  145. else {
  146. print "Symbol $s has not been met\n";
  147. }
  148. print '-' x 80 . "\n";
  149. }
  150. }
  151. print "
  152. === Summary ", '=' x 68, "
  153. Messages scanned: $total";
  154. printf " [ %s / %s ]
  155. ", $timeStamp{'start'}, $timeStamp{'end'}
  156. if defined $timeStamp{'start'};
  157. say '';
  158. printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_}
  159. for sort keys %action;
  160. say '';
  161. printf "scan time min/avg/max = %.2f/%.2f/%.2f s
  162. ", $scanTime{'min'} / 1000,
  163. ($total) ? $scanTime{'total'} / $total / 1000 : undef,
  164. $scanTime{'max'} / 1000
  165. if exists $scanTime{'min'};
  166. say '=' x 80;
  167. exit;
  168. sub ProcessLog {
  169. while(<$rspamd_log>) {
  170. if (!$enabled && ($search_pattern eq "" || /$search_pattern/)) {
  171. $enabled = 1;
  172. }
  173. next if !$enabled;
  174. if (/^.*rspamd_task_write_log.*$/) {
  175. my $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
  176. next if ( $ts lt $startTime );
  177. next if ( defined $endTime && $ts gt $endTime );
  178. if ($_ !~ /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms real/) {
  179. #print "BAD: $_\n";
  180. next;
  181. }
  182. my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
  183. my $scan_time = $5;
  184. my $act = $1;
  185. my $score = $2 * 1.0;
  186. my $skip = 0;
  187. foreach my $ex (@symbols_exclude) {
  188. my @found = grep {/^$ex/} @symbols;
  189. if (scalar(@found) > 0) {
  190. $skip = 1;
  191. last;
  192. }
  193. }
  194. next if ( $skip != 0 );
  195. $timeStamp{'end'} = $ts;
  196. $timeStamp{'start'} //= $timeStamp{'end'};
  197. $scanTime{'min'} = $scan_time
  198. if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
  199. $scanTime{'max'} = $scan_time
  200. if ( $scanTime{'max'} < $scan_time );
  201. $scanTime{'total'} += $scan_time;
  202. $action{$act}++;
  203. $total ++;
  204. if ($score >= $reject_score) {
  205. $total_spam ++;
  206. }
  207. elsif ($score >= $junk_score) {
  208. $total_junk ++;
  209. }
  210. my @sym_names;
  211. foreach my $s (@symbols_search) {
  212. my @selected = grep /$s/, @symbols;
  213. if (scalar(@selected) > 0) {
  214. foreach my $sym (@selected) {
  215. $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
  216. my $sym_name = $1;
  217. my $sym_score = 0;
  218. if ($2) {
  219. $sym_score = $3 * 1.0;
  220. if (abs($sym_score) < $diff_alpha) {
  221. next;
  222. }
  223. }
  224. next if $sym_name !~ /^$s/;
  225. push @sym_names, $sym_name;
  226. if (!$sym_res{$sym_name}) {
  227. $sym_res{$sym_name} = {
  228. hits => 0,
  229. spam_hits => 0,
  230. junk_hits => 0,
  231. spam_change => 0,
  232. junk_change => 0,
  233. weight => 0,
  234. corr => {},
  235. };
  236. }
  237. my $r = $sym_res{$sym_name};
  238. $r->{hits} ++;
  239. $r->{weight} += $sym_score;
  240. my $is_spam = 0;
  241. my $is_junk = 0;
  242. if ($score >= $reject_score) {
  243. $is_spam = 1;
  244. $r->{spam_hits} ++;
  245. }
  246. elsif ($score >= $junk_score) {
  247. $is_junk = 1;
  248. $r->{junk_hits} ++;
  249. }
  250. if ($sym_score != 0) {
  251. my $score_without = $score - $sym_score;
  252. if ($sym_score > 0) {
  253. if ($is_spam && $score_without < $reject_score) {
  254. $r->{spam_change} ++;
  255. }
  256. if ($is_junk && $score_without < $junk_score) {
  257. $r->{junk_change} ++;
  258. }
  259. }
  260. else {
  261. if (!$is_spam && $score_without >= $reject_score) {
  262. $r->{spam_change} ++;
  263. }
  264. if (!$is_junk && $score_without >= $junk_score) {
  265. $r->{junk_change} ++;
  266. }
  267. }
  268. }
  269. } # End foreach symbols selected
  270. }
  271. }
  272. if ($correlations) {
  273. foreach my $sym (@sym_names) {
  274. my $r = $sym_res{$sym};
  275. foreach my $corr_sym (@sym_names) {
  276. if ($corr_sym ne $sym) {
  277. if ($r->{'corr'}->{$corr_sym}) {
  278. $r->{'corr'}->{$corr_sym} ++;
  279. }
  280. else {
  281. $r->{'corr'}->{$corr_sym} = 1;
  282. }
  283. }
  284. }
  285. } # End of correlations check
  286. }
  287. }
  288. }
  289. }
  290. sub GetLogfilesList {
  291. my ($dir) = @_;
  292. opendir( DIR, $dir ) or die $!;
  293. my $pattern = join( '|', keys %decompressor );
  294. my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
  295. # Add unnumbered logs first
  296. my @logs =
  297. grep { -f "$dir/$_" && !/$re/ } readdir(DIR);
  298. # Add numbered logs
  299. rewinddir(DIR);
  300. push( @logs,
  301. ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir(DIR) ) ) );
  302. closedir(DIR);
  303. # Select required logs and revers their order
  304. @logs =
  305. reverse
  306. splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
  307. # Loop through array printing out filenames
  308. print "\nParsing log files:\n";
  309. foreach my $file (@logs) {
  310. print " $file\n";
  311. }
  312. print "\n";
  313. return @logs;
  314. }
  315. sub numeric {
  316. $a =~ /\.(\d+)\./;
  317. my $a_num = $1;
  318. $b =~ /\.(\d+)\./;
  319. my $b_num = $1;
  320. $a_num <=> $b_num;
  321. }
  322. __END__
  323. =head1 NAME
  324. rspamd_stats - analyze Rspamd rules by parsing log files
  325. =head1 SYNOPSIS
  326. rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file]
  327. Options:
  328. --log=file log file or directory to read (stdin by default)
  329. --reject-score=score set reject threshold (15 by default)
  330. --junk-score=score set junk score (6.0 by default)
  331. --symbol=sym check specified symbol (perl regexps, '.*' by default)
  332. --alpha=value set ignore score for symbols (0.1 by default)
  333. --correlations enable correlations report
  334. --search-pattern do not process input unless the desired pattern is found
  335. --start starting time (oldest) for log parsing
  336. --end ending time (newest) for log parsing
  337. --num-logs=integer number of recent logfiles to analyze (all files in the directory by default)
  338. --exclude-logs=integer number of latest logs to exclude (0 by default)
  339. --help brief help message
  340. --man full documentation
  341. =head1 OPTIONS
  342. =over 8
  343. =item B<--log>
  344. Specifies log file or directory to read data from.
  345. If a directory is specified B<rspamd_stats> analyses files in the directory
  346. including known compressed file types. Number of log files can be limited using
  347. B<--num-logs> and B<--exclude-logs> options. This assumes that files in the log
  348. directory have B<newsyslog(8)>- or B<logrotate(8)>-like name format with numeric
  349. indexes. Files without indexes (generally it is merely one file) are considered
  350. the most recent and files with lower indexes are considered newer.
  351. =item B<--reject-score>
  352. Specifies the reject (spam) threshold.
  353. =item B<--junk-score>
  354. Specifies the junk (add header or rewrite subject) threshold.
  355. =item B<--alpha-score>
  356. Specifies the minimum score for a symbol to be considered by this script.
  357. =item B<--symbol>
  358. Add symbol or pattern (pcre format) to analyze.
  359. =item B<--num-logs>
  360. If set, limits number of analyzed logfiles in the directory to the specified value.
  361. =item B<--exclude-logs>
  362. Number of latest logs to exclude (0 by default).
  363. =item B<--correlations>
  364. Additionaly print correlation rate for each symbol displayed. This routine calculates merely paired correlations between symbols.
  365. =item B<--search-pattern>
  366. Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position.
  367. =item B<--exclude>
  368. Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple symbols.
  369. =item B<--start>
  370. Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be
  371. truncated to any desired accuracy). If used with B<--end> select entries between
  372. B<--start> and B<--end>.
  373. =item B<--end>
  374. Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be
  375. truncated to any desired accuracy). If used with B<--start> select entries between
  376. B<--start> and B<--end>.
  377. =item B<--help>
  378. Print a brief help message and exits.
  379. =item B<--man>
  380. Prints the manual page and exits.
  381. =back
  382. =head1 DESCRIPTION
  383. B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols:
  384. Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%)
  385. Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%)
  386. Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%)
  387. Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%)
  388. Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102%
  389. Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540%
  390. Where there are the following attributes:
  391. =over 4
  392. =item *
  393. B<Weight>: average score for a symbols
  394. =item *
  395. B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages
  396. =item *
  397. B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right):
  398. =over 4
  399. =item 1.
  400. B<total symbol hits>: number of messages that has this symbol and are B<HAM>
  401. =item 2.
  402. B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count
  403. =item 3.
  404. B<total ham hits>: overall number of B<HAM> messages
  405. =item 4.
  406. B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total number of B<HAM> messages.
  407. =back
  408. =item *
  409. B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class.
  410. =item *
  411. B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class.
  412. =item *
  413. B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight.
  414. =item *
  415. B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight.
  416. =back
  417. =cut