You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

rspamd_stats.pl 32KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018
  1. #!/usr/bin/env perl
  2. use 5.010;
  3. use Data::Dumper;
  4. use Getopt::Long;
  5. use Pod::Usage;
  6. use Time::Local;
  7. use IO::Handle;
  8. use warnings;
  9. use strict;
  10. my @symbols_search;
  11. my @symbols_exclude;
  12. my @symbols_bidirectional;
  13. my @symbols_groups;
  14. my @symbols_ignored;
  15. my %symbols_mult;
  16. my %groups;
  17. my $reject_score = 15.0;
  18. my $junk_score = 6.0;
  19. my $diff_alpha = 0.1;
  20. my $correlations = 0;
  21. my $nrelated = 10;
  22. my $log_file = "";
  23. my $search_pattern = "";
  24. my $startTime = "";
  25. my $endTime;
  26. my $num_logs;
  27. my $exclude_logs = 0;
  28. my $man = 0;
  29. my $json = 0;
  30. my $help = 0;
  31. # Associate file extensions with decompressors
  32. my %decompressor = (
  33. 'bz2' => 'bzip2 -cd',
  34. 'gz' => 'gzip -cd',
  35. 'xz' => 'xz -cd',
  36. 'zst' => 'zstd -cd',
  37. );
  38. GetOptions(
  39. "reject-score|r=f" => \$reject_score,
  40. "junk-score|j=f" => \$junk_score,
  41. "symbol|s=s@" => \@symbols_search,
  42. "symbol-bidir|S=s@" => \@symbols_bidirectional,
  43. "exclude|X=s@" => \@symbols_exclude,
  44. "ignore=s@" => \@symbols_ignored,
  45. "group|g=s@" => \@symbols_groups,
  46. "log|l=s" => \$log_file,
  47. "mult=s" => \%symbols_mult,
  48. "alpha-score|alpha|a=f" => \$diff_alpha,
  49. "correlations|c" => \$correlations,
  50. "nrelated=i" => \$nrelated,
  51. "search-pattern=s" => \$search_pattern,
  52. "start=s" => \$startTime,
  53. "end=s" => \$endTime,
  54. "num-logs|n=i" => \$num_logs,
  55. "exclude-logs|x=i" => \$exclude_logs,
  56. "json|j" => \$json,
  57. "help|?" => \$help,
  58. "man" => \$man
  59. ) or pod2usage(2);
  60. pod2usage(1) if $help;
  61. pod2usage( -exitval => 0, -verbose => 2 ) if $man;
  62. # Global vars
  63. my $total = 0;
  64. my $total_spam = 0;
  65. my $total_junk = 0;
  66. my $junk_symbols = 0;
  67. my $spam_symbols = 0;
  68. my $ham_symbols = 0;
  69. my $ham_spam_change = 0;
  70. my $ham_junk_change = 0;
  71. my %sym_res;
  72. my $rspamd_log;
  73. my $enabled = 0;
  74. my $log_file_num = 1;
  75. my $spinner_update_time = 0;
  76. my %action;
  77. my %timeStamp;
  78. my %scanTime = (
  79. max => 0,
  80. total => 0,
  81. );
  82. my %bidir_match;
  83. foreach ( $startTime, $endTime ) { $_ = &normalized_time($_) }
  84. # Convert bidirectional symbols
  85. foreach my $s (@symbols_bidirectional) {
  86. $bidir_match{$s} = {
  87. spam => "${s}_SPAM",
  88. ham => "${s}_HAM",
  89. };
  90. push @symbols_search, $s unless grep /^$s$/, @symbols_search;
  91. }
  92. # Deal with groups
  93. my $group_id = 0;
  94. foreach my $g (@symbols_groups) {
  95. my @symbols = split /,/, $g;
  96. my $group_name = "group$group_id";
  97. foreach my $s (@symbols) {
  98. $groups{$s} = $group_name;
  99. push @symbols_search, $s unless grep /^$s$/, @symbols_search;
  100. }
  101. }
  102. @symbols_search = '.*'
  103. unless @symbols_search;
  104. if ( $log_file eq '-' || $log_file eq '' ) {
  105. $rspamd_log = \*STDIN;
  106. &ProcessLog();
  107. }
  108. elsif ( -d "$log_file" ) {
  109. my $log_dir = "$log_file";
  110. my @logs = &GetLogfilesList($log_dir);
  111. # Process logs
  112. foreach (@logs) {
  113. my $ext = (/[^.]+\.?([^.]*?)$/)[0];
  114. my $dc = $decompressor{$ext} || 'cat';
  115. open( $rspamd_log, "-|", "$dc $log_dir/$_" )
  116. or die "cannot execute $dc $log_dir/$_ : $!";
  117. printf { interactive(*STDERR) } "\033[J Parsing log files: [%d/%d] %s\033[G", $log_file_num++, scalar @logs,
  118. $_;
  119. $spinner_update_time = 0; # Force spinner update
  120. &spinner;
  121. &ProcessLog;
  122. close($rspamd_log)
  123. or warn "cannot close $dc $log_dir/$_: $!";
  124. }
  125. print { interactive(*STDERR) } "\033[J\033[G"; # Progress indicator clean-up
  126. }
  127. else {
  128. my $ext = ( $log_file =~ /[^.]+\.?([^.]*?)$/ )[0];
  129. my $dc = $decompressor{$ext} || 'cat';
  130. open( $rspamd_log, "-|", "$dc $log_file" )
  131. or die "cannot execute $dc $log_file : $!";
  132. $spinner_update_time = 0; # Force spinner update
  133. &spinner;
  134. &ProcessLog();
  135. }
  136. my $total_ham = $total - ( $total_spam + $total_junk );
  137. if ($json) {
  138. print "{";
  139. &Summary();
  140. print '"symbols":{';
  141. &SymbolsStat();
  142. print "}}\n";
  143. }
  144. else {
  145. &SymbolsStat();
  146. &Summary();
  147. }
  148. exit;
  149. sub IsIgnored {
  150. my ($sym) = @_;
  151. foreach my $ex (@symbols_ignored) {
  152. if ( $sym =~ /^$ex$/ ) {
  153. return 1;
  154. }
  155. }
  156. return 0;
  157. }
  158. sub GenRelated {
  159. my ( $htb, $target_sym ) = @_;
  160. my @result;
  161. my $i = 0;
  162. foreach my $sym ( sort { $htb->{$b} <=> $htb->{$a} } keys %{$htb} ) {
  163. if ( $sym ne $target_sym ) {
  164. my @elt = ( $sym, $htb->{$sym} );
  165. push @result, \@elt;
  166. $i++;
  167. }
  168. last if $i > $nrelated;
  169. }
  170. return \@result;
  171. }
  172. sub StringifyRelated {
  173. my ( $ar, $total ) = @_;
  174. return
  175. join( "\n", ( map { sprintf "\t%s(%s: %.1f%%)", $_->[0], $_->[1], $_->[1] / ( $total * 1.0 ) * 100.0 } @{$ar} ) );
  176. }
  177. sub SymbolsStat {
  178. if ( $total > 0 ) {
  179. my $has_comma = 0;
  180. while ( my ( $s, $r ) = each(%sym_res) ) {
  181. if ( $r->{hits} > 0 ) {
  182. my $th = $r->{hits};
  183. my $sh = $r->{spam_hits};
  184. my $jh = $r->{junk_hits};
  185. my $hh = $r->{hits} - $sh - $jh;
  186. my ( $htp, $stp, $jtp );
  187. $htp = $hh * 100.0 / $total_ham if $total_ham != 0;
  188. $stp = $sh * 100.0 / $total_spam if $total_spam != 0;
  189. $jtp = $jh * 100.0 / $total_junk if $total_junk != 0;
  190. if ($json) {
  191. if ($has_comma) {
  192. print ",";
  193. }
  194. else {
  195. $has_comma = 1;
  196. }
  197. print "\"$s\":{";
  198. JsonObjectElt( "avg_weight", $r->{'weight'}, "%.4f" );
  199. print ",";
  200. JsonObjectElt( "hits", $th, "%d" );
  201. print ",";
  202. JsonObjectElt( "hits_percentage", $th / $total, "%.4f" );
  203. print ",";
  204. JsonObjectElt( "spam_hits", $sh, "%d" );
  205. print ",";
  206. JsonObjectElt( "spam_to_total", $sh / $th, "%.4f" );
  207. print ",";
  208. JsonObjectElt( "spam_percentage", $stp / 100.0 || 0, "%.4f" );
  209. print ",";
  210. JsonObjectElt( "ham_hits", $hh, "%d" );
  211. print ",";
  212. JsonObjectElt( "ham_to_total", $hh / $th, "%.4f" );
  213. print ",";
  214. JsonObjectElt( "ham_percentage", $htp / 100.0 || 0, "%.4f" );
  215. print ",";
  216. JsonObjectElt( "junk_hits", $jh, "%d" );
  217. print ",";
  218. JsonObjectElt( "junk_to_total", $jh / $th, "%.4f" );
  219. print ",";
  220. JsonObjectElt( "junk_percentage", $jtp / 100.0 || 0, "%.4f" );
  221. }
  222. else {
  223. printf "%s avg. weight %.3f, hits %d(%.3f%%):
  224. Ham %7.3f%%, %6d/%-6d (%7.3f%%)
  225. Spam %7.3f%%, %6d/%-6d (%7.3f%%)
  226. Junk %7.3f%%, %6d/%-6d (%7.3f%%)
  227. ", $s, $r->{weight} / $r->{hits}, $th, ( $th / $total * 100 ),
  228. ( $hh / $th * 100 ), $hh, $total_ham, ( $htp or 0 ),
  229. ( $sh / $th * 100 ), $sh, $total_spam, ( $stp or 0 ),
  230. ( $jh / $th * 100 ), $jh, $total_junk, ( $jtp or 0 );
  231. }
  232. my ( $schp, $jchp );
  233. $schp = $r->{spam_change} / $total_spam * 100.0 if $total_spam;
  234. $jchp = $r->{junk_change} / $total_junk * 100.0 if $total_junk;
  235. if ( $r->{weight} != 0 ) {
  236. if ( !$json ) {
  237. if ( $r->{weight} > 0 ) {
  238. printf "
  239. Spam changes (ham/junk -> spam): %6d/%-6d (%7.3f%%)
  240. Spam changes / total spam hits: %6d/%-6d (%7.3f%%)
  241. Junk changes (ham -> junk): %6d/%-6d (%7.3f%%)
  242. Junk changes / total junk hits: %6d/%-6d (%7.3f%%)
  243. ",
  244. $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
  245. $r->{spam_change}, $total_spam, ( $schp or 0 ),
  246. $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
  247. $r->{junk_change}, $total_junk, ( $jchp or 0 );
  248. }
  249. else {
  250. printf "
  251. Spam changes (spam -> junk/ham): %6d/%-6d (%7.3f%%)
  252. Spam changes / total spam hits : %6d/%-6d (%7.3f%%)
  253. Junk changes (junk -> ham) : %6d/%-6d (%7.3f%%)
  254. Junk changes / total junk hits : %6d/%-6d (%7.3f%%)
  255. ",
  256. $r->{spam_change}, $th, ( $r->{spam_change} / $th * 100 ),
  257. $r->{spam_change}, $total_spam, ( $schp or 0 ),
  258. $r->{junk_change}, $th, ( $r->{junk_change} / $th * 100 ),
  259. $r->{junk_change}, $total_junk, ( $jchp or 0 );
  260. }
  261. }
  262. else {
  263. print ",";
  264. JsonObjectElt( "spam_change", $r->{spam_change}, "%.4f" );
  265. print ",";
  266. JsonObjectElt( "junk_change", $r->{junk_change}, "%.4f" );
  267. }
  268. }
  269. if ($correlations) {
  270. my $spam_related = GenRelated( $r->{symbols_met_spam}, $s );
  271. my $junk_related = GenRelated( $r->{symbols_met_junk}, $s );
  272. my $ham_related = GenRelated( $r->{symbols_met_ham}, $s );
  273. if ( !$json ) {
  274. print "Correlations report:\n";
  275. while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
  276. my $corr_prob = $r->{'hits'} / $total;
  277. my $merged_hits = 0;
  278. if ( $r->{symbols_met_spam}->{$cs} ) {
  279. $merged_hits += $r->{symbols_met_spam}->{$cs};
  280. }
  281. if ( $r->{symbols_met_junk}->{$cs} ) {
  282. $merged_hits += $r->{symbols_met_junk}->{$cs};
  283. }
  284. if ( $r->{symbols_met_ham}->{$cs} ) {
  285. $merged_hits += $r->{symbols_met_ham}->{$cs};
  286. }
  287. if ( $merged_hits > 0 ) {
  288. printf "Probability of %s when %s fires: %.3f\n", $cs, $s,
  289. ( ( $merged_hits / $total ) / $corr_prob );
  290. }
  291. }
  292. print "Related symbols report:\n";
  293. printf "Top related in spam:\n %s\n", StringifyRelated( $spam_related, $r->{spam_hits} );
  294. printf "Top related in junk:\n %s\n", StringifyRelated( $junk_related, $r->{junk_hits} );
  295. printf "Top related in ham:\n %s\n",
  296. StringifyRelated( $ham_related, $r->{hits} - $r->{spam_hits} - $r->{junk_hits} );
  297. }
  298. else {
  299. print ",";
  300. print "\"correllations\":{";
  301. my $has_comma_ = 0;
  302. while ( my ( $cs, $hits ) = each %{ $r->{corr} } ) {
  303. if ($has_comma_) {
  304. print ",";
  305. }
  306. else {
  307. $has_comma_ = 1;
  308. }
  309. my $corr_prob = $hits / $total;
  310. my $sym_prob = $r->{hits} / $total;
  311. JsonObjectElt( $cs, ( $corr_prob / $sym_prob ), "%.4f" );
  312. }
  313. print "}";
  314. }
  315. }
  316. print "}" if $json;
  317. }
  318. else {
  319. print "Symbol $s has not been met\n" if !$json;
  320. }
  321. print '-' x 80 . "\n" if !$json;
  322. }
  323. }
  324. }
  325. sub Summary() {
  326. if ( !$json ) {
  327. print "
  328. === Summary ", '=' x 68, "
  329. Messages scanned: $total";
  330. printf " [ %s / %s ]
  331. ", $timeStamp{'start'}, $timeStamp{'end'}
  332. if defined $timeStamp{'start'};
  333. say '';
  334. printf "%11s: %6.2f%%, %d\n", $_, 100 * $action{$_} / $total, $action{$_} for sort keys %action;
  335. say '';
  336. printf "scan time min/avg/max = %.2f/%.2f/%.2f s
  337. ", $scanTime{'min'} / 1000, ($total) ? $scanTime{'total'} / $total / 1000 : undef, $scanTime{'max'} / 1000
  338. if exists $scanTime{'min'};
  339. say '=' x 80;
  340. }
  341. else {
  342. JsonObjectElt( "total", $total, "%d" );
  343. print ",";
  344. if ( defined $timeStamp{'start'} ) {
  345. JsonObjectElt( "start", $timeStamp{'start'} );
  346. print ",";
  347. }
  348. if ( defined $timeStamp{'end'} ) {
  349. JsonObjectElt( "end", $timeStamp{'end'} );
  350. print ",";
  351. }
  352. print "\"actions\":{";
  353. my $has_comma = 0;
  354. foreach my $a ( sort keys %action ) {
  355. if ($has_comma) {
  356. print ",";
  357. }
  358. else {
  359. $has_comma = 1;
  360. }
  361. JsonObjectElt( $a, $action{$a}, "%d" );
  362. }
  363. print "},";
  364. }
  365. }
  366. sub ProcessRelated {
  367. my ( $symbols, $target, $source ) = @_;
  368. foreach my $s ( @{$symbols} ) {
  369. $s =~ /^([^\(]+)(\(([^\)]+)\))?/;
  370. my $sym_name = $1;
  371. my $sym_score = 0;
  372. if ( $groups{$sym_name} ) {
  373. $sym_name = $groups{$sym_name};
  374. }
  375. next if ( $source eq $sym_name );
  376. next if IsIgnored($sym_name);
  377. if ($2) {
  378. $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
  379. if ( abs($sym_score) < $diff_alpha ) {
  380. next;
  381. }
  382. my $bm = $bidir_match{$sym_name};
  383. if ($bm) {
  384. if ( $sym_score >= 0 ) {
  385. $sym_name = $bm->{'spam'};
  386. }
  387. else {
  388. $sym_name = $bm->{'ham'};
  389. }
  390. }
  391. }
  392. if ( exists( $target->{$sym_name} ) ) {
  393. $target->{$sym_name}++;
  394. }
  395. else {
  396. $target->{$sym_name} = 1;
  397. }
  398. }
  399. }
  400. sub ProcessLog {
  401. my ( $ts_format, @line ) = &log_time_format($rspamd_log);
  402. while () {
  403. last if eof $rspamd_log;
  404. $_ = (@line) ? shift @line : <$rspamd_log>;
  405. if ( !$enabled && ( $search_pattern eq "" || /$search_pattern/ ) ) {
  406. $enabled = 1;
  407. }
  408. next if !$enabled;
  409. if (/^.*rspamd_task_write_log.*$/) {
  410. &spinner;
  411. my $ts;
  412. if ( $ts_format eq 'syslog' ) {
  413. $ts = syslog2iso( join ' ', ( split /\s+/ )[ 0 .. 2 ] );
  414. }
  415. elsif ( $ts_format eq 'syslog5424' ) {
  416. /^([0-9-]+)T([0-9:]+)/;
  417. $ts = "$1 $2";
  418. }
  419. else {
  420. $ts = join ' ', ( split /\s+/ )[ 0 .. 1 ];
  421. }
  422. next if ( $ts lt $startTime );
  423. next if ( defined $endTime && $ts gt $endTime );
  424. if ( $_ !~
  425. /\(([^()]+)\): \[(NaN|-?\d+(?:\.\d+)?)\/(-?\d+(?:\.\d+)?)\]\s+\[([^\]]+)\].+? time: (\d+\.\d+)ms/ )
  426. {
  427. #print "BAD: $_\n";
  428. next;
  429. }
  430. my @symbols = split /(?:\{[^}]*\})?(?:$|,)/, $4;
  431. my $scan_time = $5;
  432. my $act = $1;
  433. my $score = $2 * 1.0;
  434. my $skip = 0;
  435. foreach my $ex (@symbols_exclude) {
  436. my @found = grep { /^$ex/ } @symbols;
  437. if ( scalar(@found) > 0 ) {
  438. $skip = 1;
  439. last;
  440. }
  441. }
  442. next if ( $skip != 0 );
  443. if ( defined( $timeStamp{'end'} ) ) {
  444. $timeStamp{'end'} = $ts if ( $ts gt $timeStamp{'end'} );
  445. }
  446. else {
  447. $timeStamp{'end'} = $ts;
  448. }
  449. if ( defined( $timeStamp{'start'} ) ) {
  450. $timeStamp{'start'} = $ts if ( $ts lt $timeStamp{'start'} );
  451. }
  452. else {
  453. $timeStamp{'start'} = $ts;
  454. }
  455. $scanTime{'min'} = $scan_time if ( !exists $scanTime{'min'} || $scanTime{'min'} > $scan_time );
  456. $scanTime{'max'} = $scan_time if ( $scanTime{'max'} < $scan_time );
  457. $scanTime{'total'} += $scan_time;
  458. $action{$act}++;
  459. $total++;
  460. if ( $score >= $reject_score ) {
  461. $total_spam++;
  462. }
  463. elsif ( $score >= $junk_score ) {
  464. $total_junk++;
  465. }
  466. my @sym_names;
  467. foreach my $s (@symbols_search) {
  468. my @selected = grep /$s/, @symbols;
  469. if ( scalar(@selected) > 0 ) {
  470. foreach my $sym (@selected) {
  471. $sym =~ /^([^\(]+)(\(([^\)]+)\))?/;
  472. my $sym_name = $1;
  473. my $sym_score = 0;
  474. my $orig_name = $sym_name;
  475. if ($2) {
  476. $sym_score = $3 * ($symbols_mult{$sym_name} or 1.0);
  477. if ( abs($sym_score) < $diff_alpha ) {
  478. next;
  479. }
  480. my $bm = $bidir_match{$sym_name};
  481. if ($bm) {
  482. if ( $sym_score >= 0 ) {
  483. $sym_name = $bm->{'spam'};
  484. }
  485. else {
  486. $sym_name = $bm->{'ham'};
  487. }
  488. }
  489. }
  490. next if $orig_name !~ /^$s/;
  491. if ( $groups{$s} ) {
  492. # Replace with group
  493. $sym_name = $groups{$s};
  494. }
  495. push @sym_names, $sym_name;
  496. if ( !$sym_res{$sym_name} ) {
  497. $sym_res{$sym_name} = {
  498. hits => 0,
  499. spam_hits => 0,
  500. junk_hits => 0,
  501. spam_change => 0,
  502. junk_change => 0,
  503. weight => 0,
  504. corr => {},
  505. symbols_met_spam => {},
  506. symbols_met_ham => {},
  507. symbols_met_junk => {},
  508. };
  509. }
  510. my $r = $sym_res{$sym_name};
  511. $r->{hits}++;
  512. $r->{weight} += $sym_score;
  513. my $is_spam = 0;
  514. my $is_junk = 0;
  515. if ( $score >= $reject_score ) {
  516. $is_spam = 1;
  517. $r->{spam_hits}++;
  518. if ($correlations) {
  519. ProcessRelated( \@symbols, $r->{symbols_met_spam}, $sym_name );
  520. }
  521. }
  522. elsif ( $score >= $junk_score ) {
  523. $is_junk = 1;
  524. $r->{junk_hits}++;
  525. if ($correlations) {
  526. ProcessRelated( \@symbols, $r->{symbols_met_junk}, $sym_name );
  527. }
  528. }
  529. else {
  530. if ($correlations) {
  531. ProcessRelated( \@symbols, $r->{symbols_met_ham}, $sym_name );
  532. }
  533. }
  534. if ( $sym_score != 0 ) {
  535. my $score_without = $score - $sym_score;
  536. if ( $sym_score > 0 ) {
  537. if ( $is_spam && $score_without < $reject_score ) {
  538. $r->{spam_change}++;
  539. }
  540. if ( $is_junk && $score_without < $junk_score ) {
  541. $r->{junk_change}++;
  542. }
  543. }
  544. else {
  545. if ( !$is_spam && $score_without >= $reject_score ) {
  546. $r->{spam_change}++;
  547. }
  548. if ( !$is_junk && $score_without >= $junk_score ) {
  549. $r->{junk_change}++;
  550. }
  551. }
  552. }
  553. } # End foreach symbols selected
  554. }
  555. }
  556. if ($correlations) {
  557. foreach my $sym (@sym_names) {
  558. next if IsIgnored($sym);
  559. my $r = $sym_res{$sym};
  560. foreach my $corr_sym (@sym_names) {
  561. if ( $corr_sym ne $sym ) {
  562. if ( $r->{'corr'}->{$corr_sym} ) {
  563. $r->{'corr'}->{$corr_sym}++;
  564. }
  565. else {
  566. $r->{'corr'}->{$corr_sym} = 1;
  567. }
  568. }
  569. }
  570. } # End of correlations check
  571. }
  572. }
  573. }
  574. }
  575. sub JsonObjectElt() {
  576. my ( $k, $v ) = @_;
  577. my $f = defined $_[2] ? $_[2] : '%s';
  578. if ( $f eq "%s" ) {
  579. $f = "\"%s\"";
  580. }
  581. printf "\"%s\":$f", $k, $v;
  582. }
  583. sub GetLogfilesList {
  584. my ($dir) = @_;
  585. opendir( my $fh, $dir ) or die $!;
  586. my $pattern = join( '|', keys %decompressor );
  587. my $re = qr/\.[0-9]+(?:\.(?:$pattern))?/;
  588. # Add unnumbered logs first
  589. my @logs =
  590. grep { -f "$dir/$_" && !/$re/ } readdir($fh);
  591. # Add numbered logs
  592. rewinddir($fh);
  593. push( @logs, ( sort numeric ( grep { -f "$dir/$_" && /$re/ } readdir($fh) ) ) );
  594. closedir($fh);
  595. # Select required logs and revers their order
  596. @logs =
  597. reverse splice( @logs, $exclude_logs, $num_logs ||= @logs - $exclude_logs );
  598. # Loop through array printing out filenames
  599. print { interactive(*STDERR) } "\nLog files to process:\n";
  600. foreach my $file (@logs) {
  601. print { interactive(*STDERR) } " $file\n";
  602. }
  603. print { interactive(*STDERR) } "\n";
  604. return @logs;
  605. }
  606. sub log_time_format {
  607. my $fh = shift;
  608. my ( $format, $line );
  609. while (<$fh>) {
  610. $line = $_;
  611. # 2017-08-08 00:00:01 #66984(
  612. # 2017-08-08 00:00:01.001 #66984(
  613. if (/^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d(\.\d{3,5})? #\d+\(/) {
  614. $format = 'rspamd';
  615. last;
  616. }
  617. # Aug 8 00:02:50 #66986(
  618. elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d #\d+\(/) {
  619. $format = 'syslog';
  620. last;
  621. }
  622. # Aug 8 00:02:50 hostname rspamd[66986]
  623. elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d \S+ rspamd\[\d+\]/) {
  624. $format = 'syslog';
  625. last;
  626. }
  627. # 2018-04-16T06:25:46.012590+02:00 rspamd rspamd[12968]
  628. elsif (/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{1,6})?(Z|[-+]\d{2}:\d{2}) \S+ rspamd\[\d+\]/) {
  629. $format = 'syslog5424';
  630. last;
  631. }
  632. # Skip newsyslog messages
  633. # Aug 8 00:00:00 hostname newsyslog[63284]: logfile turned over
  634. elsif (/^\w{3} (?:\s?\d|\d\d) \d\d:\d\d:\d\d\ \S+ newsyslog\[\d+\]: logfile turned over$/) {
  635. next;
  636. }
  637. # Skip journalctl messages
  638. # -- Logs begin at Mon 2018-01-15 11:16:24 MSK, end at Fri 2018-04-27 09:10:30 MSK. --
  639. elsif (
  640. /^-- Logs begin at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}, end at \w{3} \d{4}-\d\d-\d\d \d\d:\d\d:\d\d [A-Z]{3}\. --$/
  641. )
  642. {
  643. next;
  644. }
  645. else {
  646. print "Unknown log format\n";
  647. exit 1;
  648. }
  649. }
  650. return ( $format, $line );
  651. }
  652. sub normalized_time {
  653. return
  654. if !defined( $_ = shift );
  655. /^\d\d(?::\d\d){0,2}$/
  656. ? sprintf '%04d-%02d-%02d %s', 1900 + (localtime)[5], 1 + (localtime)[4], (localtime)[3], $_
  657. : $_;
  658. }
  659. sub numeric {
  660. $a =~ /\.(\d+)\./;
  661. my $a_num = $1;
  662. $b =~ /\.(\d+)\./;
  663. my $b_num = $1;
  664. $a_num <=> $b_num;
  665. }
  666. sub spinner {
  667. my @spinner = qw{/ - \ |};
  668. return
  669. if ( ( time - $spinner_update_time ) < 1 );
  670. $spinner_update_time = time;
  671. printf { interactive(*STDERR) } "%s\r", $spinner[ $spinner_update_time % @spinner ];
  672. select()->flush();
  673. }
  674. # Convert syslog timestamp to "ISO 8601 like" format
  675. # using current year as syslog does not record the year (nor the timezone)
  676. # or the last year if the guessed time is in the future.
  677. sub syslog2iso {
  678. my %month_map;
  679. @month_map{qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)} = 0 .. 11;
  680. my ( $month, @t ) = $_[0] =~ m/^(\w{3}) \s\s? (\d\d?) \s (\d\d):(\d\d):(\d\d)/x;
  681. my $epoch =
  682. timelocal( ( reverse @t ), $month_map{$month}, 1900 + (localtime)[5] );
  683. sprintf '%04d-%02d-%02d %02d:%02d:%02d', 1900 + (localtime)[5] - ( $epoch > time ), $month_map{$month} + 1, @t;
  684. }
  685. ### Imported from IO::Interactive 1.022 Perl module
  686. sub is_interactive {
  687. ## no critic (ProhibitInteractiveTest)
  688. my ($out_handle) = ( @_, select ); # Default to default output handle
  689. # Not interactive if output is not to terminal...
  690. return 0 if not -t $out_handle;
  691. # If *ARGV is opened, we're interactive if...
  692. if ( tied(*ARGV) or defined( fileno(ARGV) ) ) { # this is what 'Scalar::Util::openhandle *ARGV' boils down to
  693. # ...it's currently opened to the magic '-' file
  694. return -t *STDIN if defined $ARGV && $ARGV eq '-';
  695. # ...it's at end-of-file and the next file is the magic '-' file
  696. return @ARGV > 0 && $ARGV[0] eq '-' && -t *STDIN if eof *ARGV;
  697. # ...it's directly attached to the terminal
  698. return -t *ARGV;
  699. }
  700. # If *ARGV isn't opened, it will be interactive if *STDIN is attached
  701. # to a terminal.
  702. else {
  703. return -t *STDIN;
  704. }
  705. }
  706. ### Imported from IO::Interactive 1.022 Perl module
  707. local ( *DEV_NULL, *DEV_NULL2 );
  708. my $dev_null;
  709. BEGIN {
  710. pipe *DEV_NULL, *DEV_NULL2
  711. or die "Internal error: can't create null filehandle";
  712. $dev_null = \*DEV_NULL;
  713. }
  714. ### Imported from IO::Interactive 1.022 Perl module
  715. sub interactive {
  716. my ($out_handle) = ( @_, \*STDOUT ); # Default to STDOUT
  717. return &is_interactive ? $out_handle : $dev_null;
  718. }
  719. __END__
  720. =head1 NAME
  721. rspamd_stats - analyze Rspamd rules by parsing log files
  722. =head1 SYNOPSIS
  723. rspamd_stats [options] [--symbol=SYM1 [--symbol=SYM2...]] [--log file]
  724. Options:
  725. --log=file log file or directory to read (stdin by default)
  726. --reject-score=score set reject threshold (15 by default)
  727. --junk-score=score set junk score (6.0 by default)
  728. --symbol=sym check specified symbol (perl regexps, '.*' by default)
  729. --alpha-score=score set ignore score for symbols (0.1 by default)
  730. --correlations enable correlations report
  731. --nrelated=integer show that amount of related symbols (10 by default)
  732. --search-pattern do not process input unless the desired pattern is found
  733. --start starting time (oldest) for log parsing
  734. --end ending time (newest) for log parsing
  735. --num-logs=integer number of recent logfiles to analyze (all files in the directory by default)
  736. --exclude-logs=integer number of latest logs to exclude (0 by default)
  737. --json print json output instead of human readable
  738. --help brief help message
  739. --mult=sym=number multiply symbol score
  740. --man full documentation
  741. =head1 OPTIONS
  742. =over 8
  743. =item B<--log>
  744. Specifies log file or directory to read data from. If a directory is specified B<rspamd_stats> analyses files in the
  745. directory including known compressed file types. Number of log files can be limited using B<--num-logs> and
  746. B<--exclude-logs> options. This assumes that files in the log directory have B<newsyslog(8)>- or B<logrotate(8)>-like
  747. name format with numeric indexes. Files without indexes (generally it is merely one file) are considered the most
  748. recent and files with lower indexes are considered newer.
  749. =item B<--reject-score>
  750. Specifies the reject (spam) threshold.
  751. =item B<--junk-score>
  752. Specifies the junk (add header or rewrite subject) threshold.
  753. =item B<--alpha-score>
  754. Specifies the minimum score for a symbol to be considered by this script.
  755. =item B<--symbol>
  756. Add symbol or pattern (pcre format) to analyze.
  757. =item B<--num-logs>
  758. If set, limits number of analyzed logfiles in the directory to the specified value.
  759. =item B<--exclude-logs>
  760. Number of latest logs to exclude (0 by default).
  761. =item B<--correlations>
  762. Additionally print correlation rate for each symbol displayed. This routine calculates merely paired correlations
  763. between symbols.
  764. =item B<--search-pattern>
  765. Do not process input unless finding the specified regular expression. Useful to skip logs to a certain position.
  766. =item B<--exclude>
  767. Exclude log lines if certain symbols are fired (e.g. GTUBE). You may specify this option multiple time to skip multiple
  768. symbols.
  769. =item B<--start>
  770. Select log entries after this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
  771. with B<--end> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
  772. supply the time.
  773. =item B<--end>
  774. Select log entries before this time. Format: C<YYYY-MM-DD HH:MM:SS> (can be truncated to any desired accuracy). If used
  775. with B<--start> select entries between B<--start> and B<--end>. The omitted date defaults to the current date if you
  776. supply the time.
  777. =item B<--mult=symbol=number>
  778. Multiplies score for the named symbol by the provided multiplier.
  779. =item B<--help>
  780. Print a brief help message and exits.
  781. =item B<--man>
  782. Prints the manual page and exits.
  783. =back
  784. =head1 DESCRIPTION
  785. B<rspamd_stats> will read the given log file (or standard input) and provide statistics for the specified symbols:
  786. Symbol: BAYES_SPAM (weight 3.763) (381985 hits, 26.827%)
  787. Ham hits: 184557 (48.315%), total ham: 1095487 (ham with BAYES_SPAM: 16.847%)
  788. Spam hits: 15134 (3.962%), total spam: 16688 (spam with BAYES_SPAM: 90.688%)
  789. Junk hits: 182294 (47.723%), total junk: 311699 (junk with BAYES_SPAM: 58.484%)
  790. Spam changes (ham/junk -> spam): 7026 (1.839%), total percentage (changes / spam hits): 42.102%
  791. Junk changes (ham -> junk): 95192 (24.920%), total percentage (changes / junk hits): 30.540%
  792. Where there are the following attributes:
  793. =over 4
  794. =item *
  795. B<Weight>: average score for a symbols
  796. =item *
  797. B<Total hits>: total number of hits and percentage of symbol hits divided by total number of messages
  798. =item *
  799. B<HAM hits>: provides the following information about B<HAM> messages with the specified symbol (from left to right):
  800. =over 4
  801. =item 1.
  802. B<total symbol hits>: number of messages that has this symbol and are B<HAM>
  803. =item 2.
  804. B<ham percentage>: number of symbol hits divided by overall B<HAM> messages count
  805. =item 3.
  806. B<total ham hits>: overall number of B<HAM> messages
  807. =item 4.
  808. B<ham with symbol percentage>: percentage of number of hits with specified symbol in B<HAM> messages divided by total
  809. number of B<HAM> messages.
  810. =back
  811. =item *
  812. B<SPAM hits>: provides the following information about B<SPAM> messages - same as previous but for B<SPAM> class.
  813. =item *
  814. B<Junk hits>: provides the following information about B<Junk> messages - same as previous but for B<JUNK> class.
  815. =item *
  816. B<Spam changes>: displays data about how much messages switched their class because of the specific symbol weight.
  817. =item *
  818. B<Junk changes>: displays data about how much messages switched their class because of the specific symbol weight.
  819. =back
  820. =cut