You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

classifier_test.pl 12KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551
  1. #!/usr/bin/env perl
  2. use warnings;
  3. use strict;
  4. use Pod::Usage;
  5. use Getopt::Long;
  6. use Time::HiRes qw(gettimeofday tv_interval);
  7. use JSON::XS;
  8. use String::ShellQuote;
  9. use FileHandle;
  10. use IPC::Open2;
  11. use Data::Dumper;
  12. my $spam_dir;
  13. my $ham_dir;
  14. my $parallel = 1;
  15. my $classifier = "bayes";
  16. my $spam_symbol = "BAYES_SPAM";
  17. my $ham_symbol = "BAYES_HAM";
  18. my $timeout = 10;
  19. my $rspamc = $ENV{'RSPAMC'} || "rspamc";
  20. my $bogofilter = $ENV{'BOGOFILTER'} || "bogofilter";
  21. my $dspam = $ENV{'DSPAM'} || "dspam";
  22. my $train_fraction = 0.5;
  23. my $use_bogofilter = 0;
  24. my $use_dspam = 0;
  25. my $check_only = 0;
  26. my $rspamc_prob_trigger = 95;
  27. my $man;
  28. my $help;
  29. GetOptions(
  30. "spam|s=s" => \$spam_dir,
  31. "ham|h=s" => \$ham_dir,
  32. "spam-symbol=s" => \$spam_symbol,
  33. "ham-symbol=s" => \$ham_symbol,
  34. "classifier|c=s" => \$classifier,
  35. "timeout|t=f" => \$timeout,
  36. "parallel|p=i" => \$parallel,
  37. "train-fraction|t=f" => \$train_fraction,
  38. "bogofilter|b" => \$use_bogofilter,
  39. "dspam|d" => \$use_dspam,
  40. "check-only" => \$check_only,
  41. "help|?" => \$help,
  42. "man" => \$man
  43. ) or pod2usage(2);
  44. pod2usage(1) if $help;
  45. pod2usage( -exitval => 0, -verbose => 2 ) if $man;
  46. sub read_dir_files {
  47. my ( $dir, $target ) = @_;
  48. opendir( my $dh, $dir ) or die "cannot open dir $dir: $!";
  49. while ( my $file = readdir $dh ) {
  50. if ( -f "$dir/$file" ) {
  51. push @{$target}, "$dir/$file";
  52. }
  53. }
  54. }
  55. sub shuffle_array {
  56. my ($ar) = @_;
  57. for ( my $i = 0 ; $i < scalar @{$ar} ; $i++ ) {
  58. if ( $i > 1 ) {
  59. my $sel = int( rand( $i - 1 ) );
  60. ( @{$ar}[$i], @{$ar}[$sel] ) = ( @{$ar}[$sel], @{$ar}[$i] );
  61. }
  62. }
  63. }
  64. sub learn_rspamc {
  65. my ( $files, $spam ) = @_;
  66. my $processed = 0;
  67. my $cmd = $spam ? "learn_spam" : "learn_ham";
  68. my $args_quoted = shell_quote @{$files};
  69. open(
  70. my $p,
  71. "$rspamc -t $timeout -c $classifier --compact -j -n $parallel $cmd $args_quoted |"
  72. ) or die "cannot spawn $rspamc: $!";
  73. while (<$p>) {
  74. my $res = eval('decode_json($_)');
  75. if ( $res && $res->{'success'} ) {
  76. $processed++;
  77. }
  78. }
  79. return $processed;
  80. }
  81. sub learn_bogofilter {
  82. my ( $files, $spam ) = @_;
  83. my $processed = 0;
  84. foreach my $f ( @{$files} ) {
  85. my $args_quoted = shell_quote $f;
  86. my $fl = $spam ? "-s" : "-n";
  87. `$bogofilter -I $args_quoted $fl`;
  88. if ( $? == 0 ) {
  89. $processed++;
  90. }
  91. }
  92. return $processed;
  93. }
  94. sub learn_dspam {
  95. my ( $files, $spam ) = @_;
  96. my $processed = 0;
  97. foreach my $f ( @{$files} ) {
  98. my $args_quoted = shell_quote $f;
  99. my $fl = $spam ? "--class=spam" : "--class=innocent";
  100. open( my $p,
  101. "|$dspam --user nobody --source=corpus --stdout --mode=toe $fl" )
  102. or die "cannot run $dspam: $!";
  103. open( my $inp, "< $f" );
  104. while (<$inp>) {
  105. print $p $_;
  106. }
  107. }
  108. return $processed;
  109. }
  110. sub learn_samples {
  111. my ( $ar_ham, $ar_spam ) = @_;
  112. my $len;
  113. my $processed = 0;
  114. my $total = 0;
  115. my $learn_func;
  116. my @files_spam;
  117. my @files_ham;
  118. if ($use_dspam) {
  119. $learn_func = \&learn_dspam;
  120. }
  121. elsif ($use_bogofilter) {
  122. $learn_func = \&learn_bogofilter;
  123. }
  124. else {
  125. $learn_func = \&learn_rspamc;
  126. }
  127. $len = int( scalar @{$ar_ham} * $train_fraction );
  128. my @cur_vec;
  129. # Shuffle spam and ham samples
  130. for ( my $i = 0 ; $i < $len ; $i++ ) {
  131. if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
  132. push @cur_vec, @{$ar_ham}[$i];
  133. push @files_ham, [@cur_vec];
  134. @cur_vec = ();
  135. $total++;
  136. }
  137. else {
  138. push @cur_vec, @{$ar_ham}[$i];
  139. }
  140. }
  141. $len = int( scalar @{$ar_spam} * $train_fraction );
  142. @cur_vec = ();
  143. for ( my $i = 0 ; $i < $len ; $i++ ) {
  144. if ( $i > 0 && ( $i % $parallel == 0 || $i == $len - 1 ) ) {
  145. push @cur_vec, @{$ar_spam}[$i];
  146. push @files_spam, [@cur_vec];
  147. @cur_vec = ();
  148. $total++;
  149. }
  150. else {
  151. push @cur_vec, @{$ar_spam}[$i];
  152. }
  153. }
  154. for ( my $i = 0 ; $i < $total ; $i++ ) {
  155. my $args;
  156. my $spam;
  157. if ( $i % 2 == 0 ) {
  158. $args = pop @files_spam;
  159. if ( !$args ) {
  160. $args = pop @files_ham;
  161. $spam = 0;
  162. }
  163. else {
  164. $spam = 1;
  165. }
  166. }
  167. else {
  168. $args = pop @files_ham;
  169. if ( !$args ) {
  170. $args = pop @files_spam;
  171. $spam = 1;
  172. }
  173. else {
  174. $spam = 0;
  175. }
  176. }
  177. my $r = $learn_func->( $args, $spam );
  178. if ($r) {
  179. $processed += $r;
  180. }
  181. }
  182. return $processed;
  183. }
  184. sub check_rspamc {
  185. my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
  186. my $args_quoted = shell_quote @{$files};
  187. my $processed = 0;
  188. open(
  189. my $p,
  190. "$rspamc -t $timeout -n $parallel --header=\"Settings: {symbols_enabled=[BAYES_SPAM]}\" --compact -j $args_quoted |"
  191. ) or die "cannot spawn $rspamc: $!";
  192. while (<$p>) {
  193. my $res = eval('decode_json($_)');
  194. if ( $res && $res->{'default'} ) {
  195. $processed++;
  196. if ($spam) {
  197. if ( $res->{'default'}->{$ham_symbol} ) {
  198. my $m = $res->{'default'}->{$ham_symbol}->{'options'}->[0];
  199. if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
  200. my $percentage = int($1);
  201. if ( $percentage >= $rspamc_prob_trigger ) {
  202. $$fp_cnt++;
  203. }
  204. }
  205. else {
  206. $$fp_cnt++;
  207. }
  208. }
  209. elsif ( !$res->{'default'}->{$spam_symbol} ) {
  210. $$fn_cnt++;
  211. }
  212. else {
  213. $$detected_cnt++;
  214. }
  215. }
  216. else {
  217. if ( $res->{'default'}->{$spam_symbol} ) {
  218. my $m = $res->{'default'}->{$spam_symbol}->{'options'}->[0];
  219. if ( $m && $m =~ /^(\d+(?:\.\d+)?)%$/ ) {
  220. my $percentage = int($1);
  221. if ( $percentage >= $rspamc_prob_trigger ) {
  222. $$fp_cnt++;
  223. }
  224. }
  225. else {
  226. $$fp_cnt++;
  227. }
  228. }
  229. elsif ( !$res->{'default'}->{$ham_symbol} ) {
  230. $$fn_cnt++;
  231. }
  232. else {
  233. $$detected_cnt++;
  234. }
  235. }
  236. }
  237. }
  238. return $processed;
  239. }
  240. sub check_bogofilter {
  241. my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
  242. my $processed = 0;
  243. foreach my $f ( @{$files} ) {
  244. my $args_quoted = shell_quote $f;
  245. open( my $p, "$bogofilter -t -I $args_quoted |" )
  246. or die "cannot spawn $bogofilter: $!";
  247. while (<$p>) {
  248. if ( $_ =~ /^([SHU])\s+.*$/ ) {
  249. $processed++;
  250. if ($spam) {
  251. if ( $1 eq 'H' ) {
  252. $$fp_cnt++;
  253. }
  254. elsif ( $1 eq 'U' ) {
  255. $$fn_cnt++;
  256. }
  257. else {
  258. $$detected_cnt++;
  259. }
  260. }
  261. else {
  262. if ( $1 eq 'S' ) {
  263. $$fp_cnt++;
  264. }
  265. elsif ( $1 eq 'U' ) {
  266. $$fn_cnt++;
  267. }
  268. else {
  269. $$detected_cnt++;
  270. }
  271. }
  272. }
  273. }
  274. }
  275. return $processed;
  276. }
  277. sub check_dspam {
  278. my ( $files, $spam, $fp_cnt, $fn_cnt, $detected_cnt ) = @_;
  279. my $processed = 0;
  280. foreach my $f ( @{$files} ) {
  281. my $args_quoted = shell_quote $f;
  282. my $pid = open2( *Reader, *Writer,
  283. "$dspam --user nobody --classify --stdout --mode=notrain" );
  284. open( my $inp, "< $f" );
  285. while (<$inp>) {
  286. print Writer $_;
  287. }
  288. close Writer;
  289. while (<Reader>) {
  290. if ( $_ =~
  291. qr(^X-DSPAM-Result: nobody; result="([^"]+)"; class="[^"]+"; probability=(\d+(?:\.\d+)?).*$)
  292. )
  293. {
  294. $processed++;
  295. my $percentage = int($2 * 100.0);
  296. if ($spam) {
  297. if ( $1 eq 'Innocent') {
  298. if ( $percentage <= (100 - $rspamc_prob_trigger) ) {
  299. $$fp_cnt++;
  300. }
  301. }
  302. elsif ( $1 ne 'Spam' ) {
  303. $$fn_cnt++;
  304. }
  305. else {
  306. $$detected_cnt++;
  307. }
  308. }
  309. else {
  310. if ( $1 eq 'Spam' ) {
  311. if ( $percentage >= $rspamc_prob_trigger ) {
  312. $$fp_cnt++;
  313. }
  314. }
  315. elsif ( $1 ne 'Innocent' ) {
  316. $$fn_cnt++;
  317. }
  318. else {
  319. $$detected_cnt++;
  320. }
  321. }
  322. }
  323. }
  324. close Reader;
  325. waitpid( $pid, 0 );
  326. }
  327. return $processed;
  328. }
  329. sub cross_validate {
  330. my ($hr) = @_;
  331. my $args = "";
  332. my $processed = 0;
  333. my $fp_spam = 0;
  334. my $fn_spam = 0;
  335. my $fp_ham = 0;
  336. my $fn_ham = 0;
  337. my $total_spam = 0;
  338. my $total_ham = 0;
  339. my $detected_spam = 0;
  340. my $detected_ham = 0;
  341. my $i = 0;
  342. my $len = scalar keys %{$hr};
  343. my @files_spam;
  344. my @files_ham;
  345. my @cur_spam;
  346. my @cur_ham;
  347. my $check_func;
  348. if ($use_dspam) {
  349. $check_func = \&check_dspam;
  350. }
  351. elsif ($use_bogofilter) {
  352. $check_func = \&check_bogofilter;
  353. }
  354. else {
  355. $check_func = \&check_rspamc;
  356. }
  357. while ( my ( $fn, $spam ) = each( %{$hr} ) ) {
  358. if ($spam) {
  359. if ( scalar @cur_spam >= $parallel || $i == $len - 1 ) {
  360. push @cur_spam, $fn;
  361. push @files_spam, [@cur_spam];
  362. @cur_spam = ();
  363. }
  364. else {
  365. push @cur_spam, $fn;
  366. }
  367. }
  368. else {
  369. if ( scalar @cur_ham >= $parallel || $i == $len - 1 ) {
  370. push @cur_ham, $fn;
  371. push @files_ham, [@cur_ham];
  372. @cur_ham = ();
  373. }
  374. else {
  375. push @cur_ham, $fn;
  376. }
  377. }
  378. }
  379. shuffle_array( \@files_spam );
  380. foreach my $fn (@files_spam) {
  381. my $r = $check_func->( $fn, 1, \$fp_ham, \$fn_spam, \$detected_spam );
  382. $total_spam += $r;
  383. $processed += $r;
  384. }
  385. shuffle_array( \@files_ham );
  386. foreach my $fn (@files_ham) {
  387. my $r = $check_func->( $fn, 0, \$fp_spam, \$fn_ham, \$detected_ham );
  388. $total_ham += $r;
  389. $processed += $r;
  390. }
  391. printf "Scanned %d messages
  392. %d spam messages (%d detected)
  393. %d ham messages (%d detected)\n",
  394. $processed, $total_spam, $detected_spam, $total_ham, $detected_ham;
  395. printf "\nHam FP rate: %.2f%% (%d messages)
  396. Ham FN rate: %.2f%% (%d messages)\n",
  397. $fp_ham / $total_ham * 100.0, $fp_ham,
  398. $fn_ham / $total_ham * 100.0, $fn_ham;
  399. printf "\nSpam FP rate: %.2f%% (%d messages)
  400. Spam FN rate: %.2f%% (%d messages)\n",
  401. $fp_spam / $total_spam * 100.0, $fp_spam,
  402. $fn_spam / $total_spam * 100.0, $fn_spam;
  403. }
  404. if ( !$spam_dir || !$ham_dir ) {
  405. die "spam or/and ham directories are not specified";
  406. }
  407. my @spam_samples;
  408. my @ham_samples;
  409. read_dir_files( $spam_dir, \@spam_samples );
  410. read_dir_files( $ham_dir, \@ham_samples );
  411. shuffle_array( \@spam_samples );
  412. shuffle_array( \@ham_samples );
  413. if ( !$check_only ) {
  414. my $learned = 0;
  415. my $t0 = [gettimeofday];
  416. $learned = learn_samples( \@ham_samples, \@spam_samples );
  417. my $t1 = [gettimeofday];
  418. printf "Learned classifier, %d items processed, %.2f seconds elapsed\n",
  419. $learned, tv_interval( $t0, $t1 );
  420. }
  421. my %validation_set;
  422. my $len = int( scalar @spam_samples * $train_fraction );
  423. for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
  424. $validation_set{ $spam_samples[$i] } = 1;
  425. }
  426. $len = int( scalar @ham_samples * $train_fraction );
  427. for ( my $i = $len ; $i < scalar @spam_samples ; $i++ ) {
  428. $validation_set{ $ham_samples[$i] } = 0;
  429. }
  430. cross_validate( \%validation_set );
  431. __END__
  432. =head1 NAME
  433. classifier_test.pl - test various parameters for a classifier
  434. =head1 SYNOPSIS
  435. classifier_test.pl [options]
  436. Options:
  437. --spam Directory with spam files
  438. --ham Directory with ham files
  439. --spam-symbol Symbol for spam (default: BAYES_SPAM)
  440. --ham-symbol Symbol for ham (default: BAYES_HAM)
  441. --classifier Classifier to test (default: bayes)
  442. --timeout Timeout for rspamc (default: 10)
  443. --parallel Parallel execution (default: 1)
  444. --help Brief help message
  445. --man Full documentation
  446. =head1 OPTIONS
  447. =over 8
  448. =item B<--spam>
  449. Directory with spam files.
  450. =item B<--ham>
  451. Directory with ham files.
  452. =item B<--classifier>
  453. Specifies classifier name to test.
  454. =item B<--help>
  455. Print a brief help message and exits.
  456. =item B<--man>
  457. Prints the manual page and exits.
  458. =back
  459. =head1 DESCRIPTION
  460. B<classifier_test.pl> is intended to test Rspamd classifier for false positives,
  461. false negatives and other parameters. It uses half of the corpus for training
  462. and half for cross-validation.
  463. =cut