[Rework] Set token data as uint64_t instead of chars array

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2017-04-04 16:49:44 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2017-04-04 16:49:44 +0100
commit: e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11 (patch)
tree: ba2ee3b7d49e603476dbe15e52d4a8c93c30474b /src/libstat/classifiers/bayes.c
parent: 90f44e8ee59515936df340d5bace8ce68f515870 (diff)
download: rspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.tar.gz
rspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.zip
1 files changed, 51 insertions, 12 deletions
diff --git a/src/libstat/classifiers/bayes.c b/src/libstat/classifiers/bayes.c
index 95bd1f5ea..c9faae6bd 100644
--- a/src/libstat/classifiers/bayes.c
+++ b/src/libstat/classifiers/bayes.c
@@ -159,11 +159,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 		cl->processed_tokens ++;
 
 		if (tok->t1 && tok->t2) {
-			msg_debug_bayes ("token <%*s:%*s>: weight: %f, total_count: %L, "
+			msg_debug_bayes ("token %uL <%*s:%*s>: weight: %f, total_count: %L, "
 					"spam_count: %L, ham_count: %L,"
 					"spam_prob: %.3f, ham_prob: %.3f, "
 					"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
 					"current spam prob: %.3f, current ham prob: %.3f",
+					tok->data,
 					(int) tok->t1->len, tok->t1->begin,
 					(int) tok->t2->len, tok->t2->begin,
 					fw, total_count, spam_count, ham_count,
@@ -172,11 +173,12 @@ bayes_classify_token (struct rspamd_classifier *ctx,
 					cl->spam_prob, cl->ham_prob);
 		}
 		else {
-			msg_debug_bayes ("token <?:?>: weight: %f, total_count: %L, "
+			msg_debug_bayes ("token %uL <?:?>: weight: %f, total_count: %L, "
 					"spam_count: %L, ham_count: %L,"
 					"spam_prob: %.3f, ham_prob: %.3f, "
 					"bayes_spam_prob: %.3f, bayes_ham_prob: %.3f, "
 					"current spam prob: %.3f, current ham prob: %.3f",
+					tok->data,
 					fw, total_count, spam_count, ham_count,
 					spam_prob, ham_prob,
 					bayes_spam_prob, bayes_ham_prob,
@@ -324,7 +326,7 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
 		gboolean unlearn,
 		GError **err)
 {
-	guint i, j;
+	guint i, j, total_cnt, spam_cnt, ham_cnt;
 	gint id;
 	struct rspamd_statfile *st;
 	rspamd_token_t *tok;
@@ -336,6 +338,9 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
 	incrementing = ctx->cfg->flags & RSPAMD_FLAG_CLASSIFIER_INCREMENTING_BACKEND;
 
 	for (i = 0; i < tokens->len; i++) {
+		total_cnt = 0;
+		spam_cnt = 0;
+		ham_cnt = 0;
 		tok = g_ptr_array_index (tokens, i);
 
 		for (j = 0; j < ctx->statfiles_ids->len; j++) {
@@ -350,21 +355,55 @@ bayes_learn_spam (struct rspamd_classifier * ctx,
 				else {
 					tok->values[id]++;
 				}
-			}
-			else if (tok->values[id] > 0 && unlearn) {
-				/* Unlearning */
-				if (incrementing) {
-					tok->values[id] = -1;
+
+				total_cnt += tok->values[id];
+
+				if (st->stcf->is_spam) {
+					spam_cnt += tok->values[id];
 				}
 				else {
-					tok->values[id]--;
+					ham_cnt += tok->values[id];
 				}
 			}
-			else if (incrementing) {
-				tok->values[id] = 0;
+			else {
+				if (tok->values[id] > 0 && unlearn) {
+					/* Unlearning */
+					if (incrementing) {
+						tok->values[id] = -1;
+					}
+					else {
+						tok->values[id]--;
+					}
+
+					if (st->stcf->is_spam) {
+						spam_cnt += tok->values[id];
+					}
+					else {
+						ham_cnt += tok->values[id];
+					}
+					total_cnt += tok->values[id];
+				}
+				else if (incrementing) {
+					tok->values[id] = 0;
+				}
 			}
 		}
+
+		if (tok->t1 && tok->t2) {
+			msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
+					"spam_count: %d, ham_count: %d",
+					tok->data,
+					(int) tok->t1->len, tok->t1->begin,
+					(int) tok->t2->len, tok->t2->begin,
+					tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+		}
+		else {
+			msg_debug_bayes ("token %uL <?:?>: window: %d, total_count: %d, "
+					"spam_count: %d, ham_count: %d",
+					tok->data,
+					tok->window_idx, total_cnt, spam_cnt, ham_cnt);
+		}
 	}
 
 	return TRUE;
-}
+}
+\ No newline at end of file
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2017-04-04 16:49:44 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2017-04-04 16:49:44 +0100
commit	e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11 (patch)
tree	ba2ee3b7d49e603476dbe15e52d4a8c93c30474b /src/libstat/classifiers/bayes.c
parent	90f44e8ee59515936df340d5bace8ce68f515870 (diff)
download	rspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.tar.gz rspamd-e737e9942cc0c0cbd18dcbc9a1feb0a4b1c48a11.zip