summaryrefslogtreecommitdiffstats
path: root/src/filter.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-28 19:07:26 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-06-28 19:07:26 +0400
commitb3c36d4946f675619b81c9223f5ac1a86c55c55c (patch)
tree6cdd79cae18ce387f6c00f8ce23aef65b4a5c02b /src/filter.c
parent0e6a4235b1794a61d12fcde33cffaf8dd83c51f0 (diff)
downloadrspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.tar.gz
rspamd-b3c36d4946f675619b81c9223f5ac1a86c55c55c.zip
* Add correcting factor to statistics.
Now learning increments version of a statfile. Avoid learning and classifying of similar text parts if a message has 2 text parts. Several fixes to statistics.
Diffstat (limited to 'src/filter.c')
-rw-r--r--src/filter.c30
1 files changed, 27 insertions, 3 deletions
diff --git a/src/filter.c b/src/filter.c
index 797b4f6fe..b625e4c72 100644
--- a/src/filter.c
+++ b/src/filter.c
@@ -40,6 +40,8 @@
# include "lua/lua_common.h"
#endif
+#define COMMON_PART_FACTOR 80
+
static inline GQuark
filter_error_quark (void)
{
@@ -593,7 +595,8 @@ classifiers_callback (gpointer value, void *arg)
GTree *tokens = NULL;
GList *cur;
f_str_t c;
- gchar *header = NULL;
+ gchar *header = NULL;
+ gboolean is_twopart = FALSE;
if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) {
cur = message_get_header (task->task_pool, task->message, header, FALSE);
@@ -603,6 +606,9 @@ classifiers_callback (gpointer value, void *arg)
}
else {
cur = g_list_first (task->text_parts);
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
}
ctx = cl->classifier->init_func (task->task_pool, cl);
@@ -624,10 +630,18 @@ classifiers_callback (gpointer value, void *arg)
cur = g_list_next (cur);
continue;
}
+ if (is_twopart && cur->next == NULL) {
+ /* Compare part's content */
+ if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
c.begin = text_part->content->data;
c.len = text_part->content->len;
/* Tree would be freed at task pool freeing */
- if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) {
+ if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens,
+ FALSE, text_part->is_utf, text_part->urls_offset)) {
msg_info ("cannot tokenize input");
return;
}
@@ -815,7 +829,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
stat_file_t *stf;
gdouble sum;
struct mime_text_part *part;
- gboolean is_utf = FALSE;
+ gboolean is_utf = FALSE, is_twopart = FALSE;
/* Load classifier by symbol */
cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile);
@@ -834,6 +848,9 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
else {
/* Classify message otherwise */
cur = g_list_first (task->text_parts);
+ if (cur != NULL && cur->next != NULL && cur->next->next == NULL) {
+ is_twopart = TRUE;
+ }
}
/* Get tokens from each element */
@@ -854,6 +871,13 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err)
c.len = part->content->len;
is_utf = part->is_utf;
ex = part->urls_offset;
+ if (is_twopart && cur->next == NULL) {
+ /* Compare part's content */
+ if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) {
+ msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id);
+ break;
+ }
+ }
}
/* Get tokens */
if (!cl->tokenizer->tokenize_func (