From b3c36d4946f675619b81c9223f5ac1a86c55c55c Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 28 Jun 2011 19:07:26 +0400 Subject: [PATCH] * Add correcting factor to statistics. Now learning increments version of a statfile. Avoid learning and classifying of similar text parts if a message has 2 text parts. Several fixes to statistics. --- src/classifiers/bayes.c | 48 +++++++++++++++++++++++++------------ src/filter.c | 30 ++++++++++++++++++++--- src/statfile.c | 24 +++++++++++++++++-- src/statfile.h | 8 +++++++ src/tokenizers/osb.c | 29 +++++++++++----------- src/tokenizers/tokenizers.c | 2 +- 6 files changed, 105 insertions(+), 36 deletions(-) diff --git a/src/classifiers/bayes.c b/src/classifiers/bayes.c index af79e0eaa..7363df522 100644 --- a/src/classifiers/bayes.c +++ b/src/classifiers/bayes.c @@ -47,7 +47,8 @@ struct bayes_statfile_data { guint64 total_hits; double local_probability; double post_probability; - guint64 value; + double corr; + double value; struct statfile *st; stat_file_t *file; }; @@ -60,6 +61,7 @@ struct bayes_callback_data { stat_file_t *file; struct bayes_statfile_data *statfiles; guint32 statfiles_num; + guint64 learned_tokens; }; static gboolean @@ -67,7 +69,8 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) { token_node_t *node = key; struct bayes_callback_data *cd = data; - gint v, c; + gint c; + guint64 v; c = (cd->in_class) ? 1 : -1; @@ -75,8 +78,9 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) v = statfile_pool_get_block (cd->pool, cd->file, node->h1, node->h2, cd->now); if (v == 0 && c > 0) { statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, c); + cd->learned_tokens ++; } - else { + else if (v != 0) { if (G_LIKELY (c > 0)) { v ++; } @@ -86,6 +90,7 @@ bayes_learn_callback (gpointer key, gpointer value, gpointer data) } } statfile_pool_set_block (cd->pool, cd->file, node->h1, node->h2, cd->now, v); + cd->learned_tokens ++; } return FALSE; @@ -102,24 +107,21 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) struct bayes_callback_data *cd = data; double renorm = 0; gint i; - guint64 local_hits = 0; + double local_hits = 0; struct bayes_statfile_data *cur; for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now); + cur->value = statfile_pool_get_block (cd->pool, cur->file, node->h1, node->h2, cd->now) * cur->corr; if (cur->value > 0) { - cur->total_hits += cur->value; + cur->total_hits ++; cur->hits = cur->value; local_hits += cur->value; } - else { - cur->value = 0; - } } for (i = 0; i < cd->statfiles_num; i ++) { cur = &cd->statfiles[i]; - cur->local_probability = 0.5 + ((double)cur->value - ((double)local_hits - cur->value)) / + cur->local_probability = 0.5 + (cur->value - (local_hits - cur->value)) / (LOCAL_PROB_DENOM * (1.0 + local_hits)); renorm += cur->post_probability * cur->local_probability; } @@ -145,7 +147,7 @@ bayes_classify_callback (gpointer key, gpointer value, gpointer data) cur->post_probability = G_MINDOUBLE * 100; } if (cd->ctx->debug) { - msg_info ("token: %s, statfile: %s, probability: %uL, post_probability: %.4f", + msg_info ("token: %s, statfile: %s, probability: %.4f, post_probability: %.4f", node->extra, cur->st->symbol, cur->value, cur->post_probability); } } @@ -169,8 +171,9 @@ gboolean bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, struct worker_task *task) { struct bayes_callback_data data; - char *value; - int nodes, minnodes, i, cnt, best_num = 0; + gchar *value; + gint nodes, minnodes, i = 0, cnt, best_num = 0; + guint64 rev, total_learns = 0; double best = 0; struct statfile *st; stat_file_t *file; @@ -198,7 +201,6 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.ctx = ctx; cur = ctx->cfg->statfiles; - i = 0; while (cur) { /* Select statfile to learn */ st = cur->data; @@ -214,11 +216,21 @@ bayes_classify (struct classifier_ctx* ctx, statfile_pool_t *pool, GTree *input, data.statfiles[i].st = st; data.statfiles[i].post_probability = 0.5; data.statfiles[i].local_probability = 0.5; - i ++; + statfile_get_revision (file, &rev, NULL); + total_learns += rev; + cur = g_list_next (cur); + i ++; } + cnt = i; + /* Calculate correction factor */ + for (i = 0; i < cnt; i ++) { + statfile_get_revision (data.statfiles[i].file, &rev, NULL); + data.statfiles[i].corr = ((double)rev / cnt) / (double)total_learns; + } + g_tree_foreach (input, bayes_classify_callback, &data); for (i = 0; i < cnt; i ++) { @@ -277,6 +289,7 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb data.in_class = in_class; data.now = time (NULL); data.ctx = ctx; + data.learned_tokens = 0; cur = ctx->cfg->statfiles; while (cur) { /* Select statfile to learn */ @@ -321,8 +334,13 @@ bayes_learn (struct classifier_ctx* ctx, statfile_pool_t *pool, const char *symb data.file = to_learn; statfile_pool_lock_file (pool, data.file); g_tree_foreach (input, bayes_learn_callback, &data); + statfile_inc_revision (to_learn); statfile_pool_unlock_file (pool, data.file); + if (sum != NULL) { + *sum = data.learned_tokens; + } + return TRUE; } diff --git a/src/filter.c b/src/filter.c index 797b4f6fe..b625e4c72 100644 --- a/src/filter.c +++ b/src/filter.c @@ -40,6 +40,8 @@ # include "lua/lua_common.h" #endif +#define COMMON_PART_FACTOR 80 + static inline GQuark filter_error_quark (void) { @@ -593,7 +595,8 @@ classifiers_callback (gpointer value, void *arg) GTree *tokens = NULL; GList *cur; f_str_t c; - gchar *header = NULL; + gchar *header = NULL; + gboolean is_twopart = FALSE; if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) { cur = message_get_header (task->task_pool, task->message, header, FALSE); @@ -603,6 +606,9 @@ classifiers_callback (gpointer value, void *arg) } else { cur = g_list_first (task->text_parts); + if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { + is_twopart = TRUE; + } } ctx = cl->classifier->init_func (task->task_pool, cl); @@ -624,10 +630,18 @@ classifiers_callback (gpointer value, void *arg) cur = g_list_next (cur); continue; } + if (is_twopart && cur->next == NULL) { + /* Compare part's content */ + if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) { + msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); + break; + } + } c.begin = text_part->content->data; c.len = text_part->content->len; /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) { + if (!cl->tokenizer->tokenize_func (cl->tokenizer, task->task_pool, &c, &tokens, + FALSE, text_part->is_utf, text_part->urls_offset)) { msg_info ("cannot tokenize input"); return; } @@ -815,7 +829,7 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) stat_file_t *stf; gdouble sum; struct mime_text_part *part; - gboolean is_utf = FALSE; + gboolean is_utf = FALSE, is_twopart = FALSE; /* Load classifier by symbol */ cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); @@ -834,6 +848,9 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) else { /* Classify message otherwise */ cur = g_list_first (task->text_parts); + if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { + is_twopart = TRUE; + } } /* Get tokens from each element */ @@ -854,6 +871,13 @@ learn_task (const gchar *statfile, struct worker_task *task, GError **err) c.len = part->content->len; is_utf = part->is_utf; ex = part->urls_offset; + if (is_twopart && cur->next == NULL) { + /* Compare part's content */ + if (fuzzy_compare_parts (cur->data, cur->prev->data) >= COMMON_PART_FACTOR) { + msg_info ("message <%s> has two common text parts, ignore the last one", task->message_id); + break; + } + } } /* Get tokens */ if (!cl->tokenizer->tokenize_func ( diff --git a/src/statfile.c b/src/statfile.c index 0359c0c4d..65da15033 100644 --- a/src/statfile.c +++ b/src/statfile.c @@ -789,6 +789,22 @@ statfile_set_revision (stat_file_t *file, guint64 rev, time_t time) } gboolean +statfile_inc_revision (stat_file_t *file) +{ + struct stat_file_header *header; + + if (file == NULL || file->map == NULL) { + return FALSE; + } + + header = (struct stat_file_header *)file->map; + + header->revision ++; + + return TRUE; +} + +gboolean statfile_get_revision (stat_file_t *file, guint64 *rev, time_t *time) { struct stat_file_header *header; @@ -799,8 +815,12 @@ statfile_get_revision (stat_file_t *file, guint64 *rev, time_t *time) header = (struct stat_file_header *)file->map; - *rev = header->revision; - *time = header->rev_time; + if (rev != NULL) { + *rev = header->revision; + } + if (time != NULL) { + *time = header->rev_time; + } return TRUE; } diff --git a/src/statfile.h b/src/statfile.h index 571464196..e76aa0897 100644 --- a/src/statfile.h +++ b/src/statfile.h @@ -224,6 +224,14 @@ guint32 statfile_get_section_by_name (const gchar *name); */ gboolean statfile_set_revision (stat_file_t *file, guint64 rev, time_t time); +/** + * Increment statfile revision and revision time + * @param filename name of statfile + * @param time time of revision + * @return TRUE if revision was set + */ +gboolean statfile_inc_revision (stat_file_t *file); + /** * Set statfile revision and revision time * @param filename name of statfile diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index bc57255cb..790069d6a 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -41,7 +41,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * token_node_t *new = NULL; f_str_t token = { NULL, 0, 0 }; guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, k = 0, l; + gint i, l; gchar *res; if (*tree == NULL) { @@ -49,6 +49,8 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); } + memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); + while ((res = tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { /* Skip small words */ if (is_utf) { @@ -68,23 +70,20 @@ osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t * pool, f_str_t * } hashpipe[0] = fstrhash_lowercase (&token, is_utf); - if (k > FEATURE_WINDOW_SIZE) { - for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { - h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; - h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; - new = memory_pool_alloc0 (pool, sizeof (token_node_t)); - new->h1 = h1; - new->h2 = h2; - if (save_token) { - new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); - } + for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { + h1 = hashpipe[0] * primes[0] + hashpipe[i] * primes[i << 1]; + h2 = hashpipe[0] * primes[1] + hashpipe[i] * primes[(i << 1) - 1]; + new = memory_pool_alloc0 (pool, sizeof (token_node_t)); + new->h1 = h1; + new->h2 = h2; + if (save_token) { + new->extra = (uintptr_t)memory_pool_fstrdup (pool, &token); + } - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } + if (g_tree_lookup (*tree, new) == NULL) { + g_tree_insert (*tree, new, new); } } - k ++; token.begin = res; } diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 16dc763ed..d5a820d1b 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -138,7 +138,7 @@ get_next_word (f_str_t * buf, f_str_t * token, GList **exceptions) token->len = 0; remain = buf->len - (token->begin - buf->begin); - if (remain <= 0) { + if (remain == 0) { return NULL; } pos = token->begin - buf->begin; -- 2.39.5