From: Vsevolod Stakhov Date: Tue, 23 Dec 2014 14:32:22 +0000 (+0000) Subject: Rework text tokenizaton. X-Git-Tag: 0.8.0~27 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=39b8dcb94620669ae369ab559175dde1a5c103b7;p=rspamd.git Rework text tokenizaton. --- diff --git a/src/libmime/filter.c b/src/libmime/filter.c index c6a4752fa..467f30b9c 100644 --- a/src/libmime/filter.c +++ b/src/libmime/filter.c @@ -746,88 +746,58 @@ classifiers_callback (gpointer value, void *arg) struct rspamd_statfile_config *st; GTree *tokens = NULL; GList *cur; - rspamd_fstring_t c; - gchar *header = NULL; gint *dist = NULL, diff; - gboolean is_twopart = FALSE, is_headers = FALSE; - struct raw_header *rh; + gboolean is_twopart = FALSE; task = cbdata->task; - if ((header = g_hash_table_lookup (cl->opts, "header")) != NULL) { - cur = - message_get_header (task, header, FALSE); - is_headers = TRUE; - } - else { - cur = g_list_first (task->text_parts); - dist = rspamd_mempool_get_variable (task->task_pool, "parts_distance"); - if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { - is_twopart = TRUE; - } + cur = g_list_first (task->text_parts); + dist = rspamd_mempool_get_variable (task->task_pool, "parts_distance"); + if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { + is_twopart = TRUE; } ctx = cl->classifier->init_func (task->task_pool, cl); if ((tokens = g_hash_table_lookup (task->tokens, cl->tokenizer)) == NULL) { while (cur != NULL) { - if (is_headers) { - rh = (struct raw_header *)cur->data; - if (rh->decoded == NULL) { - cur = g_list_next (cur); - continue; - } - c.len = strlen (rh->decoded); - if (c.len > 0) { - c.begin = rh->decoded; - if (!cl->tokenizer->tokenize_func (cl->tokenizer, - task->task_pool, &c, &tokens, FALSE, FALSE, NULL)) { - msg_info ("cannot tokenize input"); - return; - } - } + text_part = (struct mime_text_part *)cur->data; + if (text_part->is_empty) { + cur = g_list_next (cur); + continue; } - else { - text_part = (struct mime_text_part *)cur->data; - if (text_part->is_empty) { - cur = g_list_next (cur); - continue; - } - if (dist != NULL && cur->next == NULL) { - /* Compare part's content */ + if (dist != NULL && cur->next == NULL) { + /* Compare part's content */ - if (*dist >= COMMON_PART_FACTOR) { - msg_info ( + if (*dist >= COMMON_PART_FACTOR) { + msg_info ( "message <%s> has two common text parts, ignore the last one", task->message_id); - break; - } + break; } - else if (cur->next == NULL && is_twopart) { - p1 = cur->prev->data; - p2 = text_part; - if (p1->diff_str != NULL && p2->diff_str != NULL) { - diff = + } + else if (cur->next == NULL && is_twopart) { + p1 = cur->prev->data; + p2 = text_part; + if (p1->diff_str != NULL && p2->diff_str != NULL) { + diff = rspamd_diff_distance (p1->diff_str, p2->diff_str); - } - else { - diff = rspamd_fuzzy_compare_parts (p1, p2); - } - if (diff >= COMMON_PART_FACTOR) { - msg_info ( + } + else { + diff = rspamd_fuzzy_compare_parts (p1, p2); + } + if (diff >= COMMON_PART_FACTOR) { + msg_info ( "message <%s> has two common text parts, ignore the last one", task->message_id); - break; - } + break; } - c.begin = (gchar *)text_part->content->data; - c.len = text_part->content->len; - /* Tree would be freed at task pool freeing */ - if (!cl->tokenizer->tokenize_func (cl->tokenizer, - task->task_pool, &c, &tokens, + } + /* Tree would be freed at task pool freeing */ + if (!cl->tokenizer->tokenize_func (cl->tokenizer, + task->task_pool, text_part->words, &tokens, FALSE, text_part->is_utf, text_part->urls_offset)) { - msg_info ("cannot tokenize input"); - return; - } + msg_info ("cannot tokenize input"); + return; } cur = g_list_next (cur); } @@ -1108,151 +1078,6 @@ rspamd_check_action_metric (struct rspamd_task *task, return METRIC_ACTION_NOACTION; } -gboolean -rspamd_learn_task (const gchar *statfile, struct rspamd_task *task, GError **err) -{ - GList *cur, *ex; - struct rspamd_classifier_config *cl; - struct classifier_ctx *cls_ctx; - gchar *s; - rspamd_fstring_t c; - GTree *tokens = NULL; - struct rspamd_statfile_config *st; - stat_file_t *stf; - gdouble sum; - struct mime_text_part *part, *p1, *p2; - gboolean is_utf = FALSE, is_twopart = FALSE; - gint diff; - struct raw_header *rh; - - /* Load classifier by symbol */ - cl = g_hash_table_lookup (task->cfg->classifiers_symbols, statfile); - if (cl == NULL) { - g_set_error (err, - filter_error_quark (), 1, "Statfile %s is not configured in any classifier", - statfile); - return FALSE; - } - - /* If classifier has 'header' option just classify header of this type */ - if ((s = g_hash_table_lookup (cl->opts, "header")) != NULL) { - cur = message_get_header (task, s, FALSE); - } - else { - /* Classify message otherwise */ - cur = g_list_first (task->text_parts); - if (cur != NULL && cur->next != NULL && cur->next->next == NULL) { - is_twopart = TRUE; - } - } - - /* Get tokens from each element */ - while (cur) { - if (s != NULL) { - rh = (struct raw_header *)cur->data; - if (rh->decoded == NULL) { - cur = g_list_next (cur); - continue; - } - c.len = strlen (rh->decoded); - c.begin = rh->decoded; - ex = NULL; - } - else { - part = cur->data; - /* Skip empty parts */ - if (part->is_empty) { - cur = g_list_next (cur); - continue; - } - c.begin = (gchar *)part->content->data; - c.len = part->content->len; - is_utf = part->is_utf; - ex = part->urls_offset; - if (is_twopart && cur->next == NULL) { - /* Compare part's content */ - p1 = cur->prev->data; - p2 = part; - if (p1->diff_str != NULL && p2->diff_str != NULL) { - diff = rspamd_diff_distance (p1->diff_str, p2->diff_str); - } - else { - diff = rspamd_fuzzy_compare_parts (p1, p2); - } - if (diff >= COMMON_PART_FACTOR) { - msg_info ( - "message <%s> has two common text parts, ignore the last one", - task->message_id); - break; - } - } - } - /* Get tokens */ - if (!cl->tokenizer->tokenize_func ( - cl->tokenizer, task->task_pool, - &c, &tokens, FALSE, is_utf, ex)) { - g_set_error (err, - filter_error_quark (), 2, "Cannot tokenize message"); - return FALSE; - } - cur = g_list_next (cur); - } - - /* Handle messages without text */ - if (tokens == NULL) { - g_set_error (err, - filter_error_quark (), 3, "Cannot tokenize message, no text data"); - msg_info ("learn failed for message <%s>, no tokens to extract", - task->message_id); - return FALSE; - } - - /* Take care of subject */ - tokenize_subject (task, &tokens); - - /* Init classifier */ - cls_ctx = cl->classifier->init_func ( - task->task_pool, cl); - /* Get or create statfile */ - stf = get_statfile_by_symbol (task->worker->srv->statfile_pool, - cl, statfile, &st, TRUE); - - /* Learn */ - if (stf== NULL || !cl->classifier->learn_func ( - cls_ctx, task->worker->srv->statfile_pool, - statfile, tokens, TRUE, &sum, - 1.0, err)) { - if (*err) { - msg_info ("learn failed for message <%s>, learn error: %s", - task->message_id, - (*err)->message); - return FALSE; - } - else { - g_set_error (err, - filter_error_quark (), 4, - "Learn failed, unknown learn classifier error"); - msg_info ("learn failed for message <%s>, unknown learn error", - task->message_id); - return FALSE; - } - } - /* Increase statistics */ - task->worker->srv->stat->messages_learned++; - - maybe_write_binlog (cl, st, stf, tokens); - msg_info ( - "learn success for message <%s>, for statfile: %s, sum weight: %.2f", - task->message_id, - statfile, - sum); - statfile_pool_plan_invalidate (task->worker->srv->statfile_pool, - DEFAULT_STATFILE_INVALIDATE_TIME, - DEFAULT_STATFILE_INVALIDATE_JITTER); - - return TRUE; -} - gboolean rspamd_learn_task_spam (struct rspamd_classifier_config *cl, struct rspamd_task *task, @@ -1261,7 +1086,6 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl, { GList *cur, *ex; struct classifier_ctx *cls_ctx; - rspamd_fstring_t c; GTree *tokens = NULL; struct mime_text_part *part, *p1, *p2; gboolean is_utf = FALSE, is_twopart = FALSE; @@ -1280,8 +1104,6 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl, cur = g_list_next (cur); continue; } - c.begin = (gchar *)part->content->data; - c.len = part->content->len; is_utf = part->is_utf; ex = part->urls_offset; if (is_twopart && cur->next == NULL) { @@ -1307,7 +1129,7 @@ rspamd_learn_task_spam (struct rspamd_classifier_config *cl, /* Get tokens */ if (!cl->tokenizer->tokenize_func ( cl->tokenizer, task->task_pool, - &c, &tokens, FALSE, is_utf, ex)) { + part->words, &tokens, FALSE, is_utf, ex)) { g_set_error (err, filter_error_quark (), 2, "Cannot tokenize message"); return FALSE; diff --git a/src/libmime/filter.h b/src/libmime/filter.h index 3f01f2314..de324caf7 100644 --- a/src/libmime/filter.h +++ b/src/libmime/filter.h @@ -150,18 +150,8 @@ double rspamd_factor_consolidation_func (struct rspamd_task *task, const gchar *metric_name, const gchar *unused); -/* - * Learn specified statfile with message in a task - * @param statfile symbol of statfile - * @param task worker's task object - * @param err pointer to GError - * @return true if learn succeed - */ -gboolean rspamd_learn_task (const gchar *statfile, - struct rspamd_task *task, - GError **err); -/* +/** * Learn specified statfile with message in a task * @param statfile symbol of statfile * @param task worker's task object diff --git a/src/tokenizers/osb.c b/src/tokenizers/osb.c index 1ee251f81..b879237c4 100644 --- a/src/tokenizers/osb.c +++ b/src/tokenizers/osb.c @@ -37,17 +37,17 @@ extern const int primes[]; int osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t * pool, - rspamd_fstring_t * input, + GArray * input, GTree ** tree, gboolean save_token, gboolean is_utf, GList *exceptions) { token_node_t *new = NULL; - rspamd_fstring_t token = { NULL, 0, 0 }; + rspamd_fstring_t *token; guint32 hashpipe[FEATURE_WINDOW_SIZE], h1, h2; - gint i, l, processed = 0; - gchar *res; + gint i, processed = 0; + guint w; if (*tree == NULL) { *tree = g_tree_new (token_node_compare_func); @@ -58,31 +58,20 @@ osb_tokenize_text (struct tokenizer *tokenizer, memset (hashpipe, 0xfe, FEATURE_WINDOW_SIZE * sizeof (hashpipe[0])); - while ((res = - tokenizer->get_next_word (input, &token, &exceptions)) != NULL) { - /* Skip small words */ - if (is_utf) { - l = g_utf8_strlen (token.begin, token.len); - } - else { - l = token.len; - } - if (l < MIN_LEN) { - token.begin = res; - continue; - } + for (w = 0; w < input->len; w ++) { + token = &g_array_index (input, rspamd_fstring_t, w); if (processed < FEATURE_WINDOW_SIZE) { /* Just fill a hashpipe */ hashpipe[FEATURE_WINDOW_SIZE - ++processed] = - rspamd_fstrhash_lc (&token, is_utf); + rspamd_fstrhash_lc (token, is_utf); } else { /* Shift hashpipe */ for (i = FEATURE_WINDOW_SIZE - 1; i > 0; i--) { hashpipe[i] = hashpipe[i - 1]; } - hashpipe[0] = rspamd_fstrhash_lc (&token, is_utf); + hashpipe[0] = rspamd_fstrhash_lc (token, is_utf); processed++; for (i = 1; i < FEATURE_WINDOW_SIZE; i++) { @@ -94,7 +83,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, new->h2 = h2; if (save_token) { new->extra = - (uintptr_t)rspamd_mempool_fstrdup (pool, &token); + (uintptr_t)rspamd_mempool_fstrdup (pool, token); } if (g_tree_lookup (*tree, new) == NULL) { @@ -102,7 +91,6 @@ osb_tokenize_text (struct tokenizer *tokenizer, } } } - token.begin = res; } if (processed <= FEATURE_WINDOW_SIZE) { @@ -113,7 +101,7 @@ osb_tokenize_text (struct tokenizer *tokenizer, new->h1 = h1; new->h2 = h2; if (save_token) { - new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, &token); + new->extra = (uintptr_t)rspamd_mempool_fstrdup (pool, token); } if (g_tree_lookup (*tree, new) == NULL) { diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 01456a11b..8e423a211 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -114,7 +114,7 @@ rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GLi return NULL; } - if (*exceptions != NULL) { + if (exceptions != NULL && *exceptions != NULL) { ex = (*exceptions)->data; } @@ -220,9 +220,9 @@ rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, void tokenize_subject (struct rspamd_task *task, GTree ** tree) { - rspamd_fstring_t subject; - const gchar *sub; + gchar *sub; struct tokenizer *osb_tokenizer; + GArray *words; if (*tree == NULL) { *tree = g_tree_new (token_node_compare_func); @@ -234,26 +234,21 @@ tokenize_subject (struct rspamd_task *task, GTree ** tree) /* Try to use pre-defined subject */ if (task->subject != NULL) { - subject.begin = task->subject; - subject.len = strlen (task->subject); - osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - &subject, - tree, - FALSE, - TRUE, - NULL); + sub = task->subject; + } + else { + sub = (gchar *)g_mime_message_get_subject (task->message); } - if ((sub = g_mime_message_get_subject (task->message)) != NULL) { - subject.begin = (gchar *)sub; - subject.len = strlen (sub); + + if (sub != NULL) { + words = rspamd_tokenize_text (sub, strlen (sub), TRUE, 0, NULL); osb_tokenizer->tokenize_func (osb_tokenizer, - task->task_pool, - &subject, - tree, - FALSE, - TRUE, - NULL); + task->task_pool, + words, + tree, + FALSE, + TRUE, + NULL); } } diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 51446b09d..ed47e0add 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -19,9 +19,13 @@ typedef struct token_node_s { /* Common tokenizer structure */ struct tokenizer { gchar *name; - gint (*tokenize_func)(struct tokenizer *tokenizer, rspamd_mempool_t *pool, - rspamd_fstring_t *input, - GTree **cur, gboolean save_token, gboolean is_utf, GList *exceptions); + gint (*tokenize_func)(struct tokenizer *tokenizer, + rspamd_mempool_t *pool, + GArray *words, + GTree **cur, + gboolean save_token, + gboolean is_utf, + GList *exceptions); gchar * (*get_next_word)(rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); }; @@ -42,7 +46,7 @@ GArray * rspamd_tokenize_text (gchar *text, gsize len, gboolean is_utf, /* OSB tokenize function */ int osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t *pool, - rspamd_fstring_t *input, + GArray *input, GTree **cur, gboolean save_token, gboolean is_utf,