From 8525f71340f4edaeb9cdf9f57766dd4a4a0f993a Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 17 Dec 2014 13:59:52 +0000 Subject: [PATCH] Make words from text parts. --- src/libmime/message.c | 20 ++++++++++++++++++-- src/libmime/message.h | 3 ++- src/libserver/task.c | 10 ++++++++++ src/tokenizers/tokenizers.c | 4 ++-- src/tokenizers/tokenizers.h | 3 ++- 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 0e33967ca..f465b1c06 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -30,6 +30,8 @@ #include "html.h" #include "images.h" #include "utlist.h" +#include "tokenizers/tokenizers.h" + #include #define RECURSION_LIMIT 30 @@ -1033,6 +1035,8 @@ process_text_part (struct rspamd_task *task, { struct mime_text_part *text_part; const gchar *cd; + gchar *pos; + rspamd_fstring_t token, buf; /* Skip attachements */ #ifndef GMIME24 @@ -1056,7 +1060,6 @@ process_text_part (struct rspamd_task *task, if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { - debug_task ("got urls from text/html part"); text_part = rspamd_mempool_alloc0 (task->task_pool, @@ -1097,7 +1100,6 @@ process_text_part (struct rspamd_task *task, task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "*")) { - debug_task ("got urls from text/plain part"); text_part = rspamd_mempool_alloc0 (task->task_pool, @@ -1120,6 +1122,20 @@ process_text_part (struct rspamd_task *task, rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff); task->text_parts = g_list_prepend (task->text_parts, text_part); } + else { + return; + } + + /* Post process part */ + buf.begin = text_part->content->data; + buf.len = text_part->content->len; + buf.size = buf.len; + + text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + while ((pos = rspamd_tokenizer_get_word (&buf, + &token, &text_part->urls_offset)) != NULL) { + g_array_append_val (text_part->words, token); + } } #ifdef GMIME24 diff --git a/src/libmime/message.h b/src/libmime/message.h index ddc5939fa..8287db9b0 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -30,12 +30,13 @@ struct mime_text_part { GByteArray *orig; GByteArray *content; GNode *html_nodes; - GList *urls_offset; /**< list of offsets of urls */ + GList *urls_offset; /**< list of offsets of urls */ rspamd_fuzzy_t *fuzzy; rspamd_fuzzy_t *double_fuzzy; GMimeObject *parent; GUnicodeScript script; rspamd_fstring_t *diff_str; + GArray *words; }; struct received_header { diff --git a/src/libserver/task.c b/src/libserver/task.c index b6a910a91..bfa9184fb 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -226,6 +226,7 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft) { GList *part; struct mime_part *p; + struct mime_text_part *tp; if (task) { debug_task ("free pointer %p", task); @@ -236,6 +237,15 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft) g_list_free_1 (part); } if (task->text_parts) { + part = task->text_parts; + while (part) { + tp = (struct mime_text_part *)part->data; + if (tp->words) { + g_array_free (tp->words, TRUE); + } + part = g_list_next (part); + } + g_list_free (task->text_parts); } if (task->images) { diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 4789d8f62..2adc86ff9 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -31,7 +31,7 @@ #include "tokenizers.h" struct tokenizer tokenizers[] = { - {"osb-text", osb_tokenize_text, get_next_word}, + {"osb-text", osb_tokenize_text, rspamd_tokenizer_get_word}, }; const int primes[] = { @@ -104,7 +104,7 @@ token_node_compare_func (gconstpointer a, gconstpointer b) /* Get next word from specified f_str_t buf */ gchar * -get_next_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) +rspamd_tokenizer_get_word (rspamd_fstring_t * buf, rspamd_fstring_t * token, GList **exceptions) { gsize remain, pos; guchar *p; diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index b10729614..efce5b307 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -30,7 +30,8 @@ int token_node_compare_func (gconstpointer a, gconstpointer b); /* Get tokenizer structure by name or return NULL if this name is not found */ struct tokenizer * get_tokenizer (const char *name); /* Get next word from specified f_str_t buf */ -gchar * get_next_word (rspamd_fstring_t *buf, rspamd_fstring_t *token, GList **exceptions); +gchar * rspamd_tokenizer_get_word (rspamd_fstring_t *buf, + rspamd_fstring_t *token, GList **exceptions); /* OSB tokenize function */ int osb_tokenize_text (struct tokenizer *tokenizer, rspamd_mempool_t *pool, -- 2.39.5