diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-17 13:59:52 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-17 13:59:52 +0000 |
commit | 8525f71340f4edaeb9cdf9f57766dd4a4a0f993a (patch) | |
tree | 162959bd7ac1ff30e672986a144317ca775ca1aa /src/libmime | |
parent | 5bcf9f0af263b25ab6c5b128e767cc8a2a39312a (diff) | |
download | rspamd-8525f71340f4edaeb9cdf9f57766dd4a4a0f993a.tar.gz rspamd-8525f71340f4edaeb9cdf9f57766dd4a4a0f993a.zip |
Make words from text parts.
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/message.c | 20 | ||||
-rw-r--r-- | src/libmime/message.h | 3 |
2 files changed, 20 insertions, 3 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 0e33967ca..f465b1c06 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -30,6 +30,8 @@ #include "html.h" #include "images.h" #include "utlist.h" +#include "tokenizers/tokenizers.h" + #include <iconv.h> #define RECURSION_LIMIT 30 @@ -1033,6 +1035,8 @@ process_text_part (struct rspamd_task *task, { struct mime_text_part *text_part; const gchar *cd; + gchar *pos; + rspamd_fstring_t token, buf; /* Skip attachements */ #ifndef GMIME24 @@ -1056,7 +1060,6 @@ process_text_part (struct rspamd_task *task, if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) { - debug_task ("got urls from text/html part"); text_part = rspamd_mempool_alloc0 (task->task_pool, @@ -1097,7 +1100,6 @@ process_text_part (struct rspamd_task *task, task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "*")) { - debug_task ("got urls from text/plain part"); text_part = rspamd_mempool_alloc0 (task->task_pool, @@ -1120,6 +1122,20 @@ process_text_part (struct rspamd_task *task, rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff); task->text_parts = g_list_prepend (task->text_parts, text_part); } + else { + return; + } + + /* Post process part */ + buf.begin = text_part->content->data; + buf.len = text_part->content->len; + buf.size = buf.len; + + text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); + while ((pos = rspamd_tokenizer_get_word (&buf, + &token, &text_part->urls_offset)) != NULL) { + g_array_append_val (text_part->words, token); + } } #ifdef GMIME24 diff --git a/src/libmime/message.h b/src/libmime/message.h index ddc5939fa..8287db9b0 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -30,12 +30,13 @@ struct mime_text_part { GByteArray *orig; GByteArray *content; GNode *html_nodes; - GList *urls_offset; /**< list of offsets of urls */ + GList *urls_offset; /**< list of offsets of urls */ rspamd_fuzzy_t *fuzzy; rspamd_fuzzy_t *double_fuzzy; GMimeObject *parent; GUnicodeScript script; rspamd_fstring_t *diff_str; + GArray *words; }; struct received_header { |