aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-17 13:59:52 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-17 13:59:52 +0000
commit8525f71340f4edaeb9cdf9f57766dd4a4a0f993a (patch)
tree162959bd7ac1ff30e672986a144317ca775ca1aa /src/libmime
parent5bcf9f0af263b25ab6c5b128e767cc8a2a39312a (diff)
downloadrspamd-8525f71340f4edaeb9cdf9f57766dd4a4a0f993a.tar.gz
rspamd-8525f71340f4edaeb9cdf9f57766dd4a4a0f993a.zip
Make words from text parts.
Diffstat (limited to 'src/libmime')
-rw-r--r--src/libmime/message.c20
-rw-r--r--src/libmime/message.h3
2 files changed, 20 insertions, 3 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 0e33967ca..f465b1c06 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -30,6 +30,8 @@
#include "html.h"
#include "images.h"
#include "utlist.h"
+#include "tokenizers/tokenizers.h"
+
#include <iconv.h>
#define RECURSION_LIMIT 30
@@ -1033,6 +1035,8 @@ process_text_part (struct rspamd_task *task,
{
struct mime_text_part *text_part;
const gchar *cd;
+ gchar *pos;
+ rspamd_fstring_t token, buf;
/* Skip attachements */
#ifndef GMIME24
@@ -1056,7 +1060,6 @@ process_text_part (struct rspamd_task *task,
if (g_mime_content_type_is_type (type, "text",
"html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
- debug_task ("got urls from text/html part");
text_part =
rspamd_mempool_alloc0 (task->task_pool,
@@ -1097,7 +1100,6 @@ process_text_part (struct rspamd_task *task,
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
else if (g_mime_content_type_is_type (type, "text", "*")) {
- debug_task ("got urls from text/plain part");
text_part =
rspamd_mempool_alloc0 (task->task_pool,
@@ -1120,6 +1122,20 @@ process_text_part (struct rspamd_task *task,
rspamd_fuzzy_from_text_part (text_part, task->task_pool, task->cfg->max_diff);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
+ else {
+ return;
+ }
+
+ /* Post process part */
+ buf.begin = text_part->content->data;
+ buf.len = text_part->content->len;
+ buf.size = buf.len;
+
+ text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
+ while ((pos = rspamd_tokenizer_get_word (&buf,
+ &token, &text_part->urls_offset)) != NULL) {
+ g_array_append_val (text_part->words, token);
+ }
}
#ifdef GMIME24
diff --git a/src/libmime/message.h b/src/libmime/message.h
index ddc5939fa..8287db9b0 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -30,12 +30,13 @@ struct mime_text_part {
GByteArray *orig;
GByteArray *content;
GNode *html_nodes;
- GList *urls_offset; /**< list of offsets of urls */
+ GList *urls_offset; /**< list of offsets of urls */
rspamd_fuzzy_t *fuzzy;
rspamd_fuzzy_t *double_fuzzy;
GMimeObject *parent;
GUnicodeScript script;
rspamd_fstring_t *diff_str;
+ GArray *words;
};
struct received_header {