diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:28:47 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-02-23 14:28:47 +0000 |
commit | 53991167d09fb43907dcbb69e5147bc0c0011c15 (patch) | |
tree | b570f9609a757c49619197f5776f8f55f05acf11 /src | |
parent | 77570ec7f9d6865c29c55b1535c00fb7c1d5b4b3 (diff) | |
download | rspamd-53991167d09fb43907dcbb69e5147bc0c0011c15.tar.gz rspamd-53991167d09fb43907dcbb69e5147bc0c0011c15.zip |
Add routines to normalize text parts.
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/message.c | 50 | ||||
-rw-r--r-- | src/libmime/message.h | 1 | ||||
-rw-r--r-- | src/libserver/task.c | 3 |
3 files changed, 54 insertions, 0 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index bcdb86259..e6d27563b 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -31,6 +31,7 @@ #include "images.h" #include "utlist.h" #include "tokenizers/tokenizers.h" +#include "libstemmer.h" #include <iconv.h> @@ -1170,6 +1171,54 @@ detect_text_language (struct mime_text_part *part) } static void +rspamd_normalize_text_part (struct rspamd_task *task, + struct mime_text_part *part) +{ + struct sb_stemmer *stem = NULL; + rspamd_fstring_t *w, stw; + const guchar *r; + guint i; + + if (part->language && part->language[0] != '\0' && part->is_utf) { + stem = sb_stemmer_new (part->language, "UTF_8"); + if (stem == NULL) { + msg_info ("<%s> cannot create lemmatizer for %s language", + task->message_id, part->language); + } + } + + g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), + part->words->len); + for (i = 0; i < part->words->len; i ++) { + w = &g_array_index (part->words, rspamd_fstring_t, i); + if (stem) { + r = sb_stemmer_stem (stem, w->begin, w->len); + } + + if (stem == NULL || r == NULL) { + stw.begin = rspamd_mempool_fstrdup (task->task_pool, w); + stw.len = w->len; + } + else { + stw.begin = rspamd_mempool_strdup (task->task_pool, r); + stw.len = strlen (r); + } + + if (part->is_utf) { + rspamd_str_lc_utf8 (stw.begin, stw.len); + } + else { + rspamd_str_lc (stw.begin, stw.len); + } + g_array_append_val (part->normalized_words, stw); + } + + if (stem != NULL) { + sb_stemmer_delete (stem); + } +} + +static void process_text_part (struct rspamd_task *task, GByteArray *part_content, GMimeContentType *type, @@ -1273,6 +1322,7 @@ process_text_part (struct rspamd_task *task, text_part->words = rspamd_tokenize_text (text_part->content->data, text_part->content->len, text_part->is_utf, task->cfg->min_word_len, &text_part->urls_offset); + rspamd_normalize_text_part (task, text_part); } #ifdef GMIME24 diff --git a/src/libmime/message.h b/src/libmime/message.h index d418b6cf6..ef881ebd1 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -40,6 +40,7 @@ struct mime_text_part { GMimeObject *parent; rspamd_fstring_t *diff_str; GArray *words; + GArray *normalized_words; }; struct received_header { diff --git a/src/libserver/task.c b/src/libserver/task.c index 699e129ab..c442db8fe 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -243,6 +243,9 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft) if (tp->words) { g_array_free (tp->words, TRUE); } + if (tp->normalized_words) { + g_array_free (tp->normalized_words, TRUE); + } part = g_list_next (part); } |