aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:28:47 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-02-23 14:28:47 +0000
commit53991167d09fb43907dcbb69e5147bc0c0011c15 (patch)
treeb570f9609a757c49619197f5776f8f55f05acf11 /src
parent77570ec7f9d6865c29c55b1535c00fb7c1d5b4b3 (diff)
downloadrspamd-53991167d09fb43907dcbb69e5147bc0c0011c15.tar.gz
rspamd-53991167d09fb43907dcbb69e5147bc0c0011c15.zip
Add routines to normalize text parts.
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c50
-rw-r--r--src/libmime/message.h1
-rw-r--r--src/libserver/task.c3
3 files changed, 54 insertions, 0 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index bcdb86259..e6d27563b 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -31,6 +31,7 @@
#include "images.h"
#include "utlist.h"
#include "tokenizers/tokenizers.h"
+#include "libstemmer.h"
#include <iconv.h>
@@ -1170,6 +1171,54 @@ detect_text_language (struct mime_text_part *part)
}
static void
+rspamd_normalize_text_part (struct rspamd_task *task,
+ struct mime_text_part *part)
+{
+ struct sb_stemmer *stem = NULL;
+ rspamd_fstring_t *w, stw;
+ const guchar *r;
+ guint i;
+
+ if (part->language && part->language[0] != '\0' && part->is_utf) {
+ stem = sb_stemmer_new (part->language, "UTF_8");
+ if (stem == NULL) {
+ msg_info ("<%s> cannot create lemmatizer for %s language",
+ task->message_id, part->language);
+ }
+ }
+
+ g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t),
+ part->words->len);
+ for (i = 0; i < part->words->len; i ++) {
+ w = &g_array_index (part->words, rspamd_fstring_t, i);
+ if (stem) {
+ r = sb_stemmer_stem (stem, w->begin, w->len);
+ }
+
+ if (stem == NULL || r == NULL) {
+ stw.begin = rspamd_mempool_fstrdup (task->task_pool, w);
+ stw.len = w->len;
+ }
+ else {
+ stw.begin = rspamd_mempool_strdup (task->task_pool, r);
+ stw.len = strlen (r);
+ }
+
+ if (part->is_utf) {
+ rspamd_str_lc_utf8 (stw.begin, stw.len);
+ }
+ else {
+ rspamd_str_lc (stw.begin, stw.len);
+ }
+ g_array_append_val (part->normalized_words, stw);
+ }
+
+ if (stem != NULL) {
+ sb_stemmer_delete (stem);
+ }
+}
+
+static void
process_text_part (struct rspamd_task *task,
GByteArray *part_content,
GMimeContentType *type,
@@ -1273,6 +1322,7 @@ process_text_part (struct rspamd_task *task,
text_part->words = rspamd_tokenize_text (text_part->content->data,
text_part->content->len, text_part->is_utf, task->cfg->min_word_len,
&text_part->urls_offset);
+ rspamd_normalize_text_part (task, text_part);
}
#ifdef GMIME24
diff --git a/src/libmime/message.h b/src/libmime/message.h
index d418b6cf6..ef881ebd1 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -40,6 +40,7 @@ struct mime_text_part {
GMimeObject *parent;
rspamd_fstring_t *diff_str;
GArray *words;
+ GArray *normalized_words;
};
struct received_header {
diff --git a/src/libserver/task.c b/src/libserver/task.c
index 699e129ab..c442db8fe 100644
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -243,6 +243,9 @@ rspamd_task_free (struct rspamd_task *task, gboolean is_soft)
if (tp->words) {
g_array_free (tp->words, TRUE);
}
+ if (tp->normalized_words) {
+ g_array_free (tp->normalized_words, TRUE);
+ }
part = g_list_next (part);
}