diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-04-01 17:48:23 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-04-01 17:48:50 +0100 |
commit | 02429b4f2358919d69a885256547dc366ee45af9 (patch) | |
tree | ad12c937699330b2280222061a0ad68dee2a9b5d /src | |
parent | 9a01296f34403c5949c167b0062943ff473d11d7 (diff) | |
download | rspamd-02429b4f2358919d69a885256547dc366ee45af9.tar.gz rspamd-02429b4f2358919d69a885256547dc366ee45af9.zip |
Fix words normalization.
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/message.c | 44 |
1 files changed, 24 insertions, 20 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index b963ac455..2eac86ed2 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1181,6 +1181,7 @@ rspamd_normalize_text_part (struct rspamd_task *task, rspamd_fstring_t *w, stw; const guchar *r; guint i; + GArray *tmp; if (part->language && part->language[0] != '\0' && part->is_utf) { stem = sb_stemmer_new (part->language, "UTF_8"); @@ -1191,32 +1192,35 @@ rspamd_normalize_text_part (struct rspamd_task *task, } /* Ugly workaround */ - part->normalized_words = rspamd_tokenize_text (part->content->data, + tmp = rspamd_tokenize_text (part->content->data, part->content->len, part->is_utf, task->cfg->min_word_len, part->urls_offset, FALSE); - for (i = 0; i < part->words->len; i ++) { - w = &g_array_index (part->words, rspamd_fstring_t, i); - if (stem) { - r = sb_stemmer_stem (stem, w->begin, w->len); - } + if (tmp) { + for (i = 0; i < tmp->len; i ++) { + w = &g_array_index (tmp, rspamd_fstring_t, i); + if (stem) { + r = sb_stemmer_stem (stem, w->begin, w->len); + } - if (stem == NULL || r == NULL) { - stw.begin = rspamd_mempool_fstrdup (task->task_pool, w); - stw.len = w->len; - } - else { - stw.begin = rspamd_mempool_strdup (task->task_pool, r); - stw.len = strlen (r); - } + if (stem == NULL || r == NULL) { + stw.begin = rspamd_mempool_fstrdup (task->task_pool, w); + stw.len = w->len; + } + else { + stw.begin = rspamd_mempool_strdup (task->task_pool, r); + stw.len = strlen (r); + } - if (part->is_utf) { - rspamd_str_lc_utf8 (stw.begin, stw.len); - } - else { - rspamd_str_lc (stw.begin, stw.len); + if (part->is_utf) { + rspamd_str_lc_utf8 (stw.begin, stw.len); + } + else { + rspamd_str_lc (stw.begin, stw.len); + } + g_array_append_val (part->normalized_words, stw); } - g_array_append_val (part->normalized_words, stw); + g_array_free (tmp, TRUE); } if (stem != NULL) { |