diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:57:14 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:57:14 +0000 |
commit | a75c10a8b1042a6e59b053ba9ef1a11c0708a9fc (patch) | |
tree | c11a8a10598d26787993f570c5582514fe32e067 /src/plugins | |
parent | 3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (diff) | |
download | rspamd-a75c10a8b1042a6e59b053ba9ef1a11c0708a9fc.tar.gz rspamd-a75c10a8b1042a6e59b053ba9ef1a11c0708a9fc.zip |
Lemmatize words for fuzzy check.
Diffstat (limited to 'src/plugins')
-rw-r--r-- | src/plugins/fuzzy_check.c | 59 |
1 files changed, 55 insertions, 4 deletions
diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index bce8298ee..df2ffd9f5 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -49,6 +49,7 @@ #include "main.h" #include "blake2.h" #include "ottery.h" +#include "libstemmer.h" #define DEFAULT_SYMBOL "R_FUZZY_HASH" #define DEFAULT_UPSTREAM_ERROR_TIME 10 @@ -533,6 +534,52 @@ fuzzy_io_fin (void *ud) close (session->fd); } +static void +fuzzy_g_array_destructor (gpointer a) +{ + GArray *ar = (GArray *)a; + + g_array_free (ar, TRUE); +} + +static GArray * +fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool) +{ + GArray *res; + struct sb_stemmer *stem; + rspamd_fstring_t *w, stw; + const guchar *r; + guint i; + + if (!part->is_utf || !part->language || part->language[0] == '\0') { + res = part->words; + } + else { + /* Lemmatize words */ + stem = sb_stemmer_new (part->language, "UTF_8"); + if (stem == NULL) { + msg_debug ("cannot lemmatize %s language", part->language); + res = part->words; + } + else { + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), + part->words->len); + for (i = 0; i < part->words->len; i ++) { + w = &g_array_index (part->words, rspamd_fstring_t, i); + r = sb_stemmer_stem (stem, w->begin, w->len); + if (r != NULL) { + stw.begin = rspamd_mempool_strdup (pool, r); + stw.len = strlen (r); + rspamd_str_lc (stw.begin, stw.len); + g_array_append_val (res, stw); + } + } + rspamd_mempool_add_destructor (pool, fuzzy_g_array_destructor, res); + sb_stemmer_delete (stem); + } + } + return res; +} /* * Create fuzzy command from a text part @@ -553,6 +600,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, guint i; blake2b_state st; rspamd_fstring_t *word; + GArray *words; if (legacy || part->words == NULL || part->words->len == 0) { cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd)); @@ -572,15 +620,18 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, */ g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str, rule->hash_key->len) != -1); - for (i = 0; i < part->words->len; i ++) { - word = &g_array_index (part->words, rspamd_fstring_t, i); + words = fuzzy_preprocess_words (part, pool); + + for (i = 0; i < words->len; i ++) { + word = &g_array_index (words, rspamd_fstring_t, i); blake2b_update (&st, word->begin, word->len); } blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest)); msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str); - sh = rspamd_shingles_generate (part->words, rule->shingles_key->str, - pool, rspamd_shingles_default_filter, NULL); + sh = rspamd_shingles_generate (words, + rule->shingles_key->str, pool, + rspamd_shingles_default_filter, NULL); if (sh != NULL) { memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE; |