From: Vsevolod Stakhov Date: Tue, 23 Dec 2014 15:57:14 +0000 (+0000) Subject: Lemmatize words for fuzzy check. X-Git-Tag: 0.8.0~25 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a75c10a8b1042a6e59b053ba9ef1a11c0708a9fc;p=rspamd.git Lemmatize words for fuzzy check. --- diff --git a/contrib/snowball b/contrib/snowball index 558c36912..4bc9f365c 160000 --- a/contrib/snowball +++ b/contrib/snowball @@ -1 +1 @@ -Subproject commit 558c36912b3ca63e2917d7df504a92dbc78e1b75 +Subproject commit 4bc9f365c4b674d0dd0cfd14c314a866f00e0883 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2c7783acf..ced57d20b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -101,6 +101,7 @@ TARGET_LINK_LIBRARIES(rspamd rspamd-mime) TARGET_LINK_LIBRARIES(rspamd rspamd-server) TARGET_LINK_LIBRARIES(rspamd rspamd-util) TARGET_LINK_LIBRARIES(rspamd rspamd-lua) +TARGET_LINK_LIBRARIES(rspamd stemmer) TARGET_LINK_LIBRARIES(rspamd event) IF(HAVE_LIBEVENT2) diff --git a/src/plugins/fuzzy_check.c b/src/plugins/fuzzy_check.c index bce8298ee..df2ffd9f5 100644 --- a/src/plugins/fuzzy_check.c +++ b/src/plugins/fuzzy_check.c @@ -49,6 +49,7 @@ #include "main.h" #include "blake2.h" #include "ottery.h" +#include "libstemmer.h" #define DEFAULT_SYMBOL "R_FUZZY_HASH" #define DEFAULT_UPSTREAM_ERROR_TIME 10 @@ -533,6 +534,52 @@ fuzzy_io_fin (void *ud) close (session->fd); } +static void +fuzzy_g_array_destructor (gpointer a) +{ + GArray *ar = (GArray *)a; + + g_array_free (ar, TRUE); +} + +static GArray * +fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool) +{ + GArray *res; + struct sb_stemmer *stem; + rspamd_fstring_t *w, stw; + const guchar *r; + guint i; + + if (!part->is_utf || !part->language || part->language[0] == '\0') { + res = part->words; + } + else { + /* Lemmatize words */ + stem = sb_stemmer_new (part->language, "UTF_8"); + if (stem == NULL) { + msg_debug ("cannot lemmatize %s language", part->language); + res = part->words; + } + else { + res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t), + part->words->len); + for (i = 0; i < part->words->len; i ++) { + w = &g_array_index (part->words, rspamd_fstring_t, i); + r = sb_stemmer_stem (stem, w->begin, w->len); + if (r != NULL) { + stw.begin = rspamd_mempool_strdup (pool, r); + stw.len = strlen (r); + rspamd_str_lc (stw.begin, stw.len); + g_array_append_val (res, stw); + } + } + rspamd_mempool_add_destructor (pool, fuzzy_g_array_destructor, res); + sb_stemmer_delete (stem); + } + } + return res; +} /* * Create fuzzy command from a text part @@ -553,6 +600,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, guint i; blake2b_state st; rspamd_fstring_t *word; + GArray *words; if (legacy || part->words == NULL || part->words->len == 0) { cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd)); @@ -572,15 +620,18 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule, */ g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str, rule->hash_key->len) != -1); - for (i = 0; i < part->words->len; i ++) { - word = &g_array_index (part->words, rspamd_fstring_t, i); + words = fuzzy_preprocess_words (part, pool); + + for (i = 0; i < words->len; i ++) { + word = &g_array_index (words, rspamd_fstring_t, i); blake2b_update (&st, word->begin, word->len); } blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest)); msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str); - sh = rspamd_shingles_generate (part->words, rule->shingles_key->str, - pool, rspamd_shingles_default_filter, NULL); + sh = rspamd_shingles_generate (words, + rule->shingles_key->str, pool, + rspamd_shingles_default_filter, NULL); if (sh != NULL) { memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl)); shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;