]> source.dussan.org Git - rspamd.git/commitdiff
Lemmatize words for fuzzy check.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 23 Dec 2014 15:57:14 +0000 (15:57 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 23 Dec 2014 15:57:14 +0000 (15:57 +0000)
contrib/snowball
src/CMakeLists.txt
src/plugins/fuzzy_check.c

index 558c36912b3ca63e2917d7df504a92dbc78e1b75..4bc9f365c4b674d0dd0cfd14c314a866f00e0883 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 558c36912b3ca63e2917d7df504a92dbc78e1b75
+Subproject commit 4bc9f365c4b674d0dd0cfd14c314a866f00e0883
index 2c7783acf9dc16d883d57c863081a3502748ecba..ced57d20b69acdd9af54c8dec37fa5be3e864664 100644 (file)
@@ -101,6 +101,7 @@ TARGET_LINK_LIBRARIES(rspamd rspamd-mime)
 TARGET_LINK_LIBRARIES(rspamd rspamd-server)
 TARGET_LINK_LIBRARIES(rspamd rspamd-util)
 TARGET_LINK_LIBRARIES(rspamd rspamd-lua)
+TARGET_LINK_LIBRARIES(rspamd stemmer)
 
 TARGET_LINK_LIBRARIES(rspamd event)
 IF(HAVE_LIBEVENT2)
index bce8298ee133e970905bc8b96fd2bbe0509ea463..df2ffd9f5d4d549eef652a6b9b525a33538e743b 100644 (file)
@@ -49,6 +49,7 @@
 #include "main.h"
 #include "blake2.h"
 #include "ottery.h"
+#include "libstemmer.h"
 
 #define DEFAULT_SYMBOL "R_FUZZY_HASH"
 #define DEFAULT_UPSTREAM_ERROR_TIME 10
@@ -533,6 +534,52 @@ fuzzy_io_fin (void *ud)
        close (session->fd);
 }
 
+static void
+fuzzy_g_array_destructor (gpointer a)
+{
+       GArray *ar = (GArray *)a;
+
+       g_array_free (ar, TRUE);
+}
+
+static GArray *
+fuzzy_preprocess_words (struct mime_text_part *part, rspamd_mempool_t *pool)
+{
+       GArray *res;
+       struct sb_stemmer *stem;
+       rspamd_fstring_t *w, stw;
+       const guchar *r;
+       guint i;
+
+       if (!part->is_utf || !part->language || part->language[0] == '\0') {
+               res = part->words;
+       }
+       else {
+               /* Lemmatize words */
+               stem = sb_stemmer_new (part->language, "UTF_8");
+               if (stem == NULL) {
+                       msg_debug ("cannot lemmatize %s language", part->language);
+                       res = part->words;
+               }
+               else {
+                       res = g_array_sized_new (FALSE, FALSE, sizeof (rspamd_fstring_t),
+                                       part->words->len);
+                       for (i = 0; i < part->words->len; i ++) {
+                               w = &g_array_index (part->words, rspamd_fstring_t, i);
+                               r = sb_stemmer_stem (stem, w->begin, w->len);
+                               if (r != NULL) {
+                                       stw.begin = rspamd_mempool_strdup (pool, r);
+                                       stw.len = strlen (r);
+                                       rspamd_str_lc (stw.begin, stw.len);
+                                       g_array_append_val (res, stw);
+                               }
+                       }
+                       rspamd_mempool_add_destructor (pool, fuzzy_g_array_destructor, res);
+                       sb_stemmer_delete (stem);
+               }
+       }
+       return res;
+}
 
 /*
  * Create fuzzy command from a text part
@@ -553,6 +600,7 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
        guint i;
        blake2b_state st;
        rspamd_fstring_t *word;
+       GArray *words;
 
        if (legacy || part->words == NULL || part->words->len == 0) {
                cmd = rspamd_mempool_alloc0 (pool, sizeof (*cmd));
@@ -572,15 +620,18 @@ fuzzy_cmd_from_text_part (struct fuzzy_rule *rule,
                 */
                g_assert (blake2b_init_key (&st, BLAKE2B_OUTBYTES, rule->hash_key->str,
                                rule->hash_key->len) != -1);
-               for (i = 0; i < part->words->len; i ++) {
-                       word = &g_array_index (part->words, rspamd_fstring_t, i);
+               words = fuzzy_preprocess_words (part, pool);
+
+               for (i = 0; i < words->len; i ++) {
+                       word = &g_array_index (words, rspamd_fstring_t, i);
                        blake2b_update (&st, word->begin, word->len);
                }
                blake2b_final (&st, shcmd->basic.digest, sizeof (shcmd->basic.digest));
 
                msg_debug ("loading shingles with key %*xs", 16, rule->shingles_key->str);
-               sh = rspamd_shingles_generate (part->words, rule->shingles_key->str,
-                               pool, rspamd_shingles_default_filter, NULL);
+               sh = rspamd_shingles_generate (words,
+                               rule->shingles_key->str, pool,
+                               rspamd_shingles_default_filter, NULL);
                if (sh != NULL) {
                        memcpy (&shcmd->sgl, sh, sizeof (shcmd->sgl));
                        shcmd->basic.shingles_count = RSPAMD_SHINGLE_SIZE;