]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] More fixes to parts distance calculations
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 29 Apr 2016 11:34:12 +0000 (12:34 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 29 Apr 2016 11:34:12 +0000 (12:34 +0100)
- Use hashed words instead of full words for speed
- Improve levenstein distance calculations and penalise replaces
- Always return number from 0 to 1
- Use g_malloc instead of alloca

src/libmime/message.c
src/libmime/message.h
src/libserver/task.c

index be773d48020f75666de68104310736c149b09356..9c27bf5f5a58d3040877ae6f5638086d1ebd6204 100644 (file)
@@ -24,6 +24,7 @@
 #include "email_addr.h"
 #include "utlist.h"
 #include "tokenizers/tokenizers.h"
+#include "xxhash.h"
 
 #ifdef WITH_SNOWBALL
 #include "libstemmer.h"
@@ -1034,7 +1035,12 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                        NULL);
 
        if (part->normalized_words) {
+               part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
+                               sizeof (guint64), part->normalized_words->len);
+
                for (i = 0; i < part->normalized_words->len; i ++) {
+                       guint64 h;
+
                        w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
                        r = NULL;
 #ifdef WITH_SNOWBALL
@@ -1066,6 +1072,11 @@ rspamd_normalize_text_part (struct rspamd_task *task,
                                        w->begin = temp_word;
                                }
                        }
+
+                       if (w->len > 0) {
+                               h = XXH64 (w->begin, w->len, rspamd_hash_seed ());
+                               g_array_append_val (part->normalized_hashes, h);
+                       }
                }
        }
 #ifdef WITH_SNOWBALL
@@ -1082,21 +1093,21 @@ rspamd_words_levenshtein_distance (struct rspamd_task *task,
                GArray *w1, GArray *w2)
 {
        guint s1len, s2len, x, y, lastdiag, olddiag;
-       guint *column;
-       rspamd_ftok_t *s1, *s2;
+       guint *column, ret;
+       guint64 *h1, *h2;
        gint eq;
        static const guint max_words = 8192;
 
        s1len = w1->len;
        s2len = w2->len;
 
-       if (s1len > max_words) {
+       if (s1len + s2len > max_words) {
                msg_err_task ("cannot compare parts with more than %ud words: %ud",
                                max_words, s1len);
                return 0;
        }
 
-       column = g_alloca ((s1len + 1) * sizeof (guint));
+       column = g_malloc0 ((s1len + 1) * sizeof (guint));
 
        for (y = 1; y <= s1len; y++) {
                column[y] = y;
@@ -1107,16 +1118,23 @@ rspamd_words_levenshtein_distance (struct rspamd_task *task,
 
                for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
                        olddiag = column[y];
-                       s1 = &g_array_index (w1, rspamd_ftok_t, y - 1);
-                       s2 = &g_array_index (w2, rspamd_ftok_t, x - 1);
-                       eq = rspamd_ftok_cmp (s1, s2) == 0 ? 0 : 1;
+                       h1 = &g_array_index (w1, guint64, y - 1);
+                       h2 = &g_array_index (w2, guint64, x - 1);
+                       eq = h1 == h2;
+                       /*
+                        * Cost of replacement is twice higher than cost of add/delete
+                        * to calculate percentage properly
+                        */
                        column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
-                                       lastdiag + (eq));
+                                       lastdiag + (eq * 2));
                        lastdiag = olddiag;
                }
        }
 
-       return column[s1len];
+       ret = column[s1len];
+       g_free (column);
+
+       return ret;
 }
 
 static gboolean
@@ -1875,15 +1893,15 @@ rspamd_message_parse (struct rspamd_task *task)
                        }
                        else {
                                if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
-                                               p1->normalized_words && p2->normalized_words) {
+                                               p1->normalized_hashes && p2->normalized_hashes) {
 
-                                       tw = p1->normalized_words->len + p2->normalized_words->len;
+                                       tw = p1->normalized_hashes->len + p2->normalized_hashes->len;
 
                                        if (tw > 0) {
                                                dw = rspamd_words_levenshtein_distance (task,
-                                                               p1->normalized_words,
-                                                               p2->normalized_words);
-                                               diff = (2.0 * (gdouble)dw) / (gdouble)tw;
+                                                               p1->normalized_hashes,
+                                                               p2->normalized_hashes);
+                                               diff = dw / (gdouble)tw;
 
                                                msg_debug_task (
                                                                "different words: %d, total words: %d, "
index 59fa0b73c49f1bf14ef57a7f54f838a0d957a428..3994bd102bfff645237c2e7fe20c35074df49560 100644 (file)
@@ -49,6 +49,7 @@ struct mime_text_part {
        GMimeObject *parent;
        struct mime_part *mime_part;
        GArray *normalized_words;
+       GArray *normalized_hashes;
        guint nlines;
        guint64 hash;
 };
index 395d71686814e4052369b1410dfe2ca8df4e4c41..bf90dd7b359e2274b12716c97a89bfb6c4781796 100644 (file)
@@ -188,6 +188,9 @@ rspamd_task_free (struct rspamd_task *task)
                        if (tp->normalized_words) {
                                g_array_free (tp->normalized_words, TRUE);
                        }
+                       if (tp->normalized_hashes) {
+                               g_array_free (tp->normalized_hashes, TRUE);
+                       }
                }
 
                if (task->rcpt_envelope) {