Implement levenshtein distance for words.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-07-14 19:07:49 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-07-14 19:07:49 +0100
commit: b98130a443131de8ed80cfdddfc70b3c4ccb9fc4 (patch)
tree: 3aa86b37cc0d789e03c9aab5fd0d471e971990b1
parent: f53b543ac76a3da9a72f8f6afab88318fdcea47b (diff)
download: rspamd-b98130a443131de8ed80cfdddfc70b3c4ccb9fc4.tar.gz
rspamd-b98130a443131de8ed80cfdddfc70b3c4ccb9fc4.zip
1 files changed, 41 insertions, 4 deletions
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index e64aa03e0..446493b4d 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -1165,6 +1165,43 @@ rspamd_header_exists (struct rspamd_task * task, GArray * args, void *unused)
 	return FALSE;
 }
 
+#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
+
+static gint
+rspamd_words_levenshtein_distance (GArray *w1, GArray *w2)
+{
+	guint s1len, s2len, x, y, lastdiag, olddiag;
+	guint *column;
+	rspamd_fstring_t *s1, *s2;
+	gint eq;
+
+	s1len = w1->len;
+	s2len = w2->len;
+
+	column = g_alloca ((s1len + 1) * sizeof (guint));
+
+	for (y = 1; y <= s1len; y++) {
+		column[y] = y;
+	}
+
+	for (x = 1; x <= s2len; x++) {
+		column[0] = x;
+
+		for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
+			olddiag = column[y];
+			s1 = &g_array_index (w1, rspamd_fstring_t, y - 1);
+			s2 = &g_array_index (w1, rspamd_fstring_t, x - 1);
+			eq = rspamd_fstring_equal (s1, s2) ? 0 : 1;
+			column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
+					lastdiag + (eq));
+			lastdiag = olddiag;
+		}
+	}
+
+	return column[s1len];
+}
+
+
 /*
  * This function is designed to find difference between text/html and text/plain parts
  * It takes one argument: difference threshold, if we have two text parts, compare
@@ -1280,10 +1317,10 @@ rspamd_parts_distance (struct rspamd_task * task, GArray * args, void *unused)
 		if (!IS_PART_EMPTY (p1) && !IS_PART_EMPTY (p2) &&
 				p1->normalized_words && p2->normalized_words) {
 
-			tw = 0;
-			dw = 0;
-			diff = 100;
-			/* XXX: Need levenshtein distance for a set of words */
+			tw = MAX (p1->normalized_words->len, p2->normalized_words->len);
+			dw = rspamd_words_levenshtein_distance (p1->normalized_words,
+					p2->normalized_words);
+			diff = tw > 0 ? (100.0 * (gdouble)(tw - dw) / (gdouble)tw) : 100;
 
 			msg_debug (
 				"different words: %d, total words: %d, "
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-07-14 19:07:49 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-07-14 19:07:49 +0100
commit	b98130a443131de8ed80cfdddfc70b3c4ccb9fc4 (patch)
tree	3aa86b37cc0d789e03c9aab5fd0d471e971990b1
parent	f53b543ac76a3da9a72f8f6afab88318fdcea47b (diff)
download	rspamd-b98130a443131de8ed80cfdddfc70b3c4ccb9fc4.tar.gz rspamd-b98130a443131de8ed80cfdddfc70b3c4ccb9fc4.zip