]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Rework levenshtein distance computation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 25 May 2018 11:41:26 +0000 (12:41 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 25 May 2018 11:41:26 +0000 (12:41 +0100)
We now consider Damerau-Levenshtein distance

src/libutil/str_util.c

index fc8d6637ba6301154ed776bc9b64391a9838be20..7c2a545c07bbcc3cd1ff9bb432eaae83ca6958f2 100644 (file)
@@ -866,9 +866,8 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
                const gchar *s2, gsize s2len,
                guint replace_cost)
 {
-       guint x, y, lastdiag, olddiag;
-       gchar c1, c2;
-       guint *column;
+       gchar c1, c2, last_c2, last_c1;
+       static GArray *current_row = NULL, *prev_row = NULL, *transp_row = NULL;
        gint eq;
        static const guint max_cmp = 8192;
        gint ret;
@@ -885,31 +884,79 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
 
        if (MAX(s1len, s2len) > max_cmp) {
                /* Cannot compare too many characters */
-               return 0;
+               return max_cmp;
+       }
+
+       if (s1len > s2len) {
+               /* Exchange s1 and s2 */
+               const gchar *tmp;
+               gsize tmplen;
+
+               tmp = s2;
+               s2 = s1;
+               s1 = tmp;
+
+               tmplen = s2len;
+               s2len = s1len;
+               s1len = tmplen;
+       }
+
+       /* Adjust static space */
+       if (current_row == NULL) {
+               current_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+               prev_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+               transp_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+               g_array_set_size (current_row, s1len + 1);
+               g_array_set_size (prev_row, s1len + 1);
+               g_array_set_size (transp_row, s1len + 1);
+       }
+       else if (current_row->len < s1len + 1) {
+               g_array_set_size (current_row, s1len + 1);
+               g_array_set_size (prev_row, s1len + 1);
+               g_array_set_size (transp_row, s1len + 1);
        }
 
-       column = g_malloc0 ((s1len + 1) * sizeof (guint));
+       memset (current_row->data, 0, (s1len + 1) * sizeof (gint));
+       memset (transp_row->data, 0, (s1len + 1) * sizeof (gint));
 
-       for (y = 1; y <= s1len; y++) {
-               column[y] = y;
+       for (gint i = 0; i <= s1len; i++) {
+               g_array_index (prev_row, gint, i) = i;
        }
 
-       for (x = 1; x <= s2len; x++) {
-               column[0] = x;
+       last_c2 = '\0';
+
+       for (gint i = 1; i <= s2len; i++) {
+               c2 = s2[i - 1];
+               g_array_index (current_row, gint, 0) = i;
+               last_c1 = '\0';
 
-               for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
-                       olddiag = column[y];
-                       c1 = s1[y - 1];
-                       c2 = s2[x - 1];
-                       eq = (c1 == c2) ? 0 : replace_cost;
-                       column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
-                                       lastdiag + (eq));
-                       lastdiag = olddiag;
+               for (gint j = 1; j <= s1len; j++) {
+                       c1 = s1[j - 1];
+                       eq = c1 == c2 ? 0 : replace_cost;
+                       ret = MIN3 (g_array_index (current_row, gint, j - 1) + 1, /* Insert */
+                                       g_array_index (prev_row, gint, j) + 1, /* Remove */
+                                       g_array_index (prev_row, gint, j - 1) + eq /* Replace */);
+
+                       /* Take reordering into account */
+                       if (c1 == last_c2 && c2 == last_c1 && j >= 2) {
+                               ret = MIN (ret, g_array_index (transp_row, gint, j - 2) + eq);
+                       }
+
+                       g_array_index (current_row, gint, j) = ret;
+                       last_c1 = c1;
                }
+
+               last_c2 = c2;
+
+               /* Exchange pointers */
+               GArray *tmp;
+               tmp = transp_row;
+               transp_row = prev_row;
+               prev_row = current_row;
+               current_row = tmp;
        }
 
-       ret = column[s1len];
-       g_free (column);
+       ret = g_array_index (prev_row, gint, s1len);
 
        return ret;
 }