summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-05-25 12:41:26 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-05-25 12:41:26 +0100
commitf9e27d77aa4c223034ab3601449e5dbaf5f281ad (patch)
tree0d52998882e64b37d9f10af71af0828303491caa
parentd767512e0075444617fcea502affa2165c65df29 (diff)
downloadrspamd-f9e27d77aa4c223034ab3601449e5dbaf5f281ad.tar.gz
rspamd-f9e27d77aa4c223034ab3601449e5dbaf5f281ad.zip
[Feature] Rework levenshtein distance computation
We now consider Damerau-Levenshtein distance
-rw-r--r--src/libutil/str_util.c85
1 files changed, 66 insertions, 19 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index fc8d6637b..7c2a545c0 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -866,9 +866,8 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
const gchar *s2, gsize s2len,
guint replace_cost)
{
- guint x, y, lastdiag, olddiag;
- gchar c1, c2;
- guint *column;
+ gchar c1, c2, last_c2, last_c1;
+ static GArray *current_row = NULL, *prev_row = NULL, *transp_row = NULL;
gint eq;
static const guint max_cmp = 8192;
gint ret;
@@ -885,31 +884,79 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
if (MAX(s1len, s2len) > max_cmp) {
/* Cannot compare too many characters */
- return 0;
+ return max_cmp;
+ }
+
+ if (s1len > s2len) {
+ /* Exchange s1 and s2 */
+ const gchar *tmp;
+ gsize tmplen;
+
+ tmp = s2;
+ s2 = s1;
+ s1 = tmp;
+
+ tmplen = s2len;
+ s2len = s1len;
+ s1len = tmplen;
+ }
+
+ /* Adjust static space */
+ if (current_row == NULL) {
+ current_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+ prev_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+ transp_row = g_array_sized_new (FALSE, FALSE, sizeof (gint), s1len + 1);
+ g_array_set_size (current_row, s1len + 1);
+ g_array_set_size (prev_row, s1len + 1);
+ g_array_set_size (transp_row, s1len + 1);
+ }
+ else if (current_row->len < s1len + 1) {
+ g_array_set_size (current_row, s1len + 1);
+ g_array_set_size (prev_row, s1len + 1);
+ g_array_set_size (transp_row, s1len + 1);
}
- column = g_malloc0 ((s1len + 1) * sizeof (guint));
+ memset (current_row->data, 0, (s1len + 1) * sizeof (gint));
+ memset (transp_row->data, 0, (s1len + 1) * sizeof (gint));
- for (y = 1; y <= s1len; y++) {
- column[y] = y;
+ for (gint i = 0; i <= s1len; i++) {
+ g_array_index (prev_row, gint, i) = i;
}
- for (x = 1; x <= s2len; x++) {
- column[0] = x;
+ last_c2 = '\0';
+
+ for (gint i = 1; i <= s2len; i++) {
+ c2 = s2[i - 1];
+ g_array_index (current_row, gint, 0) = i;
+ last_c1 = '\0';
- for (y = 1, lastdiag = x - 1; y <= s1len; y++) {
- olddiag = column[y];
- c1 = s1[y - 1];
- c2 = s2[x - 1];
- eq = (c1 == c2) ? 0 : replace_cost;
- column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
- lastdiag + (eq));
- lastdiag = olddiag;
+ for (gint j = 1; j <= s1len; j++) {
+ c1 = s1[j - 1];
+ eq = c1 == c2 ? 0 : replace_cost;
+ ret = MIN3 (g_array_index (current_row, gint, j - 1) + 1, /* Insert */
+ g_array_index (prev_row, gint, j) + 1, /* Remove */
+ g_array_index (prev_row, gint, j - 1) + eq /* Replace */);
+
+ /* Take reordering into account */
+ if (c1 == last_c2 && c2 == last_c1 && j >= 2) {
+ ret = MIN (ret, g_array_index (transp_row, gint, j - 2) + eq);
+ }
+
+ g_array_index (current_row, gint, j) = ret;
+ last_c1 = c1;
}
+
+ last_c2 = c2;
+
+ /* Exchange pointers */
+ GArray *tmp;
+ tmp = transp_row;
+ transp_row = prev_row;
+ prev_row = current_row;
+ current_row = tmp;
}
- ret = column[s1len];
- g_free (column);
+ ret = g_array_index (prev_row, gint, s1len);
return ret;
}