From: Vsevolod Stakhov Date: Wed, 4 May 2016 14:20:24 +0000 (+0100) Subject: [Feature] Improve levenshtein distance function X-Git-Tag: 1.3.0~553 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=798bd5e86f7941dd87884a4ea15c9a15179ac811;p=rspamd.git [Feature] Improve levenshtein distance function - Use g_malloc instead of alloca - Allow to set variable replacement cost - Update lua util.levenshtein_distance --- diff --git a/src/libmime/filter.c b/src/libmime/filter.c index f810f7508..e1a33f3e2 100644 --- a/src/libmime/filter.c +++ b/src/libmime/filter.c @@ -19,6 +19,7 @@ #include "rspamd.h" #include "message.h" #include "lua/lua_common.h" +#include "xxhash.h" #include diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 457e1fe5b..7d40b15fa 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -963,13 +963,15 @@ rspamd_decode_url (gchar *dst, const gchar *src, gsize size) gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, - const gchar *s2, gsize s2len) + const gchar *s2, gsize s2len, + guint replace_cost) { guint x, y, lastdiag, olddiag; gchar c1, c2; guint *column; gint eq; static const guint max_cmp = 8192; + gint ret; g_assert (s1 != NULL); g_assert (s2 != NULL); @@ -986,7 +988,7 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, return 0; } - column = g_alloca ((s1len + 1) * sizeof (guint)); + column = g_malloc0 ((s1len + 1) * sizeof (guint)); for (y = 1; y <= s1len; y++) { column[y] = y; @@ -999,14 +1001,17 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, olddiag = column[y]; c1 = s1[y - 1]; c2 = s2[x - 1]; - eq = (c1 == c2) ? 0 : 1; + eq = (c1 == c2) ? 0 : replace_cost; column[y] = MIN3 (column[y] + 1, column[y - 1] + 1, lastdiag + (eq)); lastdiag = olddiag; } } - return column[s1len]; + ret = column[s1len]; + g_free (column); + + return ret; } GString * diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 68f84f7bc..a63b160dd 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -217,7 +217,7 @@ gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size); * @return */ gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, - const gchar *s2, gsize s2len); + const gchar *s2, gsize s2len, guint replace_cost); /** * Fold header using rfc822 rules, return new GString from the previous one diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c index 1506676ea..05a9a4452 100644 --- a/src/lua/lua_util.c +++ b/src/lua/lua_util.c @@ -826,12 +826,18 @@ lua_util_levenshtein_distance (lua_State *L) const gchar *s1, *s2; gsize s1len, s2len; gint dist = 0; + guint replace_cost = 1; s1 = luaL_checklstring (L, 1, &s1len); s2 = luaL_checklstring (L, 2, &s2len); + if (lua_isnumber (L, 3)) { + replace_cost = lua_tonumber (L, 3); + } + if (s1 && s2) { - dist = rspamd_strings_levenshtein_distance (s1, s1len, s2, s2len); + dist = rspamd_strings_levenshtein_distance (s1, s1len, s2, s2len, + replace_cost); } lua_pushnumber (L, dist);