aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-04 15:20:24 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-05-04 15:20:24 +0100
commit798bd5e86f7941dd87884a4ea15c9a15179ac811 (patch)
tree94823482c4b436b2e5cc4186b495c2545a4ee7ec /src
parent6bb2daddb07642bbd5acb6ae8e5070d7eba49352 (diff)
downloadrspamd-798bd5e86f7941dd87884a4ea15c9a15179ac811.tar.gz
rspamd-798bd5e86f7941dd87884a4ea15c9a15179ac811.zip
[Feature] Improve levenshtein distance function
- Use g_malloc instead of alloca - Allow to set variable replacement cost - Update lua util.levenshtein_distance
Diffstat (limited to 'src')
-rw-r--r--src/libmime/filter.c1
-rw-r--r--src/libutil/str_util.c13
-rw-r--r--src/libutil/str_util.h2
-rw-r--r--src/lua/lua_util.c8
4 files changed, 18 insertions, 6 deletions
diff --git a/src/libmime/filter.c b/src/libmime/filter.c
index f810f7508..e1a33f3e2 100644
--- a/src/libmime/filter.c
+++ b/src/libmime/filter.c
@@ -19,6 +19,7 @@
#include "rspamd.h"
#include "message.h"
#include "lua/lua_common.h"
+#include "xxhash.h"
#include <math.h>
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 457e1fe5b..7d40b15fa 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -963,13 +963,15 @@ rspamd_decode_url (gchar *dst, const gchar *src, gsize size)
gint
rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
- const gchar *s2, gsize s2len)
+ const gchar *s2, gsize s2len,
+ guint replace_cost)
{
guint x, y, lastdiag, olddiag;
gchar c1, c2;
guint *column;
gint eq;
static const guint max_cmp = 8192;
+ gint ret;
g_assert (s1 != NULL);
g_assert (s2 != NULL);
@@ -986,7 +988,7 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
return 0;
}
- column = g_alloca ((s1len + 1) * sizeof (guint));
+ column = g_malloc0 ((s1len + 1) * sizeof (guint));
for (y = 1; y <= s1len; y++) {
column[y] = y;
@@ -999,14 +1001,17 @@ rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
olddiag = column[y];
c1 = s1[y - 1];
c2 = s2[x - 1];
- eq = (c1 == c2) ? 0 : 1;
+ eq = (c1 == c2) ? 0 : replace_cost;
column[y] = MIN3 (column[y] + 1, column[y - 1] + 1,
lastdiag + (eq));
lastdiag = olddiag;
}
}
- return column[s1len];
+ ret = column[s1len];
+ g_free (column);
+
+ return ret;
}
GString *
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 68f84f7bc..a63b160dd 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -217,7 +217,7 @@ gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size);
* @return
*/
gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
- const gchar *s2, gsize s2len);
+ const gchar *s2, gsize s2len, guint replace_cost);
/**
* Fold header using rfc822 rules, return new GString from the previous one
diff --git a/src/lua/lua_util.c b/src/lua/lua_util.c
index 1506676ea..05a9a4452 100644
--- a/src/lua/lua_util.c
+++ b/src/lua/lua_util.c
@@ -826,12 +826,18 @@ lua_util_levenshtein_distance (lua_State *L)
const gchar *s1, *s2;
gsize s1len, s2len;
gint dist = 0;
+ guint replace_cost = 1;
s1 = luaL_checklstring (L, 1, &s1len);
s2 = luaL_checklstring (L, 2, &s2len);
+ if (lua_isnumber (L, 3)) {
+ replace_cost = lua_tonumber (L, 3);
+ }
+
if (s1 && s2) {
- dist = rspamd_strings_levenshtein_distance (s1, s1len, s2, s2len);
+ dist = rspamd_strings_levenshtein_distance (s1, s1len, s2, s2len,
+ replace_cost);
}
lua_pushnumber (L, dist);