From: Vsevolod Stakhov Date: Thu, 20 Aug 2015 13:19:55 +0000 (+0100) Subject: Add function to calculate lev distance between strings X-Git-Tag: 1.0.0~184 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a696a5e1c89118c44ff88ee15c3a662d07015119;p=rspamd.git Add function to calculate lev distance between strings --- diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 780339e37..00ef4e4ed 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -795,3 +795,52 @@ rspamd_decode_url (gchar *dst, const gchar *src, gsize size) return (d - dst); } +#define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) + +gint +rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, + const gchar *s2, gsize s2len) +{ + guint x, y, lastdiag, olddiag; + gchar c1, c2; + guint *column; + gint eq; + static const guint max_cmp = 8192; + + g_assert (s1 != NULL); + g_assert (s2 != NULL); + + if (s1len == 0) { + s1len = strlen (s1); + } + if (s2len == 0) { + s2len = strlen (s2); + } + + if (MAX(s1len, s2len) > max_cmp) { + /* Cannot compare too many characters */ + return 0; + } + + column = g_alloca ((s1len + 1) * sizeof (guint)); + + for (y = 1; y <= s1len; y++) { + column[y] = y; + } + + for (x = 1; x <= s2len; x++) { + column[0] = x; + + for (y = 1, lastdiag = x - 1; y <= s1len; y++) { + olddiag = column[y]; + c1 = s1[y - 1]; + c2 = s2[x - 1]; + eq = (c1 == c2) ? 0 : 1; + column[y] = MIN3 (column[y] + 1, column[y - 1] + 1, + lastdiag + (eq)); + lastdiag = olddiag; + } + } + + return column[s1len]; +} diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 986fc7f03..e07c51701 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -141,4 +141,15 @@ gsize rspamd_decode_url (gchar *dst, const gchar *src, gsize size); # define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x)) #endif +/** + * Return levenstein distance between two strings + * @param s1 + * @param s1len + * @param s2 + * @param s2len + * @return + */ +gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len, + const gchar *s2, gsize s2len); + #endif /* SRC_LIBUTIL_STR_UTIL_H_ */