From 957b99e270c5a1cf278f410ebe62f398a3f8dc33 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 15 Apr 2016 14:13:44 +0100 Subject: [PATCH] [Feature] Add two way substring search algorithm --- src/libutil/str_util.c | 158 +++++++++++++++++++++++++++++++++++++++++ src/libutil/str_util.h | 11 +++ 2 files changed, 169 insertions(+) diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 513c00d85..384a3ffdb 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -1259,6 +1259,164 @@ rspamd_substring_search_caseless (const gchar *in, gsize inlen, return -1; } +/* Computing of the maximal suffix for <= */ +static inline gint +rspamd_two_way_max_suffix (const gchar *x, gint m, gint *p) +{ + gint ms, j, k; + gchar a, b; + + ms = -1; + j = 0; + k = *p = 1; + + while (j + k < m) { + a = x[j + k]; + b = x[ms + k]; + + if (a < b) { + j += k; + k = 1; + *p = j - ms; + } + else if (a == b) + if (k != *p) + ++k; + else { + j += *p; + k = 1; + } + else { /* a > b */ + ms = j; + j = ms + 1; + k = *p = 1; + } + } + + return (ms); +} + +/* Computing of the maximal suffix for >= */ +static inline gint +rspamd_two_way_max_suffix_tilde (const gchar *x, gint m, gint *p) +{ + gint ms, j, k; + gchar a, b; + + ms = -1; + j = 0; + k = *p = 1; + + while (j + k < m) { + a = x[j + k]; + b = x[ms + k]; + + if (a > b) { + j += k; + k = 1; + *p = j - ms; + } + else if (a == b) + if (k != *p) + ++k; + else { + j += *p; + k = 1; + } + else { /* a < b */ + ms = j; + j = ms + 1; + k = *p = 1; + } + } + return (ms); +} + +/* Two Way string matching algorithm. */ +goffset +rspamd_substring_search_twoway (const gchar *in, gint inlen, + const gchar *srch, gint srchlen) +{ + int i, j, ell, memory, p, per, q; + + /* Preprocessing */ + i = rspamd_two_way_max_suffix (in, inlen, &p); + j = rspamd_two_way_max_suffix_tilde (in, inlen, &q); + + if (i > j) { + ell = i; + per = p; + } + else { + ell = j; + per = q; + } + + /* Searching */ + if (memcmp (in, in + per, ell + 1) == 0) { + j = 0; + memory = -1; + + while (j <= srchlen - inlen) { + i = MAX (ell, memory) + 1; + + while (i < inlen && in[i] == srch[i + j]) { + i ++; + } + + if (i >= inlen) { + i = ell; + + while (i > memory && in[i] == srch[i + j]) { + i --; + } + + if (i <= memory) { + return j; + } + + j += per; + memory = inlen - per - 1; + } + else { + j += (i - ell); + memory = -1; + } + } + } + else { + per = MAX (ell + 1, inlen - ell - 1) + 1; + j = 0; + + while (j <= srchlen - inlen) { + i = ell + 1; + + while (i < inlen && in[i] == srch[i + j]) { + i ++; + } + + if (i >= inlen) { + i = ell; + + while (i >= 0 && in[i] == srch[i + j]) { + i --; + } + + if (i < 0) { + return j; + } + + j += per; + } + else + j += (i - ell); + } + } + + return -1; +} + + goffset rspamd_string_find_eoh (GString *input) { diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 04132a4d3..68f84f7bc 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -251,6 +251,17 @@ goffset rspamd_substring_search (const gchar *in, gsize inlen, goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen, const gchar *srch, gsize srchlen); +/** + * Search for a substring `srch` in the text `in` using 2-way algorithm: + * http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 + * @param in input + * @param inlen input len + * @param srch search string + * @param srchlen length of the search string + * @return position of the first substring match or (-1) if not found + */ +goffset rspamd_substring_search_twoway (const gchar *in, gint inlen, + const gchar *srch, gint srchlen); /** * Search for end-of-headers mark in the input string. Returns position just after -- 2.39.5