diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-04-07 14:42:58 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-04-07 14:43:15 +0100 |
commit | 3ca316cd22f55b484b0a00a5818617dd585325b4 (patch) | |
tree | bee7488fb1dac0cd71b89f69abfca38430f622c7 /src/libutil | |
parent | b9ef79d942ab8c71b2b05ca9d0d1adb01d1f0755 (diff) | |
download | rspamd-3ca316cd22f55b484b0a00a5818617dd585325b4.tar.gz rspamd-3ca316cd22f55b484b0a00a5818617dd585325b4.zip |
[Minor] Unify substring search routines
Diffstat (limited to 'src/libutil')
-rw-r--r-- | src/libutil/str_util.c | 293 | ||||
-rw-r--r-- | src/libutil/str_util.h | 18 |
2 files changed, 96 insertions, 215 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index fbf223020..b84d3e5f3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -1182,251 +1182,142 @@ rspamd_header_value_fold (const gchar *name, return res; } -#define RKHASH(a, b, h) ((((h) - (a)*d) << 1) + (b)) +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; } -goffset -rspamd_substring_search (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - - if (inlen < srchlen) { - return -1; - } +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; } - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); - } +typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b); - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + srch[i]); - hash_in = ((hash_in << 1) + in[i]); - } +static inline void +rspamd_substring_preprocess (const gchar *pat, gsize len, goffset *fsm, + rspamd_cmpchar_func_t f) +{ + goffset i, j; - /* Searching */ - j = 0; - while (j <= inlen - srchlen) { + i = 0; + j = -1; + fsm[0] = -1; - if (hash_srch == hash_in && memcmp (srch, in + j, srchlen) == 0) { - return (goffset)j; + while (i < len) { + while (j > -1 && !f(pat[i], pat[j])) { + j = fsm[j]; } - hash_in = RKHASH (in[j], in[j + srchlen], hash_in); - ++j; - } - - return -1; -} - -goffset -rspamd_substring_search_caseless (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - gchar c1, c2; + i++; + j++; - /* Searching */ - if (inlen > srchlen) { - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); + if (f(pat[i], pat[j])) { + fsm[i] = fsm[j]; } - - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + g_ascii_tolower (srch[i])); - hash_in = ((hash_in << 1) + g_ascii_tolower (in[i])); + else { + fsm[i] = j; } + } +} - j = 0; - while (j <= inlen - srchlen) { - - if (hash_srch == hash_in && - rspamd_lc_cmp (srch, in + j, srchlen) == 0) { - return (goffset) j; - } +static goffset +rspamd_substring_seacrh_common (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f) +{ + goffset *fsm; + goffset i, j, k, ell, ret = -1; - c1 = g_ascii_tolower (in[j]); - c2 = g_ascii_tolower (in[j + srchlen]); - hash_in = RKHASH (c1, c2, hash_in); - ++j; - } - } - else if (inlen == srchlen) { - return rspamd_lc_cmp (srch, in, srchlen) == 0; + if (G_LIKELY (srchlen < 1024)) { + fsm = g_alloca ((srchlen + 1) * sizeof (*fsm)); } else { - return (-1); + fsm = g_malloc ((srchlen + 1) * sizeof (*fsm)); } - return (-1); -} + rspamd_substring_preprocess (srch, srchlen, fsm, f); -/* Computing of the maximal suffix for <= */ -static inline gint -rspamd_two_way_max_suffix (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {} + if (ell == srchlen) { + ell = 0; + } - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; + /* Searching */ + i = ell; + j = k = 0; - if (a < b) { - j += k; - k = 1; - *p = j - ms; + while (j <= inlen - srchlen) { + while (i < srchlen && f(srch[i], in[i + j])) { + ++i; } - else if (a == b) - if (k != *p) { - k++; + + if (i >= srchlen) { + while (k < ell && f(srch[k], in[j + k])) { + ++k; } - else { - j += *p; - k = 1; + + if (k >= ell) { + ret = j; + goto out; } - else { /* a > b */ - ms = j; - j = ms + 1; - k = *p = 1; } - } - - return (ms); -} -/* Computing of the maximal suffix for >= */ -static inline gint -rspamd_two_way_max_suffix_tilde (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + j += (i - fsm[i]); - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; - - if (a > b) { - j += k; - k = 1; - *p = j - ms; + if (i == ell) { + k = MAX(0, k - 1); } - else if (a == b) - if (k != *p) { - k ++; - } - else { - j += *p; - k = 1; + else { + if (fsm[i] <= ell) { + k = MAX(0, fsm[i]); + i = ell; + } else { + k = ell; + i = fsm[i]; } - else { /* a < b */ - ms = j; - j = ms + 1; - k = *p = 1; } } - return (ms); + +out: + if (G_UNLIKELY (srchlen >= 1024)) { + g_free (fsm); + } + + return ret; } -/* Two Way string matching algorithm. */ goffset -rspamd_substring_search_twoway (const gchar *in, gint inlen, - const gchar *srch, gint srchlen) +rspamd_substring_search (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) { - int i, j, ell, memory, p, per, q; - - /* Preprocessing */ - i = rspamd_two_way_max_suffix (srch, srchlen, &p); - j = rspamd_two_way_max_suffix_tilde (srch, srchlen, &q); - - if (i > j) { - ell = i; - per = p; + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_cmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - ell = j; - per = q; + return (-1); } - /* Searching */ - if (memcmp (srch, srch + per, ell + 1) == 0) { - j = 0; - memory = -1; - - while (j <= inlen - srchlen) { - i = MAX (ell, memory) + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i > memory && srch[i] == in[i + j]) { - i --; - } - - if (i <= memory) { - return j; - } + return (-1); +} - j += per; - memory = srchlen - per - 1; - } - else { - j += (i - ell); - memory = -1; - } - } +goffset +rspamd_substring_search_caseless (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) +{ + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_casecmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - per = MAX (ell + 1, srchlen - ell - 1) + 1; - j = 0; - - while (j <= inlen - srchlen) { - i = ell + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i >= 0 && srch[i] == in[i + j]) { - i --; - } - - if (i < 0) { - return j; - } - - j += per; - } - else { - j += (i - ell); - } - } + return (-1); } - return -1; + return (-1); } - goffset rspamd_string_find_eoh (GString *input, goffset *body_start) { diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index ea3d97278..2a213d738 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -264,7 +264,8 @@ GString *rspamd_header_value_fold (const gchar *name, enum rspamd_newlines_type how); /** - * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 * @param in input * @param inlen input len * @param srch search string @@ -275,7 +276,8 @@ goffset rspamd_substring_search (const gchar *in, gsize inlen, const gchar *srch, gsize srchlen); /** - * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm in caseless matter (ASCII only) + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only) + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 * @param in input * @param inlen input len * @param srch search string @@ -286,18 +288,6 @@ goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen, const gchar *srch, gsize srchlen); /** - * Search for a substring `srch` in the text `in` using 2-way algorithm: - * http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - * @param in input - * @param inlen input len - * @param srch search string - * @param srchlen length of the search string - * @return position of the first substring match or (-1) if not found - */ -goffset rspamd_substring_search_twoway (const gchar *in, gint inlen, - const gchar *srch, gint srchlen); - -/** * Search for end-of-headers mark in the input string. Returns position just after * the last header in message (but before the last newline character). * Hence, to obtain the real EOH position, it is also required to skip |