diff options
Diffstat (limited to 'src/libutil/str_util.c')
-rw-r--r-- | src/libutil/str_util.c | 293 |
1 files changed, 92 insertions, 201 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index fbf223020..b84d3e5f3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -1182,251 +1182,142 @@ rspamd_header_value_fold (const gchar *name, return res; } -#define RKHASH(a, b, h) ((((h) - (a)*d) << 1) + (b)) +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; } -goffset -rspamd_substring_search (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - - if (inlen < srchlen) { - return -1; - } +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; } - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); - } +typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b); - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + srch[i]); - hash_in = ((hash_in << 1) + in[i]); - } +static inline void +rspamd_substring_preprocess (const gchar *pat, gsize len, goffset *fsm, + rspamd_cmpchar_func_t f) +{ + goffset i, j; - /* Searching */ - j = 0; - while (j <= inlen - srchlen) { + i = 0; + j = -1; + fsm[0] = -1; - if (hash_srch == hash_in && memcmp (srch, in + j, srchlen) == 0) { - return (goffset)j; + while (i < len) { + while (j > -1 && !f(pat[i], pat[j])) { + j = fsm[j]; } - hash_in = RKHASH (in[j], in[j + srchlen], hash_in); - ++j; - } - - return -1; -} - -goffset -rspamd_substring_search_caseless (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - gchar c1, c2; + i++; + j++; - /* Searching */ - if (inlen > srchlen) { - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); + if (f(pat[i], pat[j])) { + fsm[i] = fsm[j]; } - - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + g_ascii_tolower (srch[i])); - hash_in = ((hash_in << 1) + g_ascii_tolower (in[i])); + else { + fsm[i] = j; } + } +} - j = 0; - while (j <= inlen - srchlen) { - - if (hash_srch == hash_in && - rspamd_lc_cmp (srch, in + j, srchlen) == 0) { - return (goffset) j; - } +static goffset +rspamd_substring_seacrh_common (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f) +{ + goffset *fsm; + goffset i, j, k, ell, ret = -1; - c1 = g_ascii_tolower (in[j]); - c2 = g_ascii_tolower (in[j + srchlen]); - hash_in = RKHASH (c1, c2, hash_in); - ++j; - } - } - else if (inlen == srchlen) { - return rspamd_lc_cmp (srch, in, srchlen) == 0; + if (G_LIKELY (srchlen < 1024)) { + fsm = g_alloca ((srchlen + 1) * sizeof (*fsm)); } else { - return (-1); + fsm = g_malloc ((srchlen + 1) * sizeof (*fsm)); } - return (-1); -} + rspamd_substring_preprocess (srch, srchlen, fsm, f); -/* Computing of the maximal suffix for <= */ -static inline gint -rspamd_two_way_max_suffix (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {} + if (ell == srchlen) { + ell = 0; + } - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; + /* Searching */ + i = ell; + j = k = 0; - if (a < b) { - j += k; - k = 1; - *p = j - ms; + while (j <= inlen - srchlen) { + while (i < srchlen && f(srch[i], in[i + j])) { + ++i; } - else if (a == b) - if (k != *p) { - k++; + + if (i >= srchlen) { + while (k < ell && f(srch[k], in[j + k])) { + ++k; } - else { - j += *p; - k = 1; + + if (k >= ell) { + ret = j; + goto out; } - else { /* a > b */ - ms = j; - j = ms + 1; - k = *p = 1; } - } - - return (ms); -} -/* Computing of the maximal suffix for >= */ -static inline gint -rspamd_two_way_max_suffix_tilde (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + j += (i - fsm[i]); - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; - - if (a > b) { - j += k; - k = 1; - *p = j - ms; + if (i == ell) { + k = MAX(0, k - 1); } - else if (a == b) - if (k != *p) { - k ++; - } - else { - j += *p; - k = 1; + else { + if (fsm[i] <= ell) { + k = MAX(0, fsm[i]); + i = ell; + } else { + k = ell; + i = fsm[i]; } - else { /* a < b */ - ms = j; - j = ms + 1; - k = *p = 1; } } - return (ms); + +out: + if (G_UNLIKELY (srchlen >= 1024)) { + g_free (fsm); + } + + return ret; } -/* Two Way string matching algorithm. */ goffset -rspamd_substring_search_twoway (const gchar *in, gint inlen, - const gchar *srch, gint srchlen) +rspamd_substring_search (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) { - int i, j, ell, memory, p, per, q; - - /* Preprocessing */ - i = rspamd_two_way_max_suffix (srch, srchlen, &p); - j = rspamd_two_way_max_suffix_tilde (srch, srchlen, &q); - - if (i > j) { - ell = i; - per = p; + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_cmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - ell = j; - per = q; + return (-1); } - /* Searching */ - if (memcmp (srch, srch + per, ell + 1) == 0) { - j = 0; - memory = -1; - - while (j <= inlen - srchlen) { - i = MAX (ell, memory) + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i > memory && srch[i] == in[i + j]) { - i --; - } - - if (i <= memory) { - return j; - } + return (-1); +} - j += per; - memory = srchlen - per - 1; - } - else { - j += (i - ell); - memory = -1; - } - } +goffset +rspamd_substring_search_caseless (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) +{ + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_casecmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - per = MAX (ell + 1, srchlen - ell - 1) + 1; - j = 0; - - while (j <= inlen - srchlen) { - i = ell + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i >= 0 && srch[i] == in[i + j]) { - i --; - } - - if (i < 0) { - return j; - } - - j += per; - } - else { - j += (i - ell); - } - } + return (-1); } - return -1; + return (-1); } - goffset rspamd_string_find_eoh (GString *input, goffset *body_start) { |