From 3ca316cd22f55b484b0a00a5818617dd585325b4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 7 Apr 2017 14:42:58 +0100 Subject: [PATCH] [Minor] Unify substring search routines --- config.h.in | 4 + src/libmime/message.c | 2 +- src/libserver/dkim.c | 2 +- src/libutil/str_util.c | 293 +++++++++++++---------------------------- src/libutil/str_util.h | 18 +-- 5 files changed, 102 insertions(+), 217 deletions(-) diff --git a/config.h.in b/config.h.in index 6b695aa4a..d9be4fedf 100644 --- a/config.h.in +++ b/config.h.in @@ -326,8 +326,11 @@ typedef off_t goffset; #ifndef RSPAMD_ALIGNED #if defined(_MSC_VER) # define RSPAMD_ALIGNED(x) __declspec(align(x)) +# define RSPAMD_OPTIMIZE(x) +# define RSPAMD_ALWAYS_INLINE #elif defined(__GNUC__) # define RSPAMD_ALIGNED(x) __attribute__((aligned(x))) +# define RSPAMD_ALWAYS_INLINE __attribute__((always_inline)) #ifndef __clang__ # define RSPAMD_OPTIMIZE(x) __attribute__((__optimize__ (x))) #else @@ -336,6 +339,7 @@ typedef off_t goffset; #else # define RSPAMD_ALIGNED(x) # define RSPAMD_OPTIMIZE(x) +# define RSPAMD_ALWAYS_INLINE #endif #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index f6d23b686..0094afc6b 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -372,7 +372,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part if (part->content && part->content->len > sizeof (gtube_pattern) && part->content->len <= max_check_size) { - if (rspamd_substring_search_twoway (part->content->data, + if (rspamd_substring_search (part->content->data, part->content->len, gtube_pattern, sizeof (gtube_pattern) - 1) != -1) { task->flags |= RSPAMD_TASK_FLAG_SKIP; diff --git a/src/libserver/dkim.c b/src/libserver/dkim.c index 43b0f6268..fe7e5e8ea 100644 --- a/src/libserver/dkim.c +++ b/src/libserver/dkim.c @@ -1747,7 +1747,7 @@ rspamd_dkim_canonize_header (struct rspamd_dkim_common_ctx *ctx, } PTR_ARRAY_FOREACH (ar, i, rh) { - if (rspamd_substring_search_twoway (rh->raw_value, + if (rspamd_substring_search (rh->raw_value, rh->raw_len, dkim_domain, strlen (dkim_domain)) != -1) { rspamd_dkim_signature_update (ctx, rh->raw_value, diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index fbf223020..b84d3e5f3 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -1182,251 +1182,142 @@ rspamd_header_value_fold (const gchar *name, return res; } -#define RKHASH(a, b, h) ((((h) - (a)*d) << 1) + (b)) +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; } -goffset -rspamd_substring_search (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - - if (inlen < srchlen) { - return -1; - } +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE; +static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; } - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); - } +typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b); - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + srch[i]); - hash_in = ((hash_in << 1) + in[i]); - } +static inline void +rspamd_substring_preprocess (const gchar *pat, gsize len, goffset *fsm, + rspamd_cmpchar_func_t f) +{ + goffset i, j; - /* Searching */ - j = 0; - while (j <= inlen - srchlen) { + i = 0; + j = -1; + fsm[0] = -1; - if (hash_srch == hash_in && memcmp (srch, in + j, srchlen) == 0) { - return (goffset)j; + while (i < len) { + while (j > -1 && !f(pat[i], pat[j])) { + j = fsm[j]; } - hash_in = RKHASH (in[j], in[j + srchlen], hash_in); - ++j; - } - - return -1; -} - -goffset -rspamd_substring_search_caseless (const gchar *in, gsize inlen, - const gchar *srch, gsize srchlen) -{ - gint d, hash_srch, hash_in; - gsize i, j; - gchar c1, c2; + i++; + j++; - /* Searching */ - if (inlen > srchlen) { - /* Preprocessing */ - for (d = i = 1; i < srchlen; ++i) { - /* computes d = 2^(m-1) with the left-shift operator */ - d = (d << 1); + if (f(pat[i], pat[j])) { + fsm[i] = fsm[j]; } - - for (hash_in = hash_srch = i = 0; i < srchlen; ++i) { - hash_srch = ((hash_srch << 1) + g_ascii_tolower (srch[i])); - hash_in = ((hash_in << 1) + g_ascii_tolower (in[i])); + else { + fsm[i] = j; } + } +} - j = 0; - while (j <= inlen - srchlen) { - - if (hash_srch == hash_in && - rspamd_lc_cmp (srch, in + j, srchlen) == 0) { - return (goffset) j; - } +static goffset +rspamd_substring_seacrh_common (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f) +{ + goffset *fsm; + goffset i, j, k, ell, ret = -1; - c1 = g_ascii_tolower (in[j]); - c2 = g_ascii_tolower (in[j + srchlen]); - hash_in = RKHASH (c1, c2, hash_in); - ++j; - } - } - else if (inlen == srchlen) { - return rspamd_lc_cmp (srch, in, srchlen) == 0; + if (G_LIKELY (srchlen < 1024)) { + fsm = g_alloca ((srchlen + 1) * sizeof (*fsm)); } else { - return (-1); + fsm = g_malloc ((srchlen + 1) * sizeof (*fsm)); } - return (-1); -} + rspamd_substring_preprocess (srch, srchlen, fsm, f); -/* Computing of the maximal suffix for <= */ -static inline gint -rspamd_two_way_max_suffix (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {} + if (ell == srchlen) { + ell = 0; + } - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; + /* Searching */ + i = ell; + j = k = 0; - if (a < b) { - j += k; - k = 1; - *p = j - ms; + while (j <= inlen - srchlen) { + while (i < srchlen && f(srch[i], in[i + j])) { + ++i; } - else if (a == b) - if (k != *p) { - k++; + + if (i >= srchlen) { + while (k < ell && f(srch[k], in[j + k])) { + ++k; } - else { - j += *p; - k = 1; + + if (k >= ell) { + ret = j; + goto out; } - else { /* a > b */ - ms = j; - j = ms + 1; - k = *p = 1; } - } - - return (ms); -} -/* Computing of the maximal suffix for >= */ -static inline gint -rspamd_two_way_max_suffix_tilde (const gchar *srch, gint srchlen, gint *p) -{ - gint ms, j, k; - gchar a, b; - - ms = -1; - j = 0; - k = *p = 1; + j += (i - fsm[i]); - while (j + k < srchlen) { - a = srch[j + k]; - b = srch[ms + k]; - - if (a > b) { - j += k; - k = 1; - *p = j - ms; + if (i == ell) { + k = MAX(0, k - 1); } - else if (a == b) - if (k != *p) { - k ++; - } - else { - j += *p; - k = 1; + else { + if (fsm[i] <= ell) { + k = MAX(0, fsm[i]); + i = ell; + } else { + k = ell; + i = fsm[i]; } - else { /* a < b */ - ms = j; - j = ms + 1; - k = *p = 1; } } - return (ms); + +out: + if (G_UNLIKELY (srchlen >= 1024)) { + g_free (fsm); + } + + return ret; } -/* Two Way string matching algorithm. */ goffset -rspamd_substring_search_twoway (const gchar *in, gint inlen, - const gchar *srch, gint srchlen) +rspamd_substring_search (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) { - int i, j, ell, memory, p, per, q; - - /* Preprocessing */ - i = rspamd_two_way_max_suffix (srch, srchlen, &p); - j = rspamd_two_way_max_suffix_tilde (srch, srchlen, &q); - - if (i > j) { - ell = i; - per = p; + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_cmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - ell = j; - per = q; + return (-1); } - /* Searching */ - if (memcmp (srch, srch + per, ell + 1) == 0) { - j = 0; - memory = -1; - - while (j <= inlen - srchlen) { - i = MAX (ell, memory) + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i > memory && srch[i] == in[i + j]) { - i --; - } - - if (i <= memory) { - return j; - } + return (-1); +} - j += per; - memory = srchlen - per - 1; - } - else { - j += (i - ell); - memory = -1; - } - } +goffset +rspamd_substring_search_caseless (const gchar *in, gsize inlen, + const gchar *srch, gsize srchlen) +{ + if (inlen > srchlen) { + return rspamd_substring_seacrh_common (in, inlen, srch, srchlen, + rspamd_substring_casecmp_func); + } + else if (inlen == srchlen) { + return rspamd_lc_cmp (srch, in, srchlen) == 0; } else { - per = MAX (ell + 1, srchlen - ell - 1) + 1; - j = 0; - - while (j <= inlen - srchlen) { - i = ell + 1; - - while (i < srchlen && srch[i] == in[i + j]) { - i ++; - } - - if (i >= srchlen) { - i = ell; - - while (i >= 0 && srch[i] == in[i + j]) { - i --; - } - - if (i < 0) { - return j; - } - - j += per; - } - else { - j += (i - ell); - } - } + return (-1); } - return -1; + return (-1); } - goffset rspamd_string_find_eoh (GString *input, goffset *body_start) { diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index ea3d97278..2a213d738 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -264,7 +264,8 @@ GString *rspamd_header_value_fold (const gchar *name, enum rspamd_newlines_type how); /** - * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 * @param in input * @param inlen input len * @param srch search string @@ -275,7 +276,8 @@ goffset rspamd_substring_search (const gchar *in, gsize inlen, const gchar *srch, gsize srchlen); /** - * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm in caseless matter (ASCII only) + * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only) + * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120 * @param in input * @param inlen input len * @param srch search string @@ -285,18 +287,6 @@ goffset rspamd_substring_search (const gchar *in, gsize inlen, goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen, const gchar *srch, gsize srchlen); -/** - * Search for a substring `srch` in the text `in` using 2-way algorithm: - * http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 - * @param in input - * @param inlen input len - * @param srch search string - * @param srchlen length of the search string - * @return position of the first substring match or (-1) if not found - */ -goffset rspamd_substring_search_twoway (const gchar *in, gint inlen, - const gchar *srch, gint srchlen); - /** * Search for end-of-headers mark in the input string. Returns position just after * the last header in message (but before the last newline character). -- 2.39.5