aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-07 14:42:58 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-04-07 14:43:15 +0100
commit3ca316cd22f55b484b0a00a5818617dd585325b4 (patch)
treebee7488fb1dac0cd71b89f69abfca38430f622c7
parentb9ef79d942ab8c71b2b05ca9d0d1adb01d1f0755 (diff)
downloadrspamd-3ca316cd22f55b484b0a00a5818617dd585325b4.tar.gz
rspamd-3ca316cd22f55b484b0a00a5818617dd585325b4.zip
[Minor] Unify substring search routines
-rw-r--r--config.h.in4
-rw-r--r--src/libmime/message.c2
-rw-r--r--src/libserver/dkim.c2
-rw-r--r--src/libutil/str_util.c293
-rw-r--r--src/libutil/str_util.h18
5 files changed, 102 insertions, 217 deletions
diff --git a/config.h.in b/config.h.in
index 6b695aa4a..d9be4fedf 100644
--- a/config.h.in
+++ b/config.h.in
@@ -326,8 +326,11 @@ typedef off_t goffset;
#ifndef RSPAMD_ALIGNED
#if defined(_MSC_VER)
# define RSPAMD_ALIGNED(x) __declspec(align(x))
+# define RSPAMD_OPTIMIZE(x)
+# define RSPAMD_ALWAYS_INLINE
#elif defined(__GNUC__)
# define RSPAMD_ALIGNED(x) __attribute__((aligned(x)))
+# define RSPAMD_ALWAYS_INLINE __attribute__((always_inline))
#ifndef __clang__
# define RSPAMD_OPTIMIZE(x) __attribute__((__optimize__ (x)))
#else
@@ -336,6 +339,7 @@ typedef off_t goffset;
#else
# define RSPAMD_ALIGNED(x)
# define RSPAMD_OPTIMIZE(x)
+# define RSPAMD_ALWAYS_INLINE
#endif
#endif
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f6d23b686..0094afc6b 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -372,7 +372,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part
if (part->content && part->content->len > sizeof (gtube_pattern) &&
part->content->len <= max_check_size) {
- if (rspamd_substring_search_twoway (part->content->data,
+ if (rspamd_substring_search (part->content->data,
part->content->len,
gtube_pattern, sizeof (gtube_pattern) - 1) != -1) {
task->flags |= RSPAMD_TASK_FLAG_SKIP;
diff --git a/src/libserver/dkim.c b/src/libserver/dkim.c
index 43b0f6268..fe7e5e8ea 100644
--- a/src/libserver/dkim.c
+++ b/src/libserver/dkim.c
@@ -1747,7 +1747,7 @@ rspamd_dkim_canonize_header (struct rspamd_dkim_common_ctx *ctx,
}
PTR_ARRAY_FOREACH (ar, i, rh) {
- if (rspamd_substring_search_twoway (rh->raw_value,
+ if (rspamd_substring_search (rh->raw_value,
rh->raw_len, dkim_domain,
strlen (dkim_domain)) != -1) {
rspamd_dkim_signature_update (ctx, rh->raw_value,
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index fbf223020..b84d3e5f3 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -1182,251 +1182,142 @@ rspamd_header_value_fold (const gchar *name,
return res;
}
-#define RKHASH(a, b, h) ((((h) - (a)*d) << 1) + (b))
+static inline bool rspamd_substring_cmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE;
+static inline bool rspamd_substring_cmp_func (guchar a, guchar b) { return a == b; }
-goffset
-rspamd_substring_search (const gchar *in, gsize inlen,
- const gchar *srch, gsize srchlen)
-{
- gint d, hash_srch, hash_in;
- gsize i, j;
-
- if (inlen < srchlen) {
- return -1;
- }
+static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) RSPAMD_ALWAYS_INLINE;
+static inline bool rspamd_substring_casecmp_func (guchar a, guchar b) { return lc_map[a] == lc_map[b]; }
- /* Preprocessing */
- for (d = i = 1; i < srchlen; ++i) {
- /* computes d = 2^(m-1) with the left-shift operator */
- d = (d << 1);
- }
+typedef bool (*rspamd_cmpchar_func_t) (guchar a, guchar b);
- for (hash_in = hash_srch = i = 0; i < srchlen; ++i) {
- hash_srch = ((hash_srch << 1) + srch[i]);
- hash_in = ((hash_in << 1) + in[i]);
- }
+static inline void
+rspamd_substring_preprocess (const gchar *pat, gsize len, goffset *fsm,
+ rspamd_cmpchar_func_t f)
+{
+ goffset i, j;
- /* Searching */
- j = 0;
- while (j <= inlen - srchlen) {
+ i = 0;
+ j = -1;
+ fsm[0] = -1;
- if (hash_srch == hash_in && memcmp (srch, in + j, srchlen) == 0) {
- return (goffset)j;
+ while (i < len) {
+ while (j > -1 && !f(pat[i], pat[j])) {
+ j = fsm[j];
}
- hash_in = RKHASH (in[j], in[j + srchlen], hash_in);
- ++j;
- }
-
- return -1;
-}
-
-goffset
-rspamd_substring_search_caseless (const gchar *in, gsize inlen,
- const gchar *srch, gsize srchlen)
-{
- gint d, hash_srch, hash_in;
- gsize i, j;
- gchar c1, c2;
+ i++;
+ j++;
- /* Searching */
- if (inlen > srchlen) {
- /* Preprocessing */
- for (d = i = 1; i < srchlen; ++i) {
- /* computes d = 2^(m-1) with the left-shift operator */
- d = (d << 1);
+ if (f(pat[i], pat[j])) {
+ fsm[i] = fsm[j];
}
-
- for (hash_in = hash_srch = i = 0; i < srchlen; ++i) {
- hash_srch = ((hash_srch << 1) + g_ascii_tolower (srch[i]));
- hash_in = ((hash_in << 1) + g_ascii_tolower (in[i]));
+ else {
+ fsm[i] = j;
}
+ }
+}
- j = 0;
- while (j <= inlen - srchlen) {
-
- if (hash_srch == hash_in &&
- rspamd_lc_cmp (srch, in + j, srchlen) == 0) {
- return (goffset) j;
- }
+static goffset
+rspamd_substring_seacrh_common (const gchar *in, gsize inlen,
+ const gchar *srch, gsize srchlen, rspamd_cmpchar_func_t f)
+{
+ goffset *fsm;
+ goffset i, j, k, ell, ret = -1;
- c1 = g_ascii_tolower (in[j]);
- c2 = g_ascii_tolower (in[j + srchlen]);
- hash_in = RKHASH (c1, c2, hash_in);
- ++j;
- }
- }
- else if (inlen == srchlen) {
- return rspamd_lc_cmp (srch, in, srchlen) == 0;
+ if (G_LIKELY (srchlen < 1024)) {
+ fsm = g_alloca ((srchlen + 1) * sizeof (*fsm));
}
else {
- return (-1);
+ fsm = g_malloc ((srchlen + 1) * sizeof (*fsm));
}
- return (-1);
-}
+ rspamd_substring_preprocess (srch, srchlen, fsm, f);
-/* Computing of the maximal suffix for <= */
-static inline gint
-rspamd_two_way_max_suffix (const gchar *srch, gint srchlen, gint *p)
-{
- gint ms, j, k;
- gchar a, b;
-
- ms = -1;
- j = 0;
- k = *p = 1;
+ for (ell = 1; f(srch[ell - 1], srch[ell]); ell++) {}
+ if (ell == srchlen) {
+ ell = 0;
+ }
- while (j + k < srchlen) {
- a = srch[j + k];
- b = srch[ms + k];
+ /* Searching */
+ i = ell;
+ j = k = 0;
- if (a < b) {
- j += k;
- k = 1;
- *p = j - ms;
+ while (j <= inlen - srchlen) {
+ while (i < srchlen && f(srch[i], in[i + j])) {
+ ++i;
}
- else if (a == b)
- if (k != *p) {
- k++;
+
+ if (i >= srchlen) {
+ while (k < ell && f(srch[k], in[j + k])) {
+ ++k;
}
- else {
- j += *p;
- k = 1;
+
+ if (k >= ell) {
+ ret = j;
+ goto out;
}
- else { /* a > b */
- ms = j;
- j = ms + 1;
- k = *p = 1;
}
- }
-
- return (ms);
-}
-/* Computing of the maximal suffix for >= */
-static inline gint
-rspamd_two_way_max_suffix_tilde (const gchar *srch, gint srchlen, gint *p)
-{
- gint ms, j, k;
- gchar a, b;
-
- ms = -1;
- j = 0;
- k = *p = 1;
+ j += (i - fsm[i]);
- while (j + k < srchlen) {
- a = srch[j + k];
- b = srch[ms + k];
-
- if (a > b) {
- j += k;
- k = 1;
- *p = j - ms;
+ if (i == ell) {
+ k = MAX(0, k - 1);
}
- else if (a == b)
- if (k != *p) {
- k ++;
- }
- else {
- j += *p;
- k = 1;
+ else {
+ if (fsm[i] <= ell) {
+ k = MAX(0, fsm[i]);
+ i = ell;
+ } else {
+ k = ell;
+ i = fsm[i];
}
- else { /* a < b */
- ms = j;
- j = ms + 1;
- k = *p = 1;
}
}
- return (ms);
+
+out:
+ if (G_UNLIKELY (srchlen >= 1024)) {
+ g_free (fsm);
+ }
+
+ return ret;
}
-/* Two Way string matching algorithm. */
goffset
-rspamd_substring_search_twoway (const gchar *in, gint inlen,
- const gchar *srch, gint srchlen)
+rspamd_substring_search (const gchar *in, gsize inlen,
+ const gchar *srch, gsize srchlen)
{
- int i, j, ell, memory, p, per, q;
-
- /* Preprocessing */
- i = rspamd_two_way_max_suffix (srch, srchlen, &p);
- j = rspamd_two_way_max_suffix_tilde (srch, srchlen, &q);
-
- if (i > j) {
- ell = i;
- per = p;
+ if (inlen > srchlen) {
+ return rspamd_substring_seacrh_common (in, inlen, srch, srchlen,
+ rspamd_substring_cmp_func);
+ }
+ else if (inlen == srchlen) {
+ return rspamd_lc_cmp (srch, in, srchlen) == 0;
}
else {
- ell = j;
- per = q;
+ return (-1);
}
- /* Searching */
- if (memcmp (srch, srch + per, ell + 1) == 0) {
- j = 0;
- memory = -1;
-
- while (j <= inlen - srchlen) {
- i = MAX (ell, memory) + 1;
-
- while (i < srchlen && srch[i] == in[i + j]) {
- i ++;
- }
-
- if (i >= srchlen) {
- i = ell;
-
- while (i > memory && srch[i] == in[i + j]) {
- i --;
- }
-
- if (i <= memory) {
- return j;
- }
+ return (-1);
+}
- j += per;
- memory = srchlen - per - 1;
- }
- else {
- j += (i - ell);
- memory = -1;
- }
- }
+goffset
+rspamd_substring_search_caseless (const gchar *in, gsize inlen,
+ const gchar *srch, gsize srchlen)
+{
+ if (inlen > srchlen) {
+ return rspamd_substring_seacrh_common (in, inlen, srch, srchlen,
+ rspamd_substring_casecmp_func);
+ }
+ else if (inlen == srchlen) {
+ return rspamd_lc_cmp (srch, in, srchlen) == 0;
}
else {
- per = MAX (ell + 1, srchlen - ell - 1) + 1;
- j = 0;
-
- while (j <= inlen - srchlen) {
- i = ell + 1;
-
- while (i < srchlen && srch[i] == in[i + j]) {
- i ++;
- }
-
- if (i >= srchlen) {
- i = ell;
-
- while (i >= 0 && srch[i] == in[i + j]) {
- i --;
- }
-
- if (i < 0) {
- return j;
- }
-
- j += per;
- }
- else {
- j += (i - ell);
- }
- }
+ return (-1);
}
- return -1;
+ return (-1);
}
-
goffset
rspamd_string_find_eoh (GString *input, goffset *body_start)
{
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index ea3d97278..2a213d738 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -264,7 +264,8 @@ GString *rspamd_header_value_fold (const gchar *name,
enum rspamd_newlines_type how);
/**
- * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm
+ * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm
+ * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
* @param in input
* @param inlen input len
* @param srch search string
@@ -275,7 +276,8 @@ goffset rspamd_substring_search (const gchar *in, gsize inlen,
const gchar *srch, gsize srchlen);
/**
- * Search for a substring `srch` in the text `in` using Karp-Rabin algorithm in caseless matter (ASCII only)
+ * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only)
+ * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
* @param in input
* @param inlen input len
* @param srch search string
@@ -286,18 +288,6 @@ goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen,
const gchar *srch, gsize srchlen);
/**
- * Search for a substring `srch` in the text `in` using 2-way algorithm:
- * http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
- * @param in input
- * @param inlen input len
- * @param srch search string
- * @param srchlen length of the search string
- * @return position of the first substring match or (-1) if not found
- */
-goffset rspamd_substring_search_twoway (const gchar *in, gint inlen,
- const gchar *srch, gint srchlen);
-
-/**
* Search for end-of-headers mark in the input string. Returns position just after
* the last header in message (but before the last newline character).
* Hence, to obtain the real EOH position, it is also required to skip