From be58868e60036ef9ea4b45f0c5a874426a9f882d Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 9 Dec 2015 14:54:50 +0000 Subject: [PATCH] Use hits from the cache in re cache --- src/libmime/mime_expressions.c | 13 +++++--- src/libserver/re_cache.c | 58 ++++++++++++++++++---------------- src/libserver/re_cache.h | 4 +-- src/lua/lua_task.c | 10 +++--- 4 files changed, 45 insertions(+), 40 deletions(-) diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index 8d42bf9a8..bc13859c1 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -342,6 +342,13 @@ rspamd_mime_expr_parse_regexp_atom (rspamd_mempool_t * pool, const gchar *line) result->regexp = rspamd_regexp_new (dbegin, re_flags->str, &err); + if (result->is_multiple) { + rspamd_regexp_set_maxhits (result->regexp, 0); + } + else { + rspamd_regexp_set_maxhits (result->regexp, 1); + } + g_string_free (re_flags, TRUE); if (result->regexp == NULL || err != NULL) { @@ -687,8 +694,7 @@ rspamd_mime_expr_process_regexp (struct rspamd_regexp_atom *re, re->type, re->header, strlen (re->header), - re->is_strong, - re->is_multiple); + re->is_strong); } else { ret = rspamd_re_cache_process (task, @@ -697,8 +703,7 @@ rspamd_mime_expr_process_regexp (struct rspamd_regexp_atom *re, re->type, NULL, 0, - re->is_strong, - re->is_multiple); + re->is_strong); } if (re->is_test) { diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c29bc7a31..c310c7fb8 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -399,10 +399,11 @@ rspamd_re_cache_runtime_new (struct rspamd_re_cache *cache) static guint rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt, rspamd_regexp_t *re, const guchar *in, gsize len, - gboolean is_raw, gboolean is_multiple) + gboolean is_raw) { guint r = 0; const gchar *start = NULL, *end = NULL; + guint max_hits = rspamd_regexp_get_maxhits (re); if (len == 0) { len = strlen (in); @@ -421,7 +422,7 @@ rspamd_re_cache_process_pcre (struct rspamd_re_runtime *rt, NULL)) { r++; - if (!is_multiple || r >= 0xFF) { + if (max_hits > 0 && r > max_hits) { break; } } @@ -446,29 +447,32 @@ rspamd_re_cache_hyperscan_cb (unsigned int id, struct rspamd_re_hyperscan_cbdata *cbdata = ud; struct rspamd_re_runtime *rt; struct rspamd_re_cache_elt *pcre_elt; - guint ret; + guint ret, maxhits; rt = cbdata->rt; + pcre_elt = g_ptr_array_index (rt->cache->re, id); if (flags & HS_FLAG_PREFILTER) { if (!isset (rt->checked, id)) { /* We need to match the corresponding pcre first */ - pcre_elt = g_ptr_array_index (rt->cache->re, id); ret = rspamd_re_cache_process_pcre (rt, pcre_elt->re, cbdata->in + from, to - from, - FALSE, - TRUE); + FALSE); setbit (rt->checked, id); rt->results[id] = ret; } } else { + maxhits = rspamd_regexp_get_maxhits (pcre_elt->re); setbit (rt->checked, id); - rt->results[id] ++; + + if (maxhits == 0 || rt->results[id] < maxhits) { + rt->results[id]++; + } } return 0; @@ -479,7 +483,7 @@ static guint rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt, rspamd_regexp_t *re, const guchar *in, gsize len, - gboolean is_raw, gboolean is_multiple) + gboolean is_raw) { struct rspamd_re_cache_elt *elt; struct rspamd_re_class *re_class; @@ -491,14 +495,14 @@ rspamd_re_cache_process_regexp_data (struct rspamd_re_runtime *rt, re_class = rspamd_regexp_get_class (re); #ifndef WITH_HYPERSCAN - ret = rspamd_re_cache_process_pcre (rt, re, in, len, is_raw, is_multiple); + ret = rspamd_re_cache_process_pcre (rt, re, in, len, is_raw); setbit (rt->checked, re_id); rt->results[re_id] = ret; #else struct rspamd_re_hyperscan_cbdata cbdata; if (elt->match_type == RSPAMD_RE_CACHE_PCRE) { - ret = rspamd_re_cache_process_pcre (rt, re, in, len, is_raw, is_multiple); + ret = rspamd_re_cache_process_pcre (rt, re, in, len, is_raw); setbit (rt->checked, re_id); rt->results[re_id] = ret; } @@ -560,8 +564,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, struct rspamd_re_runtime *rt, rspamd_regexp_t *re, struct rspamd_re_class *re_class, - gboolean is_strong, - gboolean is_multiple) + gboolean is_strong) { guint ret = 0, i; GList *cur, *headerlist; @@ -604,7 +607,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, /* Match re */ if (in) { ret += rspamd_re_cache_process_regexp_data (rt, re, in, - strlen (in), raw, is_multiple); + strlen (in), raw); debug_task ("checking header %s regexp: %s -> %d", re_class->type_data, rspamd_regexp_get_pattern (re), ret); @@ -619,7 +622,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, in = task->raw_headers_content.begin; len = task->raw_headers_content.len; ret = rspamd_re_cache_process_regexp_data (rt, re, in, - len, raw, is_multiple); + len, raw); debug_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; @@ -649,7 +652,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, if (len > 0) { ret += rspamd_re_cache_process_regexp_data (rt, re, in, - len, raw, is_multiple); + len, raw); debug_task ("checking mime regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); } @@ -665,7 +668,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, raw = FALSE; ret += rspamd_re_cache_process_regexp_data (rt, re, in, - len, raw, is_multiple); + len, raw); } g_hash_table_iter_init (&it, task->emails); @@ -677,7 +680,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, raw = FALSE; ret += rspamd_re_cache_process_regexp_data (rt, re, in, - len, raw, is_multiple); + len, raw); } debug_task ("checking url regexp: %s -> %d", @@ -689,7 +692,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, len = task->msg.len; ret = rspamd_re_cache_process_regexp_data (rt, re, in, - len, raw, is_multiple); + len, raw); debug_task ("checking rawbody regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; @@ -711,8 +714,7 @@ rspamd_re_cache_process (struct rspamd_task *task, enum rspamd_re_type type, gpointer type_data, gsize datalen, - gboolean is_strong, - gboolean is_multiple) + gboolean is_strong) { guint64 re_id; struct rspamd_re_class *re_class; @@ -733,12 +735,7 @@ rspamd_re_cache_process (struct rspamd_task *task, if (isset (rt->checked, re_id)) { /* Fast path */ - if (is_multiple) { - return rt->results[re_id]; - } - else { - return rt->results[re_id] ? 1 : 0; - } + return rt->results[re_id]; } else { /* Slow path */ @@ -751,7 +748,7 @@ rspamd_re_cache_process (struct rspamd_task *task, } return rspamd_re_cache_exec_re (task, rt, re, re_class, - is_strong, is_multiple); + is_strong); } return 0; @@ -999,12 +996,19 @@ rspamd_re_cache_compile_hyperscan (struct rspamd_re_cache *cache, hs_flags[i] = 0; pcre_flags = rspamd_regexp_get_pcre_flags (re); + if (pcre_flags & PCRE_UTF8) { hs_flags[i] |= HS_FLAG_UTF8; } if (pcre_flags & PCRE_CASELESS) { hs_flags[i] |= HS_FLAG_CASELESS; } + if (pcre_flags & PCRE_MULTILINE) { + hs_flags[i] |= HS_FLAG_MULTILINE; + } + if (rspamd_regexp_get_maxhits (re) == 1) { + hs_flags[i] |= HS_FLAG_SINGLEMATCH; + } if (hs_compile (rspamd_regexp_get_pattern (re), hs_flags[i], diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h index 310056515..c812b8ef3 100644 --- a/src/libserver/re_cache.h +++ b/src/libserver/re_cache.h @@ -86,7 +86,6 @@ struct rspamd_re_runtime* rspamd_re_cache_runtime_new (struct rspamd_re_cache *c * @param type_data associated data with the type (e.g. header name) * @param datalen associated data length * @param is_strong use case sensitive match when looking for headers - * @param is_multiple return multiple possible occurrences of the specified re */ gint rspamd_re_cache_process (struct rspamd_task *task, struct rspamd_re_runtime *rt, @@ -94,8 +93,7 @@ gint rspamd_re_cache_process (struct rspamd_task *task, enum rspamd_re_type type, gpointer type_data, gsize datalen, - gboolean is_strong, - gboolean is_multiple); + gboolean is_strong); /** * Destroy runtime data diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 8246d921b..8529489f6 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -440,7 +440,6 @@ LUA_FUNCTION_DEF (task, set_settings); * + `url`: url regexp * - `header`: for header and rawheader regexp means the name of header * - `strong`: case sensitive match for headers - * - `multiple`: allow multiple matches * @return {number} number of regexp occurences in the task (limited by 255 so far) */ LUA_FUNCTION_DEF (task, process_regexp); @@ -2007,7 +2006,7 @@ lua_task_process_regexp (lua_State *L) { struct rspamd_task *task = lua_check_task (L, 1); struct rspamd_lua_regexp *re = NULL; - gboolean strong = FALSE, multiple = FALSE; + gboolean strong = FALSE; const gchar *type_str = NULL, *header_str = NULL; gsize header_len = 0; GError *err = NULL; @@ -2024,13 +2023,12 @@ lua_task_process_regexp (lua_State *L) * + `url`: url regexp * - `header`: for header and rawheader regexp means the name of header * - `strong`: case sensitive match for headers - * - `multiple`: allow multiple matches */ if (task != NULL) { if (!rspamd_lua_parse_table_arguments (L, 2, &err, - "*re=U{regexp};*type=S;header=V;strong=B;multiple=B", + "*re=U{regexp};*type=S;header=V;strong=B", &re, &type_str, &header_len, &header_str, - &strong, &multiple)) { + &strong)) { msg_err_task ("cannot get parameters list: %e", err); if (err) { @@ -2047,7 +2045,7 @@ lua_task_process_regexp (lua_State *L) } else { ret = rspamd_re_cache_process (task, task->re_rt, re->re, type, - (gpointer) header_str, header_len, strong, multiple); + (gpointer) header_str, header_len, strong); } } } -- 2.39.5