From 58ff3a43b71928263415a8a874943de9de158018 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 4 Mar 2015 21:59:48 +0000 Subject: [PATCH] Write skeleton of rspamd fast regexps. --- src/libmime/expressions.c | 8 +-- src/libmime/expressions.h | 8 +-- src/libserver/cfg_file.h | 2 +- src/libutil/CMakeLists.txt | 1 + src/libutil/regexp.c | 101 ++++++++++++++++++++++++++++++++++ src/libutil/regexp.h | 108 +++++++++++++++++++++++++++++++++++++ src/plugins/regexp.c | 28 +++++----- 7 files changed, 233 insertions(+), 23 deletions(-) create mode 100644 src/libutil/regexp.c create mode 100644 src/libutil/regexp.h diff --git a/src/libmime/expressions.c b/src/libmime/expressions.c index 07253a2bc..769b7dc14 100644 --- a/src/libmime/expressions.c +++ b/src/libmime/expressions.c @@ -647,12 +647,12 @@ parse_expression (rspamd_mempool_t * pool, gchar *line) /* * Rspamd regexp utility functions */ -struct rspamd_regexp * +struct rspamd_regexp_element * parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) { const gchar *begin, *end, *p, *src, *start; gchar *dbegin, *dend; - struct rspamd_regexp *result, *check; + struct rspamd_regexp_element *result, *check; gint regexp_flags = G_REGEX_OPTIMIZE | G_REGEX_NO_AUTO_CAPTURE; GError *err = NULL; @@ -662,7 +662,7 @@ parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) } src = line; - result = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_regexp)); + result = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_regexp_element)); /* Skip whitespaces */ while (g_ascii_isspace (*line)) { line++; @@ -815,7 +815,7 @@ parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) /* Avoid multiply regexp structures for similar regexps */ if ((check = - (struct rspamd_regexp *)re_cache_check (result->regexp_text, + (struct rspamd_regexp_element *)re_cache_check (result->regexp_text, pool)) != NULL) { /* Additional check for headers */ if (result->type == REGEXP_HEADER || result->type == diff --git a/src/libmime/expressions.h b/src/libmime/expressions.h index 1ba02d956..469cc690d 100644 --- a/src/libmime/expressions.h +++ b/src/libmime/expressions.h @@ -10,7 +10,7 @@ #include struct rspamd_task; -struct rspamd_regexp; +struct rspamd_regexp_element; /** * Rspamd expression function @@ -60,7 +60,7 @@ typedef gboolean (*rspamd_internal_func_t)(struct rspamd_task *, GList *args, * @param line incoming line * @return regexp structure or NULL in case of error */ -struct rspamd_regexp * parse_regexp (rspamd_mempool_t *pool, +struct rspamd_regexp_element * parse_regexp (rspamd_mempool_t *pool, const gchar *line, gboolean raw_mode); @@ -119,7 +119,7 @@ void re_cache_del (const gchar *line, rspamd_mempool_t *pool); * @param result numeric result of this regexp */ void task_cache_add (struct rspamd_task *task, - struct rspamd_regexp *re, + struct rspamd_regexp_element *re, gint32 result); /** @@ -128,7 +128,7 @@ void task_cache_add (struct rspamd_task *task, * @param pointer regexp data * @return numeric result if value exists or -1 if not */ -gint32 task_cache_check (struct rspamd_task *task, struct rspamd_regexp *re); +gint32 task_cache_check (struct rspamd_task *task, struct rspamd_regexp_element *re); /** * Parse and return a single function argument for a function (may recurse) diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 8af3c542f..8c58a4941 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -65,7 +65,7 @@ enum rspamd_log_type { /** * Regexp structure */ -struct rspamd_regexp { +struct rspamd_regexp_element { enum rspamd_regexp_type type; /**< regexp type */ gchar *regexp_text; /**< regexp text representation */ GRegex *regexp; /**< glib regexp structure */ diff --git a/src/libutil/CMakeLists.txt b/src/libutil/CMakeLists.txt index e7ebe2a47..3e8fd87a7 100644 --- a/src/libutil/CMakeLists.txt +++ b/src/libutil/CMakeLists.txt @@ -14,6 +14,7 @@ SET(LIBRSPAMDUTILSRC ${CMAKE_CURRENT_SOURCE_DIR}/mem_pool.c ${CMAKE_CURRENT_SOURCE_DIR}/printf.c ${CMAKE_CURRENT_SOURCE_DIR}/radix.c + ${CMAKE_CURRENT_SOURCE_DIR}/regexp.c ${CMAKE_CURRENT_SOURCE_DIR}/rrd.c ${CMAKE_CURRENT_SOURCE_DIR}/shingles.c ${CMAKE_CURRENT_SOURCE_DIR}/trie.c diff --git a/src/libutil/regexp.c b/src/libutil/regexp.c new file mode 100644 index 000000000..e4fbdef12 --- /dev/null +++ b/src/libutil/regexp.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2015, Vsevolod Stakhov + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "config.h" +#include "regexp.h" +#include "blake2.h" +#include "ref.h" +#include + +struct rspamd_regexp_s { + gdouble exec_time; /**< average execution time */ + pcre *re; + pcre_extra *extra; + pcre *raw_re; + pcre_extra *raw_extra; + guchar id[BLAKE2B_OUTBYTES / 2]; + ref_entry_t ref; +}; + +struct rspamd_regexp_cache { + GHashTable *tbl; +}; + +static struct rspamd_regexp_cache *global_re_cache = NULL; + +rspamd_regexp_t* +rspamd_regexp_new (const gchar *pattern, const gchar *flags, + GError **err) +{ + return NULL; +} + +gboolean +rspamd_regexp_search (rspamd_regexp_t *re, const gchar *text, gsize len) +{ + return FALSE; +} + +gboolean +rspamd_regexp_match (rspamd_regexp_t *re, const gchar *text, gsize len) +{ + return FALSE; +} + +void +rspamd_regexp_unref (rspamd_regexp_t *re) +{ + REF_RELEASE (re); +} + +struct rspamd_regexp_cache* +rspamd_regexp_cache_new (void) +{ + return NULL; +} + + +rspamd_regexp_t* +rspamd_regexp_cache_query (struct rspamd_regexp_cache* cache, + const gchar *pattern, + const gchar *flags) +{ + return NULL; +} + + +rspamd_regexp_t* +rspamd_regexp_cache_create (struct rspamd_regexp_cache *cache, + const gchar *pattern, + const gchar *flags, GError **err) +{ + return NULL; +} + +void +rspamd_regexp_cache_destroy (struct rspamd_regexp_cache *cache) +{ + +} diff --git a/src/libutil/regexp.h b/src/libutil/regexp.h new file mode 100644 index 000000000..4ca4196a0 --- /dev/null +++ b/src/libutil/regexp.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2015, Vsevolod Stakhov + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef REGEXP_H_ +#define REGEXP_H_ + +#include "config.h" + +typedef struct rspamd_regexp_s rspamd_regexp_t; +struct rspamd_regexp_cache; + +/** + * Create new rspamd regexp + * @param pattern regexp pattern + * @param flags flags (may be enclosed inside pattern) + * @param err error pointer set if compilation failed + * @return new regexp object + */ +rspamd_regexp_t* rspamd_regexp_new (const gchar *pattern, const gchar *flags, + GError **err); + +/** + * Search the specified regexp in the text + * @param re + * @param text + * @param len + * @return + */ +gboolean rspamd_regexp_search (rspamd_regexp_t *re, const gchar *text, gsize len); + + +/** + * Exact match of the specified text against the regexp + * @param re + * @param text + * @param len + * @return + */ +gboolean rspamd_regexp_match (rspamd_regexp_t *re, const gchar *text, gsize len); + +/** + * Increase refcount for a regexp object + */ +rspamd_regexp_t* rspamd_regexp_ref (rspamd_regexp_t *re); + +/** + * Unref regexp object + * @param re + */ +void rspamd_regexp_unref (rspamd_regexp_t *re); + +/** + * Create new regexp cache + * @return + */ +struct rspamd_regexp_cache* rspamd_regexp_cache_new (void); + +/** + * Query rspamd cache for a specified regexp + * @param cache regexp cache. if NULL, the superglobal cache is used (*not* thread-safe) + * @param pattern + * @param flags + * @return + */ +rspamd_regexp_t* rspamd_regexp_cache_query (struct rspamd_regexp_cache* cache, + const gchar *pattern, + const gchar *flags); + +/** + * Create or get cached regexp from the specified cache + * @param cache regexp cache. if NULL, the superglobal cache is used (*not* thread-safe) + * @param pattern regexp pattern + * @param flags flags (may be enclosed inside pattern) + * @param err error pointer set if compilation failed + * @return new regexp object + */ +rspamd_regexp_t* rspamd_regexp_cache_create (struct rspamd_regexp_cache *cache, + const gchar *pattern, + const gchar *flags, GError **err); + +/** + * Destroy regexp cache and unref all elements inside it + * @param cache + */ +void rspamd_regexp_cache_destroy (struct rspamd_regexp_cache *cache); + +#endif /* REGEXP_H_ */ diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 47cdc649f..848fdfdb2 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -120,7 +120,7 @@ G_LOCK_DEFINE (task_cache_mtx); void task_cache_add (struct rspamd_task *task, - struct rspamd_regexp *re, + struct rspamd_regexp_element *re, gint32 result) { if (result == 0) { @@ -142,7 +142,7 @@ task_cache_add (struct rspamd_task *task, } gint32 -task_cache_check (struct rspamd_task *task, struct rspamd_regexp *re) +task_cache_check (struct rspamd_task *task, struct rspamd_regexp_element *re) { gpointer res; gint32 r; @@ -370,7 +370,7 @@ regexp_module_reconfig (struct rspamd_config *cfg) struct url_regexp_param { struct rspamd_task *task; GRegex *regexp; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gboolean found; }; @@ -405,7 +405,7 @@ tree_url_callback (gpointer key, gpointer value, void *data) } static gsize -process_regexp (struct rspamd_regexp *re, +process_regexp (struct rspamd_regexp_element *re, struct rspamd_task *task, const gchar *additional, gint limit, @@ -855,7 +855,7 @@ process_regexp_expression (struct expression *expr, GQueue *stack; gsize cur, op1, op2; struct expression *it = expr; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gboolean try_optimize = TRUE; stack = g_queue_new (); @@ -863,7 +863,7 @@ process_regexp_expression (struct expression *expr, while (it) { if (it->type == EXPR_REGEXP_PARSED) { /* Find corresponding symbol */ - cur = process_regexp ((struct rspamd_regexp *)it->content.operand, + cur = process_regexp ((struct rspamd_regexp_element *)it->content.operand, task, additional, 0, @@ -1202,7 +1202,7 @@ rspamd_regexp_occurs_number (struct rspamd_task *task, { gint limit; struct expression_argument *arg; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gchar *param, *err_str, op; int_compare_func f = NULL; @@ -1289,7 +1289,7 @@ match_smtp_data (struct rspamd_task *task, const gchar *re_text, const gchar *what) { - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gint r; if (*re_text == '/') { @@ -1471,7 +1471,7 @@ lua_regexp_match (lua_State *L) void *ud = luaL_checkudata (L, 1, "rspamd{task}"); struct rspamd_task *task; const gchar *re_text; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gint r = 0; luaL_argcheck (L, ud != NULL, 1, "'task' expected"); @@ -1504,7 +1504,7 @@ rspamd_content_type_compare_param (struct rspamd_task * task, { gchar *param_name, *param_pattern; const gchar *param_data; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; struct expression_argument *arg, *arg1; GMimeObject *part; GMimeContentType *ct; @@ -1701,7 +1701,7 @@ rspamd_content_type_is_subtype (struct rspamd_task *task, void *unused) { gchar *param_pattern; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; struct expression_argument *arg, *arg1; GMimeObject *part; GMimeContentType *ct; @@ -1806,7 +1806,7 @@ rspamd_content_type_is_type (struct rspamd_task * task, void *unused) { gchar *param_pattern; - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; struct expression_argument *arg, *arg1; GMimeObject *part; GMimeContentType *ct; @@ -1909,7 +1909,7 @@ static gboolean compare_subtype (struct rspamd_task *task, GMimeContentType * ct, gchar *subtype) { - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; gint r; if (subtype == NULL || ct == NULL) { @@ -1974,7 +1974,7 @@ common_has_content_part (struct rspamd_task * task, gint min_len, gint max_len) { - struct rspamd_regexp *re; + struct rspamd_regexp_element *re; struct mime_part *part; GList *cur; GMimeContentType *ct; -- 2.39.5