From 908df97327a0e7e0c21053a67ac22e66610d30d1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 7 Mar 2015 22:00:14 +0000 Subject: [PATCH] Start moving to the rspamd regexps. --- src/libmime/expressions.c | 83 ++++++++--------------------------- src/libserver/cfg_file.h | 5 +-- src/libutil/regexp.c | 4 ++ src/plugins/regexp.c | 91 +++++++++------------------------------ 4 files changed, 44 insertions(+), 139 deletions(-) diff --git a/src/libmime/expressions.c b/src/libmime/expressions.c index 769b7dc14..aab78df5c 100644 --- a/src/libmime/expressions.c +++ b/src/libmime/expressions.c @@ -652,9 +652,9 @@ parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) { const gchar *begin, *end, *p, *src, *start; gchar *dbegin, *dend; - struct rspamd_regexp_element *result, *check; - gint regexp_flags = G_REGEX_OPTIMIZE | G_REGEX_NO_AUTO_CAPTURE; + struct rspamd_regexp_element *result; GError *err = NULL; + GString *re_flags; if (line == NULL) { msg_err ("cannot parse NULL line"); @@ -727,35 +727,20 @@ parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) } /* Parse flags */ p = end + 1; + re_flags = g_string_sized_new (32); while (p != NULL) { switch (*p) { case 'i': - regexp_flags |= G_REGEX_CASELESS; - p++; - break; case 'm': - regexp_flags |= G_REGEX_MULTILINE; - p++; - break; case 's': - regexp_flags |= G_REGEX_DOTALL; - p++; - break; case 'x': - regexp_flags |= G_REGEX_EXTENDED; - p++; - break; case 'u': - regexp_flags |= G_REGEX_UNGREEDY; + case 'O': + case 'r': + g_string_append_c (re_flags, *p); p++; break; case 'o': - regexp_flags |= G_REGEX_OPTIMIZE; - p++; - break; - case 'r': - regexp_flags |= G_REGEX_RAW; - result->is_raw = TRUE; p++; break; /* Type flags */ @@ -810,61 +795,27 @@ parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode) *dend = '\0'; if (raw_mode) { - regexp_flags |= G_REGEX_RAW; - } - - /* Avoid multiply regexp structures for similar regexps */ - if ((check = - (struct rspamd_regexp_element *)re_cache_check (result->regexp_text, - pool)) != NULL) { - /* Additional check for headers */ - if (result->type == REGEXP_HEADER || result->type == - REGEXP_RAW_HEADER) { - if (result->header && check->header) { - if (strcmp (result->header, check->header) == 0) { - return check; - } - } - } - else { - return check; - } - } - result->regexp = g_regex_new (dbegin, regexp_flags, 0, &err); - if ((regexp_flags & G_REGEX_RAW) != 0) { - result->raw_regexp = result->regexp; - } - else { - result->raw_regexp = g_regex_new (dbegin, - regexp_flags | G_REGEX_RAW, - 0, - &err); - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_regex_unref, - (void *)result->raw_regexp); + g_string_append_c (re_flags, 'r'); } - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_regex_unref, - (void *)result->regexp); - *dend = '/'; + result->regexp = rspamd_regexp_cache_create (NULL, dbegin, re_flags->str, + &err); + + g_string_free (re_flags, TRUE); if (result->regexp == NULL || err != NULL) { msg_warn ("could not read regexp: %s while reading regexp %s", err ? err->message : "unknown error", - src); + src); return NULL; } - if (result->raw_regexp == NULL || err != NULL) { - msg_warn ("could not read raw regexp: %s while reading regexp %s", - err ? err->message : "unknown error", - src); - return NULL; - } + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t) rspamd_regexp_unref, + (void *)result->regexp); + + *dend = '/'; - /* Add to cache for further usage */ - re_cache_add (result->regexp_text, result, pool); return result; } diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 8c58a4941..44728afe8 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -12,6 +12,7 @@ #include "symbols_cache.h" #include "cfg_rcl.h" #include "ucl.h" +#include "regexp.h" #define DEFAULT_BIND_PORT 11333 #define DEFAULT_CONTROL_PORT 11334 @@ -68,11 +69,9 @@ enum rspamd_log_type { struct rspamd_regexp_element { enum rspamd_regexp_type type; /**< regexp type */ gchar *regexp_text; /**< regexp text representation */ - GRegex *regexp; /**< glib regexp structure */ - GRegex *raw_regexp; /**< glib regexp structure for raw matching */ + rspamd_regexp_t *regexp; /**< regexp structure */ gchar *header; /**< header name for header regexps */ gboolean is_test; /**< true if this expression must be tested */ - gboolean is_raw; /**< true if this regexp is done by raw matching */ gboolean is_strong; /**< true if headers search must be case sensitive */ }; diff --git a/src/libutil/regexp.c b/src/libutil/regexp.c index 6da0a663e..3b76d15e0 100644 --- a/src/libutil/regexp.c +++ b/src/libutil/regexp.c @@ -296,6 +296,10 @@ rspamd_regexp_search (rspamd_regexp_t *re, const gchar *text, gsize len, g_assert (re != NULL); g_assert (text != NULL); + if (len == 0) { + len = strlen (text); + } + if (end != NULL && *end != NULL) { /* Incremental search */ mt = (*end); diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 848fdfdb2..224d40b21 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -369,7 +369,7 @@ regexp_module_reconfig (struct rspamd_config *cfg) struct url_regexp_param { struct rspamd_task *task; - GRegex *regexp; + rspamd_regexp_t *regexp; struct rspamd_regexp_element *re; gboolean found; }; @@ -379,10 +379,9 @@ tree_url_callback (gpointer key, gpointer value, void *data) { struct url_regexp_param *param = data; struct rspamd_url *url = value; - GError *err = NULL; - if (g_regex_match_full (param->regexp, struri (url), -1, 0, 0, NULL, - &err) == TRUE) { + if (rspamd_regexp_search (param->regexp, struri (url), 0, NULL, NULL, FALSE) + == TRUE) { if (G_UNLIKELY (param->re->is_test)) { msg_info ("process test regexp %s for url %s returned TRUE", struri (url)); @@ -395,11 +394,6 @@ tree_url_callback (gpointer key, gpointer value, void *data) msg_info ("process test regexp %s for url %s returned FALSE", struri (url)); } - if (err != NULL) { - msg_info ("error occured while processing regexp \"%s\": %s", - param->re->regexp_text, - err->message); - } return FALSE; } @@ -413,14 +407,12 @@ process_regexp (struct rspamd_regexp_element *re, { guint8 *ct; gsize clen; - gint r, passed = 0, start, end, old; - gboolean matched = FALSE; - const gchar *in; + gint r, passed = 0; + gboolean matched = FALSE, raw = FALSE; + const gchar *in, *start, *end; GList *cur, *headerlist; - GRegex *regexp; - GMatchInfo *info; - GError *err = NULL; + rspamd_regexp_t *regexp; struct url_regexp_param callback_param = { .task = task, .re = re, @@ -449,8 +441,8 @@ process_regexp (struct rspamd_regexp_element *re, re->regexp_text, additional); } - if (g_regex_match_full (re->regexp, additional, strlen (additional), 0, - 0, NULL, NULL) == TRUE) { + if (rspamd_regexp_search (re->regexp, additional, 0, NULL, NULL, + FALSE) == TRUE) { if (G_UNLIKELY (re->is_test)) { msg_info ("result of regexp %s is true", re->regexp_text); } @@ -513,7 +505,7 @@ process_regexp (struct rspamd_regexp_element *re, re->header, rh->decoded); if (re->type == REGEXP_RAW_HEADER) { in = rh->value; - regexp = re->raw_regexp; + raw = TRUE; } else { in = rh->decoded; @@ -527,8 +519,7 @@ process_regexp (struct rspamd_regexp_element *re, /* Match re */ if (in && - g_regex_match_full (regexp, in, -1, 0, 0, NULL, - &err) == TRUE) { + rspamd_regexp_search (regexp, in, 0, NULL, NULL, raw)) { if (G_UNLIKELY (re->is_test)) { msg_info ( "process test regexp %s for header %s with value '%s' returned TRUE", @@ -555,12 +546,6 @@ process_regexp (struct rspamd_regexp_element *re, re->header, in); } - if (err != NULL) { - msg_info ( - "error occured while processing regexp \"%s\": %s", - re->regexp_text, - err->message); - } cur = g_list_next (cur); } task_cache_add (task, re, 0); @@ -589,14 +574,14 @@ process_regexp (struct rspamd_regexp_element *re, } /* Check raw flags */ if (part->is_raw) { - regexp = re->raw_regexp; + raw = TRUE; } else { /* This time there is no need to validate anything as conversion succeed only for valid characters */ regexp = re->regexp; } /* Select data for regexp */ - if (re->is_raw) { + if (raw) { ct = part->orig->data; clen = part->orig->len; } @@ -607,9 +592,10 @@ process_regexp (struct rspamd_regexp_element *re, /* If we have limit, apply regexp so much times as we can */ if (f != NULL && limit > 1) { end = 0; + start = NULL; + end = NULL; while ((matched = - g_regex_match_full (regexp, ct + end + 1, clen - end - 1, 0, - 0, &info, &err)) == TRUE) { + rspamd_regexp_search (regexp, ct, clen, &start, &end, raw))) { if (G_UNLIKELY (re->is_test)) { msg_info ( "process test regexp %s for mime part of length %d returned TRUE", @@ -621,22 +607,10 @@ process_regexp (struct rspamd_regexp_element *re, task_cache_add (task, re, 1); return 1; } - else { - /* Match not found, skip further cycles */ - old = end; - if (!g_match_info_fetch_pos (info, 0, &start, - &end) || end <= 0) { - break; - } - end += old; - } - g_match_info_free (info); } - g_match_info_free (info); } else { - if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, - &err) == TRUE) { + if (rspamd_regexp_search (regexp, ct, clen, NULL, NULL, raw)) { if (G_UNLIKELY (re->is_test)) { msg_info ( "process test regexp %s for mime part of length %d returned TRUE", @@ -654,18 +628,13 @@ process_regexp (struct rspamd_regexp_element *re, re->regexp_text, (gint)clen); } - if (err != NULL) { - msg_info ("error occured while processing regexp \"%s\": %s", - re->regexp_text, - err->message); - } cur = g_list_next (cur); } task_cache_add (task, re, 0); break; case REGEXP_MESSAGE: debug_task ("checking message regexp: %s", re->regexp_text); - regexp = re->raw_regexp; + raw = TRUE; ct = (guint8 *)task->msg.start; clen = task->msg.len; @@ -676,10 +645,9 @@ process_regexp (struct rspamd_regexp_element *re, } /* If we have limit, apply regexp so much times as we can */ if (f != NULL && limit > 1) { - end = 0; + start = end = NULL; while ((matched = - g_regex_match_full (regexp, ct + end + 1, clen - end - 1, 0, 0, - &info, &err)) == TRUE) { + rspamd_regexp_search (regexp, ct, clen, &start, &end, raw))) { if (G_UNLIKELY (re->is_test)) { msg_info ( "process test regexp %s for mime part of length %d returned TRUE", @@ -690,22 +658,10 @@ process_regexp (struct rspamd_regexp_element *re, task_cache_add (task, re, 1); return 1; } - else { - /* Match not found, skip further cycles */ - old = end; - if (!g_match_info_fetch_pos (info, 0, &start, - &end) || end <= 0) { - break; - } - old += end; - } - g_match_info_free (info); } - g_match_info_free (info); } else { - if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, - &err) == TRUE) { + if (rspamd_regexp_search (regexp, ct, clen, NULL, NULL, raw)) { if (G_UNLIKELY (re->is_test)) { msg_info ( "process test regexp %s for message part of length %d returned TRUE", @@ -723,11 +679,6 @@ process_regexp (struct rspamd_regexp_element *re, re->regexp_text, (gint)clen); } - if (err != NULL) { - msg_info ("error occured while processing regexp \"%s\": %s", - re->regexp_text, - err->message); - } task_cache_add (task, re, 0); break; case REGEXP_URL: -- 2.39.5