From 4a79d0e82a5e2040e8dd5d4b8fb12fbf4672d8ce Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 5 Apr 2009 20:12:44 +0400 Subject: [PATCH] * Add per-task regexp results cache, that would optimize regexp engine performance by avoiding multiply match of the same regexp --- src/expressions.c | 92 +++++++++++++++++++++++++++++++++++++------- src/expressions.h | 16 ++++++++ src/main.h | 1 + src/plugins/regexp.c | 20 ++++++++++ src/worker.c | 2 + 5 files changed, 117 insertions(+), 14 deletions(-) diff --git a/src/expressions.c b/src/expressions.c index e86ebb5ab..e3747c013 100644 --- a/src/expressions.c +++ b/src/expressions.c @@ -101,6 +101,23 @@ re_cache_add (char *line, void *pointer) g_hash_table_insert (re_cache, line, pointer); } +/* Task cache functions */ +void +task_cache_add (struct worker_task *task, void *pointer, int32_t result) +{ + g_hash_table_insert (task->re_cache, pointer, GINT_TO_POINTER (result)); +} + +int32_t +task_cache_check (struct worker_task *task, void *pointer) +{ + gpointer res; + if ((res = g_hash_table_lookup (task->re_cache, pointer)) != NULL) { + return GPOINTER_TO_INT (res); + } + return -1; +} + /* * Functions for parsing expressions */ @@ -748,6 +765,7 @@ rspamd_content_type_compare_param (struct worker_task *task, GList *args) struct expression_argument *arg; GMimeObject *part; const GMimeContentType *ct; + int r; if (args == NULL) { msg_warn ("rspamd_content_type_compare_param: no parameters to function"); @@ -781,8 +799,15 @@ rspamd_content_type_compare_param (struct worker_task *task, GList *args) } re_cache_add (param_pattern, re); } - if (g_regex_match (re->regexp, param_data, 0, NULL) == TRUE) { - return TRUE; + if ((r = task_cache_check (task, re)) == -1) { + if (g_regex_match (re->regexp, param_data, 0, NULL) == TRUE) { + task_cache_add (task, re, 1); + return TRUE; + } + task_cache_add (task, re, 0); + } + else { + return r == 1; } } else { @@ -842,6 +867,7 @@ rspamd_content_type_is_subtype (struct worker_task *task, GList *args) struct expression_argument *arg; GMimeObject *part; const localContentType *ct; + int r; if (args == NULL) { msg_warn ("rspamd_content_type_compare_param: no parameters to function"); @@ -869,8 +895,15 @@ rspamd_content_type_is_subtype (struct worker_task *task, GList *args) } re_cache_add (param_pattern, re); } - if (g_regex_match (re->regexp, ct->subtype, 0, NULL) == TRUE) { - return TRUE; + if ((r = task_cache_check (task, re)) == -1) { + if (g_regex_match (re->regexp, ct->subtype, 0, NULL) == TRUE) { + task_cache_add (task, re, 1); + return TRUE; + } + task_cache_add (task, re, 0); + } + else { + return r == 1; } } else { @@ -892,6 +925,7 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args) GMimeObject *part; const localContentType *ct; struct expression_argument *arg; + int r; if (args == NULL) { msg_warn ("rspamd_content_type_compare_param: no parameters to function"); @@ -920,8 +954,15 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args) } re_cache_add (param_pattern, re); } - if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) { - return TRUE; + if ((r = task_cache_check (task, re)) == -1) { + if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) { + task_cache_add (task, re, 1); + return TRUE; + } + task_cache_add (task, re, 0); + } + else { + return r == 1; } } else { @@ -1085,6 +1126,7 @@ static inline gboolean compare_subtype (struct worker_task *task, const localContentType *ct, char *subtype) { struct rspamd_regexp *re; + int r; if (*subtype == '/') { /* This is regexp, so compile and create g_regexp object */ @@ -1096,8 +1138,15 @@ compare_subtype (struct worker_task *task, const localContentType *ct, char *sub } re_cache_add (subtype, re); } - if (g_regex_match (re->regexp, ct->subtype , 0, NULL) == TRUE) { - return TRUE; + if ((r = task_cache_check (task, re)) == -1) { + if (g_regex_match (re->regexp, subtype, 0, NULL) == TRUE) { + task_cache_add (task, re, 1); + return TRUE; + } + task_cache_add (task, re, 0); + } + else { + return r == 1; } } else { @@ -1135,7 +1184,7 @@ common_has_content_part (struct worker_task *task, char *param_type, char *param struct mime_part *part; GList *cur; const localContentType *ct; - + int r; cur = g_list_first (task->parts); while (cur) { @@ -1157,17 +1206,32 @@ common_has_content_part (struct worker_task *task, char *param_type, char *param } re_cache_add (param_type, re); } - if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) { - if (param_subtype) { - if (compare_subtype (task, ct, param_subtype)) { + if ((r = task_cache_check (task, re)) == -1) { + if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) { + if (param_subtype) { + if (compare_subtype (task, ct, param_subtype)) { + if (compare_len (part, min_len, max_len)) { + return TRUE; + } + } + } + else { if (compare_len (part, min_len, max_len)) { return TRUE; } } + task_cache_add (task, re, 1); } else { - if (compare_len (part, min_len, max_len)) { - return TRUE; + task_cache_add (task, re, 0); + } + } + else { + if (r == 1) { + if (compare_subtype (task, ct, param_subtype)) { + if (compare_len (part, min_len, max_len)) { + return TRUE; + } } } } diff --git a/src/expressions.h b/src/expressions.h index 5c2a391eb..5e7e134e6 100644 --- a/src/expressions.h +++ b/src/expressions.h @@ -88,4 +88,20 @@ void re_cache_add (char *line, void *pointer); */ void * re_cache_check (const char *line); +/** + * Add regexp to regexp task cache + * @param task task object + * @param pointer regexp data + * @param result numeric result of this regexp + */ +void task_cache_add (struct worker_task *task, void *pointer, int32_t result); + +/** + * Check regexp in cache + * @param task task object + * @param pointer regexp data + * @return numeric result if value exists or -1 if not + */ +int32_t task_cache_check (struct worker_task *task, void *pointer); + #endif diff --git a/src/main.h b/src/main.h index ab785fa5a..f7ab2eda4 100644 --- a/src/main.h +++ b/src/main.h @@ -180,6 +180,7 @@ struct worker_task { TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */ GHashTable *results; /**< hash table of metric_result indexed by * metric's name */ + GHashTable *re_cache; /**< cache for matched or not matched regexps */ struct config_file *cfg; /**< pointer to config object */ struct save_point save; /**< save point for delayed processing */ char *last_error; /**< last error */ diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 8fca04bdd..573d370f9 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -158,11 +158,16 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) struct mime_text_part *part; GList *cur, *headerlist; struct uri *url; + int r; if (re == NULL) { msg_info ("process_regexp: invalid regexp passed"); return 0; } + + if ((r = task_cache_check (task, re)) != -1) { + return r == 1; + } switch (re->type) { case REGEXP_NONE: @@ -170,27 +175,32 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) case REGEXP_HEADER: if (re->header == NULL) { msg_info ("process_regexp: header regexp without header name"); + task_cache_add (task, re, 0); return 0; } msg_debug ("process_regexp: checking header regexp: %s = /%s/", re->header, re->regexp_text); headerlist = message_get_header (task->task_pool, task->message, re->header); if (headerlist == NULL) { + task_cache_add (task, re, 0); return 0; } else { if (re->regexp == NULL) { msg_debug ("process_regexp: regexp contains only header and it is found %s", re->header); + task_cache_add (task, re, 1); g_list_free (headerlist); return 1; } cur = headerlist; while (cur) { if (cur->data && g_regex_match (re->regexp, cur->data, 0, NULL) == TRUE) { + task_cache_add (task, re, 1); return 1; } cur = g_list_next (cur); } g_list_free (headerlist); + task_cache_add (task, re, 0); return 0; } break; @@ -200,33 +210,41 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) while (cur) { part = (struct mime_text_part *)cur->data; if (g_regex_match_full (re->regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) { + task_cache_add (task, re, 1); return 1; } cur = g_list_next (cur); } + task_cache_add (task, re, 0); return 0; case REGEXP_MESSAGE: msg_debug ("process_regexp: checking message regexp: /%s/", re->regexp_text); if (g_regex_match_full (re->regexp, task->msg->begin, task->msg->len, 0, 0, NULL, NULL) == TRUE) { + task_cache_add (task, re, 1); return 1; } + task_cache_add (task, re, 0); return 0; case REGEXP_URL: msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text); TAILQ_FOREACH (url, &task->urls, next) { if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) { + task_cache_add (task, re, 1); return 1; } } + task_cache_add (task, re, 0); return 0; case REGEXP_RAW_HEADER: msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text); if (task->raw_headers == NULL) { msg_debug ("process_regexp: cannot check for raw header in message, no headers found"); + task_cache_add (task, re, 0); return 0; } if ((headerv = strstr (task->raw_headers, re->header)) == NULL) { /* No header was found */ + task_cache_add (task, re, 0); return 0; } /* Skip header name and start matching after regexp */ @@ -256,9 +274,11 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) *c = '\0'; if (g_regex_match (re->regexp, headerv, 0, NULL) == TRUE) { *c = t; + task_cache_add (task, re, 1); return 1; } *c = t; + task_cache_add (task, re, 0); return 0; } diff --git a/src/worker.c b/src/worker.c index 1f1548dd1..20e04ed5e 100644 --- a/src/worker.c +++ b/src/worker.c @@ -274,6 +274,8 @@ accept_socket (int fd, short what, void *arg) memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task); new_task->results = g_hash_table_new (g_str_hash, g_str_equal); memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)g_hash_table_destroy, new_task->results); + new_task->re_cache = g_hash_table_new (g_direct_hash, g_direct_equal); + memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)g_hash_table_destroy, new_task->re_cache); worker->srv->stat->connections_count ++; -- 2.39.5