]> source.dussan.org Git - rspamd.git/commitdiff
* Add per-task regexp results cache, that would optimize regexp engine performance
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Sun, 5 Apr 2009 16:12:44 +0000 (20:12 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Sun, 5 Apr 2009 16:12:44 +0000 (20:12 +0400)
  by avoiding multiply match of the same regexp

src/expressions.c
src/expressions.h
src/main.h
src/plugins/regexp.c
src/worker.c

index e86ebb5abd7f77d6a430a347e9c00c4d00bfdb73..e3747c01301584324d1e600d81d7a94e58fbb0f0 100644 (file)
@@ -101,6 +101,23 @@ re_cache_add (char *line, void *pointer)
        g_hash_table_insert (re_cache, line, pointer);
 }
 
+/* Task cache functions */
+void 
+task_cache_add (struct worker_task *task, void *pointer, int32_t result)
+{
+       g_hash_table_insert (task->re_cache, pointer, GINT_TO_POINTER (result));
+}
+
+int32_t
+task_cache_check (struct worker_task *task, void *pointer)
+{
+       gpointer res;
+       if ((res = g_hash_table_lookup (task->re_cache, pointer)) != NULL) {
+               return GPOINTER_TO_INT (res);
+       }
+       return -1;
+}
+
 /*
  * Functions for parsing expressions
  */
@@ -748,6 +765,7 @@ rspamd_content_type_compare_param (struct worker_task *task, GList *args)
        struct expression_argument *arg;
        GMimeObject *part;
        const GMimeContentType *ct;
+       int r;
        
        if (args == NULL) {
                msg_warn ("rspamd_content_type_compare_param: no parameters to function");
@@ -781,8 +799,15 @@ rspamd_content_type_compare_param (struct worker_task *task, GList *args)
                                }
                                re_cache_add (param_pattern, re);
                        }
-                       if (g_regex_match (re->regexp, param_data, 0, NULL) == TRUE) {
-                               return TRUE;
+                       if ((r = task_cache_check (task, re)) == -1) {
+                               if (g_regex_match (re->regexp, param_data, 0, NULL) == TRUE) {
+                                       task_cache_add (task, re, 1);
+                                       return TRUE;
+                               }
+                               task_cache_add (task, re, 0);
+                       }
+                       else {
+                               return r == 1;
                        }
                }
                else {
@@ -842,6 +867,7 @@ rspamd_content_type_is_subtype (struct worker_task *task, GList *args)
        struct expression_argument *arg;
        GMimeObject *part;
        const localContentType *ct;
+       int r;
        
        if (args == NULL) {
                msg_warn ("rspamd_content_type_compare_param: no parameters to function");
@@ -869,8 +895,15 @@ rspamd_content_type_is_subtype (struct worker_task *task, GList *args)
                                }
                                re_cache_add (param_pattern, re);
                        }
-                       if (g_regex_match (re->regexp, ct->subtype, 0, NULL) == TRUE) {
-                               return TRUE;
+                       if ((r = task_cache_check (task, re)) == -1) {
+                               if (g_regex_match (re->regexp, ct->subtype, 0, NULL) == TRUE) {
+                                       task_cache_add (task, re, 1);
+                                       return TRUE;
+                               }
+                               task_cache_add (task, re, 0);
+                       }
+                       else {
+                               return r == 1;
                        }
                }
                else {
@@ -892,6 +925,7 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args)
        GMimeObject *part;
        const localContentType *ct;
        struct expression_argument *arg;
+       int r;
        
        if (args == NULL) {
                msg_warn ("rspamd_content_type_compare_param: no parameters to function");
@@ -920,8 +954,15 @@ rspamd_content_type_is_type (struct worker_task *task, GList *args)
                                }
                                re_cache_add (param_pattern, re);
                        }
-                       if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) {
-                               return TRUE;
+                       if ((r = task_cache_check (task, re)) == -1) {
+                               if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) {
+                                       task_cache_add (task, re, 1);
+                                       return TRUE;
+                               }
+                               task_cache_add (task, re, 0);
+                       }
+                       else {
+                               return r == 1;
                        }
                }
                else {
@@ -1085,6 +1126,7 @@ static inline gboolean
 compare_subtype (struct worker_task *task, const localContentType *ct, char *subtype)
 {
        struct rspamd_regexp *re;
+       int r;
 
        if (*subtype == '/') {
                /* This is regexp, so compile and create g_regexp object */
@@ -1096,8 +1138,15 @@ compare_subtype (struct worker_task *task, const localContentType *ct, char *sub
                        }
                        re_cache_add (subtype, re);
                }
-               if (g_regex_match (re->regexp, ct->subtype , 0, NULL) == TRUE) {
-                       return TRUE;
+               if ((r = task_cache_check (task, re)) == -1) {
+                       if (g_regex_match (re->regexp, subtype, 0, NULL) == TRUE) {
+                               task_cache_add (task, re, 1);
+                               return TRUE;
+                       }
+                       task_cache_add (task, re, 0);
+               }
+               else {
+                       return r == 1;
                }
        }
        else {
@@ -1135,7 +1184,7 @@ common_has_content_part (struct worker_task *task, char *param_type, char *param
        struct mime_part *part;
        GList *cur;
        const localContentType *ct;
-       
+       int r;
        
        cur = g_list_first (task->parts);
        while (cur) {
@@ -1157,17 +1206,32 @@ common_has_content_part (struct worker_task *task, char *param_type, char *param
                                }
                                re_cache_add (param_type, re);
                        }
-                       if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) {
-                               if (param_subtype) {
-                                       if (compare_subtype (task, ct, param_subtype)) {
+                       if ((r = task_cache_check (task, re)) == -1) {
+                               if (g_regex_match (re->regexp, ct->type, 0, NULL) == TRUE) {
+                                       if (param_subtype) {
+                                               if (compare_subtype (task, ct, param_subtype)) {
+                                                       if (compare_len (part, min_len, max_len)) {
+                                                               return TRUE;
+                                                       }
+                                               }
+                                       }
+                                       else {
                                                if (compare_len (part, min_len, max_len)) {
                                                        return TRUE;
                                                }
                                        }
+                                       task_cache_add (task, re, 1);
                                }
                                else {
-                                       if (compare_len (part, min_len, max_len)) {
-                                               return TRUE;
+                                       task_cache_add (task, re, 0);
+                               }
+                       }
+                       else {
+                               if (r == 1) {
+                                       if (compare_subtype (task, ct, param_subtype)) {
+                                               if (compare_len (part, min_len, max_len)) {
+                                                       return TRUE;
+                                               }
                                        }
                                }
                        }
index 5c2a391eb8d8b77467b704ae88a70d5e23774e24..5e7e134e6644232f585c7b0fc38ecb19447553d0 100644 (file)
@@ -88,4 +88,20 @@ void re_cache_add (char *line, void *pointer);
  */
 void * re_cache_check (const char *line);
 
+/**
+ * Add regexp to regexp task cache
+ * @param task task object
+ * @param pointer regexp data
+ * @param result numeric result of this regexp
+ */
+void task_cache_add (struct worker_task *task, void *pointer, int32_t result);
+
+/**
+ * Check regexp in cache
+ * @param task task object
+ * @param pointer regexp data
+ * @return numeric result if value exists or -1 if not
+ */
+int32_t task_cache_check (struct worker_task *task, void *pointer);
+
 #endif
index ab785fa5a647f348b23259e558234110e8fd55d5..f7ab2eda4bda140f15617f8f67a5bf8eea8abae5 100644 (file)
@@ -180,6 +180,7 @@ struct worker_task {
        TAILQ_HEAD (uriq, uri) urls;                                                            /**< list of parsed urls                                                        */
        GHashTable *results;                                                                            /**< hash table of metric_result indexed by 
                                                                                                                                 *    metric's name                                                                     */
+       GHashTable *re_cache;                                                                           /**< cache for matched or not matched regexps           */
        struct config_file *cfg;                                                                        /**< pointer to config object                                           */
        struct save_point save;                                                                         /**< save point for delayed processing                          */
        char *last_error;                                                                                       /**< last error                                                                         */
index 8fca04bdd2194a662a32540813f2a3d76b43bba6..573d370f9b2ae4e1a943a0d7efe0e62c49db5558 100644 (file)
@@ -158,11 +158,16 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
        struct mime_text_part *part;
        GList *cur, *headerlist;
        struct uri *url;
+       int r;
 
        if (re == NULL) {
                msg_info ("process_regexp: invalid regexp passed");
                return 0;
        }
+       
+       if ((r = task_cache_check (task, re)) != -1) {
+               return r == 1;
+       }
 
        switch (re->type) {
                case REGEXP_NONE:
@@ -170,27 +175,32 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                case REGEXP_HEADER:
                        if (re->header == NULL) {
                                msg_info ("process_regexp: header regexp without header name");
+                               task_cache_add (task, re, 0);
                                return 0;
                        }
                        msg_debug ("process_regexp: checking header regexp: %s = /%s/", re->header, re->regexp_text);
                        headerlist = message_get_header (task->task_pool, task->message, re->header);
                        if (headerlist == NULL) {
+                               task_cache_add (task, re, 0);
                                return 0;
                        }
                        else {
                                if (re->regexp == NULL) {
                                        msg_debug ("process_regexp: regexp contains only header and it is found %s", re->header);
+                                       task_cache_add (task, re, 1);
                                        g_list_free (headerlist);
                                        return 1;
                                }
                                cur = headerlist;
                                while (cur) {
                                        if (cur->data && g_regex_match (re->regexp, cur->data, 0, NULL) == TRUE) {
+                                               task_cache_add (task, re, 1);
                                                return 1;
                                        }
                                        cur = g_list_next (cur);
                                }
                                g_list_free (headerlist);
+                               task_cache_add (task, re, 0);
                                return 0;
                        }
                        break;
@@ -200,33 +210,41 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                        while (cur) {
                                part = (struct mime_text_part *)cur->data;
                                if (g_regex_match_full (re->regexp, part->orig->data, part->orig->len, 0, 0, NULL, NULL) == TRUE) {
+                                       task_cache_add (task, re, 1);
                                        return 1;
                                }
                                cur = g_list_next (cur);
                        }
+                       task_cache_add (task, re, 0);
                        return 0;
                case REGEXP_MESSAGE:
                        msg_debug ("process_regexp: checking message regexp: /%s/", re->regexp_text);
                        if (g_regex_match_full (re->regexp, task->msg->begin, task->msg->len, 0, 0, NULL, NULL) == TRUE) {
+                               task_cache_add (task, re, 1);
                                return 1;
                        }
+                       task_cache_add (task, re, 0);
                        return 0;
                case REGEXP_URL:
                        msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text);
                        TAILQ_FOREACH (url, &task->urls, next) {
                                if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) {
+                                       task_cache_add (task, re, 1);
                                        return 1;
                                }
                        }
+                       task_cache_add (task, re, 0);
                        return 0;
                case REGEXP_RAW_HEADER:
                        msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text);
                        if (task->raw_headers == NULL) {
                                msg_debug ("process_regexp: cannot check for raw header in message, no headers found");
+                               task_cache_add (task, re, 0);
                                return 0;
                        }
                        if ((headerv = strstr (task->raw_headers, re->header)) == NULL) {
                                /* No header was found */
+                               task_cache_add (task, re, 0);
                                return 0;
                        }
                        /* Skip header name and start matching after regexp */
@@ -256,9 +274,11 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                        *c = '\0';
                        if (g_regex_match (re->regexp, headerv, 0, NULL) == TRUE) {
                                *c = t;
+                               task_cache_add (task, re, 1);
                                return 1;
                        }
                        *c = t;
+                       task_cache_add (task, re, 0);
                        return 0;
        }
 
index 1f1548dd1b094380e922a10b80ec55a3d203dac7..20e04ed5e8b0b90cd0166fc720ca1714436d8832 100644 (file)
@@ -274,6 +274,8 @@ accept_socket (int fd, short what, void *arg)
        memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
        new_task->results = g_hash_table_new (g_str_hash, g_str_equal);
        memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)g_hash_table_destroy, new_task->results);
+       new_task->re_cache = g_hash_table_new (g_direct_hash, g_direct_equal);
+       memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)g_hash_table_destroy, new_task->re_cache);
 
        worker->srv->stat->connections_count ++;