diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2009-06-02 19:32:34 +0400 |
commit | 7bae787900fea17ca82393886217c6287d7e8cea (patch) | |
tree | 4f358b3624d7b2ba6c86a25057d4ba7db10965ae /src | |
parent | 4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff) | |
download | rspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip |
* Rework url parsing algorithms
* Adopt all parts of rspamd for new url parser
* Improve url-extracter utility by avoiding cut&paste of mime parsing
* Small fixes to rspamc client
* Bump version to 0.1.3
Diffstat (limited to 'src')
-rw-r--r-- | src/lmtp.c | 4 | ||||
-rw-r--r-- | src/main.h | 2 | ||||
-rw-r--r-- | src/message.c | 20 | ||||
-rw-r--r-- | src/message.h | 2 | ||||
-rw-r--r-- | src/plugins/regexp.c | 51 | ||||
-rw-r--r-- | src/plugins/surbl.c | 55 | ||||
-rw-r--r-- | src/protocol.c | 8 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 6 | ||||
-rw-r--r-- | src/url.c | 31 | ||||
-rw-r--r-- | src/url.h | 3 | ||||
-rw-r--r-- | src/worker.c | 4 |
11 files changed, 141 insertions, 45 deletions
diff --git a/src/lmtp.c b/src/lmtp.c index d649d11c3..5f2878a08 100644 --- a/src/lmtp.c +++ b/src/lmtp.c @@ -109,6 +109,9 @@ free_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft) else { rspamd_remove_dispatcher (lmtp->task->dispatcher); } + if (lmtp->task->urls) { + g_list_free (lmtp->task->urls); + } close (lmtp->task->sock); g_free (lmtp->task); g_free (lmtp); @@ -230,7 +233,6 @@ accept_socket (int fd, short what, void *arg) new_task->state = READ_COMMAND; new_task->sock = nfd; new_task->cfg = worker->srv->cfg; - TAILQ_INIT (&new_task->urls); new_task->task_pool = memory_pool_new (memory_pool_get_size ()); /* Add destructor for recipients list (it would be better to use anonymous function here */ memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task); diff --git a/src/main.h b/src/main.h index c1a057824..924fe13ed 100644 --- a/src/main.h +++ b/src/main.h @@ -183,7 +183,7 @@ struct worker_task { GList *parts; /**< list of parsed parts */ GList *text_parts; /**< list of text parts */ char *raw_headers; /**< list of raw headers */ - TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */ + GList *urls; /**< list of parsed urls */ GHashTable *results; /**< hash table of metric_result indexed by * metric's name */ GHashTable *re_cache; /**< cache for matched or not matched regexps */ diff --git a/src/message.c b/src/message.c index 510d407e9..f664122d0 100644 --- a/src/message.c +++ b/src/message.c @@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part->is_balanced = TRUE; text_part->html_nodes = NULL; text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL); + text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); + text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); if (text_part->html_nodes == NULL) { - url_parse_text (task, text_part->orig, FALSE); + url_parse_text (task->task_pool, task, text_part, FALSE); } else { - url_parse_text (task, text_part->orig, TRUE); + url_parse_text (task->task_pool, task, text_part, FALSE); + url_parse_text (task->task_pool, task, text_part, TRUE); } text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls); task->text_parts = g_list_prepend (task->text_parts, text_part); } else if (g_mime_content_type_is_type (type, "text", "plain")) { msg_debug ("mime_foreach_callback: got urls from text/plain part"); - url_parse_text (task, part_content, FALSE); text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); text_part->content = text_part->orig; text_part->is_html = FALSE; text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); + text_part->html_urls = NULL; + text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); + url_parse_text (task->task_pool, task, text_part, FALSE); task->text_parts = g_list_prepend (task->text_parts, text_part); + memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls); } } @@ -489,8 +497,10 @@ process_message (struct worker_task *task) if (task->rcpts) { memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts); } - - task->worker->srv->stat->messages_scanned ++; + + if (task->worker) { + task->worker->srv->stat->messages_scanned ++; + } /* free the parser (and the stream) */ g_object_unref (parser); diff --git a/src/message.h b/src/message.h index 72711638f..9a63b0824 100644 --- a/src/message.h +++ b/src/message.h @@ -21,6 +21,8 @@ struct mime_text_part { GByteArray *orig; GByteArray *content; GNode *html_nodes; + GTree *urls; + GTree *html_urls; fuzzy_hash_t *fuzzy; }; diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c index 1b109bf55..fbe36f291 100644 --- a/src/plugins/regexp.c +++ b/src/plugins/regexp.c @@ -237,6 +237,28 @@ find_raw_header_pos (const char *headers, const char *headerv) return NULL; } +struct url_regexp_param { + struct worker_task *task; + GRegex *regexp; + struct rspamd_regexp *re; + gboolean found; +}; + +static gboolean +tree_url_callback (gpointer key, gpointer value, void *data) +{ + struct url_regexp_param *param = data; + struct uri *url = value; + + if (g_regex_match (param->regexp, struri (url), 0, NULL) == TRUE) { + task_cache_add (param->task, param->re, 1); + param->found = TRUE; + return TRUE; + } + + return FALSE; +} + static gsize process_regexp (struct rspamd_regexp *re, struct worker_task *task) { @@ -244,7 +266,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) struct mime_text_part *part, *tmp; GList *cur, *headerlist; GRegex *regexp; - struct uri *url; + struct url_regexp_param callback_param; int r; @@ -333,13 +355,30 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task) return 0; case REGEXP_URL: msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text); - TAILQ_FOREACH (url, &task->urls, next) { - if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) { - task_cache_add (task, re, 1); - return 1; + cur = g_list_first (task->text_parts); + while (cur) { + part = (struct mime_text_part *)cur->data; + if (part->is_raw) { + regexp = re->raw_regexp; } + else { + regexp = re->regexp; + } + callback_param.task = task; + callback_param.regexp = regexp; + callback_param.re = re; + callback_param.found = FALSE; + if (part->urls) { + g_tree_foreach (part->urls, tree_url_callback, &callback_param); + } + if (part->html_urls && callback_param.found == FALSE) { + g_tree_foreach (part->html_urls, tree_url_callback, &callback_param); + } + cur = g_list_next (cur); + } + if (callback_param.found == FALSE) { + task_cache_add (task, re, 0); } - task_cache_add (task, re, 0); return 0; case REGEXP_RAW_HEADER: msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text); diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index d44c7fbe7..1514cae1c 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -28,6 +28,7 @@ #include "../config.h" #include "../util.h" +#include "../message.h" #include <evdns.h> #include "surbl.h" @@ -647,29 +648,53 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree *url_ event_add (¶m->ev, &timeout); } +static gboolean +tree_url_callback (gpointer key, gpointer value, void *data) +{ + struct redirector_param *param = data; + struct uri *url = value; + + msg_debug ("surbl_test_url: check url %s", struri (url)); + if (surbl_module_ctx->use_redirector) { + register_redirector_call (url, param->task, param->tree); + param->task->save.saved++; + } + else { + if (param->task->worker->srv->cfg->memcached_servers_num > 0) { + register_memcached_call (url, param->task, param->tree); + param->task->save.saved++; + } + else { + make_surbl_requests (url, param->task, param->tree); + } + } + + return FALSE; +} + static int surbl_test_url (struct worker_task *task) { - struct uri *url; GTree *url_tree; + GList *cur; + struct mime_text_part *part; + struct redirector_param param; url_tree = g_tree_new ((GCompareFunc)g_ascii_strcasecmp); - - TAILQ_FOREACH (url, &task->urls, next) { - msg_debug ("surbl_test_url: check url %s", struri (url)); - if (surbl_module_ctx->use_redirector) { - register_redirector_call (url, task, url_tree); - task->save.saved++; + + param.tree = url_tree; + param.task = task; + cur = task->text_parts; + while (cur) { + part = cur->data; + if (part->urls) { + g_tree_foreach (part->urls, tree_url_callback, ¶m); } - else { - if (task->worker->srv->cfg->memcached_servers_num > 0) { - register_memcached_call (url, task, url_tree); - task->save.saved++; - } - else { - make_surbl_requests (url, task, url_tree); - } + if (part->html_urls) { + g_tree_foreach (part->html_urls, tree_url_callback, ¶m); } + + cur = g_list_next (cur); } memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, url_tree); diff --git a/src/protocol.c b/src/protocol.c index 2cd025287..c551bb783 100644 --- a/src/protocol.c +++ b/src/protocol.c @@ -369,10 +369,13 @@ show_url_header (struct worker_task *task) int r = 0; char outbuf[OUTBUFSIZ], c; struct uri *url; + GList *cur; f_str_t host; r = snprintf (outbuf, sizeof (outbuf), "Urls: "); - TAILQ_FOREACH (url, &task->urls, next) { + cur = task->urls; + while (cur) { + url = cur->data; host.begin = url->host; host.len = url->hostlen; /* Skip long hosts to avoid protocol coollisions */ @@ -386,7 +389,7 @@ show_url_header (struct worker_task *task) r = 0; } /* Write url host to buf */ - if (TAILQ_NEXT (url, next) != NULL) { + if (g_list_next (cur) != NULL) { c = *(host.begin + host.len); *(host.begin + host.len) = '\0'; msg_debug ("show_url_header: write url: %s", host.begin); @@ -400,6 +403,7 @@ show_url_header (struct worker_task *task) r += snprintf (outbuf + r, sizeof (outbuf) - r, "%s" CRLF, host.begin); *(host.begin + host.len) = c; } + cur = g_list_next (cur); } rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE); } diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index 1b47289a2..4527e699c 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -122,6 +122,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree) token_node_t *new = NULL; f_str_t url_domain; struct uri *url; + GList *cur; uint32_t h; if (*tree == NULL) { @@ -129,7 +130,9 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree) memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree); } - TAILQ_FOREACH (url, &task->urls, next) { + cur = task->urls; + while (cur) { + url = cur->data; url_domain.begin = url->host; url_domain.len = url->hostlen; new = memory_pool_alloc (pool, sizeof (token_node_t)); @@ -139,6 +142,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree) if (g_tree_lookup (*tree, new) == NULL) { g_tree_insert (*tree, new, new); } + cur = g_list_next (cur); } return TRUE; @@ -23,10 +23,11 @@ */ #include "config.h" +#include "url.h" #include "util.h" #include "fstring.h" #include "main.h" -#include "url.h" +#include "message.h" #define POST_CHAR 1 #define POST_CHAR_S "\001" @@ -853,7 +854,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool) } void -url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html) +url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html) { GMatchInfo *info; GError *err = NULL; @@ -861,26 +862,32 @@ url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html) char *url_str = NULL; struct uri *new; - if (!content->data || content->len == 0) { + if (!part->orig->data || part->orig->len == 0) { msg_warn ("url_parse_text: got empty text part"); return; } if (url_init () == 0) { - rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err); + if (is_html) { + rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err); + } + else { + rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err); + + } if (rc) { while (g_match_info_matches (info)) { url_str = g_match_info_fetch (info, is_html ? 1 : 0); msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off"); if (url_str != NULL) { - new = memory_pool_alloc (task->task_pool, sizeof (struct uri)); - if (new != NULL) { - rc = parse_uri (new, url_str, task->task_pool); - if (rc != URI_ERRNO_OK) { - msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc)); - } - if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { - TAILQ_INSERT_TAIL (&task->urls, new, next); + if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { + new = memory_pool_alloc (pool, sizeof (struct uri)); + if (new != NULL) { + rc = parse_uri (new, url_str, pool); + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { + g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); + task->urls = g_list_prepend (task->urls, new); + } } } } @@ -6,6 +6,7 @@ #include "mem_pool.h" struct worker_task; +struct mime_text_part; struct uri { /* The start of the uri (and thus start of the protocol string). */ @@ -73,7 +74,7 @@ enum protocol { #define struri(uri) ((uri)->string) -void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html); +void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html); enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool); #endif diff --git a/src/worker.c b/src/worker.c index 3f223241b..af4575919 100644 --- a/src/worker.c +++ b/src/worker.c @@ -130,6 +130,9 @@ free_task (struct worker_task *task, gboolean is_soft) if (task->text_parts) { g_list_free (task->text_parts); } + if (task->urls) { + g_list_free (task->urls); + } memory_pool_delete (task->task_pool); if (is_soft) { /* Plan dispatcher shutdown */ @@ -287,7 +290,6 @@ accept_socket (int fd, short what, void *arg) #endif io_tv.tv_sec = WORKER_IO_TIMEOUT; io_tv.tv_usec = 0; - TAILQ_INIT (&new_task->urls); new_task->task_pool = memory_pool_new (memory_pool_get_size ()); /* Add destructor for recipients list (it would be better to use anonymous function here */ memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task); |