aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-06-02 19:32:34 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-06-02 19:32:34 +0400
commit7bae787900fea17ca82393886217c6287d7e8cea (patch)
tree4f358b3624d7b2ba6c86a25057d4ba7db10965ae /src
parent4eb2985d1ef3631fca82cbf18cc2e8d7aab9b096 (diff)
downloadrspamd-7bae787900fea17ca82393886217c6287d7e8cea.tar.gz
rspamd-7bae787900fea17ca82393886217c6287d7e8cea.zip
* Rework url parsing algorithms
* Adopt all parts of rspamd for new url parser * Improve url-extracter utility by avoiding cut&paste of mime parsing * Small fixes to rspamc client * Bump version to 0.1.3
Diffstat (limited to 'src')
-rw-r--r--src/lmtp.c4
-rw-r--r--src/main.h2
-rw-r--r--src/message.c20
-rw-r--r--src/message.h2
-rw-r--r--src/plugins/regexp.c51
-rw-r--r--src/plugins/surbl.c55
-rw-r--r--src/protocol.c8
-rw-r--r--src/tokenizers/tokenizers.c6
-rw-r--r--src/url.c31
-rw-r--r--src/url.h3
-rw-r--r--src/worker.c4
11 files changed, 141 insertions, 45 deletions
diff --git a/src/lmtp.c b/src/lmtp.c
index d649d11c3..5f2878a08 100644
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -109,6 +109,9 @@ free_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
else {
rspamd_remove_dispatcher (lmtp->task->dispatcher);
}
+ if (lmtp->task->urls) {
+ g_list_free (lmtp->task->urls);
+ }
close (lmtp->task->sock);
g_free (lmtp->task);
g_free (lmtp);
@@ -230,7 +233,6 @@ accept_socket (int fd, short what, void *arg)
new_task->state = READ_COMMAND;
new_task->sock = nfd;
new_task->cfg = worker->srv->cfg;
- TAILQ_INIT (&new_task->urls);
new_task->task_pool = memory_pool_new (memory_pool_get_size ());
/* Add destructor for recipients list (it would be better to use anonymous function here */
memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/src/main.h b/src/main.h
index c1a057824..924fe13ed 100644
--- a/src/main.h
+++ b/src/main.h
@@ -183,7 +183,7 @@ struct worker_task {
GList *parts; /**< list of parsed parts */
GList *text_parts; /**< list of text parts */
char *raw_headers; /**< list of raw headers */
- TAILQ_HEAD (uriq, uri) urls; /**< list of parsed urls */
+ GList *urls; /**< list of parsed urls */
GHashTable *results; /**< hash table of metric_result indexed by
* metric's name */
GHashTable *re_cache; /**< cache for matched or not matched regexps */
diff --git a/src/message.c b/src/message.c
index 510d407e9..f664122d0 100644
--- a/src/message.c
+++ b/src/message.c
@@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part->is_balanced = TRUE;
text_part->html_nodes = NULL;
text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+ text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+ text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
if (text_part->html_nodes == NULL) {
- url_parse_text (task, text_part->orig, FALSE);
+ url_parse_text (task->task_pool, task, text_part, FALSE);
}
else {
- url_parse_text (task, text_part->orig, TRUE);
+ url_parse_text (task->task_pool, task, text_part, FALSE);
+ url_parse_text (task->task_pool, task, text_part, TRUE);
}
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
task->text_parts = g_list_prepend (task->text_parts, text_part);
}
else if (g_mime_content_type_is_type (type, "text", "plain")) {
msg_debug ("mime_foreach_callback: got urls from text/plain part");
- url_parse_text (task, part_content, FALSE);
text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
text_part->content = text_part->orig;
text_part->is_html = FALSE;
text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+ text_part->html_urls = NULL;
+ text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+ url_parse_text (task->task_pool, task, text_part, FALSE);
task->text_parts = g_list_prepend (task->text_parts, text_part);
+ memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
}
}
@@ -489,8 +497,10 @@ process_message (struct worker_task *task)
if (task->rcpts) {
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts);
}
-
- task->worker->srv->stat->messages_scanned ++;
+
+ if (task->worker) {
+ task->worker->srv->stat->messages_scanned ++;
+ }
/* free the parser (and the stream) */
g_object_unref (parser);
diff --git a/src/message.h b/src/message.h
index 72711638f..9a63b0824 100644
--- a/src/message.h
+++ b/src/message.h
@@ -21,6 +21,8 @@ struct mime_text_part {
GByteArray *orig;
GByteArray *content;
GNode *html_nodes;
+ GTree *urls;
+ GTree *html_urls;
fuzzy_hash_t *fuzzy;
};
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c
index 1b109bf55..fbe36f291 100644
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -237,6 +237,28 @@ find_raw_header_pos (const char *headers, const char *headerv)
return NULL;
}
+struct url_regexp_param {
+ struct worker_task *task;
+ GRegex *regexp;
+ struct rspamd_regexp *re;
+ gboolean found;
+};
+
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+ struct url_regexp_param *param = data;
+ struct uri *url = value;
+
+ if (g_regex_match (param->regexp, struri (url), 0, NULL) == TRUE) {
+ task_cache_add (param->task, param->re, 1);
+ param->found = TRUE;
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
static gsize
process_regexp (struct rspamd_regexp *re, struct worker_task *task)
{
@@ -244,7 +266,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
struct mime_text_part *part, *tmp;
GList *cur, *headerlist;
GRegex *regexp;
- struct uri *url;
+ struct url_regexp_param callback_param;
int r;
@@ -333,13 +355,30 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
return 0;
case REGEXP_URL:
msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text);
- TAILQ_FOREACH (url, &task->urls, next) {
- if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) {
- task_cache_add (task, re, 1);
- return 1;
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ part = (struct mime_text_part *)cur->data;
+ if (part->is_raw) {
+ regexp = re->raw_regexp;
}
+ else {
+ regexp = re->regexp;
+ }
+ callback_param.task = task;
+ callback_param.regexp = regexp;
+ callback_param.re = re;
+ callback_param.found = FALSE;
+ if (part->urls) {
+ g_tree_foreach (part->urls, tree_url_callback, &callback_param);
+ }
+ if (part->html_urls && callback_param.found == FALSE) {
+ g_tree_foreach (part->html_urls, tree_url_callback, &callback_param);
+ }
+ cur = g_list_next (cur);
+ }
+ if (callback_param.found == FALSE) {
+ task_cache_add (task, re, 0);
}
- task_cache_add (task, re, 0);
return 0;
case REGEXP_RAW_HEADER:
msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text);
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index d44c7fbe7..1514cae1c 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -28,6 +28,7 @@
#include "../config.h"
#include "../util.h"
+#include "../message.h"
#include <evdns.h>
#include "surbl.h"
@@ -647,29 +648,53 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree *url_
event_add (&param->ev, &timeout);
}
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+ struct redirector_param *param = data;
+ struct uri *url = value;
+
+ msg_debug ("surbl_test_url: check url %s", struri (url));
+ if (surbl_module_ctx->use_redirector) {
+ register_redirector_call (url, param->task, param->tree);
+ param->task->save.saved++;
+ }
+ else {
+ if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
+ register_memcached_call (url, param->task, param->tree);
+ param->task->save.saved++;
+ }
+ else {
+ make_surbl_requests (url, param->task, param->tree);
+ }
+ }
+
+ return FALSE;
+}
+
static int
surbl_test_url (struct worker_task *task)
{
- struct uri *url;
GTree *url_tree;
+ GList *cur;
+ struct mime_text_part *part;
+ struct redirector_param param;
url_tree = g_tree_new ((GCompareFunc)g_ascii_strcasecmp);
-
- TAILQ_FOREACH (url, &task->urls, next) {
- msg_debug ("surbl_test_url: check url %s", struri (url));
- if (surbl_module_ctx->use_redirector) {
- register_redirector_call (url, task, url_tree);
- task->save.saved++;
+
+ param.tree = url_tree;
+ param.task = task;
+ cur = task->text_parts;
+ while (cur) {
+ part = cur->data;
+ if (part->urls) {
+ g_tree_foreach (part->urls, tree_url_callback, &param);
}
- else {
- if (task->worker->srv->cfg->memcached_servers_num > 0) {
- register_memcached_call (url, task, url_tree);
- task->save.saved++;
- }
- else {
- make_surbl_requests (url, task, url_tree);
- }
+ if (part->html_urls) {
+ g_tree_foreach (part->html_urls, tree_url_callback, &param);
}
+
+ cur = g_list_next (cur);
}
memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, url_tree);
diff --git a/src/protocol.c b/src/protocol.c
index 2cd025287..c551bb783 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -369,10 +369,13 @@ show_url_header (struct worker_task *task)
int r = 0;
char outbuf[OUTBUFSIZ], c;
struct uri *url;
+ GList *cur;
f_str_t host;
r = snprintf (outbuf, sizeof (outbuf), "Urls: ");
- TAILQ_FOREACH (url, &task->urls, next) {
+ cur = task->urls;
+ while (cur) {
+ url = cur->data;
host.begin = url->host;
host.len = url->hostlen;
/* Skip long hosts to avoid protocol coollisions */
@@ -386,7 +389,7 @@ show_url_header (struct worker_task *task)
r = 0;
}
/* Write url host to buf */
- if (TAILQ_NEXT (url, next) != NULL) {
+ if (g_list_next (cur) != NULL) {
c = *(host.begin + host.len);
*(host.begin + host.len) = '\0';
msg_debug ("show_url_header: write url: %s", host.begin);
@@ -400,6 +403,7 @@ show_url_header (struct worker_task *task)
r += snprintf (outbuf + r, sizeof (outbuf) - r, "%s" CRLF, host.begin);
*(host.begin + host.len) = c;
}
+ cur = g_list_next (cur);
}
rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE);
}
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index 1b47289a2..4527e699c 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -122,6 +122,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
token_node_t *new = NULL;
f_str_t url_domain;
struct uri *url;
+ GList *cur;
uint32_t h;
if (*tree == NULL) {
@@ -129,7 +130,9 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
}
- TAILQ_FOREACH (url, &task->urls, next) {
+ cur = task->urls;
+ while (cur) {
+ url = cur->data;
url_domain.begin = url->host;
url_domain.len = url->hostlen;
new = memory_pool_alloc (pool, sizeof (token_node_t));
@@ -139,6 +142,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
if (g_tree_lookup (*tree, new) == NULL) {
g_tree_insert (*tree, new, new);
}
+ cur = g_list_next (cur);
}
return TRUE;
diff --git a/src/url.c b/src/url.c
index 875358ae0..cc58a2caf 100644
--- a/src/url.c
+++ b/src/url.c
@@ -23,10 +23,11 @@
*/
#include "config.h"
+#include "url.h"
#include "util.h"
#include "fstring.h"
#include "main.h"
-#include "url.h"
+#include "message.h"
#define POST_CHAR 1
#define POST_CHAR_S "\001"
@@ -853,7 +854,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
}
void
-url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
+url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
{
GMatchInfo *info;
GError *err = NULL;
@@ -861,26 +862,32 @@ url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
char *url_str = NULL;
struct uri *new;
- if (!content->data || content->len == 0) {
+ if (!part->orig->data || part->orig->len == 0) {
msg_warn ("url_parse_text: got empty text part");
return;
}
if (url_init () == 0) {
- rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+ if (is_html) {
+ rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err);
+ }
+ else {
+ rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err);
+
+ }
if (rc) {
while (g_match_info_matches (info)) {
url_str = g_match_info_fetch (info, is_html ? 1 : 0);
msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
if (url_str != NULL) {
- new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
- if (new != NULL) {
- rc = parse_uri (new, url_str, task->task_pool);
- if (rc != URI_ERRNO_OK) {
- msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
- }
- if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
- TAILQ_INSERT_TAIL (&task->urls, new, next);
+ if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+ new = memory_pool_alloc (pool, sizeof (struct uri));
+ if (new != NULL) {
+ rc = parse_uri (new, url_str, pool);
+ if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+ g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+ task->urls = g_list_prepend (task->urls, new);
+ }
}
}
}
diff --git a/src/url.h b/src/url.h
index 2c367548d..7860f544a 100644
--- a/src/url.h
+++ b/src/url.h
@@ -6,6 +6,7 @@
#include "mem_pool.h"
struct worker_task;
+struct mime_text_part;
struct uri {
/* The start of the uri (and thus start of the protocol string). */
@@ -73,7 +74,7 @@ enum protocol {
#define struri(uri) ((uri)->string)
-void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
+void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
#endif
diff --git a/src/worker.c b/src/worker.c
index 3f223241b..af4575919 100644
--- a/src/worker.c
+++ b/src/worker.c
@@ -130,6 +130,9 @@ free_task (struct worker_task *task, gboolean is_soft)
if (task->text_parts) {
g_list_free (task->text_parts);
}
+ if (task->urls) {
+ g_list_free (task->urls);
+ }
memory_pool_delete (task->task_pool);
if (is_soft) {
/* Plan dispatcher shutdown */
@@ -287,7 +290,6 @@ accept_socket (int fd, short what, void *arg)
#endif
io_tv.tv_sec = WORKER_IO_TIMEOUT;
io_tv.tv_usec = 0;
- TAILQ_INIT (&new_task->urls);
new_task->task_pool = memory_pool_new (memory_pool_get_size ());
/* Add destructor for recipients list (it would be better to use anonymous function here */
memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);