summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-02-24 18:51:13 +0300
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-02-24 18:51:13 +0300
commit121efbcddf8ec41eea91aa80574dab3730bf8976 (patch)
tree2d39b5895526d63c7994aa81575c2db15a46cee1
parent7eb9b642db888b26a97b06394695e55173c45895 (diff)
downloadrspamd-121efbcddf8ec41eea91aa80574dab3730bf8976.tar.gz
rspamd-121efbcddf8ec41eea91aa80574dab3730bf8976.zip
* Rewrite URL storage system
-rw-r--r--CMakeLists.txt28
-rw-r--r--src/html.c4
-rw-r--r--src/lmtp.c3
-rw-r--r--src/lua/lua_task.c61
-rw-r--r--src/main.h4
-rw-r--r--src/plugins/surbl.c134
-rw-r--r--src/protocol.c202
-rw-r--r--src/tokenizers/tokenizers.c32
-rw-r--r--src/tokenizers/tokenizers.h2
-rw-r--r--src/url.c11
-rw-r--r--src/worker.c60
11 files changed, 191 insertions, 350 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf6e7748c..e93793fdf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -735,34 +735,6 @@ IF(ENABLE_STATIC MATCHES "ON")
TARGET_LINK_LIBRARIES(test/rspamd-test ${PCRE_LIBRARIES})
ENDIF(ENABLE_STATIC MATCHES "ON")
-ADD_EXECUTABLE(utils/url-extracter ${UTILSDEPENDS} ${CONTRIBSRC} ${UTILSSRC})
-SET_TARGET_PROPERTIES(utils/url-extracter PROPERTIES LINKER_LANGUAGE C)
-TARGET_LINK_LIBRARIES(utils/url-extracter ${CMAKE_REQUIRED_LIBRARIES})
-TARGET_LINK_LIBRARIES(utils/url-extracter ${GLIB2_LIBRARIES})
-IF(GMIME2_FOUND)
- TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
- TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
-IF(ENABLE_STATIC MATCHES "ON")
- TARGET_LINK_LIBRARIES(utils/url-extracter ${PCRE_LIBRARIES})
-ENDIF(ENABLE_STATIC MATCHES "ON")
-
-ADD_EXECUTABLE(utils/expression-parser ${UTILSDEPENDS} ${CONTRIBSRC} ${EXPRSRC})
-SET_TARGET_PROPERTIES(utils/expression-parser PROPERTIES LINKER_LANGUAGE C)
-TARGET_LINK_LIBRARIES(utils/expression-parser ${CMAKE_REQUIRED_LIBRARIES})
-IF(LIBUTIL_LIBRARY)
- TARGET_LINK_LIBRARIES(utils/expression-parser util)
-ENDIF(LIBUTIL_LIBRARY)
-TARGET_LINK_LIBRARIES(utils/expression-parser ${GLIB2_LIBRARIES})
-IF(GMIME2_FOUND)
- TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
- TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
-IF(ENABLE_STATIC MATCHES "ON")
- TARGET_LINK_LIBRARIES(utils/expression-parser ${PCRE_LIBRARIES})
-ENDIF(ENABLE_STATIC MATCHES "ON")
##################### INSTALLATION ##########################################
diff --git a/src/html.c b/src/html.c
index 2ac2fe323..60c6eabd0 100644
--- a/src/html.c
+++ b/src/html.c
@@ -800,7 +800,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
}
if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
g_tree_insert (part->html_urls, url_text, url);
- task->urls = g_list_prepend (task->urls, url);
+ }
+ if (g_tree_lookup (task->urls, url)) {
+ g_tree_insert (task->urls, url, url);
}
}
}
diff --git a/src/lmtp.c b/src/lmtp.c
index 6e7d38c99..d080b7d95 100644
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -115,9 +115,6 @@ free_lmtp_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
else {
rspamd_remove_dispatcher (lmtp->task->dispatcher);
}
- if (lmtp->task->urls) {
- g_list_free (lmtp->task->urls);
- }
close (lmtp->task->sock);
g_free (lmtp->task);
g_free (lmtp);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 48db4c833..0a6185f7e 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -220,27 +220,37 @@ lua_task_insert_result (lua_State * L)
return 1;
}
+struct lua_tree_cb_data {
+ lua_State *L;
+ int i;
+};
+
+static gboolean
+lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
+{
+ struct uri **purl;
+ struct lua_tree_cb_data *cb = ud;
+
+ purl = lua_newuserdata (cb->L, sizeof (struct uri *));
+ lua_setclass (cb->L, "rspamd{url}", -1);
+ *purl = value;
+ lua_rawseti (cb->L, -2, cb->i++);
+
+ return FALSE;
+}
+
static gint
lua_task_get_urls (lua_State * L)
{
- gint i = 1;
struct worker_task *task = lua_check_task (L);
- GList *cur;
- struct uri **purl;
+ struct lua_tree_cb_data cb;
if (task) {
- cur = task->urls;
- if (cur != NULL) {
- lua_newtable (L);
- while (cur) {
- purl = lua_newuserdata (L, sizeof (struct uri *));
- lua_setclass (L, "rspamd{url}", -1);
- *purl = cur->data;
- lua_rawseti (L, -2, i++);
- cur = g_list_next (cur);
- }
- return 1;
- }
+ lua_newtable (L);
+ cb.i = 1;
+ cb.L = L;
+ g_tree_foreach (task->urls, lua_tree_url_callback, &cb);
+ return 1;
}
lua_pushnil (L);
@@ -250,24 +260,15 @@ lua_task_get_urls (lua_State * L)
static gint
lua_task_get_emails (lua_State * L)
{
- gint i = 1;
struct worker_task *task = lua_check_task (L);
- GList *cur;
- struct uri **purl;
+ struct lua_tree_cb_data cb;
if (task) {
- cur = task->emails;
- if (cur != NULL) {
- lua_newtable (L);
- while (cur) {
- purl = lua_newuserdata (L, sizeof (struct uri *));
- lua_setclass (L, "rspamd{url}", -1);
- *purl = cur->data;
- lua_rawseti (L, -2, i++);
- cur = g_list_next (cur);
- }
- return 1;
- }
+ lua_newtable (L);
+ cb.i = 1;
+ cb.L = L;
+ g_tree_foreach (task->emails, lua_tree_url_callback, &cb);
+ return 1;
}
lua_pushnil (L);
diff --git a/src/main.h b/src/main.h
index acbfe8a72..e6cf67078 100644
--- a/src/main.h
+++ b/src/main.h
@@ -203,8 +203,8 @@ struct worker_task {
GList *text_parts; /**< list of text parts */
gchar *raw_headers; /**< list of raw headers */
GList *received; /**< list of received headers */
- GList *urls; /**< list of parsed urls */
- GList *emails; /**< list of parsed emails */
+ GTree *urls; /**< list of parsed urls */
+ GTree *emails; /**< list of parsed emails */
GList *images; /**< list of images */
GList *raw_headers_list; /**< list of raw headers */
GHashTable *results; /**< hash table of metric_result indexed by
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c
index c9de2fb40..f83e0a229 100644
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -53,11 +53,10 @@
static struct surbl_ctx *surbl_module_ctx = NULL;
-static gint surbl_filter (struct worker_task *task);
+static gint surbl_filter (struct worker_task *task);
static void surbl_test_url (struct worker_task *task, void *user_data);
static void dns_callback (struct rspamd_dns_reply *reply, gpointer arg);
static void process_dns_results (struct worker_task *task, struct suffix_item *suffix, gchar *url, guint32 addr);
-static gint urls_command_handler (struct worker_task *task);
#define NO_REGEXP (gpointer)-1
@@ -224,7 +223,6 @@ surbl_module_init (struct config_file *cfg, struct module_ctx **ctx)
*ctx = (struct module_ctx *)surbl_module_ctx;
- register_protocol_command ("urls", urls_command_handler);
/* Register module options */
register_module_opt ("surbl", "redirector", MODULE_OPT_TYPE_STRING);
register_module_opt ("surbl", "url_expire", MODULE_OPT_TYPE_TIME);
@@ -570,7 +568,7 @@ format_surbl_request (memory_pool_t * pool, f_str_t * hostname, struct suffix_it
}
static void
-make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree,
+make_surbl_requests (struct uri *url, struct worker_task *task,
struct suffix_item *suffix, gboolean forced)
{
gchar *surbl_req;
@@ -583,20 +581,14 @@ make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree,
if (check_view (task->cfg->views, suffix->symbol, task)) {
if ((surbl_req = format_surbl_request (task->task_pool, &f, suffix, TRUE, &err, forced)) != NULL) {
- if (g_tree_lookup (tree, surbl_req) == NULL) {
- g_tree_insert (tree, surbl_req, surbl_req);
- param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param));
- param->url = url;
- param->task = task;
- param->suffix = suffix;
- param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req);
- debug_task ("send surbl dns request %s", surbl_req);
- if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) {
- param->task->save.saved++;
- }
- }
- else {
- debug_task ("request %s is already sent", surbl_req);
+ param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param));
+ param->url = url;
+ param->task = task;
+ param->suffix = suffix;
+ param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req);
+ debug_task ("send surbl dns request %s", surbl_req);
+ if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) {
+ param->task->save.saved++;
}
}
else if (err != NULL && err->code != WHITELIST_ERROR) {
@@ -732,7 +724,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data)
param->task->save.saved = 1;
process_filters (param->task);
}
- make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE);
+ make_surbl_requests (param->url, param->task, param->suffix, FALSE);
break;
default:
return;
@@ -740,7 +732,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data)
}
static void
-register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_tree, struct suffix_item *suffix)
+register_memcached_call (struct uri *url, struct worker_task *task, struct suffix_item *suffix)
{
struct memcached_param *param;
struct memcached_server *selected;
@@ -754,7 +746,6 @@ register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_
param->url = url;
param->task = task;
- param->tree = url_tree;
param->suffix = suffix;
param->ctx = memory_pool_alloc0 (task->task_pool, sizeof (memcached_ctx_t));
@@ -799,7 +790,7 @@ free_redirector_session (void *ud)
event_del (&param->ev);
close (param->sock);
param->task->save.saved--;
- make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE);
+ make_surbl_requests (param->url, param->task, param->suffix, FALSE);
if (param->task->save.saved == 0) {
/* Call other filters */
param->task->save.saved = 1;
@@ -886,7 +877,7 @@ redirector_callback (gint fd, short what, void *arg)
static void
-register_redirector_call (struct uri *url, struct worker_task *task, GTree * url_tree,
+register_redirector_call (struct uri *url, struct worker_task *task,
struct suffix_item *suffix, const gchar *rule)
{
gint s = -1;
@@ -907,7 +898,7 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url
if (s == -1) {
msg_info ("<%s> cannot create tcp socket failed: %s", task->message_id, strerror (errno));
task->save.saved--;
- make_surbl_requests (url, task, url_tree, suffix, FALSE);
+ make_surbl_requests (url, task, suffix, FALSE);
return;
}
@@ -916,7 +907,6 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url
param->task = task;
param->state = STATE_CONNECT;
param->sock = s;
- param->tree = url_tree;
param->suffix = suffix;
param->redirector = selected;
timeout = memory_pool_alloc (task->task_pool, sizeof (struct timeval));
@@ -966,22 +956,22 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
re = g_hash_table_lookup (surbl_module_ctx->redirector_hosts, red_domain);
if (re != NULL && (re == NO_REGEXP || g_regex_match (re, url->string, 0, NULL))) {
/* If no regexp found or founded regexp matches url string register redirector's call */
- register_redirector_call (url, param->task, param->tree, param->suffix, red_domain);
+ register_redirector_call (url, param->task, param->suffix, red_domain);
param->task->save.saved++;
return FALSE;
}
}
}
}
- make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE);
+ make_surbl_requests (url, param->task, param->suffix, FALSE);
}
else {
if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
- register_memcached_call (url, param->task, param->tree, param->suffix);
+ register_memcached_call (url, param->task, param->suffix);
param->task->save.saved++;
}
else {
- make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE);
+ make_surbl_requests (url, param->task, param->suffix, FALSE);
}
}
@@ -991,31 +981,12 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
static void
surbl_test_url (struct worker_task *task, void *user_data)
{
- GTree *url_tree;
- GList *cur;
- struct mime_text_part *part;
struct redirector_param param;
struct suffix_item *suffix = user_data;
- url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
-
- param.tree = url_tree;
param.task = task;
param.suffix = suffix;
- cur = task->text_parts;
- while (cur) {
- part = cur->data;
- if (part->urls) {
- g_tree_foreach (part->urls, surbl_tree_url_callback, &param);
- }
- if (part->html_urls) {
- g_tree_foreach (part->html_urls, surbl_tree_url_callback, &param);
- }
-
- cur = g_list_next (cur);
- }
-
- memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, url_tree);
+ g_tree_foreach (task->urls, surbl_tree_url_callback, &param);
}
static gint
@@ -1025,71 +996,6 @@ surbl_filter (struct worker_task *task)
return 0;
}
-static gboolean
-urls_command_handler (struct worker_task *task)
-{
- GList *cur;
- gchar *outbuf, *urlstr;
- gint r, num = 0, buflen;
- struct uri *url;
- GError *err = NULL;
- GTree *url_tree;
- f_str_t f;
-
- url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
-
- /* First calculate buffer length */
- cur = g_list_first (task->urls);
- buflen = 0;
- while (cur) {
- url = cur->data;
- buflen += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1;
- cur = g_list_next (cur);
- }
-
- buflen += sizeof (RSPAMD_REPLY_BANNER " 0 OK" CRLF CRLF "Urls: ");
-
- outbuf = memory_pool_alloc (task->task_pool, buflen * sizeof (gchar));
-
- r = rspamd_snprintf (outbuf, buflen, "%s 0 %s" CRLF, (task->proto == SPAMC_PROTO) ? SPAMD_REPLY_BANNER : RSPAMD_REPLY_BANNER, "OK");
-
- r += rspamd_snprintf (outbuf + r, buflen - r - 2, "Urls: ");
-
- cur = g_list_first (task->urls);
-
- while (cur) {
- num++;
- url = cur->data;
- if (g_tree_lookup (url_tree, struri (url)) == NULL) {
- g_tree_insert (url_tree, struri (url), url);
- f.begin = url->host;
- f.len = url->hostlen;
- if ((urlstr = format_surbl_request (task->task_pool, &f, NULL, FALSE, &err, FALSE)) != NULL) {
- if (g_list_next (cur) != NULL) {
- r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">, ", (gchar *)urlstr, struri (url));
- }
- else {
- r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">", (gchar *)urlstr, struri (url));
- }
- }
- }
- cur = g_list_next (cur);
- }
-
- outbuf[r++] = '\r';
- outbuf[r++] = '\n';
-
- g_tree_destroy (url_tree);
- if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, TRUE)) {
- return FALSE;
- }
- msg_info ("msg ok, id: <%s>, %d urls extracted", task->message_id, num);
- task->state = STATE_REPLY;
-
- return TRUE;
-}
-
-
/*
* vi:ts=4
*/
diff --git a/src/protocol.c b/src/protocol.c
index ca49c5ecf..683d291a9 100644
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -29,8 +29,8 @@
#include "settings.h"
#include "message.h"
-/* Max line size as it is defined in rfc2822 */
-#define OUTBUFSIZ 1000
+/* Max line size */
+#define OUTBUFSIZ BUFSIZ
/*
* Just check if the passed message is spam or not and reply as
* described below
@@ -477,162 +477,106 @@ write_hashes_to_log (struct worker_task *task, gchar *logbuf, gint offset, gint
}
}
-static gint
-compare_url_func (gconstpointer a, gconstpointer b)
-{
- const struct uri *u1 = a, *u2 = b;
- if (u1->hostlen != u2->hostlen) {
- return u1->hostlen - u2->hostlen;
- }
- else {
- return memcmp (u1->host, u2->host, u1->hostlen);
- }
-}
+/* Structure for writing tree data */
+struct tree_cb_data {
+ gchar *buf;
+ gsize len;
+ gsize off;
+};
-static gint
-compare_email_func (gconstpointer a, gconstpointer b)
+/*
+ * Callback for writing urls
+ */
+static gboolean
+urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
{
- const struct uri *u1 = a, *u2 = b;
- gint r;
+ struct tree_cb_data *cb = ud;
+ struct uri *url = value;
+ gsize len;
- if (u1->hostlen != u2->hostlen) {
- return u1->hostlen - u2->hostlen;
+ len = url->hostlen + url->userlen + 1;
+ if (cb->off + len >= cb->len) {
+ msg_info ("cannot write urls header completely, stripped reply at: %z", cb->off);
+ return TRUE;
}
else {
- if ((r = memcmp (u1->host, u2->host, u1->hostlen)) == 0){
- if (u1->userlen != u2->userlen) {
- return u1->userlen - u2->userlen;
- }
- else {
- return memcmp (u1->user, u2->user, u1->userlen);
- }
- }
- else {
- return r;
- }
+ cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s,",
+ url->hostlen, url->host);
}
-
- return 0;
+ return FALSE;
}
static gboolean
show_url_header (struct worker_task *task)
{
gint r = 0;
- gchar outbuf[OUTBUFSIZ], c;
- struct uri *url;
- GList *cur;
- f_str_t host;
- GTree *url_tree;
+ gchar outbuf[OUTBUFSIZ];
+ struct tree_cb_data cb;
r = rspamd_snprintf (outbuf, sizeof (outbuf), "Urls: ");
- url_tree = g_tree_new (compare_url_func);
- cur = task->urls;
- while (cur) {
- url = cur->data;
- if (task->cfg->log_urls) {
- /* Write this url to log as well */
- msg_info ("url found: <%s>, score: [%.2f / %.2f]", struri (url), default_score, default_required_score);
- }
- if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) {
- g_tree_insert (url_tree, url, url);
- host.begin = url->host;
- host.len = url->hostlen;
- /* Skip long hosts to avoid protocol coollisions */
- if (host.len > OUTBUFSIZ) {
- cur = g_list_next (cur);
- continue;
- }
- /* Do header folding */
- if (host.len + r >= OUTBUFSIZ - 3) {
- outbuf[r++] = '\r';
- outbuf[r++] = '\n';
- outbuf[r] = ' ';
- if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) {
- return FALSE;
- }
- r = 0;
- }
- /* Write url host to buf */
- if (g_list_next (cur) != NULL) {
- c = *(host.begin + host.len);
- *(host.begin + host.len) = '\0';
- debug_task ("write url: %s", host.begin);
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s, ", host.begin);
- *(host.begin + host.len) = c;
- }
- else {
- c = *(host.begin + host.len);
- *(host.begin + host.len) = '\0';
- debug_task ("write url: %s", host.begin);
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s", host.begin);
- *(host.begin + host.len) = c;
- }
- }
- cur = g_list_next (cur);
+
+ cb.buf = outbuf;
+ cb.len = sizeof (outbuf);
+ cb.off = r;
+
+ g_tree_foreach (task->urls, urls_protocol_cb, &cb);
+ /* Strip last ',' */
+ if (cb.buf[cb.off - 1] == ',') {
+ cb.buf[--cb.off] = '\0';
}
- if (r == 0) {
+ cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF);
+
+ return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE);
+}
+
+/*
+ * Callback for writing emails
+ */
+static gboolean
+emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
+{
+ struct tree_cb_data *cb = ud;
+ struct uri *url = value;
+ gsize len;
+
+ len = url->hostlen + url->userlen + 1;
+ if (cb->off + len >= cb->len) {
+ msg_info ("cannot write emails header completely, stripped reply at: %z", cb->off);
return TRUE;
}
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF);
-
- return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE);
+ else {
+ cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s@%*s,",
+ url->userlen, url->user,
+ url->hostlen, url->host);
+ }
+ return FALSE;
}
+/*
+ * Show header for emails found in a message
+ */
static gboolean
show_email_header (struct worker_task *task)
{
gint r = 0;
gchar outbuf[OUTBUFSIZ];
- struct uri *url;
- GList *cur;
- gsize len;
- GTree *url_tree;
+ struct tree_cb_data cb;
r = rspamd_snprintf (outbuf, sizeof (outbuf), "Emails: ");
- url_tree = g_tree_new (compare_email_func);
- cur = task->emails;
- while (cur) {
- url = cur->data;
- if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) {
- g_tree_insert (url_tree, url, url);
- len = url->hostlen + url->userlen + 1;
- /* Skip long hosts to avoid protocol coollisions */
- if (len > OUTBUFSIZ) {
- cur = g_list_next (cur);
- continue;
- }
- /* Do header folding */
- if (len + r >= OUTBUFSIZ - 3) {
- outbuf[r++] = '\r';
- outbuf[r++] = '\n';
- outbuf[r] = ' ';
- if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) {
- return FALSE;
- }
- r = 0;
- }
- /* Write url host to buf */
- if (g_list_next (cur) != NULL) {
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s, ",
- url->userlen, url->user,
- url->hostlen, url->host);
- }
- else {
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s",
- url->userlen, url->user,
- url->hostlen, url->host);
- }
- }
- cur = g_list_next (cur);
- }
- if (r == 0) {
- return TRUE;
+
+ cb.buf = outbuf;
+ cb.len = sizeof (outbuf);
+ cb.off = r;
+
+ g_tree_foreach (task->emails, emails_protocol_cb, &cb);
+ /* Strip last ',' */
+ if (cb.buf[cb.off - 1] == ',') {
+ cb.buf[--cb.off] = '\0';
}
- r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF);
+ cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF);
- return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE);
+ return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE);
}
static void
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c
index faa8f074b..b7318bdfc 100644
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -143,38 +143,6 @@ get_next_word (f_str_t * buf, f_str_t * token)
return token;
}
-int
-tokenize_urls (memory_pool_t * pool, struct worker_task *task, GTree ** tree)
-{
- token_node_t *new = NULL;
- f_str_t url_domain;
- struct uri *url;
- GList *cur;
- uint32_t h;
-
- if (*tree == NULL) {
- *tree = g_tree_new (token_node_compare_func);
- memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree);
- }
-
- cur = task->urls;
- while (cur) {
- url = cur->data;
- url_domain.begin = url->host;
- url_domain.len = url->hostlen;
- new = memory_pool_alloc (pool, sizeof (token_node_t));
- h = fstrhash (&url_domain);
- new->h1 = h * primes[0];
- new->h2 = h * primes[1];
- if (g_tree_lookup (*tree, new) == NULL) {
- g_tree_insert (*tree, new, new);
- }
- cur = g_list_next (cur);
- }
-
- return TRUE;
-}
-
/* Struct to access gmime headers */
struct raw_header {
struct raw_header *next;
diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h
index 21e454e6b..59a2684d0 100644
--- a/src/tokenizers/tokenizers.h
+++ b/src/tokenizers/tokenizers.h
@@ -36,8 +36,6 @@ struct tokenizer* get_tokenizer (char *name);
f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
/* OSB tokenize function */
int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
-/* Common tokenizer for urls */
-int tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Common tokenizer for headers */
int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
/* Make tokens for a subject */
diff --git a/src/url.c b/src/url.c
index b24392056..083ceecba 100644
--- a/src/url.c
+++ b/src/url.c
@@ -1185,13 +1185,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
if (new != NULL) {
g_strstrip (url_str);
rc = parse_uri (new, url_str, pool);
- if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+ if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
+ new->hostlen > 0) {
if (new->protocol == PROTOCOL_MAILTO) {
- task->emails = g_list_prepend (task->emails, new);
+ if (!g_tree_lookup (task->emails, new)) {
+ g_tree_insert (task->emails, new, new);
+ }
}
else {
g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
- task->urls = g_list_prepend (task->urls, new);
+ if (!g_tree_lookup (task->urls, new)) {
+ g_tree_insert (task->urls, new, new);
+ }
}
}
else {
diff --git a/src/worker.c b/src/worker.c
index 1d6ec05fb..0ac952b79 100644
--- a/src/worker.c
+++ b/src/worker.c
@@ -261,12 +261,6 @@ free_task (struct worker_task *task, gboolean is_soft)
if (task->text_parts) {
g_list_free (task->text_parts);
}
- if (task->urls) {
- g_list_free (task->urls);
- }
- if (task->emails) {
- g_list_free (task->emails);
- }
if (task->images) {
g_list_free (task->images);
}
@@ -460,6 +454,52 @@ err_socket (GError * err, void *arg)
}
}
+/* Compare two emails for building emails tree */
+static gint
+compare_email_func (gconstpointer a, gconstpointer b)
+{
+ const struct uri *u1 = a, *u2 = b;
+ gint r;
+
+ if (u1->hostlen != u2->hostlen) {
+ return u1->hostlen - u2->hostlen;
+ }
+ else {
+ if ((r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen)) == 0){
+ if (u1->userlen != u2->userlen) {
+ return u1->userlen - u2->userlen;
+ }
+ else {
+ return g_ascii_strncasecmp (u1->user, u2->user, u1->userlen);
+ }
+ }
+ else {
+ return r;
+ }
+ }
+
+ return 0;
+}
+
+static gint
+compare_url_func (gconstpointer a, gconstpointer b)
+{
+ const struct uri *u1 = a, *u2 = b;
+ int r;
+
+ if (u1->hostlen != u2->hostlen) {
+ return u1->hostlen - u2->hostlen;
+ }
+ else {
+ r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen);
+ }
+
+ return r;
+}
+
+/*
+ * Create new task
+ */
struct worker_task *
construct_task (struct rspamd_worker *worker)
{
@@ -499,6 +539,14 @@ construct_task (struct rspamd_worker *worker)
memory_pool_add_destructor (new_task->task_pool,
(pool_destruct_func) g_hash_table_destroy,
new_task->re_cache);
+ new_task->emails = g_tree_new (compare_email_func);
+ memory_pool_add_destructor (new_task->task_pool,
+ (pool_destruct_func) g_tree_destroy,
+ new_task->emails);
+ new_task->urls = g_tree_new (compare_url_func);
+ memory_pool_add_destructor (new_task->task_pool,
+ (pool_destruct_func) g_tree_destroy,
+ new_task->urls);
new_task->s =
new_async_session (new_task->task_pool, free_task_hard, new_task);
new_task->sock = -1;