diff options
author | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-02-24 18:51:13 +0300 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rambler-co.ru> | 2011-02-24 18:51:13 +0300 |
commit | 121efbcddf8ec41eea91aa80574dab3730bf8976 (patch) | |
tree | 2d39b5895526d63c7994aa81575c2db15a46cee1 | |
parent | 7eb9b642db888b26a97b06394695e55173c45895 (diff) | |
download | rspamd-121efbcddf8ec41eea91aa80574dab3730bf8976.tar.gz rspamd-121efbcddf8ec41eea91aa80574dab3730bf8976.zip |
* Rewrite URL storage system
-rw-r--r-- | CMakeLists.txt | 28 | ||||
-rw-r--r-- | src/html.c | 4 | ||||
-rw-r--r-- | src/lmtp.c | 3 | ||||
-rw-r--r-- | src/lua/lua_task.c | 61 | ||||
-rw-r--r-- | src/main.h | 4 | ||||
-rw-r--r-- | src/plugins/surbl.c | 134 | ||||
-rw-r--r-- | src/protocol.c | 202 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.c | 32 | ||||
-rw-r--r-- | src/tokenizers/tokenizers.h | 2 | ||||
-rw-r--r-- | src/url.c | 11 | ||||
-rw-r--r-- | src/worker.c | 60 |
11 files changed, 191 insertions, 350 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index bf6e7748c..e93793fdf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -735,34 +735,6 @@ IF(ENABLE_STATIC MATCHES "ON") TARGET_LINK_LIBRARIES(test/rspamd-test ${PCRE_LIBRARIES}) ENDIF(ENABLE_STATIC MATCHES "ON") -ADD_EXECUTABLE(utils/url-extracter ${UTILSDEPENDS} ${CONTRIBSRC} ${UTILSSRC}) -SET_TARGET_PROPERTIES(utils/url-extracter PROPERTIES LINKER_LANGUAGE C) -TARGET_LINK_LIBRARIES(utils/url-extracter ${CMAKE_REQUIRED_LIBRARIES}) -TARGET_LINK_LIBRARIES(utils/url-extracter ${GLIB2_LIBRARIES}) -IF(GMIME2_FOUND) - TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME2_LIBRARIES}) -ELSE(GMIME2_FOUND) - TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME24_LIBRARIES}) -ENDIF(GMIME2_FOUND) -IF(ENABLE_STATIC MATCHES "ON") - TARGET_LINK_LIBRARIES(utils/url-extracter ${PCRE_LIBRARIES}) -ENDIF(ENABLE_STATIC MATCHES "ON") - -ADD_EXECUTABLE(utils/expression-parser ${UTILSDEPENDS} ${CONTRIBSRC} ${EXPRSRC}) -SET_TARGET_PROPERTIES(utils/expression-parser PROPERTIES LINKER_LANGUAGE C) -TARGET_LINK_LIBRARIES(utils/expression-parser ${CMAKE_REQUIRED_LIBRARIES}) -IF(LIBUTIL_LIBRARY) - TARGET_LINK_LIBRARIES(utils/expression-parser util) -ENDIF(LIBUTIL_LIBRARY) -TARGET_LINK_LIBRARIES(utils/expression-parser ${GLIB2_LIBRARIES}) -IF(GMIME2_FOUND) - TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME2_LIBRARIES}) -ELSE(GMIME2_FOUND) - TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME24_LIBRARIES}) -ENDIF(GMIME2_FOUND) -IF(ENABLE_STATIC MATCHES "ON") - TARGET_LINK_LIBRARIES(utils/expression-parser ${PCRE_LIBRARIES}) -ENDIF(ENABLE_STATIC MATCHES "ON") ##################### INSTALLATION ########################################## diff --git a/src/html.c b/src/html.c index 2ac2fe323..60c6eabd0 100644 --- a/src/html.c +++ b/src/html.c @@ -800,7 +800,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i } if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { g_tree_insert (part->html_urls, url_text, url); - task->urls = g_list_prepend (task->urls, url); + } + if (g_tree_lookup (task->urls, url)) { + g_tree_insert (task->urls, url, url); } } } diff --git a/src/lmtp.c b/src/lmtp.c index 6e7d38c99..d080b7d95 100644 --- a/src/lmtp.c +++ b/src/lmtp.c @@ -115,9 +115,6 @@ free_lmtp_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft) else { rspamd_remove_dispatcher (lmtp->task->dispatcher); } - if (lmtp->task->urls) { - g_list_free (lmtp->task->urls); - } close (lmtp->task->sock); g_free (lmtp->task); g_free (lmtp); diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 48db4c833..0a6185f7e 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -220,27 +220,37 @@ lua_task_insert_result (lua_State * L) return 1; } +struct lua_tree_cb_data { + lua_State *L; + int i; +}; + +static gboolean +lua_tree_url_callback (gpointer key, gpointer value, gpointer ud) +{ + struct uri **purl; + struct lua_tree_cb_data *cb = ud; + + purl = lua_newuserdata (cb->L, sizeof (struct uri *)); + lua_setclass (cb->L, "rspamd{url}", -1); + *purl = value; + lua_rawseti (cb->L, -2, cb->i++); + + return FALSE; +} + static gint lua_task_get_urls (lua_State * L) { - gint i = 1; struct worker_task *task = lua_check_task (L); - GList *cur; - struct uri **purl; + struct lua_tree_cb_data cb; if (task) { - cur = task->urls; - if (cur != NULL) { - lua_newtable (L); - while (cur) { - purl = lua_newuserdata (L, sizeof (struct uri *)); - lua_setclass (L, "rspamd{url}", -1); - *purl = cur->data; - lua_rawseti (L, -2, i++); - cur = g_list_next (cur); - } - return 1; - } + lua_newtable (L); + cb.i = 1; + cb.L = L; + g_tree_foreach (task->urls, lua_tree_url_callback, &cb); + return 1; } lua_pushnil (L); @@ -250,24 +260,15 @@ lua_task_get_urls (lua_State * L) static gint lua_task_get_emails (lua_State * L) { - gint i = 1; struct worker_task *task = lua_check_task (L); - GList *cur; - struct uri **purl; + struct lua_tree_cb_data cb; if (task) { - cur = task->emails; - if (cur != NULL) { - lua_newtable (L); - while (cur) { - purl = lua_newuserdata (L, sizeof (struct uri *)); - lua_setclass (L, "rspamd{url}", -1); - *purl = cur->data; - lua_rawseti (L, -2, i++); - cur = g_list_next (cur); - } - return 1; - } + lua_newtable (L); + cb.i = 1; + cb.L = L; + g_tree_foreach (task->emails, lua_tree_url_callback, &cb); + return 1; } lua_pushnil (L); diff --git a/src/main.h b/src/main.h index acbfe8a72..e6cf67078 100644 --- a/src/main.h +++ b/src/main.h @@ -203,8 +203,8 @@ struct worker_task { GList *text_parts; /**< list of text parts */ gchar *raw_headers; /**< list of raw headers */ GList *received; /**< list of received headers */ - GList *urls; /**< list of parsed urls */ - GList *emails; /**< list of parsed emails */ + GTree *urls; /**< list of parsed urls */ + GTree *emails; /**< list of parsed emails */ GList *images; /**< list of images */ GList *raw_headers_list; /**< list of raw headers */ GHashTable *results; /**< hash table of metric_result indexed by diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c index c9de2fb40..f83e0a229 100644 --- a/src/plugins/surbl.c +++ b/src/plugins/surbl.c @@ -53,11 +53,10 @@ static struct surbl_ctx *surbl_module_ctx = NULL; -static gint surbl_filter (struct worker_task *task); +static gint surbl_filter (struct worker_task *task); static void surbl_test_url (struct worker_task *task, void *user_data); static void dns_callback (struct rspamd_dns_reply *reply, gpointer arg); static void process_dns_results (struct worker_task *task, struct suffix_item *suffix, gchar *url, guint32 addr); -static gint urls_command_handler (struct worker_task *task); #define NO_REGEXP (gpointer)-1 @@ -224,7 +223,6 @@ surbl_module_init (struct config_file *cfg, struct module_ctx **ctx) *ctx = (struct module_ctx *)surbl_module_ctx; - register_protocol_command ("urls", urls_command_handler); /* Register module options */ register_module_opt ("surbl", "redirector", MODULE_OPT_TYPE_STRING); register_module_opt ("surbl", "url_expire", MODULE_OPT_TYPE_TIME); @@ -570,7 +568,7 @@ format_surbl_request (memory_pool_t * pool, f_str_t * hostname, struct suffix_it } static void -make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree, +make_surbl_requests (struct uri *url, struct worker_task *task, struct suffix_item *suffix, gboolean forced) { gchar *surbl_req; @@ -583,20 +581,14 @@ make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree, if (check_view (task->cfg->views, suffix->symbol, task)) { if ((surbl_req = format_surbl_request (task->task_pool, &f, suffix, TRUE, &err, forced)) != NULL) { - if (g_tree_lookup (tree, surbl_req) == NULL) { - g_tree_insert (tree, surbl_req, surbl_req); - param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param)); - param->url = url; - param->task = task; - param->suffix = suffix; - param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req); - debug_task ("send surbl dns request %s", surbl_req); - if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) { - param->task->save.saved++; - } - } - else { - debug_task ("request %s is already sent", surbl_req); + param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param)); + param->url = url; + param->task = task; + param->suffix = suffix; + param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req); + debug_task ("send surbl dns request %s", surbl_req); + if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) { + param->task->save.saved++; } } else if (err != NULL && err->code != WHITELIST_ERROR) { @@ -732,7 +724,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data) param->task->save.saved = 1; process_filters (param->task); } - make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE); + make_surbl_requests (param->url, param->task, param->suffix, FALSE); break; default: return; @@ -740,7 +732,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data) } static void -register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_tree, struct suffix_item *suffix) +register_memcached_call (struct uri *url, struct worker_task *task, struct suffix_item *suffix) { struct memcached_param *param; struct memcached_server *selected; @@ -754,7 +746,6 @@ register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_ param->url = url; param->task = task; - param->tree = url_tree; param->suffix = suffix; param->ctx = memory_pool_alloc0 (task->task_pool, sizeof (memcached_ctx_t)); @@ -799,7 +790,7 @@ free_redirector_session (void *ud) event_del (¶m->ev); close (param->sock); param->task->save.saved--; - make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE); + make_surbl_requests (param->url, param->task, param->suffix, FALSE); if (param->task->save.saved == 0) { /* Call other filters */ param->task->save.saved = 1; @@ -886,7 +877,7 @@ redirector_callback (gint fd, short what, void *arg) static void -register_redirector_call (struct uri *url, struct worker_task *task, GTree * url_tree, +register_redirector_call (struct uri *url, struct worker_task *task, struct suffix_item *suffix, const gchar *rule) { gint s = -1; @@ -907,7 +898,7 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url if (s == -1) { msg_info ("<%s> cannot create tcp socket failed: %s", task->message_id, strerror (errno)); task->save.saved--; - make_surbl_requests (url, task, url_tree, suffix, FALSE); + make_surbl_requests (url, task, suffix, FALSE); return; } @@ -916,7 +907,6 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url param->task = task; param->state = STATE_CONNECT; param->sock = s; - param->tree = url_tree; param->suffix = suffix; param->redirector = selected; timeout = memory_pool_alloc (task->task_pool, sizeof (struct timeval)); @@ -966,22 +956,22 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) re = g_hash_table_lookup (surbl_module_ctx->redirector_hosts, red_domain); if (re != NULL && (re == NO_REGEXP || g_regex_match (re, url->string, 0, NULL))) { /* If no regexp found or founded regexp matches url string register redirector's call */ - register_redirector_call (url, param->task, param->tree, param->suffix, red_domain); + register_redirector_call (url, param->task, param->suffix, red_domain); param->task->save.saved++; return FALSE; } } } } - make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE); + make_surbl_requests (url, param->task, param->suffix, FALSE); } else { if (param->task->worker->srv->cfg->memcached_servers_num > 0) { - register_memcached_call (url, param->task, param->tree, param->suffix); + register_memcached_call (url, param->task, param->suffix); param->task->save.saved++; } else { - make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE); + make_surbl_requests (url, param->task, param->suffix, FALSE); } } @@ -991,31 +981,12 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data) static void surbl_test_url (struct worker_task *task, void *user_data) { - GTree *url_tree; - GList *cur; - struct mime_text_part *part; struct redirector_param param; struct suffix_item *suffix = user_data; - url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); - - param.tree = url_tree; param.task = task; param.suffix = suffix; - cur = task->text_parts; - while (cur) { - part = cur->data; - if (part->urls) { - g_tree_foreach (part->urls, surbl_tree_url_callback, ¶m); - } - if (part->html_urls) { - g_tree_foreach (part->html_urls, surbl_tree_url_callback, ¶m); - } - - cur = g_list_next (cur); - } - - memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, url_tree); + g_tree_foreach (task->urls, surbl_tree_url_callback, ¶m); } static gint @@ -1025,71 +996,6 @@ surbl_filter (struct worker_task *task) return 0; } -static gboolean -urls_command_handler (struct worker_task *task) -{ - GList *cur; - gchar *outbuf, *urlstr; - gint r, num = 0, buflen; - struct uri *url; - GError *err = NULL; - GTree *url_tree; - f_str_t f; - - url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp); - - /* First calculate buffer length */ - cur = g_list_first (task->urls); - buflen = 0; - while (cur) { - url = cur->data; - buflen += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1; - cur = g_list_next (cur); - } - - buflen += sizeof (RSPAMD_REPLY_BANNER " 0 OK" CRLF CRLF "Urls: "); - - outbuf = memory_pool_alloc (task->task_pool, buflen * sizeof (gchar)); - - r = rspamd_snprintf (outbuf, buflen, "%s 0 %s" CRLF, (task->proto == SPAMC_PROTO) ? SPAMD_REPLY_BANNER : RSPAMD_REPLY_BANNER, "OK"); - - r += rspamd_snprintf (outbuf + r, buflen - r - 2, "Urls: "); - - cur = g_list_first (task->urls); - - while (cur) { - num++; - url = cur->data; - if (g_tree_lookup (url_tree, struri (url)) == NULL) { - g_tree_insert (url_tree, struri (url), url); - f.begin = url->host; - f.len = url->hostlen; - if ((urlstr = format_surbl_request (task->task_pool, &f, NULL, FALSE, &err, FALSE)) != NULL) { - if (g_list_next (cur) != NULL) { - r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">, ", (gchar *)urlstr, struri (url)); - } - else { - r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">", (gchar *)urlstr, struri (url)); - } - } - } - cur = g_list_next (cur); - } - - outbuf[r++] = '\r'; - outbuf[r++] = '\n'; - - g_tree_destroy (url_tree); - if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, TRUE)) { - return FALSE; - } - msg_info ("msg ok, id: <%s>, %d urls extracted", task->message_id, num); - task->state = STATE_REPLY; - - return TRUE; -} - - /* * vi:ts=4 */ diff --git a/src/protocol.c b/src/protocol.c index ca49c5ecf..683d291a9 100644 --- a/src/protocol.c +++ b/src/protocol.c @@ -29,8 +29,8 @@ #include "settings.h" #include "message.h" -/* Max line size as it is defined in rfc2822 */ -#define OUTBUFSIZ 1000 +/* Max line size */ +#define OUTBUFSIZ BUFSIZ /* * Just check if the passed message is spam or not and reply as * described below @@ -477,162 +477,106 @@ write_hashes_to_log (struct worker_task *task, gchar *logbuf, gint offset, gint } } -static gint -compare_url_func (gconstpointer a, gconstpointer b) -{ - const struct uri *u1 = a, *u2 = b; - if (u1->hostlen != u2->hostlen) { - return u1->hostlen - u2->hostlen; - } - else { - return memcmp (u1->host, u2->host, u1->hostlen); - } -} +/* Structure for writing tree data */ +struct tree_cb_data { + gchar *buf; + gsize len; + gsize off; +}; -static gint -compare_email_func (gconstpointer a, gconstpointer b) +/* + * Callback for writing urls + */ +static gboolean +urls_protocol_cb (gpointer key, gpointer value, gpointer ud) { - const struct uri *u1 = a, *u2 = b; - gint r; + struct tree_cb_data *cb = ud; + struct uri *url = value; + gsize len; - if (u1->hostlen != u2->hostlen) { - return u1->hostlen - u2->hostlen; + len = url->hostlen + url->userlen + 1; + if (cb->off + len >= cb->len) { + msg_info ("cannot write urls header completely, stripped reply at: %z", cb->off); + return TRUE; } else { - if ((r = memcmp (u1->host, u2->host, u1->hostlen)) == 0){ - if (u1->userlen != u2->userlen) { - return u1->userlen - u2->userlen; - } - else { - return memcmp (u1->user, u2->user, u1->userlen); - } - } - else { - return r; - } + cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s,", + url->hostlen, url->host); } - - return 0; + return FALSE; } static gboolean show_url_header (struct worker_task *task) { gint r = 0; - gchar outbuf[OUTBUFSIZ], c; - struct uri *url; - GList *cur; - f_str_t host; - GTree *url_tree; + gchar outbuf[OUTBUFSIZ]; + struct tree_cb_data cb; r = rspamd_snprintf (outbuf, sizeof (outbuf), "Urls: "); - url_tree = g_tree_new (compare_url_func); - cur = task->urls; - while (cur) { - url = cur->data; - if (task->cfg->log_urls) { - /* Write this url to log as well */ - msg_info ("url found: <%s>, score: [%.2f / %.2f]", struri (url), default_score, default_required_score); - } - if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) { - g_tree_insert (url_tree, url, url); - host.begin = url->host; - host.len = url->hostlen; - /* Skip long hosts to avoid protocol coollisions */ - if (host.len > OUTBUFSIZ) { - cur = g_list_next (cur); - continue; - } - /* Do header folding */ - if (host.len + r >= OUTBUFSIZ - 3) { - outbuf[r++] = '\r'; - outbuf[r++] = '\n'; - outbuf[r] = ' '; - if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) { - return FALSE; - } - r = 0; - } - /* Write url host to buf */ - if (g_list_next (cur) != NULL) { - c = *(host.begin + host.len); - *(host.begin + host.len) = '\0'; - debug_task ("write url: %s", host.begin); - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s, ", host.begin); - *(host.begin + host.len) = c; - } - else { - c = *(host.begin + host.len); - *(host.begin + host.len) = '\0'; - debug_task ("write url: %s", host.begin); - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s", host.begin); - *(host.begin + host.len) = c; - } - } - cur = g_list_next (cur); + + cb.buf = outbuf; + cb.len = sizeof (outbuf); + cb.off = r; + + g_tree_foreach (task->urls, urls_protocol_cb, &cb); + /* Strip last ',' */ + if (cb.buf[cb.off - 1] == ',') { + cb.buf[--cb.off] = '\0'; } - if (r == 0) { + cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF); + + return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE); +} + +/* + * Callback for writing emails + */ +static gboolean +emails_protocol_cb (gpointer key, gpointer value, gpointer ud) +{ + struct tree_cb_data *cb = ud; + struct uri *url = value; + gsize len; + + len = url->hostlen + url->userlen + 1; + if (cb->off + len >= cb->len) { + msg_info ("cannot write emails header completely, stripped reply at: %z", cb->off); return TRUE; } - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF); - - return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE); + else { + cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s@%*s,", + url->userlen, url->user, + url->hostlen, url->host); + } + return FALSE; } +/* + * Show header for emails found in a message + */ static gboolean show_email_header (struct worker_task *task) { gint r = 0; gchar outbuf[OUTBUFSIZ]; - struct uri *url; - GList *cur; - gsize len; - GTree *url_tree; + struct tree_cb_data cb; r = rspamd_snprintf (outbuf, sizeof (outbuf), "Emails: "); - url_tree = g_tree_new (compare_email_func); - cur = task->emails; - while (cur) { - url = cur->data; - if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) { - g_tree_insert (url_tree, url, url); - len = url->hostlen + url->userlen + 1; - /* Skip long hosts to avoid protocol coollisions */ - if (len > OUTBUFSIZ) { - cur = g_list_next (cur); - continue; - } - /* Do header folding */ - if (len + r >= OUTBUFSIZ - 3) { - outbuf[r++] = '\r'; - outbuf[r++] = '\n'; - outbuf[r] = ' '; - if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) { - return FALSE; - } - r = 0; - } - /* Write url host to buf */ - if (g_list_next (cur) != NULL) { - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s, ", - url->userlen, url->user, - url->hostlen, url->host); - } - else { - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s", - url->userlen, url->user, - url->hostlen, url->host); - } - } - cur = g_list_next (cur); - } - if (r == 0) { - return TRUE; + + cb.buf = outbuf; + cb.len = sizeof (outbuf); + cb.off = r; + + g_tree_foreach (task->emails, emails_protocol_cb, &cb); + /* Strip last ',' */ + if (cb.buf[cb.off - 1] == ',') { + cb.buf[--cb.off] = '\0'; } - r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF); + cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF); - return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE); + return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE); } static void diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c index faa8f074b..b7318bdfc 100644 --- a/src/tokenizers/tokenizers.c +++ b/src/tokenizers/tokenizers.c @@ -143,38 +143,6 @@ get_next_word (f_str_t * buf, f_str_t * token) return token; } -int -tokenize_urls (memory_pool_t * pool, struct worker_task *task, GTree ** tree) -{ - token_node_t *new = NULL; - f_str_t url_domain; - struct uri *url; - GList *cur; - uint32_t h; - - if (*tree == NULL) { - *tree = g_tree_new (token_node_compare_func); - memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree); - } - - cur = task->urls; - while (cur) { - url = cur->data; - url_domain.begin = url->host; - url_domain.len = url->hostlen; - new = memory_pool_alloc (pool, sizeof (token_node_t)); - h = fstrhash (&url_domain); - new->h1 = h * primes[0]; - new->h2 = h * primes[1]; - if (g_tree_lookup (*tree, new) == NULL) { - g_tree_insert (*tree, new, new); - } - cur = g_list_next (cur); - } - - return TRUE; -} - /* Struct to access gmime headers */ struct raw_header { struct raw_header *next; diff --git a/src/tokenizers/tokenizers.h b/src/tokenizers/tokenizers.h index 21e454e6b..59a2684d0 100644 --- a/src/tokenizers/tokenizers.h +++ b/src/tokenizers/tokenizers.h @@ -36,8 +36,6 @@ struct tokenizer* get_tokenizer (char *name); f_str_t *get_next_word (f_str_t *buf, f_str_t *token); /* OSB tokenize function */ int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur); -/* Common tokenizer for urls */ -int tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Common tokenizer for headers */ int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur); /* Make tokens for a subject */ @@ -1185,13 +1185,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, pool); - if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { + if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) && + new->hostlen > 0) { if (new->protocol == PROTOCOL_MAILTO) { - task->emails = g_list_prepend (task->emails, new); + if (!g_tree_lookup (task->emails, new)) { + g_tree_insert (task->emails, new, new); + } } else { g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); - task->urls = g_list_prepend (task->urls, new); + if (!g_tree_lookup (task->urls, new)) { + g_tree_insert (task->urls, new, new); + } } } else { diff --git a/src/worker.c b/src/worker.c index 1d6ec05fb..0ac952b79 100644 --- a/src/worker.c +++ b/src/worker.c @@ -261,12 +261,6 @@ free_task (struct worker_task *task, gboolean is_soft) if (task->text_parts) { g_list_free (task->text_parts); } - if (task->urls) { - g_list_free (task->urls); - } - if (task->emails) { - g_list_free (task->emails); - } if (task->images) { g_list_free (task->images); } @@ -460,6 +454,52 @@ err_socket (GError * err, void *arg) } } +/* Compare two emails for building emails tree */ +static gint +compare_email_func (gconstpointer a, gconstpointer b) +{ + const struct uri *u1 = a, *u2 = b; + gint r; + + if (u1->hostlen != u2->hostlen) { + return u1->hostlen - u2->hostlen; + } + else { + if ((r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen)) == 0){ + if (u1->userlen != u2->userlen) { + return u1->userlen - u2->userlen; + } + else { + return g_ascii_strncasecmp (u1->user, u2->user, u1->userlen); + } + } + else { + return r; + } + } + + return 0; +} + +static gint +compare_url_func (gconstpointer a, gconstpointer b) +{ + const struct uri *u1 = a, *u2 = b; + int r; + + if (u1->hostlen != u2->hostlen) { + return u1->hostlen - u2->hostlen; + } + else { + r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen); + } + + return r; +} + +/* + * Create new task + */ struct worker_task * construct_task (struct rspamd_worker *worker) { @@ -499,6 +539,14 @@ construct_task (struct rspamd_worker *worker) memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func) g_hash_table_destroy, new_task->re_cache); + new_task->emails = g_tree_new (compare_email_func); + memory_pool_add_destructor (new_task->task_pool, + (pool_destruct_func) g_tree_destroy, + new_task->emails); + new_task->urls = g_tree_new (compare_url_func); + memory_pool_add_destructor (new_task->task_pool, + (pool_destruct_func) g_tree_destroy, + new_task->urls); new_task->s = new_async_session (new_task->task_pool, free_task_hard, new_task); new_task->sock = -1; |