]> source.dussan.org Git - rspamd.git/commitdiff
* Rewrite URL storage system
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 24 Feb 2011 15:51:13 +0000 (18:51 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Thu, 24 Feb 2011 15:51:13 +0000 (18:51 +0300)
CMakeLists.txt
src/html.c
src/lmtp.c
src/lua/lua_task.c
src/main.h
src/plugins/surbl.c
src/protocol.c
src/tokenizers/tokenizers.c
src/tokenizers/tokenizers.h
src/url.c
src/worker.c

index bf6e7748c4bc07f6a3ebbf224d22ba8ac4b7c54c..e93793fdf501e80df3418834f4d620f64b4815bb 100644 (file)
@@ -735,34 +735,6 @@ IF(ENABLE_STATIC MATCHES "ON")
        TARGET_LINK_LIBRARIES(test/rspamd-test ${PCRE_LIBRARIES})
 ENDIF(ENABLE_STATIC MATCHES "ON")
 
-ADD_EXECUTABLE(utils/url-extracter ${UTILSDEPENDS} ${CONTRIBSRC} ${UTILSSRC})
-SET_TARGET_PROPERTIES(utils/url-extracter PROPERTIES LINKER_LANGUAGE C)
-TARGET_LINK_LIBRARIES(utils/url-extracter ${CMAKE_REQUIRED_LIBRARIES})
-TARGET_LINK_LIBRARIES(utils/url-extracter ${GLIB2_LIBRARIES})
-IF(GMIME2_FOUND)
-       TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
-       TARGET_LINK_LIBRARIES(utils/url-extracter ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
-IF(ENABLE_STATIC MATCHES "ON")
-       TARGET_LINK_LIBRARIES(utils/url-extracter ${PCRE_LIBRARIES})
-ENDIF(ENABLE_STATIC MATCHES "ON")
-
-ADD_EXECUTABLE(utils/expression-parser ${UTILSDEPENDS} ${CONTRIBSRC} ${EXPRSRC})
-SET_TARGET_PROPERTIES(utils/expression-parser PROPERTIES LINKER_LANGUAGE C)
-TARGET_LINK_LIBRARIES(utils/expression-parser ${CMAKE_REQUIRED_LIBRARIES})
-IF(LIBUTIL_LIBRARY)
-       TARGET_LINK_LIBRARIES(utils/expression-parser util)
-ENDIF(LIBUTIL_LIBRARY)
-TARGET_LINK_LIBRARIES(utils/expression-parser ${GLIB2_LIBRARIES})
-IF(GMIME2_FOUND)
-       TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME2_LIBRARIES})
-ELSE(GMIME2_FOUND)
-       TARGET_LINK_LIBRARIES(utils/expression-parser ${GMIME24_LIBRARIES})
-ENDIF(GMIME2_FOUND)
-IF(ENABLE_STATIC MATCHES "ON")
-       TARGET_LINK_LIBRARIES(utils/expression-parser ${PCRE_LIBRARIES})
-ENDIF(ENABLE_STATIC MATCHES "ON")
 
 ##################### INSTALLATION ##########################################
 
index 2ac2fe323e9c63610ab1b2cdc44400b2842b2f74..60c6eabd0471e8c3158662221fae8dbb2d498002 100644 (file)
@@ -800,7 +800,9 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                        }
                        if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
                                g_tree_insert (part->html_urls, url_text, url);
-                               task->urls = g_list_prepend (task->urls, url);
+                       }
+                       if (g_tree_lookup (task->urls, url)) {
+                               g_tree_insert (task->urls, url, url);
                        }
                }
        }
index 6e7d38c99b5bd2d120048d0e11a08f80658158c0..d080b7d955913751175950d4f42ee57e166e8910 100644 (file)
@@ -115,9 +115,6 @@ free_lmtp_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
                else {
                        rspamd_remove_dispatcher (lmtp->task->dispatcher);
                }
-               if (lmtp->task->urls) {
-                       g_list_free (lmtp->task->urls);
-               }
                close (lmtp->task->sock);
                g_free (lmtp->task);
                g_free (lmtp);
index 48db4c8333491ce54a2ab139de10cd17c3e08df8..0a6185f7e25ac7e645c12d0de419de7ba66d5997 100644 (file)
@@ -220,27 +220,37 @@ lua_task_insert_result (lua_State * L)
        return 1;
 }
 
+struct lua_tree_cb_data {
+       lua_State                     *L;
+       int                            i;
+};
+
+static gboolean
+lua_tree_url_callback (gpointer key, gpointer value, gpointer ud)
+{
+       struct uri                    **purl;
+       struct lua_tree_cb_data         *cb = ud;
+
+       purl = lua_newuserdata (cb->L, sizeof (struct uri *));
+       lua_setclass (cb->L, "rspamd{url}", -1);
+       *purl = value;
+       lua_rawseti (cb->L, -2, cb->i++);
+
+       return FALSE;
+}
+
 static gint
 lua_task_get_urls (lua_State * L)
 {
-       gint                            i = 1;
        struct worker_task             *task = lua_check_task (L);
-       GList                          *cur;
-       struct uri                    **purl;
+       struct lua_tree_cb_data         cb;
 
        if (task) {
-               cur = task->urls;
-               if (cur != NULL) {
-                       lua_newtable (L);
-                       while (cur) {
-                               purl = lua_newuserdata (L, sizeof (struct uri *));
-                               lua_setclass (L, "rspamd{url}", -1);
-                               *purl = cur->data;
-                               lua_rawseti (L, -2, i++);
-                               cur = g_list_next (cur);
-                       }
-                       return 1;
-               }
+               lua_newtable (L);
+               cb.i = 1;
+               cb.L = L;
+               g_tree_foreach (task->urls, lua_tree_url_callback, &cb);
+               return 1;
        }
 
        lua_pushnil (L);
@@ -250,24 +260,15 @@ lua_task_get_urls (lua_State * L)
 static gint
 lua_task_get_emails (lua_State * L)
 {
-       gint                            i = 1;
        struct worker_task             *task = lua_check_task (L);
-       GList                          *cur;
-       struct uri                    **purl;
+       struct lua_tree_cb_data         cb;
 
        if (task) {
-               cur = task->emails;
-               if (cur != NULL) {
-                       lua_newtable (L);
-                       while (cur) {
-                               purl = lua_newuserdata (L, sizeof (struct uri *));
-                               lua_setclass (L, "rspamd{url}", -1);
-                               *purl = cur->data;
-                               lua_rawseti (L, -2, i++);
-                               cur = g_list_next (cur);
-                       }
-                       return 1;
-               }
+               lua_newtable (L);
+               cb.i = 1;
+               cb.L = L;
+               g_tree_foreach (task->emails, lua_tree_url_callback, &cb);
+               return 1;
        }
 
        lua_pushnil (L);
index acbfe8a721f4f0c8e3ff283bade215c7615afb07..e6cf670789b0461fd0a1ea276aa163192c819bec 100644 (file)
@@ -203,8 +203,8 @@ struct worker_task {
        GList *text_parts;                                                                                      /**< list of text parts                                                         */
        gchar *raw_headers;                                                                                     /**< list of raw headers                                                        */
        GList *received;                                                                                        /**< list of received headers                                           */
-       GList *urls;                                                                                            /**< list of parsed urls                                                        */
-       GList *emails;                                                                                          /**< list of parsed emails                                                      */
+       GTree *urls;                                                                                            /**< list of parsed urls                                                        */
+       GTree *emails;                                                                                          /**< list of parsed emails                                                      */
        GList *images;                                                                                          /**< list of images                                                                     */
        GList *raw_headers_list;                                                                        /**< list of raw headers                                                        */
        GHashTable *results;                                                                            /**< hash table of metric_result indexed by 
index c9de2fb406bac438f07d7dcba51e37c2a9be2474..f83e0a2299ea3358060dd116d0291b21e085fae4 100644 (file)
 
 static struct surbl_ctx        *surbl_module_ctx = NULL;
 
-static gint                      surbl_filter (struct worker_task *task);
+static gint                     surbl_filter (struct worker_task *task);
 static void                     surbl_test_url (struct worker_task *task, void *user_data);
 static void                     dns_callback (struct rspamd_dns_reply *reply, gpointer arg);
 static void                     process_dns_results (struct worker_task *task, struct suffix_item *suffix, gchar *url, guint32 addr);
-static gint                      urls_command_handler (struct worker_task *task);
 
 #define NO_REGEXP (gpointer)-1
 
@@ -224,7 +223,6 @@ surbl_module_init (struct config_file *cfg, struct module_ctx **ctx)
 
        *ctx = (struct module_ctx *)surbl_module_ctx;
 
-       register_protocol_command ("urls", urls_command_handler);
        /* Register module options */
        register_module_opt ("surbl", "redirector", MODULE_OPT_TYPE_STRING);
        register_module_opt ("surbl", "url_expire", MODULE_OPT_TYPE_TIME);
@@ -570,7 +568,7 @@ format_surbl_request (memory_pool_t * pool, f_str_t * hostname, struct suffix_it
 }
 
 static void
-make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree,
+make_surbl_requests (struct uri *url, struct worker_task *task,
                struct suffix_item *suffix, gboolean forced)
 {
        gchar                           *surbl_req;
@@ -583,20 +581,14 @@ make_surbl_requests (struct uri *url, struct worker_task *task, GTree * tree,
 
        if (check_view (task->cfg->views, suffix->symbol, task)) {
                if ((surbl_req = format_surbl_request (task->task_pool, &f, suffix, TRUE, &err, forced)) != NULL) {
-                       if (g_tree_lookup (tree, surbl_req) == NULL) {
-                               g_tree_insert (tree, surbl_req, surbl_req);
-                               param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param));
-                               param->url = url;
-                               param->task = task;
-                               param->suffix = suffix;
-                               param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req);
-                               debug_task ("send surbl dns request %s", surbl_req);
-                               if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) {
-                                       param->task->save.saved++;
-                               }
-                       }
-                       else {
-                               debug_task ("request %s is already sent", surbl_req);
+                       param = memory_pool_alloc (task->task_pool, sizeof (struct dns_param));
+                       param->url = url;
+                       param->task = task;
+                       param->suffix = suffix;
+                       param->host_resolve = memory_pool_strdup (task->task_pool, surbl_req);
+                       debug_task ("send surbl dns request %s", surbl_req);
+                       if (make_dns_request (task->resolver, task->s, task->task_pool, dns_callback, (void *)param, DNS_REQUEST_A, surbl_req)) {
+                               param->task->save.saved++;
                        }
                }
                else if (err != NULL && err->code != WHITELIST_ERROR) {
@@ -732,7 +724,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data)
                        param->task->save.saved = 1;
                        process_filters (param->task);
                }
-               make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE);
+               make_surbl_requests (param->url, param->task, param->suffix, FALSE);
                break;
        default:
                return;
@@ -740,7 +732,7 @@ memcached_callback (memcached_ctx_t * ctx, memc_error_t error, void *data)
 }
 
 static void
-register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_tree, struct suffix_item *suffix)
+register_memcached_call (struct uri *url, struct worker_task *task, struct suffix_item *suffix)
 {
        struct memcached_param         *param;
        struct memcached_server        *selected;
@@ -754,7 +746,6 @@ register_memcached_call (struct uri *url, struct worker_task *task, GTree * url_
 
        param->url = url;
        param->task = task;
-       param->tree = url_tree;
        param->suffix = suffix;
 
        param->ctx = memory_pool_alloc0 (task->task_pool, sizeof (memcached_ctx_t));
@@ -799,7 +790,7 @@ free_redirector_session (void *ud)
        event_del (&param->ev);
        close (param->sock);
        param->task->save.saved--;
-       make_surbl_requests (param->url, param->task, param->tree, param->suffix, FALSE);
+       make_surbl_requests (param->url, param->task, param->suffix, FALSE);
        if (param->task->save.saved == 0) {
                /* Call other filters */
                param->task->save.saved = 1;
@@ -886,7 +877,7 @@ redirector_callback (gint fd, short what, void *arg)
 
 
 static void
-register_redirector_call (struct uri *url, struct worker_task *task, GTree * url_tree,
+register_redirector_call (struct uri *url, struct worker_task *task,
                struct suffix_item *suffix, const gchar *rule)
 {
        gint                            s = -1;
@@ -907,7 +898,7 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url
        if (s == -1) {
                msg_info ("<%s> cannot create tcp socket failed: %s", task->message_id, strerror (errno));
                task->save.saved--;
-               make_surbl_requests (url, task, url_tree, suffix, FALSE);
+               make_surbl_requests (url, task, suffix, FALSE);
                return;
        }
 
@@ -916,7 +907,6 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree * url
        param->task = task;
        param->state = STATE_CONNECT;
        param->sock = s;
-       param->tree = url_tree;
        param->suffix = suffix;
        param->redirector = selected;
        timeout = memory_pool_alloc (task->task_pool, sizeof (struct timeval));
@@ -966,22 +956,22 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
                                        re = g_hash_table_lookup (surbl_module_ctx->redirector_hosts, red_domain);
                                        if (re != NULL && (re == NO_REGEXP || g_regex_match (re, url->string, 0, NULL))) {
                                                /* If no regexp found or founded regexp matches url string register redirector's call */
-                                               register_redirector_call (url, param->task, param->tree, param->suffix, red_domain);
+                                               register_redirector_call (url, param->task, param->suffix, red_domain);
                                                param->task->save.saved++;
                                                return FALSE;
                                        }
                                }
                        }
                }
-               make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE);
+               make_surbl_requests (url, param->task, param->suffix, FALSE);
        }
        else {
                if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
-                       register_memcached_call (url, param->task, param->tree, param->suffix);
+                       register_memcached_call (url, param->task, param->suffix);
                        param->task->save.saved++;
                }
                else {
-                       make_surbl_requests (url, param->task, param->tree, param->suffix, FALSE);
+                       make_surbl_requests (url, param->task, param->suffix, FALSE);
                }
        }
 
@@ -991,31 +981,12 @@ surbl_tree_url_callback (gpointer key, gpointer value, void *data)
 static void
 surbl_test_url (struct worker_task *task, void *user_data)
 {
-       GTree                          *url_tree;
-       GList                          *cur;
-       struct mime_text_part          *part;
        struct redirector_param         param;
        struct suffix_item             *suffix = user_data;
 
-       url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
-
-       param.tree = url_tree;
        param.task = task;
        param.suffix = suffix;
-       cur = task->text_parts;
-       while (cur) {
-               part = cur->data;
-               if (part->urls) {
-                       g_tree_foreach (part->urls, surbl_tree_url_callback, &param);
-               }
-               if (part->html_urls) {
-                       g_tree_foreach (part->html_urls, surbl_tree_url_callback, &param);
-               }
-
-               cur = g_list_next (cur);
-       }
-
-       memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_tree_destroy, url_tree);
+       g_tree_foreach (task->urls, surbl_tree_url_callback, &param);
 }
 
 static gint
@@ -1025,71 +996,6 @@ surbl_filter (struct worker_task *task)
        return 0;
 }
 
-static gboolean
-urls_command_handler (struct worker_task *task)
-{
-       GList                          *cur;
-       gchar                           *outbuf, *urlstr;
-       gint                            r, num = 0, buflen;
-       struct uri                     *url;
-       GError                         *err = NULL;
-       GTree                          *url_tree;
-       f_str_t                         f;
-
-       url_tree = g_tree_new ((GCompareFunc) g_ascii_strcasecmp);
-
-       /* First calculate buffer length */
-       cur = g_list_first (task->urls);
-       buflen = 0;
-       while (cur) {
-               url = cur->data;
-               buflen += strlen (struri (url)) + url->hostlen + sizeof (" <\"\">, ") - 1;
-               cur = g_list_next (cur);
-       }
-
-       buflen += sizeof (RSPAMD_REPLY_BANNER " 0 OK" CRLF CRLF "Urls: ");
-
-       outbuf = memory_pool_alloc (task->task_pool, buflen * sizeof (gchar));
-
-       r = rspamd_snprintf (outbuf, buflen, "%s 0 %s" CRLF, (task->proto == SPAMC_PROTO) ? SPAMD_REPLY_BANNER : RSPAMD_REPLY_BANNER, "OK");
-
-       r += rspamd_snprintf (outbuf + r, buflen - r - 2, "Urls: ");
-
-       cur = g_list_first (task->urls);
-
-       while (cur) {
-               num++;
-               url = cur->data;
-               if (g_tree_lookup (url_tree, struri (url)) == NULL) {
-                       g_tree_insert (url_tree, struri (url), url);
-                       f.begin = url->host;
-                       f.len = url->hostlen;
-                       if ((urlstr = format_surbl_request (task->task_pool, &f, NULL, FALSE, &err, FALSE)) != NULL) {
-                               if (g_list_next (cur) != NULL) {
-                                       r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">, ", (gchar *)urlstr, struri (url));
-                               }
-                               else {
-                                       r += rspamd_snprintf (outbuf + r, buflen - r - 2, "%s <\"%s\">", (gchar *)urlstr, struri (url));
-                               }
-                       }
-               }
-               cur = g_list_next (cur);
-       }
-
-       outbuf[r++] = '\r';
-       outbuf[r++] = '\n';
-
-       g_tree_destroy (url_tree);
-       if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, TRUE)) {
-               return FALSE;
-       }
-       msg_info ("msg ok, id: <%s>, %d urls extracted", task->message_id, num);
-       task->state = STATE_REPLY;
-
-       return TRUE;
-}
-
-
 /*
  * vi:ts=4 
  */
index ca49c5ecf22c600bfe8f40b20af83ea610da91bc..683d291a96623c19b4bc42d90d338b98163d6723 100644 (file)
@@ -29,8 +29,8 @@
 #include "settings.h"
 #include "message.h"
 
-/* Max line size as it is defined in rfc2822 */
-#define OUTBUFSIZ 1000
+/* Max line size */
+#define OUTBUFSIZ BUFSIZ
 /*
  * Just check if the passed message is spam or not and reply as
  * described below
@@ -477,162 +477,106 @@ write_hashes_to_log (struct worker_task *task, gchar *logbuf, gint offset, gint
        }
 }
 
-static gint
-compare_url_func (gconstpointer a, gconstpointer b)
-{
-       const struct uri               *u1 = a, *u2 = b;
 
-       if (u1->hostlen != u2->hostlen) {
-               return u1->hostlen - u2->hostlen;
-       }
-       else {
-               return memcmp (u1->host, u2->host, u1->hostlen);
-       }
-}
+/* Structure for writing tree data */
+struct tree_cb_data {
+       gchar                          *buf;
+       gsize                           len;
+       gsize                           off;
+};
 
-static gint
-compare_email_func (gconstpointer a, gconstpointer b)
+/*
+ * Callback for writing urls
+ */
+static gboolean
+urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
 {
-       const struct uri               *u1 = a, *u2 = b;
-       gint                            r;
+       struct tree_cb_data             *cb = ud;
+       struct uri                      *url = value;
+       gsize                            len;
 
-       if (u1->hostlen != u2->hostlen) {
-               return u1->hostlen - u2->hostlen;
+       len = url->hostlen + url->userlen + 1;
+       if (cb->off + len >= cb->len) {
+               msg_info ("cannot write urls header completely, stripped reply at: %z", cb->off);
+               return TRUE;
        }
        else {
-               if ((r = memcmp (u1->host, u2->host, u1->hostlen)) == 0){
-                       if (u1->userlen != u2->userlen) {
-                               return u1->userlen - u2->userlen;
-                       }
-                       else {
-                               return memcmp (u1->user, u2->user, u1->userlen);
-                       }
-               }
-               else {
-                       return r;
-               }
+               cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s,",
+                                                               url->hostlen, url->host);
        }
-
-       return 0;
+       return FALSE;
 }
 
 static gboolean
 show_url_header (struct worker_task *task)
 {
        gint                            r = 0;
-       gchar                           outbuf[OUTBUFSIZ], c;
-       struct uri                     *url;
-       GList                          *cur;
-       f_str_t                         host;
-       GTree                          *url_tree;
+       gchar                           outbuf[OUTBUFSIZ];
+       struct tree_cb_data             cb;
 
        r = rspamd_snprintf (outbuf, sizeof (outbuf), "Urls: ");
-       url_tree = g_tree_new (compare_url_func);
-       cur = task->urls;
-       while (cur) {
-               url = cur->data;
-        if (task->cfg->log_urls) {
-            /* Write this url to log as well */
-            msg_info ("url found: <%s>, score: [%.2f / %.2f]", struri (url), default_score, default_required_score);
-        }
-               if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) {
-                       g_tree_insert (url_tree, url, url);
-                       host.begin = url->host;
-                       host.len = url->hostlen;
-                       /* Skip long hosts to avoid protocol coollisions */
-                       if (host.len > OUTBUFSIZ) {
-                               cur = g_list_next (cur);
-                               continue;
-                       }
-                       /* Do header folding */
-                       if (host.len + r >= OUTBUFSIZ - 3) {
-                               outbuf[r++] = '\r';
-                               outbuf[r++] = '\n';
-                               outbuf[r] = ' ';
-                               if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) {
-                                       return FALSE;
-                               }
-                               r = 0;
-                       }
-                       /* Write url host to buf */
-                       if (g_list_next (cur) != NULL) {
-                               c = *(host.begin + host.len);
-                               *(host.begin + host.len) = '\0';
-                               debug_task ("write url: %s", host.begin);
-                               r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s, ", host.begin);
-                               *(host.begin + host.len) = c;
-                       }
-                       else {
-                               c = *(host.begin + host.len);
-                               *(host.begin + host.len) = '\0';
-                               debug_task ("write url: %s", host.begin);
-                               r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%s", host.begin);
-                               *(host.begin + host.len) = c;
-                       }
-               }
-               cur = g_list_next (cur);
+
+       cb.buf = outbuf;
+       cb.len = sizeof (outbuf);
+       cb.off = r;
+
+       g_tree_foreach (task->urls, urls_protocol_cb, &cb);
+       /* Strip last ',' */
+       if (cb.buf[cb.off - 1] == ',') {
+               cb.buf[--cb.off] = '\0';
        }
-       if (r == 0) {
+       cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF);
+
+       return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE);
+}
+
+/*
+ * Callback for writing emails
+ */
+static gboolean
+emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
+{
+       struct tree_cb_data             *cb = ud;
+       struct uri                      *url = value;
+       gsize                            len;
+
+       len = url->hostlen + url->userlen + 1;
+       if (cb->off + len >= cb->len) {
+               msg_info ("cannot write emails header completely, stripped reply at: %z", cb->off);
                return TRUE;
        }
-       r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF);
-
-       return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE);
+       else {
+               cb->off += rspamd_snprintf (cb->buf + cb->off, cb->len - cb->off, " %*s@%*s,",
+                                                               url->userlen, url->user,
+                                                               url->hostlen, url->host);
+       }
+       return FALSE;
 }
 
+/*
+ * Show header for emails found in a message
+ */
 static gboolean
 show_email_header (struct worker_task *task)
 {
        gint                            r = 0;
        gchar                           outbuf[OUTBUFSIZ];
-       struct uri                     *url;
-       GList                          *cur;
-       gsize                           len;
-       GTree                          *url_tree;
+       struct tree_cb_data             cb;
 
        r = rspamd_snprintf (outbuf, sizeof (outbuf), "Emails: ");
-       url_tree = g_tree_new (compare_email_func);
-       cur = task->emails;
-       while (cur) {
-               url = cur->data;
-               if (g_tree_lookup (url_tree, url) == NULL && url->hostlen > 0) {
-                       g_tree_insert (url_tree, url, url);
-                       len = url->hostlen + url->userlen + 1;
-                       /* Skip long hosts to avoid protocol coollisions */
-                       if (len > OUTBUFSIZ) {
-                               cur = g_list_next (cur);
-                               continue;
-                       }
-                       /* Do header folding */
-                       if (len + r >= OUTBUFSIZ - 3) {
-                               outbuf[r++] = '\r';
-                               outbuf[r++] = '\n';
-                               outbuf[r] = ' ';
-                               if (! rspamd_dispatcher_write (task->dispatcher, outbuf, r, TRUE, FALSE)) {
-                                       return FALSE;
-                               }
-                               r = 0;
-                       }
-                       /* Write url host to buf */
-                       if (g_list_next (cur) != NULL) {
-                               r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s, ",
-                                               url->userlen, url->user,
-                                               url->hostlen, url->host);
-                       }
-                       else {
-                               r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, "%*s@%*s",
-                                                                               url->userlen, url->user,
-                                                                               url->hostlen, url->host);
-                       }
-               }
-               cur = g_list_next (cur);
-       }
-       if (r == 0) {
-               return TRUE;
+
+       cb.buf = outbuf;
+       cb.len = sizeof (outbuf);
+       cb.off = r;
+
+       g_tree_foreach (task->emails, emails_protocol_cb, &cb);
+       /* Strip last ',' */
+       if (cb.buf[cb.off - 1] == ',') {
+               cb.buf[--cb.off] = '\0';
        }
-       r += rspamd_snprintf (outbuf + r, sizeof (outbuf) - r, CRLF);
+       cb.off += rspamd_snprintf (cb.buf + cb.off, cb.len - cb.off, CRLF);
 
-       return rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE, FALSE);
+       return rspamd_dispatcher_write (task->dispatcher, outbuf, cb.off, FALSE, FALSE);
 }
 
 static void
index faa8f074b837e2b6b2050dca16d4860df9d779bf..b7318bdfcd14071be25b94a0797f8e561bb4c147 100644 (file)
@@ -143,38 +143,6 @@ get_next_word (f_str_t * buf, f_str_t * token)
        return token;
 }
 
-int
-tokenize_urls (memory_pool_t * pool, struct worker_task *task, GTree ** tree)
-{
-       token_node_t                   *new = NULL;
-       f_str_t                         url_domain;
-       struct uri                     *url;
-       GList                          *cur;
-       uint32_t                        h;
-
-       if (*tree == NULL) {
-               *tree = g_tree_new (token_node_compare_func);
-               memory_pool_add_destructor (pool, (pool_destruct_func) g_tree_destroy, *tree);
-       }
-
-       cur = task->urls;
-       while (cur) {
-               url = cur->data;
-               url_domain.begin = url->host;
-               url_domain.len = url->hostlen;
-               new = memory_pool_alloc (pool, sizeof (token_node_t));
-               h = fstrhash (&url_domain);
-               new->h1 = h * primes[0];
-               new->h2 = h * primes[1];
-               if (g_tree_lookup (*tree, new) == NULL) {
-                       g_tree_insert (*tree, new, new);
-               }
-               cur = g_list_next (cur);
-       }
-
-       return TRUE;
-}
-
 /* Struct to access gmime headers */
 struct raw_header {
        struct raw_header              *next;
index 21e454e6bc38ca2e23e57c032d280aa6ef1660d5..59a2684d071d5948438056cd6614698a9f8b96de 100644 (file)
@@ -36,8 +36,6 @@ struct tokenizer* get_tokenizer (char *name);
 f_str_t *get_next_word (f_str_t *buf, f_str_t *token);
 /* OSB tokenize function */
 int osb_tokenize_text (struct tokenizer *tokenizer, memory_pool_t *pool, f_str_t *input, GTree **cur);
-/* Common tokenizer for urls */
-int tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **cur);
 /* Common tokenizer for headers */
 int tokenize_headers (memory_pool_t *pool, struct worker_task *task, GTree **cur);
 /* Make tokens for a subject */
index b243920567f2f54557a538bba4326e052b790ae4..083ceecba3bed5810eb69955759b8414ab138168 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -1185,13 +1185,18 @@ url_parse_text (memory_pool_t * pool, struct worker_task *task, struct mime_text
                                        if (new != NULL) {
                                                g_strstrip (url_str);
                                                rc = parse_uri (new, url_str, pool);
-                                               if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
+                                               if ((rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) &&
+                                                               new->hostlen > 0) {
                                                        if (new->protocol == PROTOCOL_MAILTO) {
-                                                               task->emails = g_list_prepend (task->emails, new);
+                                                               if (!g_tree_lookup (task->emails, new)) {
+                                                                       g_tree_insert (task->emails, new, new);
+                                                               }
                                                        }
                                                        else {
                                                                g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
-                                                               task->urls = g_list_prepend (task->urls, new);
+                                                               if (!g_tree_lookup (task->urls, new)) {
+                                                                       g_tree_insert (task->urls, new, new);
+                                                               }
                                                        }
                                                }
                                                else {
index 1d6ec05fba62d2601fa6f7535b305b0b879dd53c..0ac952b799e0b8b1aecefb980ea14926179cf3e4 100644 (file)
@@ -261,12 +261,6 @@ free_task (struct worker_task *task, gboolean is_soft)
                if (task->text_parts) {
                        g_list_free (task->text_parts);
                }
-               if (task->urls) {
-                       g_list_free (task->urls);
-               }
-               if (task->emails) {
-                       g_list_free (task->emails);
-               }
                if (task->images) {
                        g_list_free (task->images);
                }
@@ -460,6 +454,52 @@ err_socket (GError * err, void *arg)
        }
 }
 
+/* Compare two emails for building emails tree */
+static gint
+compare_email_func (gconstpointer a, gconstpointer b)
+{
+       const struct uri               *u1 = a, *u2 = b;
+       gint                            r;
+
+       if (u1->hostlen != u2->hostlen) {
+               return u1->hostlen - u2->hostlen;
+       }
+       else {
+               if ((r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen)) == 0){
+                       if (u1->userlen != u2->userlen) {
+                               return u1->userlen - u2->userlen;
+                       }
+                       else {
+                               return g_ascii_strncasecmp (u1->user, u2->user, u1->userlen);
+                       }
+               }
+               else {
+                       return r;
+               }
+       }
+
+       return 0;
+}
+
+static gint
+compare_url_func (gconstpointer a, gconstpointer b)
+{
+       const struct uri               *u1 = a, *u2 = b;
+       int                             r;
+
+       if (u1->hostlen != u2->hostlen) {
+               return u1->hostlen - u2->hostlen;
+       }
+       else {
+               r = g_ascii_strncasecmp (u1->host, u2->host, u1->hostlen);
+       }
+
+       return r;
+}
+
+/*
+ * Create new task
+ */
 struct worker_task             *
 construct_task (struct rspamd_worker *worker)
 {
@@ -499,6 +539,14 @@ construct_task (struct rspamd_worker *worker)
        memory_pool_add_destructor (new_task->task_pool,
                        (pool_destruct_func) g_hash_table_destroy,
                        new_task->re_cache);
+       new_task->emails = g_tree_new (compare_email_func);
+       memory_pool_add_destructor (new_task->task_pool,
+                               (pool_destruct_func) g_tree_destroy,
+                               new_task->emails);
+       new_task->urls = g_tree_new (compare_url_func);
+       memory_pool_add_destructor (new_task->task_pool,
+                                       (pool_destruct_func) g_tree_destroy,
+                                       new_task->urls);
        new_task->s =
                        new_async_session (new_task->task_pool, free_task_hard, new_task);
        new_task->sock = -1;