* Rework url parsing algorithms

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index f744fff03a370b9b33fd0b4fe37e38d0a112d39c..2d55565e90c41d02ac46281baa7825c93614155a 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ PROJECT(rspamd C)
  
  SET(RSPAMD_VERSION_MAJOR 0)
  SET(RSPAMD_VERSION_MINOR 1)
-SET(RSPAMD_VERSION_PATCH 2)
+SET(RSPAMD_VERSION_PATCH 3)
  
  SET(RSPAMD_VERSION         "${RSPAMD_VERSION_MAJOR}.${RSPAMD_VERSION_MINOR}.${RSPAMD_VERSION_PATCH}")
  SET(RSPAMD_MASTER_SITE_URL "http://cebka.pp.ru/hg/rspamd")
diff --git a/rspamc.pl.in b/rspamc.pl.in

index 6681fd6d9193b9e29977da43dfe1737e98058995..9d5712117c17fd25e341453be989af70684770fe 100755 (executable)
--- a/rspamc.pl.in
+++ b/rspamc.pl.in
@@ -200,7 +200,7 @@ sub do_control_command {
  my %args;
  getopt('c:h:p:Ps:', \%args);
  my $cmd = shift;
-my $do_parse_config = 0;
+my $do_parse_config = 1;
  
  if (!defined ($cmd) || $cmd eq '') {
      HELP_MESSAGE();
@@ -230,6 +230,7 @@ if (defined ($args{h})) {
      $cfg{'host'} = $args{h};
      if ($args{h} =~ /^\/.*$/) {
          $cfg{'is_unix'} = 1;
+        $do_parse_config = 0;
      }
  }
  if (defined ($args{p})) {
diff --git a/src/lmtp.c b/src/lmtp.c

index d649d11c3041f27ae968f9ffeacc407fc301a76a..5f2878a08fdd4b0d90762a7cec336acd915363f0 100644 (file)
--- a/src/lmtp.c
+++ b/src/lmtp.c
@@ -109,6 +109,9 @@ free_task (struct rspamd_lmtp_proto *lmtp, gboolean is_soft)
                 else {
                         rspamd_remove_dispatcher (lmtp->task->dispatcher);
                 }
+               if (lmtp->task->urls) {
+                       g_list_free (lmtp->task->urls);
+               }
                 close (lmtp->task->sock);
                 g_free (lmtp->task);
                 g_free (lmtp);
@@ -230,7 +233,6 @@ accept_socket (int fd, short what, void *arg)
         new_task->state = READ_COMMAND;
         new_task->sock = nfd;
         new_task->cfg = worker->srv->cfg;
-       TAILQ_INIT (&new_task->urls);
         new_task->task_pool = memory_pool_new (memory_pool_get_size ());
         /* Add destructor for recipients list (it would be better to use anonymous function here */
         memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/src/main.h b/src/main.h

index c1a057824d1d0488eb26a8f6d5b07f68418ddb45..924fe13edb8e16347e8c551d0dffafff549e09c3 100644 (file)
--- a/src/main.h
+++ b/src/main.h
@@ -183,7 +183,7 @@ struct worker_task {
         GList *parts;                                                                                           /**< list of parsed parts                                                       */
         GList *text_parts;                                                                                      /**< list of text parts                                                         */
         char *raw_headers;                                                                                      /**< list of raw headers                                                        */
-       TAILQ_HEAD (uriq, uri) urls;                                                            /**< list of parsed urls                                                        */
+       GList *urls;                                                                                            /**< list of parsed urls                                                        */
         GHashTable *results;                                                                            /**< hash table of metric_result indexed by 
                                                                                                                                  *    metric's name                                                                     */
         GHashTable *re_cache;                                                                           /**< cache for matched or not matched regexps           */
diff --git a/src/message.c b/src/message.c

index 510d407e9f9ae06733744dfe90ae1590c0fac699..f664122d08494938d8548d2b97ae73c6029b48e5 100644 (file)
--- a/src/message.c
+++ b/src/message.c
@@ -301,28 +301,36 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
                 text_part->is_balanced = TRUE;
                 text_part->html_nodes = NULL;
                 text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+               text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+               text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
  
                 if (text_part->html_nodes == NULL) {
-                       url_parse_text (task, text_part->orig, FALSE);
+                       url_parse_text (task->task_pool, task, text_part, FALSE);
                 }
                 else {
-                       url_parse_text (task, text_part->orig, TRUE);
+                       url_parse_text (task->task_pool, task, text_part, FALSE);
+                       url_parse_text (task->task_pool, task, text_part, TRUE);
                 }
  
                 text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
                 memory_pool_add_destructor (task->task_pool, (pool_destruct_func)free_byte_array_callback, text_part->content);
+               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->html_urls);
+               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
                 task->text_parts = g_list_prepend (task->text_parts, text_part);
         } 
         else if (g_mime_content_type_is_type (type, "text", "plain")) {
                 msg_debug ("mime_foreach_callback: got urls from text/plain part");
-               url_parse_text (task, part_content, FALSE);
  
                 text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                 text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
                 text_part->content = text_part->orig;
                 text_part->is_html = FALSE;
                 text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool);
+               text_part->html_urls = NULL;
+               text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+               url_parse_text (task->task_pool, task, text_part, FALSE);
                 task->text_parts = g_list_prepend (task->text_parts, text_part);
+               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, text_part->urls);
         }
  }
  
@@ -489,8 +497,10 @@ process_message (struct worker_task *task)
         if (task->rcpts) {
                 memory_pool_add_destructor (task->task_pool, (pool_destruct_func)internet_address_list_destroy, task->rcpts);
         }
-
-       task->worker->srv->stat->messages_scanned ++;
+       
+       if (task->worker) {
+               task->worker->srv->stat->messages_scanned ++;
+       }
  
         /* free the parser (and the stream) */
         g_object_unref (parser);
diff --git a/src/message.h b/src/message.h

index 72711638f5e4a3e913c84aba787597cb6a1dbcc9..9a63b0824d5ba6d1cb15cc4adfd6b07a4b82d8f2 100644 (file)
--- a/src/message.h
+++ b/src/message.h
@@ -21,6 +21,8 @@ struct mime_text_part {
         GByteArray *orig;
         GByteArray *content;
         GNode *html_nodes;
+       GTree *urls;
+       GTree *html_urls;
         fuzzy_hash_t *fuzzy;
  };
  
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c

index 1b109bf55ec2d2ca5cd7ae27162a6be4d1b3dd68..fbe36f29129d97fb0f2dc85e370a3bfaab69c390 100644 (file)
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -237,6 +237,28 @@ find_raw_header_pos (const char *headers, const char *headerv)
         return NULL;
  }
  
+struct url_regexp_param {
+       struct worker_task *task;
+       GRegex *regexp;
+       struct rspamd_regexp *re;
+       gboolean found;
+};
+
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+       struct url_regexp_param *param = data;
+       struct uri *url = value;
+
+       if (g_regex_match (param->regexp, struri (url), 0, NULL) == TRUE) {
+               task_cache_add (param->task, param->re, 1);
+               param->found = TRUE;
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
  static gsize
  process_regexp (struct rspamd_regexp *re, struct worker_task *task)
  {
@@ -244,7 +266,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
         struct mime_text_part *part, *tmp;
         GList *cur, *headerlist;
         GRegex *regexp;
-       struct uri *url;
+       struct url_regexp_param callback_param;
         int r;
         
  
@@ -333,13 +355,30 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task)
                         return 0;
                 case REGEXP_URL:
                         msg_debug ("process_regexp: checking url regexp: /%s/", re->regexp_text);
-                       TAILQ_FOREACH (url, &task->urls, next) {
-                               if (g_regex_match (re->regexp, struri (url), 0, NULL) == TRUE) {
-                                       task_cache_add (task, re, 1);
-                                       return 1;
+                       cur = g_list_first (task->text_parts);
+                       while (cur) {
+                               part = (struct mime_text_part *)cur->data;
+                               if (part->is_raw) {
+                                       regexp = re->raw_regexp;
                                 }
+                               else {
+                                       regexp = re->regexp;
+                               }
+                               callback_param.task = task;
+                               callback_param.regexp = regexp;
+                               callback_param.re = re;
+                               callback_param.found = FALSE;
+                               if (part->urls) {
+                                       g_tree_foreach (part->urls, tree_url_callback, &callback_param);
+                               }
+                               if (part->html_urls && callback_param.found == FALSE) {
+                                       g_tree_foreach (part->html_urls, tree_url_callback, &callback_param);
+                               }
+                               cur = g_list_next (cur);
+                       }
+                       if (callback_param.found == FALSE) {
+                               task_cache_add (task, re, 0);
                         }
-                       task_cache_add (task, re, 0);
                         return 0;
                 case REGEXP_RAW_HEADER:
                         msg_debug ("process_regexp: checking for raw header: %s with regexp: /%s/", re->header, re->regexp_text);
diff --git a/src/plugins/surbl.c b/src/plugins/surbl.c

index d44c7fbe7b1e564b66c163aec454649b696cd37c..1514cae1c967af450170de8c1599bea972101fa9 100644 (file)
--- a/src/plugins/surbl.c
+++ b/src/plugins/surbl.c
@@ -28,6 +28,7 @@
  
  #include "../config.h"
  #include "../util.h"
+#include "../message.h"
  #include <evdns.h>
  
  #include "surbl.h"
@@ -647,29 +648,53 @@ register_redirector_call (struct uri *url, struct worker_task *task, GTree *url_
         event_add (&param->ev, &timeout);
  }
  
+static gboolean
+tree_url_callback (gpointer key, gpointer value, void *data)
+{
+       struct redirector_param *param = data;
+       struct uri *url = value;
+
+       msg_debug ("surbl_test_url: check url %s", struri (url));
+       if (surbl_module_ctx->use_redirector) {
+               register_redirector_call (url, param->task, param->tree);
+               param->task->save.saved++;
+       }
+       else {
+               if (param->task->worker->srv->cfg->memcached_servers_num > 0) {
+                       register_memcached_call (url, param->task, param->tree);
+                       param->task->save.saved++;
+               }
+               else {
+                       make_surbl_requests (url, param->task, param->tree);
+               }
+       }
+
+       return FALSE;
+}
+
  static int 
  surbl_test_url (struct worker_task *task)
  {
-       struct uri *url;
         GTree *url_tree;
+       GList *cur;
+       struct mime_text_part *part;
+       struct redirector_param param;
  
         url_tree = g_tree_new ((GCompareFunc)g_ascii_strcasecmp);
-
-       TAILQ_FOREACH (url, &task->urls, next) {
-               msg_debug ("surbl_test_url: check url %s", struri (url));
-               if (surbl_module_ctx->use_redirector) {
-                       register_redirector_call (url, task, url_tree);
-                       task->save.saved++;
+       
+       param.tree = url_tree;
+       param.task = task;
+       cur = task->text_parts;
+       while (cur) {
+               part = cur->data;
+               if (part->urls) {
+                       g_tree_foreach (part->urls, tree_url_callback, &param); 
                 }
-               else {
-                       if (task->worker->srv->cfg->memcached_servers_num > 0) {
-                               register_memcached_call (url, task, url_tree);
-                               task->save.saved++;
-                       }
-                       else {
-                               make_surbl_requests (url, task, url_tree);
-                       }
+               if (part->html_urls) {
+                       g_tree_foreach (part->html_urls, tree_url_callback, &param); 
                 }
+
+               cur = g_list_next (cur);
         }
  
         memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_tree_destroy, url_tree);
diff --git a/src/protocol.c b/src/protocol.c

index 2cd0252871f20c44a237a5dc098941dcd95f7746..c551bb783a253098e6cca2009ec86c442eedaebd 100644 (file)
--- a/src/protocol.c
+++ b/src/protocol.c
@@ -369,10 +369,13 @@ show_url_header (struct worker_task *task)
         int r = 0;
         char outbuf[OUTBUFSIZ], c;
         struct uri *url;
+       GList *cur;
         f_str_t host;
  
         r = snprintf (outbuf, sizeof (outbuf), "Urls: ");
-       TAILQ_FOREACH (url, &task->urls, next) {
+       cur = task->urls;
+       while (cur) {
+               url = cur->data;
                 host.begin = url->host;
                 host.len = url->hostlen;
                 /* Skip long hosts to avoid protocol coollisions */
@@ -386,7 +389,7 @@ show_url_header (struct worker_task *task)
                         r = 0;
                 }
                 /* Write url host to buf */
-               if (TAILQ_NEXT (url, next) != NULL) {
+               if (g_list_next (cur) != NULL) {
                         c = *(host.begin + host.len);
                         *(host.begin + host.len) = '\0';
                         msg_debug ("show_url_header: write url: %s", host.begin);
@@ -400,6 +403,7 @@ show_url_header (struct worker_task *task)
                         r += snprintf (outbuf + r, sizeof (outbuf) - r, "%s" CRLF, host.begin);
                         *(host.begin + host.len) = c;
                 }
+               cur = g_list_next (cur);
         }
         rspamd_dispatcher_write (task->dispatcher, outbuf, r, FALSE);
  }
diff --git a/src/tokenizers/tokenizers.c b/src/tokenizers/tokenizers.c

index 1b47289a29a6fb48bd1c6310aa4a070d18260c5a..4527e699ccf9749ab772ceb9d4e8bfbc2e255798 100644 (file)
--- a/src/tokenizers/tokenizers.c
+++ b/src/tokenizers/tokenizers.c
@@ -122,6 +122,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
         token_node_t *new = NULL;
         f_str_t url_domain;
         struct uri *url;
+       GList *cur;
         uint32_t h;
  
         if (*tree == NULL) {
@@ -129,7 +130,9 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
                 memory_pool_add_destructor (pool, (pool_destruct_func)g_tree_destroy, *tree);
         }
         
-       TAILQ_FOREACH (url, &task->urls, next) {
+       cur = task->urls;
+       while (cur) {
+               url = cur->data;
                 url_domain.begin = url->host;
                 url_domain.len = url->hostlen;
                 new = memory_pool_alloc (pool, sizeof (token_node_t));
@@ -139,6 +142,7 @@ tokenize_urls (memory_pool_t *pool, struct worker_task *task, GTree **tree)
                 if (g_tree_lookup (*tree, new) == NULL) {
                         g_tree_insert (*tree, new, new);
                 }
+               cur = g_list_next (cur);
         }
  
         return TRUE;
diff --git a/src/url.c b/src/url.c

index 875358ae018eb4864290f1708313403406dd25db..cc58a2caf9bbaa1be0076d31206fdd69b85ec6bf 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -23,10 +23,11 @@
   */
  
  #include "config.h"
+#include "url.h"
  #include "util.h"
  #include "fstring.h"
  #include "main.h"
-#include "url.h"
+#include "message.h"
  
  #define POST_CHAR 1
  #define POST_CHAR_S "\001"
@@ -853,7 +854,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
  }
  
  void 
-url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
+url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html)
  {
         GMatchInfo *info;
         GError *err = NULL;
@@ -861,26 +862,32 @@ url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
         char *url_str = NULL;
         struct uri *new;
         
-       if (!content->data || content->len == 0) {
+       if (!part->orig->data || part->orig->len == 0) {
                 msg_warn ("url_parse_text: got empty text part");
                 return;
         }
  
         if (url_init () == 0) {
-               rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+               if (is_html) {
+                       rc = g_regex_match_full (html_re, (const char *)part->orig->data, part->orig->len, 0, 0, &info, &err);
+               }
+               else {
+                       rc = g_regex_match_full (text_re, (const char *)part->content->data, part->content->len, 0, 0, &info, &err);
+               
+               }
                 if (rc) {
                         while (g_match_info_matches (info)) {
                                 url_str = g_match_info_fetch (info, is_html ? 1 : 0);
                                 msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
                                 if (url_str != NULL) {
-                                       new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-                                       if (new != NULL) {
-                                               rc = parse_uri (new, url_str, task->task_pool);
-                                               if (rc != URI_ERRNO_OK) {
-                                                       msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
-                                               }
-                                               if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                       TAILQ_INSERT_TAIL (&task->urls, new, next);
+                                       if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+                                               new = memory_pool_alloc (pool, sizeof (struct uri));
+                                               if (new != NULL) {
+                                                       rc = parse_uri (new, url_str, pool);
+                                                       if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+                                                               g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+                                                               task->urls = g_list_prepend (task->urls, new);
+                                                       }
                                                 }
                                         }
                                 }
diff --git a/src/url.h b/src/url.h

index 2c367548d1d740993a6ea7a959a8ca6d9975d8f3..7860f544a7145ea4606c7bfb175541fe06d7f226 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -6,6 +6,7 @@
  #include "mem_pool.h"
  
  struct worker_task;
+struct mime_text_part;
  
  struct uri {
         /* The start of the uri (and thus start of the protocol string). */
@@ -73,7 +74,7 @@ enum protocol {
  
  #define struri(uri) ((uri)->string)
  
-void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
+void url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_part *part, gboolean is_html);
  enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
  
  #endif
diff --git a/src/worker.c b/src/worker.c

index 3f223241bf4fbe0751c3bfbf20b15839c95bebb9..af457591947114608a6cb5b6fb7dcf0a1115e66a 100644 (file)
--- a/src/worker.c
+++ b/src/worker.c
@@ -130,6 +130,9 @@ free_task (struct worker_task *task, gboolean is_soft)
                 if (task->text_parts) {
                         g_list_free (task->text_parts);
                 }
+               if (task->urls) {
+                       g_list_free (task->urls);
+               }
                 memory_pool_delete (task->task_pool);
                 if (is_soft) {
                         /* Plan dispatcher shutdown */
@@ -287,7 +290,6 @@ accept_socket (int fd, short what, void *arg)
  #endif
         io_tv.tv_sec = WORKER_IO_TIMEOUT;
         io_tv.tv_usec = 0;
-       TAILQ_INIT (&new_task->urls);
         new_task->task_pool = memory_pool_new (memory_pool_get_size ());
         /* Add destructor for recipients list (it would be better to use anonymous function here */
         memory_pool_add_destructor (new_task->task_pool, (pool_destruct_func)rcpt_destruct, new_task);
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c

index 36c9e439bdd84cf534983f371e4130fdcbc535e3..f716c1ab068f577fd8f0c6d71a5a9fe0975f8a79 100644 (file)
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -70,51 +70,5 @@ const char *test_html = "<some_tag>This is test file with <a href=\"http://micro
  void
  rspamd_url_test_func ()
  {
-       GByteArray *text, *html;
-       struct worker_task task;
-       struct uri *url;
-       int i = 0;
-
-       text = g_byte_array_new();
-       text->data = (gchar *)test_text;
-       text->len = strlen (test_text);
-       html = g_byte_array_new();
-       html->data = (gchar *)test_html;
-       html->len = strlen (test_html);
-       bzero (&task, sizeof (task));
-       TAILQ_INIT (&task.urls);
-       task.task_pool = memory_pool_new (8192);
-       
-       g_test_timer_start ();
-       g_test_message ("Testing text URL regexp parser");
-       url_parse_text (&task, text, FALSE);
-
-       TAILQ_FOREACH (url, &task.urls, next) {
-               msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
-               i ++;
-       }
-
-       while (!TAILQ_EMPTY (&task.urls)) {
-               url = TAILQ_FIRST (&task.urls);
-               TAILQ_REMOVE (&task.urls, url, next);
-       }
-       /* g_assert (i == 39); */
-
-       msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
-       i = 0;
-       g_test_timer_start ();
-       g_test_message ("Testing html URL regexp parser");
-       url_parse_text (&task, html, TRUE);
-
-       TAILQ_FOREACH (url, &task.urls, next) {
-               msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
-               i ++;
-       }
-
-       while (!TAILQ_EMPTY (&task.urls)) {
-               url = TAILQ_FIRST (&task.urls);
-               TAILQ_REMOVE (&task.urls, url, next);
-       }
-       g_assert (i == 1);
-       msg_debug ("Time elapsed: %.2f", g_test_timer_elapsed ());
+       /* XXX: maybe write test for this */
  }
diff --git a/utils/url_extracter.c b/utils/url_extracter.c

index ac8e8be4e956b84c5d9094678b3cd31241af0afb..97bf72c47b882aba7227f25c9b0b33e34ae60af6 100644 (file)
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -24,107 +24,24 @@
  #include "../src/main.h"
  #include "../src/cfg_file.h"
  #include "../src/url.h"
+#include "../src/util.h"
  #include "../src/message.h"
  
  rspamd_hash_t *counters = NULL;
-#ifdef GMIME24
-static void
-mime_foreach_callback (GMimeObject *parent, GMimeObject *part, gpointer user_data)
-#else
-static void
-mime_foreach_callback (GMimeObject *part, gpointer user_data)
-#endif
-{
-       struct worker_task *task = (struct worker_task *)user_data;
-       struct mime_part *mime_part;
-       GMimeContentType *type;
-       GMimeDataWrapper *wrapper;
-       GMimeStream *part_stream;
-       GByteArray *part_content;
-       GMimeMessage *message;
-       
-       /* 'part' points to the current part node that g_mime_message_foreach_part() is iterating over */
-       
-       /* find out what class 'part' is... */
-       if (GMIME_IS_MESSAGE_PART (part)) {
-               /* message/rfc822 or message/news */
-               printf ("Message part found\n");
-               
-               /* g_mime_message_foreach_part() won't descend into
-                   child message parts, so if we want to count any
-                   subparts of this child message, we'll have to call
-                   g_mime_message_foreach_part() again here. */
-               
-               message = g_mime_message_part_get_message ((GMimeMessagePart *) part);
-#ifdef GMIME24
-               g_mime_message_foreach (message, mime_foreach_callback, task);
-#else
-               g_mime_message_foreach_part (message, mime_foreach_callback, task);
-#endif
-               g_object_unref (message);
-       } else if (GMIME_IS_MESSAGE_PARTIAL (part)) {
-               /* message/partial */
-               printf ("Message/partial part found\n");
-               
-               /* this is an incomplete message part, probably a
-                   large message that the sender has broken into
-                   smaller parts and is sending us bit by bit. we
-                   could save some info about it so that we could
-                   piece this back together again once we get all the
-                   parts? */
-       } else if (GMIME_IS_MULTIPART (part)) {
-               /* multipart/mixed, multipart/alternative, multipart/related, multipart/signed, multipart/encrypted, etc... */
-               
-               /* we'll get to finding out if this is a signed/encrypted multipart later... */
-       } else if (GMIME_IS_PART (part)) {
-               printf ("Normal part found\n");
-               /* a normal leaf part, could be text/plain or image/jpeg etc */
-               wrapper = g_mime_part_get_content_object (GMIME_PART (part));
-               if (wrapper != NULL) {
-                       part_stream = g_mime_stream_mem_new ();
-                       printf ("Get new wrapper object for normal part\n");
-                       if (g_mime_data_wrapper_write_to_stream (wrapper, part_stream) != -1) {
-                               printf ("Write wrapper to stream\n");
-                               part_content = g_mime_stream_mem_get_byte_array (GMIME_STREAM_MEM (part_stream));
-#ifdef GMIME24
-                               type = (GMimeContentType *)g_mime_object_get_content_type (GMIME_OBJECT (part));
-#else
-                               type = (GMimeContentType *)g_mime_part_get_content_type (GMIME_PART (part));
-#endif
-                               mime_part = g_malloc (sizeof (struct mime_part));
-                               mime_part->type = type;
-                               mime_part->content = part_content;
-                               task->parts =  g_list_prepend (task->parts, mime_part);
-                               if (g_mime_content_type_is_type (type, "text", "html")) {
-                                       printf ("Found text/html part\n");
-                                       url_parse_text (task, part_content, TRUE);
-                               } 
-                               else if (g_mime_content_type_is_type (type, "text", "plain")) {
-                                       printf ("Found text/plain part\n");
-                                       url_parse_text (task, part_content, FALSE);
-                               }
-                       }
-               }
-       } else {
-               g_assert_not_reached ();
-       }
-}
-
  
  int
  main (int argc, char **argv)
  {
-       GMimeMessage *message;
-       GMimeParser *parser;
-       GMimeStream *stream;
         struct worker_task task;
         struct uri *url;
         char *buf = NULL;
         size_t pos = 0, size = 65535;
+       GList *cur;
         
         g_mem_set_vtable(glib_mem_profiler_table);
         g_mime_init (0);
         bzero (&task, sizeof (struct worker_task));
+       task.task_pool = memory_pool_new (memory_pool_get_size ());
         
         /* Preallocate buffer */
         buf = g_malloc (size);
@@ -137,32 +54,23 @@ main (int argc, char **argv)
                         buf = g_realloc (buf, size);
                 }
         }
-
-       stream = g_mime_stream_mem_new_with_buffer (buf, pos);
-       /* create a new parser object to parse the stream */
-       parser = g_mime_parser_new_with_stream (stream);
-
-       /* unref the stream (parser owns a ref, so this object does not actually get free'd until we destroy the parser) */
-       g_object_unref (stream);
-
-       /* parse the message from the stream */
-       message = g_mime_parser_construct_message (parser);
         
-       task.message = message;
-       task.task_pool = memory_pool_new (memory_pool_get_size ());
-       TAILQ_INIT (&task.urls);
-
-       /* free the parser (and the stream) */
-       g_object_unref (parser);
+       task.cfg = memory_pool_alloc0 (task.task_pool, sizeof (struct config_file));
+       task.cfg->log_level = G_LOG_LEVEL_CRITICAL;
+       task.cfg->log_fd = STDERR_FILENO;
+       g_log_set_default_handler (file_log_function, task.cfg);
  
-#ifdef GMIME24
-       g_mime_message_foreach (message, mime_foreach_callback, &task);
-#else
-       g_mime_message_foreach_part (message, mime_foreach_callback, &task);
-#endif
-
-       TAILQ_FOREACH (url, &task.urls, next) {
-               printf ("Found url: %s, hostname: %s, data: %s\n", struri (url), url->host, url->data);
+       task.msg = memory_pool_alloc (task.task_pool, sizeof (f_str_t));
+       task.msg->begin = buf;
+       task.msg->len = pos;
+       process_message (&task);
+       
+       cur = task.urls;
+       while (cur) {
+               url = cur->data;
+               printf ("%s\n", struri (url));
+               cur = g_list_next (cur);
         }
-
+       
+       return 0;
  }
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Tue, 2 Jun 2009 15:32:34 +0000 (19:32 +0400)
CMakeLists.txt		patch \| blob \| history
rspamc.pl.in		patch \| blob \| history
src/lmtp.c		patch \| blob \| history
src/main.h		patch \| blob \| history
src/message.c		patch \| blob \| history
src/message.h		patch \| blob \| history
src/plugins/regexp.c		patch \| blob \| history
src/plugins/surbl.c		patch \| blob \| history
src/protocol.c		patch \| blob \| history
src/tokenizers/tokenizers.c		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history
src/worker.c		patch \| blob \| history
test/rspamd_url_test.c		patch \| blob \| history
utils/url_extracter.c		patch \| blob \| history