* Fix urls extracting, avoid code repeating

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
diff --git a/src/message.c b/src/message.c

index 14f9245cb2450c5b08ada43b5706ec6f36fdce73..4d6ba95d198ab6a6a0b0331f27b7d0a6ca25a5c0 100644 (file)
--- a/src/message.c
+++ b/src/message.c
@@ -278,7 +278,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
  
         if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
                 msg_debug ("mime_foreach_callback: got urls from text/html part");
-               url_parse_html (task, part_content);
+               url_parse_text (task, part_content, TRUE);
  
                 text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                 text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
@@ -290,7 +290,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
         } 
         else if (g_mime_content_type_is_type (type, "text", "plain")) {
                 msg_debug ("mime_foreach_callback: got urls from text/plain part");
-               url_parse_text (task, part_content);
+               url_parse_text (task, part_content, FALSE);
  
                 text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                 text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
diff --git a/src/url.c b/src/url.c

index c9a1875362366ec422ee7f3ba507f747889b07b9..875358ae018eb4864290f1708313403406dd25db 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -853,11 +853,10 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
  }
  
  void 
-url_parse_text (struct worker_task *task, GByteArray *content)
+url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
  {
         GMatchInfo *info;
         GError *err = NULL;
-       int pos = 0, start;
         int rc;
         char *url_str = NULL;
         struct uri *new;
@@ -868,86 +867,39 @@ url_parse_text (struct worker_task *task, GByteArray *content)
         }
  
         if (url_init () == 0) {
-               do {
-                       rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-                       if (rc) {
-                               if (g_match_info_matches (info)) {
-                                       g_match_info_fetch_pos (info, 0, &start, &pos);
-                                       url_str = g_match_info_fetch (info, 0);
-                                       msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
-                                       if (url_str != NULL) {
-                                               new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       rc = parse_uri (new, url_str, task->task_pool);
-                                                       if (rc != URI_ERRNO_OK) {
-                                                               msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-                                                       }
-                                                       if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
-                                                       }
+               rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+               if (rc) {
+                       while (g_match_info_matches (info)) {
+                               url_str = g_match_info_fetch (info, is_html ? 1 : 0);
+                               msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
+                               if (url_str != NULL) {
+                                       new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+                                       if (new != NULL) {
+                                               rc = parse_uri (new, url_str, task->task_pool);
+                                               if (rc != URI_ERRNO_OK) {
+                                                       msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
                                                 }
-                                       }
-                                       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
-                               }
-                       }
-                       else if (err != NULL) {
-                               msg_debug ("url_parse_text: error matching regexp: %s", err->message);
-                               g_free (err);
-                       }
-                       else {
-                               msg_debug ("url_parse_text: cannot find url pattern in given string");
-                       }
-                       g_match_info_free (info);
-               } while (rc);
-       }
-}
-
-void 
-url_parse_html (struct worker_task *task, GByteArray *content)
-{
-       GMatchInfo *info;
-       GError *err = NULL;
-       int pos = 0, start;
-       int rc;
-       char *url_str = NULL;
-       struct uri *new;
-
-       if (!content->data || content->len == 0) {
-               msg_warn ("url_parse_text: got empty text part");
-               return;
-       }
-
-       if (url_init () == 0) {
-               do {
-                       rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-                       if (rc) {
-                               if (g_match_info_matches (info)) {
-                                       g_match_info_fetch_pos (info, 0, &start, &pos);
-                                       url_str = g_match_info_fetch (info, 1);
-                                       msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
-                                       if (url_str != NULL) {
-                                               new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       rc = parse_uri (new, url_str, task->task_pool);
-                                                       if (rc != URI_ERRNO_OK) {
-                                                               msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-                                                       }
-                                                       if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
-                                                       }
+                                               if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+                                                       TAILQ_INSERT_TAIL (&task->urls, new, next);
                                                 }
                                         }
-                                       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
                                 }
+                               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
+                               /* Get next match */
+                               g_match_info_next (info, &err);
                         }
-                       else if (err) {
-                               msg_debug ("url_parse_html: error matching regexp: %s", err->message);
-                               g_free (err);
-                       }
-                       else {
-                               msg_debug ("url_parse_html: cannot find url pattern in given string");
-                       }
-                       g_match_info_free (info);
-               } while (rc);
+               }
+               else if (err != NULL) {
+                       msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+                       g_free (err);
+               }
+               else {
+                       msg_debug ("url_parse_text: cannot find url pattern in given string");
+               }
+               g_match_info_free (info);
         }
  }
+
+/*
+ * vi: ts=4
+ */
diff --git a/src/url.h b/src/url.h

index 08a5cb0a6e290776e14c2f6dfa4a78fb345130be..2c367548d1d740993a6ea7a959a8ca6d9975d8f3 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -73,8 +73,7 @@ enum protocol {
  
  #define struri(uri) ((uri)->string)
  
-void url_parse_html (struct worker_task *task, GByteArray *part);
-void url_parse_text (struct worker_task *task, GByteArray *part);
+void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
  enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
  
  #endif
diff --git a/test/rspamd_url_test.c b/test/rspamd_url_test.c

index 808659757a589ab43538e266ca817aa80a36b551..36c9e439bdd84cf534983f371e4130fdcbc535e3 100644 (file)
--- a/test/rspamd_url_test.c
+++ b/test/rspamd_url_test.c
@@ -87,7 +87,7 @@ rspamd_url_test_func ()
         
         g_test_timer_start ();
         g_test_message ("Testing text URL regexp parser");
-       url_parse_text (&task, text);
+       url_parse_text (&task, text, FALSE);
  
         TAILQ_FOREACH (url, &task.urls, next) {
                 msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
@@ -104,7 +104,7 @@ rspamd_url_test_func ()
         i = 0;
         g_test_timer_start ();
         g_test_message ("Testing html URL regexp parser");
-       url_parse_html (&task, html);
+       url_parse_text (&task, html, TRUE);
  
         TAILQ_FOREACH (url, &task.urls, next) {
                 msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
diff --git a/utils/url_extracter.c b/utils/url_extracter.c

index 8e16a689f33a1eb3fe2909156d266e7bf0f3fd8d..1b98fa67e46d8463d184fae27481bb0c4156bbff 100644 (file)
--- a/utils/url_extracter.c
+++ b/utils/url_extracter.c
@@ -96,11 +96,11 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
                                 task->parts =  g_list_prepend (task->parts, mime_part);
                                 if (g_mime_content_type_is_type (type, "text", "html")) {
                                         printf ("Found text/html part\n");
-                                       url_parse_html (task, part_content);
+                                       url_parse_text (task, part_content, TRUE);
                                 } 
                                 else if (g_mime_content_type_is_type (type, "text", "plain")) {
                                         printf ("Found text/plain part\n");
-                                       url_parse_text (task, part_content);
+                                       url_parse_text (task, part_content, FALSE);
                                 }
                         }
                 }
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
src/message.c		patch \| blob \| history
src/url.c		patch \| blob \| history
src/url.h		patch \| blob \| history
test/rspamd_url_test.c		patch \| blob \| history
utils/url_extracter.c		patch \| blob \| history