]> source.dussan.org Git - rspamd.git/commitdiff
* Fix urls extracting, avoid code repeating
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 17 Apr 2009 15:04:04 +0000 (19:04 +0400)
src/message.c
src/url.c
src/url.h
test/rspamd_url_test.c
utils/url_extracter.c

index 14f9245cb2450c5b08ada43b5706ec6f36fdce73..4d6ba95d198ab6a6a0b0331f27b7d0a6ca25a5c0 100644 (file)
@@ -278,7 +278,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
 
        if (g_mime_content_type_is_type (type, "text", "html") || g_mime_content_type_is_type (type, "text", "xhtml")) {
                msg_debug ("mime_foreach_callback: got urls from text/html part");
-               url_parse_html (task, part_content);
+               url_parse_text (task, part_content, TRUE);
 
                text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
@@ -290,7 +290,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
        } 
        else if (g_mime_content_type_is_type (type, "text", "plain")) {
                msg_debug ("mime_foreach_callback: got urls from text/plain part");
-               url_parse_text (task, part_content);
+               url_parse_text (task, part_content, FALSE);
 
                text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part));
                text_part->orig = convert_text_to_utf (task, part_content, type, text_part);
index c9a1875362366ec422ee7f3ba507f747889b07b9..875358ae018eb4864290f1708313403406dd25db 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -853,11 +853,10 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
 }
 
 void 
-url_parse_text (struct worker_task *task, GByteArray *content)
+url_parse_text (struct worker_task *task, GByteArray *content, gboolean is_html)
 {
        GMatchInfo *info;
        GError *err = NULL;
-       int pos = 0, start;
        int rc;
        char *url_str = NULL;
        struct uri *new;
@@ -868,86 +867,39 @@ url_parse_text (struct worker_task *task, GByteArray *content)
        }
 
        if (url_init () == 0) {
-               do {
-                       rc = g_regex_match_full (text_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-                       if (rc) {
-                               if (g_match_info_matches (info)) {
-                                       g_match_info_fetch_pos (info, 0, &start, &pos);
-                                       url_str = g_match_info_fetch (info, 0);
-                                       msg_debug ("url_parse_text: extracted string with regexp: '%s'", url_str);
-                                       if (url_str != NULL) {
-                                               new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       rc = parse_uri (new, url_str, task->task_pool);
-                                                       if (rc != URI_ERRNO_OK) {
-                                                               msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-                                                       }
-                                                       if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
-                                                       }
+               rc = g_regex_match_full (is_html ? html_re : text_re, (const char *)content->data, content->len, 0, 0, &info, &err);
+               if (rc) {
+                       while (g_match_info_matches (info)) {
+                               url_str = g_match_info_fetch (info, is_html ? 1 : 0);
+                               msg_debug ("url_parse_text: extracted string with regexp: '%s', html is %s", url_str, is_html ? "on" : "off");
+                               if (url_str != NULL) {
+                                       new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+                                       if (new != NULL) {
+                                               rc = parse_uri (new, url_str, task->task_pool);
+                                               if (rc != URI_ERRNO_OK) {
+                                                       msg_debug ("url_parse_text: error while parsing url %s: %s", url_str, url_strerror (rc));
                                                }
-                                       }
-                                       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
-                               }
-                       }
-                       else if (err != NULL) {
-                               msg_debug ("url_parse_text: error matching regexp: %s", err->message);
-                               g_free (err);
-                       }
-                       else {
-                               msg_debug ("url_parse_text: cannot find url pattern in given string");
-                       }
-                       g_match_info_free (info);
-               } while (rc);
-       }
-}
-
-void 
-url_parse_html (struct worker_task *task, GByteArray *content)
-{
-       GMatchInfo *info;
-       GError *err = NULL;
-       int pos = 0, start;
-       int rc;
-       char *url_str = NULL;
-       struct uri *new;
-
-       if (!content->data || content->len == 0) {
-               msg_warn ("url_parse_text: got empty text part");
-               return;
-       }
-
-       if (url_init () == 0) {
-               do {
-                       rc = g_regex_match_full (html_re, (const char *)content->data, content->len, pos, 0, &info, &err);
-                       if (rc) {
-                               if (g_match_info_matches (info)) {
-                                       g_match_info_fetch_pos (info, 0, &start, &pos);
-                                       url_str = g_match_info_fetch (info, 1);
-                                       msg_debug ("url_parse_html: extracted string with regexp: '%s'", url_str);
-                                       if (url_str != NULL) {
-                                               new = memory_pool_alloc (task->task_pool, sizeof (struct uri));
-                                               if (new != NULL) {
-                                                       rc = parse_uri (new, url_str, task->task_pool);
-                                                       if (rc != URI_ERRNO_OK) {
-                                                               msg_debug ("url_parse_html: error while parsing url %s: %s", url_str, url_strerror (rc));
-                                                       }
-                                                       if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                               TAILQ_INSERT_TAIL (&task->urls, new, next);
-                                                       }
+                                               if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+                                                       TAILQ_INSERT_TAIL (&task->urls, new, next);
                                                }
                                        }
-                                       memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
                                }
+                               memory_pool_add_destructor (task->task_pool, (pool_destruct_func)g_free, url_str);
+                               /* Get next match */
+                               g_match_info_next (info, &err);
                        }
-                       else if (err) {
-                               msg_debug ("url_parse_html: error matching regexp: %s", err->message);
-                               g_free (err);
-                       }
-                       else {
-                               msg_debug ("url_parse_html: cannot find url pattern in given string");
-                       }
-                       g_match_info_free (info);
-               } while (rc);
+               }
+               else if (err != NULL) {
+                       msg_debug ("url_parse_text: error matching regexp: %s", err->message);
+                       g_free (err);
+               }
+               else {
+                       msg_debug ("url_parse_text: cannot find url pattern in given string");
+               }
+               g_match_info_free (info);
        }
 }
+
+/*
+ * vi: ts=4
+ */
index 08a5cb0a6e290776e14c2f6dfa4a78fb345130be..2c367548d1d740993a6ea7a959a8ca6d9975d8f3 100644 (file)
--- a/src/url.h
+++ b/src/url.h
@@ -73,8 +73,7 @@ enum protocol {
 
 #define struri(uri) ((uri)->string)
 
-void url_parse_html (struct worker_task *task, GByteArray *part);
-void url_parse_text (struct worker_task *task, GByteArray *part);
+void url_parse_text (struct worker_task *task, GByteArray *part, gboolean is_html);
 enum uri_errno parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool);
 
 #endif
index 808659757a589ab43538e266ca817aa80a36b551..36c9e439bdd84cf534983f371e4130fdcbc535e3 100644 (file)
@@ -87,7 +87,7 @@ rspamd_url_test_func ()
        
        g_test_timer_start ();
        g_test_message ("Testing text URL regexp parser");
-       url_parse_text (&task, text);
+       url_parse_text (&task, text, FALSE);
 
        TAILQ_FOREACH (url, &task.urls, next) {
                msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
@@ -104,7 +104,7 @@ rspamd_url_test_func ()
        i = 0;
        g_test_timer_start ();
        g_test_message ("Testing html URL regexp parser");
-       url_parse_html (&task, html);
+       url_parse_text (&task, html, TRUE);
 
        TAILQ_FOREACH (url, &task.urls, next) {
                msg_debug ("Found url: %s, hostname: %s, data: %s", struri (url), url->host, url->data);
index 8e16a689f33a1eb3fe2909156d266e7bf0f3fd8d..1b98fa67e46d8463d184fae27481bb0c4156bbff 100644 (file)
@@ -96,11 +96,11 @@ mime_foreach_callback (GMimeObject *part, gpointer user_data)
                                task->parts =  g_list_prepend (task->parts, mime_part);
                                if (g_mime_content_type_is_type (type, "text", "html")) {
                                        printf ("Found text/html part\n");
-                                       url_parse_html (task, part_content);
+                                       url_parse_text (task, part_content, TRUE);
                                } 
                                else if (g_mime_content_type_is_type (type, "text", "plain")) {
                                        printf ("Found text/plain part\n");
-                                       url_parse_text (task, part_content);
+                                       url_parse_text (task, part_content, FALSE);
                                }
                        }
                }