]> source.dussan.org Git - rspamd.git/commitdiff
* Extract url encoded urls from html texts
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 3 Jul 2009 13:24:37 +0000 (17:24 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 3 Jul 2009 13:24:37 +0000 (17:24 +0400)
src/html.c
src/html.h
src/message.c
src/url.c

index 69f5e09c24ff7de0bf59da0e6a5e47203023254c..5b3552c7f954b942e266018648deb14e275565de 100644 (file)
@@ -27,6 +27,7 @@
 #include "main.h"
 #include "message.h"
 #include "html.h"
+#include "url.h"
 
 sig_atomic_t tags_sorted = 0;
 
@@ -258,8 +259,61 @@ get_tag_by_name (const char *name)
        return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
 }
 
+static void
+parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, char *tag_text)
+{
+       char *c = NULL, *p;
+       int len, rc;
+       char *url_text;
+       struct uri *url;
+       gboolean got_quote = FALSE;
+
+       /* For A tags search for href= and for IMG tags search for src= */
+       if (id == Tag_A) {
+               c = strcasestr (tag_text, "href=");
+               len = sizeof ("href=") - 1;
+       }
+       else if (id == Tag_IMG) {
+               c = strcasestr (tag_text, "src=");
+               len = sizeof ("src=") - 1;
+       }
+
+       if (c != NULL) {
+               /* First calculate length */
+               c += len;
+               len = 0;
+               p = c;
+               while (*p) {
+                       if (*p == '\r' || *p == '\n' || (got_quote && *p == '"')) {
+                               break;
+                       }
+                       if (*p != '"') {
+                               got_quote = !got_quote;
+                               len ++;
+                       }
+                       p ++;
+               }
+
+               if (got_quote) {
+                       c++;
+               }
+
+               url_text = memory_pool_alloc (task->task_pool, len + 1);
+               g_strlcpy (url_text, c, len + 1);
+               url = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+               rc = parse_uri (url, url_text, task->task_pool);
+
+               if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+                       if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
+                               g_tree_insert (part->html_urls, url_text, url);
+                               task->urls = g_list_prepend (task->urls, url);
+                       }
+               }
+       }       
+}
+
 gboolean
-add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
+add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
 {
        GNode *new;
        struct html_node *data;
@@ -277,7 +331,7 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text,
                part->html_nodes = new;
                memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes);
                /* Call once again with root node */
-               return add_html_node (pool, part, tag_text, cur_level);
+               return add_html_node (task, pool, part, tag_text, cur_level);
        }
        else {
                new = construct_html_node (pool, tag_text);
@@ -286,6 +340,9 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text,
                        return -1;
                }
                data = new->data;
+               if (data->tag->id == Tag_A || data->tag->id == Tag_IMG) {
+                       parse_tag_url (task, part, data->tag->id, tag_text);
+               }
                if (data->flags & FL_CLOSING) {
                        if (! *cur_level) {
                                msg_debug ("add_html_node: bad parent node");
index 70f20de49e54aeffaebe4973a47bed6c300ca715..1a7924e08c3b5088466e9bf0934999815587c7e4 100644 (file)
@@ -204,7 +204,10 @@ struct html_node {
        int flags;
 };
 
-gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
+/* Forwarded declaration */
+struct worker_task;
+
+gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
 struct html_tag * get_tag_by_name (const char *name);
 
 #endif
index 65187d4784fa9796fc43b70f39c5cea26d81dac9..9afc4fa1984c33cc65f1a13d965d0b32f9ca4757 100644 (file)
@@ -31,7 +31,7 @@
 #include "modules.h"
 
 GByteArray*
-strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
+strip_html_tags (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
 {
        uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin = NULL, c, lc;
        int br, i = 0, depth = 0, in_q = 0;
@@ -105,7 +105,7 @@ strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *s
                                                lc = '>';
                                                in_q = state = 0;
                                                *p = '\0';
-                                               add_html_node (pool, part, tbegin, &level_ptr);
+                                               add_html_node (task, pool, part, tbegin, &level_ptr);
                                                *p = '>';
                                                break;
                                                
@@ -300,10 +300,12 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
                text_part->is_html = TRUE;
                text_part->is_balanced = TRUE;
                text_part->html_nodes = NULL;
-               text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+
                text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
                text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
 
+               text_part->content = strip_html_tags (task, task->task_pool, text_part, part_content, NULL);
+
                if (text_part->html_nodes == NULL) {
                        url_parse_text (task->task_pool, task, text_part, FALSE);
                }
index 221b8ef635a6dcd24b563424e7138ae39e03d6ad..7cb67199195c90c03223d7ee078bd2b22a5d5b72 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -351,7 +351,7 @@ get_protocol_length(const unsigned char *url)
    string intact, make a copy before calling this function.  */
 
 static void
-url_unescape (char *s)
+url_unescape (char *s, unsigned int *len)
 {
        char *t = s;                    /* t - tortoise */
        char *h = s;                    /* h - hare     */
@@ -373,6 +373,7 @@ url_unescape (char *s)
                                goto copychar;
                        *t = c;
                        h += 2;
+                       *len -=2;
                }
        }
        *t = '\0';
@@ -846,7 +847,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
      don't), but to support binary characters (which will have been
      converted to %HH by reencode_escapes).  */
        if (strchr (uri->host, '%')) {
-               url_unescape (uri->host);
+               url_unescape (uri->host, &uri->hostlen);
        }
        path_simplify (uri->data);
 
@@ -885,8 +886,10 @@ url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_
                                                if (new != NULL) {
                                                        rc = parse_uri (new, url_str, pool);
                                                        if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
-                                                               g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
-                                                               task->urls = g_list_prepend (task->urls, new);
+                                                               if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+                                                                       g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+                                                                       task->urls = g_list_prepend (task->urls, new);
+                                                               }
                                                        }
                                                }
                                        }