From ad56efc14e371b6a452c1ccc46aa68d800125468 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 3 Jul 2009 17:24:37 +0400 Subject: [PATCH] * Extract url encoded urls from html texts --- src/html.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++-- src/html.h | 5 ++++- src/message.c | 8 ++++--- src/url.c | 11 ++++++---- 4 files changed, 75 insertions(+), 10 deletions(-) diff --git a/src/html.c b/src/html.c index 69f5e09c2..5b3552c7f 100644 --- a/src/html.c +++ b/src/html.c @@ -27,6 +27,7 @@ #include "main.h" #include "message.h" #include "html.h" +#include "url.h" sig_atomic_t tags_sorted = 0; @@ -258,8 +259,61 @@ get_tag_by_name (const char *name) return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); } +static void +parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, char *tag_text) +{ + char *c = NULL, *p; + int len, rc; + char *url_text; + struct uri *url; + gboolean got_quote = FALSE; + + /* For A tags search for href= and for IMG tags search for src= */ + if (id == Tag_A) { + c = strcasestr (tag_text, "href="); + len = sizeof ("href=") - 1; + } + else if (id == Tag_IMG) { + c = strcasestr (tag_text, "src="); + len = sizeof ("src=") - 1; + } + + if (c != NULL) { + /* First calculate length */ + c += len; + len = 0; + p = c; + while (*p) { + if (*p == '\r' || *p == '\n' || (got_quote && *p == '"')) { + break; + } + if (*p != '"') { + got_quote = !got_quote; + len ++; + } + p ++; + } + + if (got_quote) { + c++; + } + + url_text = memory_pool_alloc (task->task_pool, len + 1); + g_strlcpy (url_text, c, len + 1); + url = memory_pool_alloc (task->task_pool, sizeof (struct uri)); + rc = parse_uri (url, url_text, task->task_pool); + + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { + if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) { + g_tree_insert (part->html_urls, url_text, url); + task->urls = g_list_prepend (task->urls, url); + } + } + } +} + gboolean -add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level) +add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level) { GNode *new; struct html_node *data; @@ -277,7 +331,7 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, part->html_nodes = new; memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes); /* Call once again with root node */ - return add_html_node (pool, part, tag_text, cur_level); + return add_html_node (task, pool, part, tag_text, cur_level); } else { new = construct_html_node (pool, tag_text); @@ -286,6 +340,9 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, return -1; } data = new->data; + if (data->tag->id == Tag_A || data->tag->id == Tag_IMG) { + parse_tag_url (task, part, data->tag->id, tag_text); + } if (data->flags & FL_CLOSING) { if (! *cur_level) { msg_debug ("add_html_node: bad parent node"); diff --git a/src/html.h b/src/html.h index 70f20de49..1a7924e08 100644 --- a/src/html.h +++ b/src/html.h @@ -204,7 +204,10 @@ struct html_node { int flags; }; -gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level); +/* Forwarded declaration */ +struct worker_task; + +gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level); struct html_tag * get_tag_by_name (const char *name); #endif diff --git a/src/message.c b/src/message.c index 65187d478..9afc4fa19 100644 --- a/src/message.c +++ b/src/message.c @@ -31,7 +31,7 @@ #include "modules.h" GByteArray* -strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr) +strip_html_tags (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr) { uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin = NULL, c, lc; int br, i = 0, depth = 0, in_q = 0; @@ -105,7 +105,7 @@ strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *s lc = '>'; in_q = state = 0; *p = '\0'; - add_html_node (pool, part, tbegin, &level_ptr); + add_html_node (task, pool, part, tbegin, &level_ptr); *p = '>'; break; @@ -300,10 +300,12 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part->is_html = TRUE; text_part->is_balanced = TRUE; text_part->html_nodes = NULL; - text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL); + text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp); + text_part->content = strip_html_tags (task, task->task_pool, text_part, part_content, NULL); + if (text_part->html_nodes == NULL) { url_parse_text (task->task_pool, task, text_part, FALSE); } diff --git a/src/url.c b/src/url.c index 221b8ef63..7cb671991 100644 --- a/src/url.c +++ b/src/url.c @@ -351,7 +351,7 @@ get_protocol_length(const unsigned char *url) string intact, make a copy before calling this function. */ static void -url_unescape (char *s) +url_unescape (char *s, unsigned int *len) { char *t = s; /* t - tortoise */ char *h = s; /* h - hare */ @@ -373,6 +373,7 @@ url_unescape (char *s) goto copychar; *t = c; h += 2; + *len -=2; } } *t = '\0'; @@ -846,7 +847,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool) don't), but to support binary characters (which will have been converted to %HH by reencode_escapes). */ if (strchr (uri->host, '%')) { - url_unescape (uri->host); + url_unescape (uri->host, &uri->hostlen); } path_simplify (uri->data); @@ -885,8 +886,10 @@ url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_ if (new != NULL) { rc = parse_uri (new, url_str, pool); if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) { - g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); - task->urls = g_list_prepend (task->urls, new); + if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) { + g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new); + task->urls = g_list_prepend (task->urls, new); + } } } } -- 2.39.5