summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/html.c61
-rw-r--r--src/html.h5
-rw-r--r--src/message.c8
-rw-r--r--src/url.c11
4 files changed, 75 insertions, 10 deletions
diff --git a/src/html.c b/src/html.c
index 69f5e09c2..5b3552c7f 100644
--- a/src/html.c
+++ b/src/html.c
@@ -27,6 +27,7 @@
#include "main.h"
#include "message.h"
#include "html.h"
+#include "url.h"
sig_atomic_t tags_sorted = 0;
@@ -258,8 +259,61 @@ get_tag_by_name (const char *name)
return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
}
+static void
+parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, char *tag_text)
+{
+ char *c = NULL, *p;
+ int len, rc;
+ char *url_text;
+ struct uri *url;
+ gboolean got_quote = FALSE;
+
+ /* For A tags search for href= and for IMG tags search for src= */
+ if (id == Tag_A) {
+ c = strcasestr (tag_text, "href=");
+ len = sizeof ("href=") - 1;
+ }
+ else if (id == Tag_IMG) {
+ c = strcasestr (tag_text, "src=");
+ len = sizeof ("src=") - 1;
+ }
+
+ if (c != NULL) {
+ /* First calculate length */
+ c += len;
+ len = 0;
+ p = c;
+ while (*p) {
+ if (*p == '\r' || *p == '\n' || (got_quote && *p == '"')) {
+ break;
+ }
+ if (*p != '"') {
+ got_quote = !got_quote;
+ len ++;
+ }
+ p ++;
+ }
+
+ if (got_quote) {
+ c++;
+ }
+
+ url_text = memory_pool_alloc (task->task_pool, len + 1);
+ g_strlcpy (url_text, c, len + 1);
+ url = memory_pool_alloc (task->task_pool, sizeof (struct uri));
+ rc = parse_uri (url, url_text, task->task_pool);
+
+ if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
+ if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
+ g_tree_insert (part->html_urls, url_text, url);
+ task->urls = g_list_prepend (task->urls, url);
+ }
+ }
+ }
+}
+
gboolean
-add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
+add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level)
{
GNode *new;
struct html_node *data;
@@ -277,7 +331,7 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text,
part->html_nodes = new;
memory_pool_add_destructor (pool, (pool_destruct_func)g_node_destroy, part->html_nodes);
/* Call once again with root node */
- return add_html_node (pool, part, tag_text, cur_level);
+ return add_html_node (task, pool, part, tag_text, cur_level);
}
else {
new = construct_html_node (pool, tag_text);
@@ -286,6 +340,9 @@ add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text,
return -1;
}
data = new->data;
+ if (data->tag->id == Tag_A || data->tag->id == Tag_IMG) {
+ parse_tag_url (task, part, data->tag->id, tag_text);
+ }
if (data->flags & FL_CLOSING) {
if (! *cur_level) {
msg_debug ("add_html_node: bad parent node");
diff --git a/src/html.h b/src/html.h
index 70f20de49..1a7924e08 100644
--- a/src/html.h
+++ b/src/html.h
@@ -204,7 +204,10 @@ struct html_node {
int flags;
};
-gboolean add_html_node (memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
+/* Forwarded declaration */
+struct worker_task;
+
+gboolean add_html_node (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, char *tag_text, GNode **cur_level);
struct html_tag * get_tag_by_name (const char *name);
#endif
diff --git a/src/message.c b/src/message.c
index 65187d478..9afc4fa19 100644
--- a/src/message.c
+++ b/src/message.c
@@ -31,7 +31,7 @@
#include "modules.h"
GByteArray*
-strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
+strip_html_tags (struct worker_task *task, memory_pool_t *pool, struct mime_text_part *part, GByteArray *src, int *stateptr)
{
uint8_t *tbuf = NULL, *p, *tp = NULL, *rp, *tbegin = NULL, c, lc;
int br, i = 0, depth = 0, in_q = 0;
@@ -105,7 +105,7 @@ strip_html_tags (memory_pool_t *pool, struct mime_text_part *part, GByteArray *s
lc = '>';
in_q = state = 0;
*p = '\0';
- add_html_node (pool, part, tbegin, &level_ptr);
+ add_html_node (task, pool, part, tbegin, &level_ptr);
*p = '>';
break;
@@ -300,10 +300,12 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont
text_part->is_html = TRUE;
text_part->is_balanced = TRUE;
text_part->html_nodes = NULL;
- text_part->content = strip_html_tags (task->task_pool, text_part, part_content, NULL);
+
text_part->html_urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
text_part->urls = g_tree_new ( (GCompareFunc)g_ascii_strcasecmp);
+ text_part->content = strip_html_tags (task, task->task_pool, text_part, part_content, NULL);
+
if (text_part->html_nodes == NULL) {
url_parse_text (task->task_pool, task, text_part, FALSE);
}
diff --git a/src/url.c b/src/url.c
index 221b8ef63..7cb671991 100644
--- a/src/url.c
+++ b/src/url.c
@@ -351,7 +351,7 @@ get_protocol_length(const unsigned char *url)
string intact, make a copy before calling this function. */
static void
-url_unescape (char *s)
+url_unescape (char *s, unsigned int *len)
{
char *t = s; /* t - tortoise */
char *h = s; /* h - hare */
@@ -373,6 +373,7 @@ url_unescape (char *s)
goto copychar;
*t = c;
h += 2;
+ *len -=2;
}
}
*t = '\0';
@@ -846,7 +847,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
don't), but to support binary characters (which will have been
converted to %HH by reencode_escapes). */
if (strchr (uri->host, '%')) {
- url_unescape (uri->host);
+ url_unescape (uri->host, &uri->hostlen);
}
path_simplify (uri->data);
@@ -885,8 +886,10 @@ url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_
if (new != NULL) {
rc = parse_uri (new, url_str, pool);
if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
- g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
- task->urls = g_list_prepend (task->urls, new);
+ if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+ g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+ task->urls = g_list_prepend (task->urls, new);
+ }
}
}
}