summaryrefslogtreecommitdiffstats
path: root/src/url.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-03 17:24:37 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2009-07-03 17:24:37 +0400
commitad56efc14e371b6a452c1ccc46aa68d800125468 (patch)
treeb2dbc42171dc5c846dfc3240e760cc6f0f2dad9a /src/url.c
parent7348a381a903eea67611fbce0782cf968b965ebf (diff)
downloadrspamd-ad56efc14e371b6a452c1ccc46aa68d800125468.tar.gz
rspamd-ad56efc14e371b6a452c1ccc46aa68d800125468.zip
* Extract url encoded urls from html texts
Diffstat (limited to 'src/url.c')
-rw-r--r--src/url.c11
1 files changed, 7 insertions, 4 deletions
diff --git a/src/url.c b/src/url.c
index 221b8ef63..7cb671991 100644
--- a/src/url.c
+++ b/src/url.c
@@ -351,7 +351,7 @@ get_protocol_length(const unsigned char *url)
string intact, make a copy before calling this function. */
static void
-url_unescape (char *s)
+url_unescape (char *s, unsigned int *len)
{
char *t = s; /* t - tortoise */
char *h = s; /* h - hare */
@@ -373,6 +373,7 @@ url_unescape (char *s)
goto copychar;
*t = c;
h += 2;
+ *len -=2;
}
}
*t = '\0';
@@ -846,7 +847,7 @@ parse_uri(struct uri *uri, unsigned char *uristring, memory_pool_t *pool)
don't), but to support binary characters (which will have been
converted to %HH by reencode_escapes). */
if (strchr (uri->host, '%')) {
- url_unescape (uri->host);
+ url_unescape (uri->host, &uri->hostlen);
}
path_simplify (uri->data);
@@ -885,8 +886,10 @@ url_parse_text (memory_pool_t *pool, struct worker_task *task, struct mime_text_
if (new != NULL) {
rc = parse_uri (new, url_str, pool);
if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST) {
- g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
- task->urls = g_list_prepend (task->urls, new);
+ if (g_tree_lookup (is_html ? part->html_urls : part->urls, url_str) == NULL) {
+ g_tree_insert (is_html ? part->html_urls : part->urls, url_str, new);
+ task->urls = g_list_prepend (task->urls, new);
+ }
}
}
}