From: Vsevolod Stakhov Date: Mon, 20 Jul 2009 15:57:24 +0000 (+0400) Subject: * Decode html entitles in urls while extracting urls values from html tags X-Git-Tag: 0.2.7~80 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=3499fefcd2a33d80d5f9400a161106a2bd1ff72f;p=rspamd.git * Decode html entitles in urls while extracting urls values from html tags NOTE: works only for ascii symbols --- diff --git a/src/html.c b/src/html.c index 7c2cf4409..84db67d80 100644 --- a/src/html.c +++ b/src/html.c @@ -259,6 +259,51 @@ get_tag_by_name (const char *name) return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); } +/* Decode HTML entitles in text */ +static void +decode_entitles (char *s) +{ + char *t = s; /* t - tortoise */ + char *h = s; /* h - hare */ + char *end_ptr; + int state = 0, val; + + while (*h) { + switch (state) { + /* Out of entitle */ + case 0: + if (*h == '&' && *(h + 1) == '#') { + state = 1; + h ++; + continue; + } + else { + *t = *h; + h ++; + t ++; + } + break; + case 1: + if (*h == ';') { + val = strtoul ((t + 2), &end_ptr, 10); + if ((end_ptr != NULL && *end_ptr != ';') || val < 0 || val > 128) { + msg_info ("decode_entitles: invalid entitle code, cannot convert, %d", val); + *t = 'U'; + } + else { + *t = (char)val; + } + state = 0; + t ++; + } + h ++; + break; + } + } + *t = '\0'; + +} + static void parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, char *tag_text) { @@ -331,6 +376,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i url_text = memory_pool_alloc (task->task_pool, len + 1); g_strlcpy (url_text, c, len + 1); + decode_entitles (url_text); url = memory_pool_alloc (task->task_pool, sizeof (struct uri)); rc = parse_uri (url, url_text, task->task_pool);