]> source.dussan.org Git - rspamd.git/commitdiff
* Decode html entitles in urls while extracting urls values from html tags
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 20 Jul 2009 15:57:24 +0000 (19:57 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 20 Jul 2009 15:57:24 +0000 (19:57 +0400)
  NOTE: works only for ascii symbols

src/html.c

index 7c2cf4409ca847460d527ea524cbb316be019779..84db67d80beb2586b4a71c52cf77b9c40bcf33c6 100644 (file)
@@ -259,6 +259,51 @@ get_tag_by_name (const char *name)
        return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp);
 }
 
+/* Decode HTML entitles in text */
+static void
+decode_entitles (char *s)
+{
+       char *t = s;                    /* t - tortoise */
+       char *h = s;                    /* h - hare     */
+       char *end_ptr;
+       int state = 0, val;
+       
+       while (*h) {
+               switch (state) {
+                       /* Out of entitle */
+                       case 0:
+                               if (*h == '&' && *(h + 1) == '#') {
+                                       state = 1;      
+                                       h ++;
+                                       continue;
+                               }
+                               else {
+                                       *t = *h;
+                                       h ++;
+                                       t ++;
+                               }
+                               break;
+                       case 1:
+                               if (*h == ';') {
+                                       val = strtoul ((t + 2), &end_ptr, 10);
+                                       if ((end_ptr != NULL && *end_ptr != ';') || val < 0 || val > 128) {
+                                               msg_info ("decode_entitles: invalid entitle code, cannot convert, %d", val);
+                                               *t = 'U';
+                                       }
+                                       else {
+                                               *t = (char)val;
+                                       }
+                                       state = 0;
+                                       t ++;
+                               }
+                               h ++;
+                               break;
+               }
+       }
+       *t = '\0';
+
+}
+
 static void
 parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t id, char *tag_text)
 {
@@ -331,6 +376,7 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                
                url_text = memory_pool_alloc (task->task_pool, len + 1);
                g_strlcpy (url_text, c, len + 1);
+               decode_entitles (url_text);
                url = memory_pool_alloc (task->task_pool, sizeof (struct uri));
                rc = parse_uri (url, url_text, task->task_pool);