diff options
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r-- | src/libserver/html.c | 249 |
1 files changed, 98 insertions, 151 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index 0b7b584f5..028c54f6c 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -23,15 +23,15 @@ */ #include "config.h" -#include "html.h" +#include "util.h" #include "main.h" #include "message.h" +#include "html.h" #include "url.h" -#include "util.h" -static sig_atomic_t tags_sorted = 0; +static sig_atomic_t tags_sorted = 0; -static struct html_tag tag_defs[] = { +static struct html_tag tag_defs[] = { /* W3C defined elements */ {Tag_A, "a", (CM_INLINE)}, {Tag_ABBR, "abbr", (CM_INLINE)}, @@ -93,8 +93,7 @@ static struct html_tag tag_defs[] = { {Tag_META, "meta", (CM_HEAD | CM_EMPTY)}, {Tag_NOFRAMES, "noframes", (CM_BLOCK | CM_FRAMES)}, {Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_MIXED)}, - {Tag_OBJECT, "object", - (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)}, + {Tag_OBJECT, "object", (CM_OBJECT | CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)}, {Tag_OL, "ol", (CM_BLOCK)}, {Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)}, {Tag_OPTION, "option", (CM_FIELD | CM_OPT)}, @@ -157,21 +156,21 @@ static struct html_tag tag_defs[] = { {Tag_WBR, "wbr", (CM_INLINE | CM_EMPTY)}, }; -static sig_atomic_t entities_sorted = 0; +static sig_atomic_t entities_sorted = 0; struct _entity; -typedef struct _entity entity; +typedef struct _entity entity; struct _entity { - gchar *name; - uint code; - gchar *replacement; + gchar *name; + uint code; + gchar *replacement; }; -static entity entities_defs[] = { +static entity entities_defs[] = { /* - ** Markup pre-defined character entities - */ + ** Markup pre-defined character entities + */ {"quot", 34, "\""}, {"amp", 38, "&"}, {"apos", 39, "'"}, @@ -179,8 +178,8 @@ static entity entities_defs[] = { {"gt", 62, ">"}, /* - ** Latin-1 character entities - */ + ** Latin-1 character entities + */ {"nbsp", 160, " "}, {"iexcl", 161, "!"}, {"cent", 162, "cent"}, @@ -279,8 +278,8 @@ static entity entities_defs[] = { {"yuml", 255, "y"}, /* - ** Extended Entities defined in HTML 4: Symbols - */ + ** Extended Entities defined in HTML 4: Symbols + */ {"fnof", 402, "f"}, {"Alpha", 913, "alpha"}, {"Beta", 914, "beta"}, @@ -407,8 +406,8 @@ static entity entities_defs[] = { {"diams", 9830, NULL}, /* - ** Extended Entities defined in HTML 4: Special (less Markup at top) - */ + ** Extended Entities defined in HTML 4: Special (less Markup at top) + */ {"OElig", 338, NULL}, {"oelig", 339, NULL}, {"Scaron", 352, NULL}, @@ -444,8 +443,8 @@ static entity entities_defs_num[ (G_N_ELEMENTS (entities_defs)) ]; static gint tag_cmp (const void *m1, const void *m2) { - const struct html_tag *p1 = m1; - const struct html_tag *p2 = m2; + const struct html_tag *p1 = m1; + const struct html_tag *p2 = m2; return g_ascii_strcasecmp (p1->name, p2->name); } @@ -453,8 +452,8 @@ tag_cmp (const void *m1, const void *m2) static gint entity_cmp (const void *m1, const void *m2) { - const entity *p1 = m1; - const entity *p2 = m2; + const entity *p1 = m1; + const entity *p2 = m2; return g_ascii_strcasecmp (p1->name, p2->name); } @@ -462,19 +461,19 @@ entity_cmp (const void *m1, const void *m2) static gint entity_cmp_num (const void *m1, const void *m2) { - const entity *p1 = m1; - const entity *p2 = m2; + const entity *p1 = m1; + const entity *p2 = m2; return p1->code - p2->code; } -static GNode * +static GNode * construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) { - struct html_node *html; - GNode *n = NULL; - struct html_tag key, *found; - gchar t; + struct html_node *html; + GNode *n = NULL; + struct html_tag key, *found; + gchar t; if (text == NULL || *text == '\0') { return NULL; @@ -488,8 +487,7 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) } /* Check xml tag */ - if (*text == '?' && - g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { + if (*text == '?' && g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { html->flags |= FL_XML; html->tag = NULL; } @@ -501,15 +499,13 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) /* Find end of tag name */ key.name = text; - while (*text && g_ascii_isalnum (*(++text))) ; + while (*text && g_ascii_isalnum (*(++text))); t = *text; *text = '\0'; /* Match tag id by tag name */ - if ((found = - bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), - sizeof (struct html_tag), tag_cmp)) != NULL) { + if ((found = bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp)) != NULL) { *text = t; html->tag = found; } @@ -524,20 +520,18 @@ construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) return n; } -static gboolean +static gboolean check_balance (GNode * node, GNode ** cur_level) { - struct html_node *arg = node->data, *tmp; - GNode *cur; + struct html_node *arg = node->data, *tmp; + GNode *cur; if (arg->flags & FL_CLOSING) { /* First of all check whether this tag is closing tag for parent node */ cur = node->parent; while (cur && cur->data) { tmp = cur->data; - if ((tmp->tag && - arg->tag) && tmp->tag->id == arg->tag->id && - (tmp->flags & FL_CLOSED) == 0) { + if ((tmp->tag && arg->tag) && tmp->tag->id == arg->tag->id && (tmp->flags & FL_CLOSED) == 0) { tmp->flags |= FL_CLOSED; /* Destroy current node as we find corresponding parent node */ g_node_destroy (node); @@ -555,28 +549,27 @@ check_balance (GNode * node, GNode ** cur_level) return FALSE; } -struct html_tag * +struct html_tag * get_tag_by_name (const gchar *name) { - struct html_tag key; + struct html_tag key; key.name = name; - return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), - sizeof (struct html_tag), tag_cmp); + return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); } /* Decode HTML entitles in text */ void decode_entitles (gchar *s, guint * len) { - guint l, rep_len; - gchar *t = s; /* t - tortoise */ - gchar *h = s; /* h - hare */ - gchar *e = s; - gchar *end_ptr; - gint state = 0, val, base; - entity *found, key; + guint l, rep_len; + gchar *t = s; /* t - tortoise */ + gchar *h = s; /* h - hare */ + gchar *e = s; + gchar *end_ptr; + gint state = 0, val, base; + entity *found, key; if (len == NULL || *len == 0) { l = strlen (s); @@ -587,7 +580,7 @@ decode_entitles (gchar *s, guint * len) while (h - s < (gint)l) { switch (state) { - /* Out of entitle */ + /* Out of entitle */ case 0: if (*h == '&') { state = 1; @@ -608,10 +601,7 @@ decode_entitles (gchar *s, guint * len) key.name = e + 1; *h = '\0'; - if (*(e + 1) != '#' && - (found = - bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), - sizeof (entity), entity_cmp)) != NULL) { + if (*(e + 1) != '#' && (found = bsearch (&key, entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp)) != NULL) { if (found->replacement) { rep_len = strlen (found->replacement); memcpy (t, found->replacement, rep_len); @@ -641,10 +631,7 @@ decode_entitles (gchar *s, guint * len) else { /* Search for a replacement */ key.code = val; - found = - bsearch (&key, entities_defs_num, G_N_ELEMENTS ( - entities_defs), sizeof (entity), - entity_cmp_num); + found = bsearch (&key, entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); if (found) { if (found->replacement) { rep_len = strlen (found->replacement); @@ -669,19 +656,15 @@ decode_entitles (gchar *s, guint * len) } static void -check_phishing (struct rspamd_task *task, - struct uri *href_url, - const gchar *url_text, - gsize remain, - tag_id_t id) +check_phishing (struct rspamd_task *task, struct uri *href_url, const gchar *url_text, gsize remain, tag_id_t id) { - struct uri *new; - gchar *url_str; - const gchar *p, *c; - gchar tagbuf[128]; - struct html_tag *tag; - gsize len = 0; - gint rc; + struct uri *new; + gchar *url_str; + const gchar *p, *c; + gchar tagbuf[128]; + struct html_tag *tag; + gsize len = 0; + gint rc; p = url_text; while (len < remain) { @@ -695,8 +678,8 @@ check_phishing (struct rspamd_task *task, } while (len < remain) { if (!g_ascii_isspace (*p) && *p != '>') { - p++; - len++; + p ++; + len ++; } else { break; @@ -710,11 +693,11 @@ check_phishing (struct rspamd_task *task, else if (tag->id == Tag_IMG) { /* We should ignore IMG tag here */ while (len < remain && *p != '>' && *p != '<') { - p++; - len++; + p ++; + len ++; } if (*p == '>' && len < remain) { - p++; + p ++; } remain -= p - url_text; @@ -724,21 +707,19 @@ check_phishing (struct rspamd_task *task, } } } - len++; - p++; + len ++; + p ++; } - if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, - TRUE) && url_str != NULL) { + if (url_try_text (task->task_pool, url_text, len, NULL, NULL, &url_str, TRUE) && url_str != NULL) { new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct uri)); if (new != NULL) { g_strstrip (url_str); rc = parse_uri (new, url_str, task->task_pool); - if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == - URI_ERRNO_NO_HOST_SLASH) { + if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) { if (g_ascii_strncasecmp (href_url->host, new->host, - MAX (href_url->hostlen, new->hostlen)) != 0) { + MAX (href_url->hostlen, new->hostlen)) != 0) { /* Special check for urls beginning with 'www' */ if (new->hostlen > 4 && href_url->hostlen > 4) { p = new->host; @@ -781,9 +762,7 @@ check_phishing (struct rspamd_task *task, } } else { - msg_info ("extract of url '%s' failed: %s", - url_str, - url_strerror (rc)); + msg_info ("extract of url '%s' failed: %s", url_str, url_strerror (rc)); } } } @@ -791,17 +770,13 @@ check_phishing (struct rspamd_task *task, } static void -parse_tag_url (struct rspamd_task *task, - struct mime_text_part *part, - tag_id_t id, - gchar *tag_text, - gsize tag_len, - gsize remain) +parse_tag_url (struct rspamd_task *task, struct mime_text_part *part, tag_id_t id, + gchar *tag_text, gsize tag_len, gsize remain) { - gchar *c = NULL, *p, *url_text; - gint len, rc; - struct uri *url; - gboolean got_single_quote = FALSE, got_double_quote = FALSE; + gchar *c = NULL, *p, *url_text; + gint len, rc; + struct uri *url; + gboolean got_single_quote = FALSE, got_double_quote = FALSE; /* For A tags search for href= and for IMG tags search for src= */ if (id == Tag_A) { @@ -839,8 +814,7 @@ parse_tag_url (struct rspamd_task *task, len++; } } - else if (g_ascii_isspace (*p) || *p == '>' || - (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') { + else if (g_ascii_isspace (*p) || *p == '>' || (*p == '/' && *(p + 1) == '>') || *p == '\r' || *p == '\n') { break; } else { @@ -869,27 +843,22 @@ parse_tag_url (struct rspamd_task *task, rspamd_strlcpy (url_text, c, len + 1); decode_entitles (url_text, NULL); - if (g_ascii_strncasecmp (url_text, "http://", - sizeof ("http://") - 1) != 0 && - g_ascii_strncasecmp (url_text, "www", - sizeof ("www") - 1) != 0 && - g_ascii_strncasecmp (url_text, "ftp://", - sizeof ("ftp://") - 1) != 0 && - g_ascii_strncasecmp (url_text, "mailto:", - sizeof ("mailto:") - 1) != 0) { + if (g_ascii_strncasecmp (url_text, "http://", sizeof ("http://") - 1) != 0 && + g_ascii_strncasecmp (url_text, "www", sizeof ("www") - 1) != 0 && + g_ascii_strncasecmp (url_text, "ftp://", sizeof ("ftp://") - 1) != 0 && + g_ascii_strncasecmp (url_text, "mailto:", sizeof ("mailto:") - 1) != 0) { return; } url = rspamd_mempool_alloc (task->task_pool, sizeof (struct uri)); rc = parse_uri (url, url_text, task->task_pool); - if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != - 0) { + if (rc != URI_ERRNO_EMPTY && rc != URI_ERRNO_NO_HOST && url->hostlen != 0) { /* * Check for phishing */ if ((p = strchr (c, '>')) != NULL && id == Tag_A) { - p++; + p ++; check_phishing (task, url, p, remain - (p - tag_text), id); } if (g_tree_lookup (task->urls, url) == NULL) { @@ -900,28 +869,20 @@ parse_tag_url (struct rspamd_task *task, } gboolean -add_html_node (struct rspamd_task *task, - rspamd_mempool_t * pool, - struct mime_text_part *part, - gchar *tag_text, - gsize tag_len, - gsize remain, - GNode ** cur_level) +add_html_node (struct rspamd_task *task, rspamd_mempool_t * pool, struct mime_text_part *part, + gchar *tag_text, gsize tag_len, gsize remain, GNode ** cur_level) { - GNode *new; - struct html_node *data; + GNode *new; + struct html_node *data; if (!tags_sorted) { - qsort (tag_defs, G_N_ELEMENTS ( - tag_defs), sizeof (struct html_tag), tag_cmp); + qsort (tag_defs, G_N_ELEMENTS (tag_defs), sizeof (struct html_tag), tag_cmp); tags_sorted = 1; } if (!entities_sorted) { - qsort (entities_defs, G_N_ELEMENTS ( - entities_defs), sizeof (entity), entity_cmp); + qsort (entities_defs, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp); memcpy (entities_defs_num, entities_defs, sizeof (entities_defs)); - qsort (entities_defs_num, G_N_ELEMENTS ( - entities_defs), sizeof (entity), entity_cmp_num); + qsort (entities_defs_num, G_N_ELEMENTS (entities_defs), sizeof (entity), entity_cmp_num); entities_sorted = 1; } @@ -931,32 +892,19 @@ add_html_node (struct rspamd_task *task, new = g_node_new (NULL); *cur_level = new; part->html_nodes = new; - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t) g_node_destroy, - part->html_nodes); + rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, part->html_nodes); /* Call once again with root node */ - return add_html_node (task, - pool, - part, - tag_text, - tag_len, - remain, - cur_level); + return add_html_node (task, pool, part, tag_text, tag_len, remain, cur_level); } else { new = construct_html_node (pool, tag_text, tag_len); if (new == NULL) { - debug_task ("cannot construct HTML node for text '%*s'", - tag_len, - tag_text); + debug_task ("cannot construct HTML node for text '%*s'", tag_len, tag_text); return FALSE; } data = new->data; - if (data->tag && - (data->tag->id == Tag_A || - data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { - parse_tag_url (task, part, data->tag->id, tag_text, tag_len, - remain); + if (data->tag && (data->tag->id == Tag_A || data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { + parse_tag_url (task, part, data->tag->id, tag_text, tag_len, remain); } if (data->flags & FL_CLOSING) { @@ -966,8 +914,7 @@ add_html_node (struct rspamd_task *task, } g_node_append (*cur_level, new); if (!check_balance (new, cur_level)) { - debug_task ( - "mark part as unbalanced as it has not pairable closing tags"); + debug_task ("mark part as unbalanced as it has not pairable closing tags"); part->is_balanced = FALSE; } } @@ -979,9 +926,9 @@ add_html_node (struct rspamd_task *task, } /* Skip some tags */ if (data->tag && (data->tag->id == Tag_STYLE || - data->tag->id == Tag_SCRIPT || - data->tag->id == Tag_OBJECT || - data->tag->id == Tag_TITLE)) { + data->tag->id == Tag_SCRIPT || + data->tag->id == Tag_OBJECT || + data->tag->id == Tag_TITLE)) { return FALSE; } } |