diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-15 17:31:00 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2015-07-15 17:31:00 +0100 |
commit | a76fd88266a59d7b59438a3f1985ccf2c33e7e0a (patch) | |
tree | 1fe34539fb80ecf54a249ccd2acabb51ee6717c1 /src/libserver/html.c | |
parent | f195719ee3385afa09c65b5b665bee70ef735b06 (diff) | |
download | rspamd-a76fd88266a59d7b59438a3f1985ccf2c33e7e0a.tar.gz rspamd-a76fd88266a59d7b59438a3f1985ccf2c33e7e0a.zip |
More rework.
Diffstat (limited to 'src/libserver/html.c')
-rw-r--r-- | src/libserver/html.c | 196 |
1 files changed, 59 insertions, 137 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index ef72f1397..fb450c823 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -210,12 +210,12 @@ typedef enum #define CM_OMITST (1 << 21) /* XML tag */ -#define FL_XML (1 << 0) +#define FL_XML (1 << 22) /* Closing tag */ -#define FL_CLOSING (1 << 1) +#define FL_CLOSING (1 << 23) /* Fully closed tag (e.g. <a attrs />) */ -#define FL_CLOSED (1 << 2) -#define FL_BROKEN (1 << 3) +#define FL_CLOSED (1 << 24) +#define FL_BROKEN (1 << 25) struct html_tag_def { gint id; @@ -648,7 +648,7 @@ tag_find (const void *skey, const void *elt) const struct html_tag *tag = skey; const struct html_tag_def *d = elt; - return g_ascii_strcnasecmp (tag->name.start, d->name, tag->name.len); + return g_ascii_strncasecmp (tag->name.start, d->name, tag->name.len); } static gint @@ -669,70 +669,10 @@ entity_cmp_num (const void *m1, const void *m2) return p1->code - p2->code; } -static GNode * -construct_html_node (rspamd_mempool_t * pool, gchar *text, gsize tag_len) -{ - struct html_node *html; - GNode *n = NULL; - struct html_tag key, *found; - gchar t; - - if (text == NULL || *text == '\0') { - return NULL; - } - - html = rspamd_mempool_alloc0 (pool, sizeof (struct html_node)); - - /* Check whether this tag is fully closed */ - if (*(text + tag_len - 1) == '/') { - html->flags |= FL_CLOSED; - } - - /* Check xml tag */ - if (*text == '?' && - g_ascii_strncasecmp (text + 1, "xml", sizeof ("xml") - 1) == 0) { - html->flags |= FL_XML; - html->tag = NULL; - } - else if (*text == '!') { - html->flags |= FL_SGML; - html->tag = NULL; - } - else { - if (*text == '/') { - html->flags |= FL_CLOSING; - text++; - } - - /* Find end of tag name */ - key.name = text; - while (*text && g_ascii_isalnum (*(++text))) ; - - t = *text; - *text = '\0'; - - /* Match tag id by tag name */ - if ((found = - bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), - sizeof (struct html_tag), tag_cmp)) != NULL) { - *text = t; - html->tag = found; - } - else { - *text = t; - return NULL; - } - } - - n = g_node_new (html); - - return n; -} - static gboolean -check_balance (GNode * node, GNode ** cur_level) +rspamd_html_check_balance (GNode * node, GNode ** cur_level) { - struct html_node *arg = node->data, *tmp; + struct html_tag *arg = node->data, *tmp; GNode *cur; if (arg->flags & FL_CLOSING) { @@ -740,8 +680,7 @@ check_balance (GNode * node, GNode ** cur_level) cur = node->parent; while (cur && cur->data) { tmp = cur->data; - if ((tmp->tag && - arg->tag) && tmp->tag->id == arg->tag->id && + if (tmp->id == arg->id && (tmp->flags & FL_CLOSED) == 0) { tmp->flags |= FL_CLOSED; /* Destroy current node as we find corresponding parent node */ @@ -763,12 +702,6 @@ check_balance (GNode * node, GNode ** cur_level) struct html_tag * get_tag_by_name (const gchar *name) { - struct html_tag key; - - key.name = name; - - return bsearch (&key, tag_defs, G_N_ELEMENTS (tag_defs), - sizeof (struct html_tag), tag_cmp); } /* Decode HTML entitles in text */ @@ -1082,76 +1015,53 @@ add_html_node (struct rspamd_task *task, gsize remain, GNode ** cur_level) { - GNode *new; - struct html_node *data; - - /* First call of this function */ - if (part->html_nodes == NULL) { - /* Insert root node */ - new = g_node_new (NULL); - *cur_level = new; - part->html_nodes = new; +} + +static gboolean +rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc, + struct html_tag *tag, GNode **cur_level) +{ + GNode *nnode; + + if (hc->html_tags == NULL) { + nnode = g_node_new (NULL); + *cur_level = nnode; + hc->html_tags = nnode; rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_node_destroy, - part->html_nodes); - /* Call once again with root node */ - return add_html_node (task, - pool, - part, - tag_text, - tag_len, - remain, - cur_level); + nnode); } - else { - new = construct_html_node (pool, tag_text, tag_len); - if (new == NULL) { - debug_task ("cannot construct HTML node for text '%*s'", - tag_len, - tag_text); + + nnode = g_node_new (tag); + + if (tag->flags & FL_CLOSING) { + if (!*cur_level) { + debug_task ("bad parent node"); return FALSE; } - data = new->data; - if (data->tag && - (data->tag->id == Tag_A || - data->tag->id == Tag_IMG) && ((data->flags & FL_CLOSING) == 0)) { - parse_tag_url (task, part, data->tag->id, tag_text, tag_len, - remain); - } + g_node_append (*cur_level, nnode); - if (data->flags & FL_CLOSING) { - if (!*cur_level) { - debug_task ("bad parent node"); - return FALSE; - } - g_node_append (*cur_level, new); - if (!check_balance (new, cur_level)) { - debug_task ( + if (!rspamd_html_check_balance (nnode, cur_level)) { + debug_task ( "mark part as unbalanced as it has not pairable closing tags"); - part->flags &= ~RSPAMD_MIME_PART_FLAG_BALANCED; - } + hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; } - else if ((data->flags & (FL_XML|FL_SGML)) == 0) { + } + else { + g_node_append (*cur_level, nnode); - g_node_append (*cur_level, new); - if ((data->flags & FL_CLOSED) == 0) { - *cur_level = new; - } - /* Skip some tags */ - if (data->tag && (data->tag->id == Tag_STYLE || - data->tag->id == Tag_SCRIPT || - data->tag->id == Tag_OBJECT || - data->tag->id == Tag_TITLE)) { - return FALSE; - } + if ((tag->flags & FL_CLOSED) == 0) { + *cur_level = nnode; } - else { - /* Destroy ignored nodes */ - g_node_destroy (new); + + if (tag->flags & (CM_HEAD|CM_EMPTY|CM_UNKNOWN|FL_BROKEN)) { + return FALSE; } + + return TRUE; } - return TRUE; + return FALSE; } static void @@ -1421,11 +1331,12 @@ gboolean rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, GByteArray *in) { - const guchar *p, *c, *end, t, *tag_start = NULL; - guchar *savep = NULL; + const guchar *p, *c, *end, *tag_start = NULL; + guchar *savep = NULL, t; gboolean closing = FALSE; GByteArray *dest; guint obrace = 0, ebrace = 0; + GNode *cur_level = NULL; gint substate; struct html_tag *cur_tag; enum { @@ -1599,7 +1510,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, break; case content_ignore: - if (p != '<') { + if (t != '<') { p ++; } else { @@ -1608,7 +1519,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, break; case content_write: - if (p != '<') { + if (t != '<') { p ++; } else { @@ -1624,6 +1535,17 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, rspamd_html_parse_tag_content (pool, hc, cur_tag, p, &substate, &savep); if (t == '>') { + if (closing) { + cur_tag->flags |= FL_CLOSING; + + if (cur_tag->flags & FL_CLOSED) { + /* Bad mix of closed and closing */ + hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + } + + closing = FALSE; + } + state = tag_end; continue; } @@ -1636,7 +1558,7 @@ rspamd_html_process_part (rspamd_mempool_t *pool, struct html_content *hc, savep = NULL; if (cur_tag != NULL) { - if (rspamd_html_process_tag (pool, hc, cur_tag)) { + if (rspamd_html_process_tag (pool, hc, cur_tag, &cur_level)) { state = content_write; } else { |