From 832e08e53f2364bda26afc6498ce968874d0fb33 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 5 Aug 2016 11:11:12 +0100 Subject: [PATCH] [Fix] Further fixes in HTML tags parsing --- src/libserver/html.c | 54 ++++++++++++++++++++++++++++----------- src/libserver/html.h | 19 ++++++++++++++ src/libserver/html_tags.h | 10 +------- 3 files changed, 59 insertions(+), 24 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index a65dd9a95..f897acd8e 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -539,23 +539,37 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level) return FALSE; } -gboolean -rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname) +gint +rspamd_html_tag_by_name (const gchar *name) { struct html_tag tag; struct html_tag_def *found; - g_assert (hc != NULL); - g_assert (hc->tags_seen != NULL); - - tag.name.start = tagname; - tag.name.len = strlen (tagname); + tag.name.start = name; + tag.name.len = strlen (name); found = bsearch (&tag, tag_defs, G_N_ELEMENTS (tag_defs), sizeof (tag_defs[0]), tag_find); if (found) { - return isset (hc->tags_seen, found->id); + return found->id; + } + + return -1; +} + +gboolean +rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname) +{ + gint id; + + g_assert (hc != NULL); + g_assert (hc->tags_seen != NULL); + + id = rspamd_html_tag_by_name (tagname); + + if (id != -1) { + return isset (hc->tags_seen, id); } return FALSE; @@ -775,8 +789,12 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc, else { parent = (*cur_level)->data; - if (parent && (parent->flags & FL_IGNORE)) { - tag->flags |= FL_IGNORE; + if (parent) { + if ((parent->flags & FL_IGNORE)) { + tag->flags |= FL_IGNORE; + } + + parent->content_length += tag->content_length; } g_node_append (*cur_level, nnode); @@ -1605,7 +1623,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, guint obrace = 0, ebrace = 0; GNode *cur_level = NULL; gint substate = 0, len, href_offset = -1; - struct html_tag *cur_tag = NULL; + struct html_tag *cur_tag = NULL, *content_tag = NULL; struct rspamd_url *url = NULL, *turl; struct rspamd_process_exception *ex; enum { @@ -1830,10 +1848,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, save_space = FALSE; } } - - if (cur_tag) { - cur_tag->content_length ++; - } } else { if (c != p) { @@ -1847,6 +1861,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, } g_byte_array_append (dest, c, len); + + if (content_tag) { + content_tag->content_length = len; + content_tag->content = c; + content_tag = NULL; + } } state = tag_begin; @@ -1922,6 +1942,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, setbit (hc->tags_seen, cur_tag->id); } + if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) { + content_tag = cur_tag; + } + /* Handle newlines */ if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) { if (dest->len > 0 && dest->data[dest->len - 1] != '\n') { diff --git a/src/libserver/html.h b/src/libserver/html.h index eb8ca8070..35af8f05c 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -77,10 +77,22 @@ struct html_block { gchar *class; }; +/* Public tags flags */ +/* XML tag */ +#define FL_XML (1 << 23) +/* Closing tag */ +#define FL_CLOSING (1 << 24) +/* Fully closed tag (e.g. ) */ +#define FL_CLOSED (1 << 25) +#define FL_BROKEN (1 << 26) +#define FL_IGNORE (1 << 27) +#define FL_BLOCK (1 << 28) + struct html_tag { gint id; gint flags; gsize content_length; + const gchar *content; struct html_tag_component name; GQueue *params; gpointer extra; /** Additional data associated with tag (e.g. image) */ @@ -123,6 +135,13 @@ gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname); */ const gchar* rspamd_html_tag_by_id (gint id); +/** + * Returns HTML tag id by name + * @param name + * @return + */ +gint rspamd_html_tag_by_name (const gchar *name); + /** * Extract URL from HTML tag component and sets component elements if needed * @param pool diff --git a/src/libserver/html_tags.h b/src/libserver/html_tags.h index 1b82832f1..8809975e3 100644 --- a/src/libserver/html_tags.h +++ b/src/libserver/html_tags.h @@ -195,14 +195,6 @@ typedef enum #define CM_OMITST (1 << 21) /* Unique elements */ #define CM_UNIQUE (1 << 22) -/* XML tag */ -#define FL_XML (1 << 23) -/* Closing tag */ -#define FL_CLOSING (1 << 24) -/* Fully closed tag (e.g. ) */ -#define FL_CLOSED (1 << 25) -#define FL_BROKEN (1 << 26) -#define FL_IGNORE (1 << 27) -#define FL_BLOCK (1 << 28) + #endif /* SRC_LIBSERVER_HTML_TAGS_H_ */ -- 2.39.5