diff options
-rw-r--r-- | src/libserver/html.c | 29 | ||||
-rw-r--r-- | src/libserver/html.h | 1 |
2 files changed, 19 insertions, 11 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index aac6af731..16d966c81 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -208,20 +208,21 @@ typedef enum #define CM_NEW (1 << 20) /* Elements that cannot be omitted. */ #define CM_OMITST (1 << 21) - +/* Unique elements */ +#define CM_UNIQUE (1 << 22) /* XML tag */ -#define FL_XML (1 << 22) +#define FL_XML (1 << 23) /* Closing tag */ -#define FL_CLOSING (1 << 23) +#define FL_CLOSING (1 << 24) /* Fully closed tag (e.g. <a attrs />) */ -#define FL_CLOSED (1 << 24) -#define FL_BROKEN (1 << 25) -#define FL_IGNORE (1 << 26) +#define FL_CLOSED (1 << 25) +#define FL_BROKEN (1 << 26) +#define FL_IGNORE (1 << 27) struct html_tag_def { gint id; const gchar *name; - gint flags; + guint flags; }; static struct html_tag_def tag_defs[] = { @@ -238,7 +239,7 @@ static struct html_tag_def tag_defs[] = { {Tag_BDO, "bdo", (CM_INLINE)}, {Tag_BIG, "big", (CM_INLINE)}, {Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)}, - {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)}, {Tag_BR, "br", (CM_INLINE | CM_EMPTY)}, {Tag_BUTTON, "button", (CM_INLINE)}, {Tag_CAPTION, "caption", (CM_TABLE)}, @@ -266,9 +267,9 @@ static struct html_tag_def tag_defs[] = { {Tag_H4, "h4", (CM_BLOCK | CM_HEADING)}, {Tag_H5, "h5", (CM_BLOCK | CM_HEADING)}, {Tag_H6, "h6", (CM_BLOCK | CM_HEADING)}, - {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)}, {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)}, - {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST)}, + {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)}, {Tag_I, "i", (CM_INLINE)}, {Tag_IFRAME, "iframe", (CM_INLINE)}, {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)}, @@ -320,7 +321,7 @@ static struct html_tag_def tag_defs[] = { {Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)}, {Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT)}, {Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)}, - {Tag_TITLE, "title", (CM_HEAD)}, + {Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)}, {Tag_TR, "tr", (CM_TABLE | CM_OPT)}, {Tag_TT, "tt", (CM_INLINE)}, {Tag_U, "u", (CM_INLINE)}, @@ -1596,6 +1597,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, } if (cur_tag->id != -1 && cur_tag->id < N_TAGS) { + if (cur_tag->flags & CM_UNIQUE) { + if (isset (hc->tags_seen, cur_tag->id)) { + /* Duplicate tag has been found */ + hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; + } + } setbit (hc->tags_seen, cur_tag->id); } diff --git a/src/libserver/html.h b/src/libserver/html.h index 4b17b5000..5516594e4 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -13,6 +13,7 @@ #define RSPAMD_HTML_FLAG_XML (1 << 2) #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3) #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4) +#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5) enum html_component_type { RSPAMD_HTML_COMPONENT_NAME = 0, |