]> source.dussan.org Git - rspamd.git/commitdiff
Implement unique HTML tags.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 22 Jul 2015 15:36:35 +0000 (16:36 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 22 Jul 2015 15:36:35 +0000 (16:36 +0100)
src/libserver/html.c
src/libserver/html.h

index aac6af731e8379f00acde2015f3234010d0991d9..16d966c816e2331af6525185112e36d6d2fccf94 100644 (file)
@@ -208,20 +208,21 @@ typedef enum
 #define CM_NEW          (1 << 20)
 /* Elements that cannot be omitted. */
 #define CM_OMITST       (1 << 21)
-
+/* Unique elements */
+#define CM_UNIQUE       (1 << 22)
 /* XML tag */
-#define FL_XML          (1 << 22)
+#define FL_XML          (1 << 23)
 /* Closing tag */
-#define FL_CLOSING      (1 << 23)
+#define FL_CLOSING      (1 << 24)
 /* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 24)
-#define FL_BROKEN       (1 << 25)
-#define FL_IGNORE       (1 << 26)
+#define FL_CLOSED       (1 << 25)
+#define FL_BROKEN       (1 << 26)
+#define FL_IGNORE       (1 << 27)
 
 struct html_tag_def {
        gint id;
        const gchar *name;
-       gint flags;
+       guint flags;
 };
 
 static struct html_tag_def tag_defs[] = {
@@ -238,7 +239,7 @@ static struct html_tag_def tag_defs[] = {
        {Tag_BDO, "bdo", (CM_INLINE)},
        {Tag_BIG, "big", (CM_INLINE)},
        {Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)},
-       {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST)},
+       {Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)},
        {Tag_BR, "br", (CM_INLINE | CM_EMPTY)},
        {Tag_BUTTON, "button", (CM_INLINE)},
        {Tag_CAPTION, "caption", (CM_TABLE)},
@@ -266,9 +267,9 @@ static struct html_tag_def tag_defs[] = {
        {Tag_H4, "h4", (CM_BLOCK | CM_HEADING)},
        {Tag_H5, "h5", (CM_BLOCK | CM_HEADING)},
        {Tag_H6, "h6", (CM_BLOCK | CM_HEADING)},
-       {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST)},
+       {Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)},
        {Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)},
-       {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST)},
+       {Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)},
        {Tag_I, "i", (CM_INLINE)},
        {Tag_IFRAME, "iframe", (CM_INLINE)},
        {Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)},
@@ -320,7 +321,7 @@ static struct html_tag_def tag_defs[] = {
        {Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)},
        {Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT)},
        {Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)},
-       {Tag_TITLE, "title", (CM_HEAD)},
+       {Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)},
        {Tag_TR, "tr", (CM_TABLE | CM_OPT)},
        {Tag_TT, "tt", (CM_INLINE)},
        {Tag_U, "u", (CM_INLINE)},
@@ -1596,6 +1597,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                }
 
                                if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
+                                       if (cur_tag->flags & CM_UNIQUE) {
+                                               if (isset (hc->tags_seen, cur_tag->id)) {
+                                                       /* Duplicate tag has been found */
+                                                       hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS;
+                                               }
+                                       }
                                        setbit (hc->tags_seen, cur_tag->id);
                                }
 
index 4b17b5000f5fd9a57693118c48bc6fa931a0328e..5516594e44ed08878c2e9700f6aae1fc4733c800 100644 (file)
@@ -13,6 +13,7 @@
 #define RSPAMD_HTML_FLAG_XML (1 << 2)
 #define RSPAMD_HTML_FLAG_UNBALANCED (1 << 3)
 #define RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS (1 << 4)
+#define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
 
 enum html_component_type {
        RSPAMD_HTML_COMPONENT_NAME = 0,