]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Further fixes in HTML tags parsing
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 5 Aug 2016 10:11:12 +0000 (11:11 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 5 Aug 2016 10:11:12 +0000 (11:11 +0100)
src/libserver/html.c
src/libserver/html.h
src/libserver/html_tags.h

index a65dd9a956e24adc514dd1284b9054c478e1d203..f897acd8e9471b0123cb9895fa42561a20c86bb6 100644 (file)
@@ -539,23 +539,37 @@ rspamd_html_check_balance (GNode * node, GNode ** cur_level)
        return FALSE;
 }
 
-gboolean
-rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
+gint
+rspamd_html_tag_by_name (const gchar *name)
 {
        struct html_tag tag;
        struct html_tag_def *found;
 
-       g_assert (hc != NULL);
-       g_assert (hc->tags_seen != NULL);
-
-       tag.name.start = tagname;
-       tag.name.len = strlen (tagname);
+       tag.name.start = name;
+       tag.name.len = strlen (name);
 
        found = bsearch (&tag, tag_defs, G_N_ELEMENTS (tag_defs),
                        sizeof (tag_defs[0]), tag_find);
 
        if (found) {
-               return isset (hc->tags_seen, found->id);
+               return found->id;
+       }
+
+       return -1;
+}
+
+gboolean
+rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname)
+{
+       gint id;
+
+       g_assert (hc != NULL);
+       g_assert (hc->tags_seen != NULL);
+
+       id = rspamd_html_tag_by_name (tagname);
+
+       if (id != -1) {
+               return isset (hc->tags_seen, id);
        }
 
        return FALSE;
@@ -775,8 +789,12 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
                else {
                        parent = (*cur_level)->data;
 
-                       if (parent && (parent->flags & FL_IGNORE)) {
-                               tag->flags |= FL_IGNORE;
+                       if (parent) {
+                               if ((parent->flags & FL_IGNORE)) {
+                                       tag->flags |= FL_IGNORE;
+                               }
+
+                               parent->content_length += tag->content_length;
                        }
 
                        g_node_append (*cur_level, nnode);
@@ -1605,7 +1623,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
        guint obrace = 0, ebrace = 0;
        GNode *cur_level = NULL;
        gint substate = 0, len, href_offset = -1;
-       struct html_tag *cur_tag = NULL;
+       struct html_tag *cur_tag = NULL, *content_tag = NULL;
        struct rspamd_url *url = NULL, *turl;
        struct rspamd_process_exception *ex;
        enum {
@@ -1830,10 +1848,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                save_space = FALSE;
                                        }
                                }
-
-                               if (cur_tag) {
-                                       cur_tag->content_length ++;
-                               }
                        }
                        else {
                                if (c != p) {
@@ -1847,6 +1861,12 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                        }
 
                                        g_byte_array_append (dest, c, len);
+
+                                       if (content_tag) {
+                                               content_tag->content_length = len;
+                                               content_tag->content = c;
+                                               content_tag = NULL;
+                                       }
                                }
 
                                state = tag_begin;
@@ -1922,6 +1942,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                        setbit (hc->tags_seen, cur_tag->id);
                                }
 
+                               if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) {
+                                       content_tag = cur_tag;
+                               }
+
                                /* Handle newlines */
                                if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
                                        if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
index eb8ca80700cff90a55ae5d4ffb18e18abdc33833..35af8f05cc90370f258b2ff10b436367ce596119 100644 (file)
@@ -77,10 +77,22 @@ struct html_block {
        gchar *class;
 };
 
+/* Public tags flags */
+/* XML tag */
+#define FL_XML          (1 << 23)
+/* Closing tag */
+#define FL_CLOSING      (1 << 24)
+/* Fully closed tag (e.g. <a attrs />) */
+#define FL_CLOSED       (1 << 25)
+#define FL_BROKEN       (1 << 26)
+#define FL_IGNORE       (1 << 27)
+#define FL_BLOCK        (1 << 28)
+
 struct html_tag {
        gint id;
        gint flags;
        gsize content_length;
+       const gchar *content;
        struct html_tag_component name;
        GQueue *params;
        gpointer extra; /** Additional data associated with tag (e.g. image) */
@@ -123,6 +135,13 @@ gboolean rspamd_html_tag_seen (struct html_content *hc, const gchar *tagname);
  */
 const gchar* rspamd_html_tag_by_id (gint id);
 
+/**
+ * Returns HTML tag id by name
+ * @param name
+ * @return
+ */
+gint rspamd_html_tag_by_name (const gchar *name);
+
 /**
  * Extract URL from HTML tag component and sets component elements if needed
  * @param pool
index 1b82832f15d2489a5b244e7dfb854453628aea8b..8809975e30f1e4ec56b9c61dacf3071d69740848 100644 (file)
@@ -195,14 +195,6 @@ typedef enum
 #define CM_OMITST       (1 << 21)
 /* Unique elements */
 #define CM_UNIQUE       (1 << 22)
-/* XML tag */
-#define FL_XML          (1 << 23)
-/* Closing tag */
-#define FL_CLOSING      (1 << 24)
-/* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 25)
-#define FL_BROKEN       (1 << 26)
-#define FL_IGNORE       (1 << 27)
-#define FL_BLOCK        (1 << 28)
+
 
 #endif /* SRC_LIBSERVER_HTML_TAGS_H_ */