]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Rework HTML tags content attachment
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 6 Jan 2020 17:08:02 +0000 (17:08 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 6 Jan 2020 17:08:02 +0000 (17:08 +0000)
src/libserver/html.c
src/libserver/html.h
src/lua/lua_html.c

index d9cddb468c4d4e0e6a761a2c9391fdc74e8f9419..502fa42faa51979c52f205692df0dff2a72a3f45 100644 (file)
@@ -815,8 +815,6 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc,
                                                return TRUE;
                                        }
                                }
-
-                               parent->content_length += tag->content_length;
                        }
 
                        if (hc->total_tags < max_tags) {
@@ -2774,13 +2772,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                p ++;
                        }
                        else {
-                               if (content_tag) {
-                                       if (content_tag->content == NULL) {
-                                               content_tag->content = c;
-                                       }
-
-                                       content_tag->content_length += p - c;
-                               }
                                state = tag_begin;
                        }
                        break;
@@ -2798,24 +2789,35 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                if (need_decode) {
                                                        goffset old_offset = dest->len;
 
+                                                       if (content_tag) {
+                                                               if (content_tag->content_offset == 0) {
+                                                                       content_tag->content_offset = old_offset;
+                                                               }
+                                                       }
+
                                                        g_byte_array_append (dest, c, (p - c));
 
                                                        len = rspamd_html_decode_entitles_inplace (
                                                                        dest->data + old_offset,
                                                                        p - c);
                                                        dest->len = dest->len + len - (p - c);
+
+                                                       if (content_tag) {
+                                                               content_tag->content_length += len;
+                                                       }
                                                }
                                                else {
                                                        len = p - c;
-                                                       g_byte_array_append (dest, c, len);
-                                               }
 
-                                               if (content_tag) {
-                                                       if (content_tag->content == NULL) {
-                                                               content_tag->content = c;
+                                                       if (content_tag) {
+                                                               if (content_tag->content_offset == 0) {
+                                                                       content_tag->content_offset = dest->len;
+                                                               }
+
+                                                               content_tag->content_length += len;
                                                        }
 
-                                                       content_tag->content_length += p - c + 1;
+                                                       g_byte_array_append (dest, c, len);
                                                }
                                        }
 
@@ -2828,6 +2830,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                if (dest->len > 0 &&
                                                                !g_ascii_isspace (dest->data[dest->len - 1])) {
                                                        g_byte_array_append (dest, " ", 1);
+                                                       if (content_tag) {
+                                                               content_tag->content_length ++;
+                                                       }
                                                }
                                                save_space = FALSE;
                                        }
@@ -2839,24 +2844,34 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                        if (need_decode) {
                                                goffset old_offset = dest->len;
 
+                                               if (content_tag) {
+                                                       if (content_tag->content_offset == 0) {
+                                                               content_tag->content_offset = dest->len;
+                                                       }
+                                               }
+
                                                g_byte_array_append (dest, c, (p - c));
                                                len = rspamd_html_decode_entitles_inplace (
                                                                dest->data + old_offset,
                                                                p - c);
                                                dest->len = dest->len + len - (p - c);
+
+                                               if (content_tag) {
+                                                       content_tag->content_length += len;
+                                               }
                                        }
                                        else {
                                                len = p - c;
-                                               g_byte_array_append (dest, c, len);
-                                       }
 
+                                               if (content_tag) {
+                                                       if (content_tag->content_offset == 0) {
+                                                               content_tag->content_offset = dest->len;
+                                                       }
 
-                                       if (content_tag) {
-                                               if (content_tag->content == NULL) {
-                                                       content_tag->content = c;
+                                                       content_tag->content_length += len;
                                                }
 
-                                               content_tag->content_length += p - c;
+                                               g_byte_array_append (dest, c, len);
                                        }
                                }
 
@@ -2876,10 +2891,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                continue;
                        }
 
-                       if (content_tag) {
-                               content_tag->content_length ++;
-                       }
-
                        p ++;
                        break;
 
@@ -2949,6 +2960,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) {
                                        if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
                                                g_byte_array_append (dest, "\r\n", 2);
+
+                                               if (content_tag) {
+                                                       content_tag->content_length += 2;
+                                               }
                                        }
                                        save_space = FALSE;
                                }
@@ -2958,6 +2973,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
                                                cur_tag->id == Tag_DIV)) {
                                        if (dest->len > 0 && dest->data[dest->len - 1] != '\n') {
                                                g_byte_array_append (dest, "\r\n", 2);
+
+                                               if (content_tag) {
+                                                       content_tag->content_length += 2;
+                                               }
                                        }
                                        save_space = FALSE;
                                }
@@ -3106,6 +3125,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
        }
 
        g_queue_free (styles_blocks);
+       hc->parsed = dest;
 
        return dest;
 }
index 86a266a6240b735b1b4c2eb6d1a7f4d3dcd58064..b369bd8907932de13e7cbb8ad5087117c42e1794 100644 (file)
@@ -107,9 +107,9 @@ struct html_block {
 struct html_tag {
        gint id;
        gint flags;
-       guint content_length;
        struct html_tag_component name;
-       const gchar *content;
+       guint content_length;
+       goffset content_offset;
        GQueue *params;
        gpointer extra; /** Additional data associated with tag (e.g. image) */
        GNode *parent;
@@ -127,6 +127,7 @@ struct html_content {
        guchar *tags_seen;
        GPtrArray *images;
        GPtrArray *blocks;
+       GByteArray *parsed;
 };
 
 /*
index 0af0457da39dc7375aa7377e2d4816a8ab736492..43c34797cfd6fe451c12409237a61eadca870395 100644 (file)
@@ -186,12 +186,17 @@ lua_check_html (lua_State * L, gint pos)
        return ud ? *((struct html_content **)ud) : NULL;
 }
 
-static struct html_tag *
+struct lua_html_tag {
+       struct html_content *html;
+       struct html_tag *tag;
+};
+
+static struct lua_html_tag *
 lua_check_html_tag (lua_State * L, gint pos)
 {
        void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html_tag}");
        luaL_argcheck (L, ud != NULL, pos, "'html_tag' expected");
-       return ud ? *((struct html_tag **)ud) : NULL;
+       return ud ? ((struct lua_html_tag *)ud) : NULL;
 }
 
 static gint
@@ -263,7 +268,7 @@ static void
 lua_html_push_image (lua_State *L, struct html_image *img)
 {
        LUA_TRACE_POINT;
-       struct html_tag **ptag;
+       struct lua_html_tag *ltag;
        struct rspamd_url **purl;
 
        lua_newtable (L);
@@ -298,8 +303,9 @@ lua_html_push_image (lua_State *L, struct html_image *img)
 
        if (img->tag) {
                lua_pushstring (L, "tag");
-               ptag = lua_newuserdata (L, sizeof (gpointer));
-               *ptag = img->tag;
+               ltag = lua_newuserdata (L, sizeof (struct lua_html_tag));
+               ltag->tag = img->tag;
+               ltag->html = NULL;
                rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
                lua_settable (L, -3);
        }
@@ -440,6 +446,7 @@ lua_html_get_blocks (lua_State *L)
 
 struct lua_html_traverse_ud {
        lua_State *L;
+       struct html_content *html;
        gint cbref;
        GHashTable *tags;
        gboolean any;
@@ -449,15 +456,17 @@ static gboolean
 lua_html_node_foreach_cb (GNode *n, gpointer d)
 {
        struct lua_html_traverse_ud *ud = d;
-       struct html_tag *tag = n->data, **ptag;
+       struct html_tag *tag = n->data;
+       struct lua_html_tag *ltag;
 
        if (tag && (ud->any || g_hash_table_lookup (ud->tags,
                        GSIZE_TO_POINTER (mum_hash64 (tag->id, 0))))) {
 
                lua_rawgeti (ud->L, LUA_REGISTRYINDEX, ud->cbref);
 
-               ptag = lua_newuserdata (ud->L, sizeof (*ptag));
-               *ptag = tag;
+               ltag = lua_newuserdata (ud->L, sizeof (*ltag));
+               ltag->tag = tag;
+               ltag->html = ud->html;
                rspamd_lua_setclass (ud->L, "rspamd{html_tag}", -1);
                lua_pushinteger (ud->L, tag->content_length);
 
@@ -489,6 +498,7 @@ lua_html_foreach_tag (lua_State *L)
 
        ud.tags = g_hash_table_new (g_direct_hash, g_direct_equal);
        ud.any = FALSE;
+       ud.html = hc;
 
        if (lua_type (L, 2) == LUA_TSTRING) {
                tagname = luaL_checkstring (L, 2);
@@ -556,11 +566,11 @@ static gint
 lua_html_tag_get_type (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1);
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
        const gchar *tagname;
 
-       if (tag != NULL) {
-               tagname = rspamd_html_tag_by_id (tag->id);
+       if (ltag != NULL) {
+               tagname = rspamd_html_tag_by_id (ltag->tag->id);
 
                if (tagname) {
                        lua_pushstring (L, tagname);
@@ -580,15 +590,16 @@ static gint
 lua_html_tag_get_parent (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1), **ptag;
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1), *ptag;
        GNode *node;
 
-       if (tag != NULL) {
-               node = tag->parent;
+       if (ltag != NULL) {
+               node = ltag->tag->parent;
 
                if (node && node->data) {
-                       ptag = lua_newuserdata (L, sizeof (gpointer));
-                       *ptag = node->data;
+                       ptag = lua_newuserdata (L, sizeof (*ptag));
+                       ptag->tag = node->data;
+                       ptag->html = ltag->html;
                        rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
                }
                else {
@@ -606,33 +617,33 @@ static gint
 lua_html_tag_get_flags (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1);
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
        gint i = 1;
 
-       if (tag) {
+       if (ltag->tag) {
                /* Push flags */
                lua_createtable (L, 4, 0);
-               if (tag->flags & FL_CLOSING) {
+               if (ltag->tag->flags & FL_CLOSING) {
                        lua_pushstring (L, "closing");
                        lua_rawseti (L, -2, i++);
                }
-               if (tag->flags & FL_HREF) {
+               if (ltag->tag->flags & FL_HREF) {
                        lua_pushstring (L, "href");
                        lua_rawseti (L, -2, i++);
                }
-               if (tag->flags & FL_CLOSED) {
+               if (ltag->tag->flags & FL_CLOSED) {
                        lua_pushstring (L, "closed");
                        lua_rawseti (L, -2, i++);
                }
-               if (tag->flags & FL_BROKEN) {
+               if (ltag->tag->flags & FL_BROKEN) {
                        lua_pushstring (L, "broken");
                        lua_rawseti (L, -2, i++);
                }
-               if (tag->flags & FL_XML) {
+               if (ltag->tag->flags & FL_XML) {
                        lua_pushstring (L, "xml");
                        lua_rawseti (L, -2, i++);
                }
-               if (tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) {
+               if (ltag->tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) {
                        lua_pushstring (L, "unbalanced");
                        lua_rawseti (L, -2, i++);
                }
@@ -648,15 +659,16 @@ static gint
 lua_html_tag_get_content (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1);
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
        struct rspamd_lua_text *t;
 
-       if (tag) {
-               if (tag->content && tag->content_length) {
+       if (ltag) {
+               if (ltag->html && ltag->tag->content_offset && ltag->tag->content_length &&
+                               ltag->html->parsed->len >= ltag->tag->content_offset + ltag->tag->content_length) {
                        t = lua_newuserdata (L, sizeof (*t));
                        rspamd_lua_setclass (L, "rspamd{text}", -1);
-                       t->start = tag->content;
-                       t->len = tag->content_length;
+                       t->start = ltag->html->parsed->data + ltag->tag->content_offset;
+                       t->len = ltag->tag->content_length;
                        t->flags = 0;
                }
                else {
@@ -674,10 +686,10 @@ static gint
 lua_html_tag_get_content_length (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1);
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
 
-       if (tag) {
-               lua_pushinteger (L, tag->content_length);
+       if (ltag) {
+               lua_pushinteger (L, ltag->tag->content_length);
        }
        else {
                return luaL_error (L, "invalid arguments");
@@ -690,24 +702,24 @@ static gint
 lua_html_tag_get_extra (lua_State *L)
 {
        LUA_TRACE_POINT;
-       struct html_tag *tag = lua_check_html_tag (L, 1);
+       struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
        struct html_image *img;
        struct rspamd_url **purl;
 
-       if (tag) {
-               if (tag->extra) {
-                       if ((tag->flags & FL_HREF) || tag->id == Tag_BASE) {
+       if (ltag) {
+               if (ltag->tag->extra) {
+                       if ((ltag->tag->flags & FL_HREF) || ltag->tag->id == Tag_BASE) {
                                /* For A that's URL */
                                purl = lua_newuserdata (L, sizeof (gpointer));
-                               *purl = tag->extra;
+                               *purl = ltag->tag->extra;
                                rspamd_lua_setclass (L, "rspamd{url}", -1);
                        }
-                       else if (tag->id == Tag_IMG) {
-                               img = tag->extra;
+                       else if (ltag->tag->id == Tag_IMG) {
+                               img = ltag->tag->extra;
                                lua_html_push_image (L, img);
                        }
-                       else if (tag->flags & FL_BLOCK) {
-                               lua_html_push_block (L, tag->extra);
+                       else if (ltag->tag->flags & FL_BLOCK) {
+                               lua_html_push_block (L, ltag->tag->extra);
                        }
                        else {
                                /* Unknown extra ? */