From 58fafcd653930ec374aba9dc6a1876052d9a1881 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 6 Jan 2020 17:08:02 +0000 Subject: [PATCH] [Rework] Rework HTML tags content attachment --- src/libserver/html.c | 70 +++++++++++++++++++++------------ src/libserver/html.h | 5 ++- src/lua/lua_html.c | 94 +++++++++++++++++++++++++------------------- 3 files changed, 101 insertions(+), 68 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index d9cddb468..502fa42fa 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -815,8 +815,6 @@ rspamd_html_process_tag (rspamd_mempool_t *pool, struct html_content *hc, return TRUE; } } - - parent->content_length += tag->content_length; } if (hc->total_tags < max_tags) { @@ -2774,13 +2772,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, p ++; } else { - if (content_tag) { - if (content_tag->content == NULL) { - content_tag->content = c; - } - - content_tag->content_length += p - c; - } state = tag_begin; } break; @@ -2798,24 +2789,35 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (need_decode) { goffset old_offset = dest->len; + if (content_tag) { + if (content_tag->content_offset == 0) { + content_tag->content_offset = old_offset; + } + } + g_byte_array_append (dest, c, (p - c)); len = rspamd_html_decode_entitles_inplace ( dest->data + old_offset, p - c); dest->len = dest->len + len - (p - c); + + if (content_tag) { + content_tag->content_length += len; + } } else { len = p - c; - g_byte_array_append (dest, c, len); - } - if (content_tag) { - if (content_tag->content == NULL) { - content_tag->content = c; + if (content_tag) { + if (content_tag->content_offset == 0) { + content_tag->content_offset = dest->len; + } + + content_tag->content_length += len; } - content_tag->content_length += p - c + 1; + g_byte_array_append (dest, c, len); } } @@ -2828,6 +2830,9 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (dest->len > 0 && !g_ascii_isspace (dest->data[dest->len - 1])) { g_byte_array_append (dest, " ", 1); + if (content_tag) { + content_tag->content_length ++; + } } save_space = FALSE; } @@ -2839,24 +2844,34 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (need_decode) { goffset old_offset = dest->len; + if (content_tag) { + if (content_tag->content_offset == 0) { + content_tag->content_offset = dest->len; + } + } + g_byte_array_append (dest, c, (p - c)); len = rspamd_html_decode_entitles_inplace ( dest->data + old_offset, p - c); dest->len = dest->len + len - (p - c); + + if (content_tag) { + content_tag->content_length += len; + } } else { len = p - c; - g_byte_array_append (dest, c, len); - } + if (content_tag) { + if (content_tag->content_offset == 0) { + content_tag->content_offset = dest->len; + } - if (content_tag) { - if (content_tag->content == NULL) { - content_tag->content = c; + content_tag->content_length += len; } - content_tag->content_length += p - c; + g_byte_array_append (dest, c, len); } } @@ -2876,10 +2891,6 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, continue; } - if (content_tag) { - content_tag->content_length ++; - } - p ++; break; @@ -2949,6 +2960,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) { if (dest->len > 0 && dest->data[dest->len - 1] != '\n') { g_byte_array_append (dest, "\r\n", 2); + + if (content_tag) { + content_tag->content_length += 2; + } } save_space = FALSE; } @@ -2958,6 +2973,10 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, cur_tag->id == Tag_DIV)) { if (dest->len > 0 && dest->data[dest->len - 1] != '\n') { g_byte_array_append (dest, "\r\n", 2); + + if (content_tag) { + content_tag->content_length += 2; + } } save_space = FALSE; } @@ -3106,6 +3125,7 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc, } g_queue_free (styles_blocks); + hc->parsed = dest; return dest; } diff --git a/src/libserver/html.h b/src/libserver/html.h index 86a266a62..b369bd890 100644 --- a/src/libserver/html.h +++ b/src/libserver/html.h @@ -107,9 +107,9 @@ struct html_block { struct html_tag { gint id; gint flags; - guint content_length; struct html_tag_component name; - const gchar *content; + guint content_length; + goffset content_offset; GQueue *params; gpointer extra; /** Additional data associated with tag (e.g. image) */ GNode *parent; @@ -127,6 +127,7 @@ struct html_content { guchar *tags_seen; GPtrArray *images; GPtrArray *blocks; + GByteArray *parsed; }; /* diff --git a/src/lua/lua_html.c b/src/lua/lua_html.c index 0af0457da..43c34797c 100644 --- a/src/lua/lua_html.c +++ b/src/lua/lua_html.c @@ -186,12 +186,17 @@ lua_check_html (lua_State * L, gint pos) return ud ? *((struct html_content **)ud) : NULL; } -static struct html_tag * +struct lua_html_tag { + struct html_content *html; + struct html_tag *tag; +}; + +static struct lua_html_tag * lua_check_html_tag (lua_State * L, gint pos) { void *ud = rspamd_lua_check_udata (L, pos, "rspamd{html_tag}"); luaL_argcheck (L, ud != NULL, pos, "'html_tag' expected"); - return ud ? *((struct html_tag **)ud) : NULL; + return ud ? ((struct lua_html_tag *)ud) : NULL; } static gint @@ -263,7 +268,7 @@ static void lua_html_push_image (lua_State *L, struct html_image *img) { LUA_TRACE_POINT; - struct html_tag **ptag; + struct lua_html_tag *ltag; struct rspamd_url **purl; lua_newtable (L); @@ -298,8 +303,9 @@ lua_html_push_image (lua_State *L, struct html_image *img) if (img->tag) { lua_pushstring (L, "tag"); - ptag = lua_newuserdata (L, sizeof (gpointer)); - *ptag = img->tag; + ltag = lua_newuserdata (L, sizeof (struct lua_html_tag)); + ltag->tag = img->tag; + ltag->html = NULL; rspamd_lua_setclass (L, "rspamd{html_tag}", -1); lua_settable (L, -3); } @@ -440,6 +446,7 @@ lua_html_get_blocks (lua_State *L) struct lua_html_traverse_ud { lua_State *L; + struct html_content *html; gint cbref; GHashTable *tags; gboolean any; @@ -449,15 +456,17 @@ static gboolean lua_html_node_foreach_cb (GNode *n, gpointer d) { struct lua_html_traverse_ud *ud = d; - struct html_tag *tag = n->data, **ptag; + struct html_tag *tag = n->data; + struct lua_html_tag *ltag; if (tag && (ud->any || g_hash_table_lookup (ud->tags, GSIZE_TO_POINTER (mum_hash64 (tag->id, 0))))) { lua_rawgeti (ud->L, LUA_REGISTRYINDEX, ud->cbref); - ptag = lua_newuserdata (ud->L, sizeof (*ptag)); - *ptag = tag; + ltag = lua_newuserdata (ud->L, sizeof (*ltag)); + ltag->tag = tag; + ltag->html = ud->html; rspamd_lua_setclass (ud->L, "rspamd{html_tag}", -1); lua_pushinteger (ud->L, tag->content_length); @@ -489,6 +498,7 @@ lua_html_foreach_tag (lua_State *L) ud.tags = g_hash_table_new (g_direct_hash, g_direct_equal); ud.any = FALSE; + ud.html = hc; if (lua_type (L, 2) == LUA_TSTRING) { tagname = luaL_checkstring (L, 2); @@ -556,11 +566,11 @@ static gint lua_html_tag_get_type (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1); + struct lua_html_tag *ltag = lua_check_html_tag (L, 1); const gchar *tagname; - if (tag != NULL) { - tagname = rspamd_html_tag_by_id (tag->id); + if (ltag != NULL) { + tagname = rspamd_html_tag_by_id (ltag->tag->id); if (tagname) { lua_pushstring (L, tagname); @@ -580,15 +590,16 @@ static gint lua_html_tag_get_parent (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1), **ptag; + struct lua_html_tag *ltag = lua_check_html_tag (L, 1), *ptag; GNode *node; - if (tag != NULL) { - node = tag->parent; + if (ltag != NULL) { + node = ltag->tag->parent; if (node && node->data) { - ptag = lua_newuserdata (L, sizeof (gpointer)); - *ptag = node->data; + ptag = lua_newuserdata (L, sizeof (*ptag)); + ptag->tag = node->data; + ptag->html = ltag->html; rspamd_lua_setclass (L, "rspamd{html_tag}", -1); } else { @@ -606,33 +617,33 @@ static gint lua_html_tag_get_flags (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1); + struct lua_html_tag *ltag = lua_check_html_tag (L, 1); gint i = 1; - if (tag) { + if (ltag->tag) { /* Push flags */ lua_createtable (L, 4, 0); - if (tag->flags & FL_CLOSING) { + if (ltag->tag->flags & FL_CLOSING) { lua_pushstring (L, "closing"); lua_rawseti (L, -2, i++); } - if (tag->flags & FL_HREF) { + if (ltag->tag->flags & FL_HREF) { lua_pushstring (L, "href"); lua_rawseti (L, -2, i++); } - if (tag->flags & FL_CLOSED) { + if (ltag->tag->flags & FL_CLOSED) { lua_pushstring (L, "closed"); lua_rawseti (L, -2, i++); } - if (tag->flags & FL_BROKEN) { + if (ltag->tag->flags & FL_BROKEN) { lua_pushstring (L, "broken"); lua_rawseti (L, -2, i++); } - if (tag->flags & FL_XML) { + if (ltag->tag->flags & FL_XML) { lua_pushstring (L, "xml"); lua_rawseti (L, -2, i++); } - if (tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) { + if (ltag->tag->flags & RSPAMD_HTML_FLAG_UNBALANCED) { lua_pushstring (L, "unbalanced"); lua_rawseti (L, -2, i++); } @@ -648,15 +659,16 @@ static gint lua_html_tag_get_content (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1); + struct lua_html_tag *ltag = lua_check_html_tag (L, 1); struct rspamd_lua_text *t; - if (tag) { - if (tag->content && tag->content_length) { + if (ltag) { + if (ltag->html && ltag->tag->content_offset && ltag->tag->content_length && + ltag->html->parsed->len >= ltag->tag->content_offset + ltag->tag->content_length) { t = lua_newuserdata (L, sizeof (*t)); rspamd_lua_setclass (L, "rspamd{text}", -1); - t->start = tag->content; - t->len = tag->content_length; + t->start = ltag->html->parsed->data + ltag->tag->content_offset; + t->len = ltag->tag->content_length; t->flags = 0; } else { @@ -674,10 +686,10 @@ static gint lua_html_tag_get_content_length (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1); + struct lua_html_tag *ltag = lua_check_html_tag (L, 1); - if (tag) { - lua_pushinteger (L, tag->content_length); + if (ltag) { + lua_pushinteger (L, ltag->tag->content_length); } else { return luaL_error (L, "invalid arguments"); @@ -690,24 +702,24 @@ static gint lua_html_tag_get_extra (lua_State *L) { LUA_TRACE_POINT; - struct html_tag *tag = lua_check_html_tag (L, 1); + struct lua_html_tag *ltag = lua_check_html_tag (L, 1); struct html_image *img; struct rspamd_url **purl; - if (tag) { - if (tag->extra) { - if ((tag->flags & FL_HREF) || tag->id == Tag_BASE) { + if (ltag) { + if (ltag->tag->extra) { + if ((ltag->tag->flags & FL_HREF) || ltag->tag->id == Tag_BASE) { /* For A that's URL */ purl = lua_newuserdata (L, sizeof (gpointer)); - *purl = tag->extra; + *purl = ltag->tag->extra; rspamd_lua_setclass (L, "rspamd{url}", -1); } - else if (tag->id == Tag_IMG) { - img = tag->extra; + else if (ltag->tag->id == Tag_IMG) { + img = ltag->tag->extra; lua_html_push_image (L, img); } - else if (tag->flags & FL_BLOCK) { - lua_html_push_block (L, tag->extra); + else if (ltag->tag->flags & FL_BLOCK) { + lua_html_push_block (L, ltag->tag->extra); } else { /* Unknown extra ? */ -- 2.39.5