From: Vsevolod Stakhov Date: Mon, 21 Jun 2021 23:21:24 +0000 (+0100) Subject: [Rework] Html: Further rework of the tags content extraction X-Git-Tag: 3.0~264 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=ce88665b0cd344fff4b2924448499b5a4ebe4f94;p=rspamd.git [Rework] Html: Further rework of the tags content extraction --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 925735f41..f82bd0359 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1069,6 +1069,54 @@ html_append_content(struct html_content *hc, std::string_view data) -> auto return nlen; } +static auto +html_append_tag_content(const gchar *start, gsize len, + struct html_content *hc, const struct html_tag *tag) -> void +{ + auto cur_offset = tag->content_offset; + auto total_len = tag->content_length; + + if (cur_offset > len || total_len + cur_offset > len) { + RSPAMD_UNREACHABLE; + } + + if (tag->id == Tag_BR || tag->id == Tag_HR) { + hc->parsed.append("\n"); + return; + } + + if (!tag->block) { + return; /* XXX: is it always true? */ + } + + if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) { + hc->parsed.append("\n"); + } + + for (const auto &cld_tag : tag->children) { + if (cld_tag->tag_start > cur_offset) { + if (tag->block->is_visible()) { + html_append_content(hc, {start + cur_offset, + cld_tag->tag_start - cur_offset}); + } + } + html_append_tag_content(start, len, hc, cld_tag); + auto old_offset = cur_offset; + cur_offset = cld_tag->content_offset + cld_tag->content_length; + + if (total_len < cur_offset - old_offset) { + /* Child tag spans over parent (e.g. wrong nesting) */ + total_len = 0; + break; + } + total_len -= cur_offset - old_offset; + } + + if (total_len > 0 && tag->block->is_visible()) { + html_append_content(hc, {start + cur_offset, total_len}); + } +} + static auto html_process_input(rspamd_mempool_t *pool, GByteArray *in, @@ -1490,17 +1538,8 @@ html_process_input(rspamd_mempool_t *pool, } } - /* Summarize content length from children */ - hc->traverse_block_tags([](const html_tag *tag) -> bool { - - for (const auto *cld_tag : tag->children) { - tag->content_length += cld_tag->content_length; - } - return true; - }, html_content::traverse_type::POST_ORDER); - /* Propagate styles */ - hc->traverse_block_tags([&hc, &exceptions,&pool](const html_tag *tag) -> bool { + hc->traverse_block_tags([&hc](const html_tag *tag) -> bool { if (hc->css_style) { auto *css_block = hc->css_style->check_tag_block(tag); @@ -1514,62 +1553,18 @@ html_process_input(rspamd_mempool_t *pool, } } if (tag->block) { - tag->block->compute_visibility(); - - if (exceptions) { - if (!tag->block->is_visible()) { - if (tag->parent == nullptr || (tag->parent->block && tag->parent->block->is_visible())) { - /* Add exception for an invisible element */ - auto * ex = rspamd_mempool_alloc_type (pool,struct rspamd_process_exception); - ex->pos = tag->content_offset; - ex->len = tag->content_length; - ex->type = RSPAMD_EXCEPTION_INVISIBLE; - ex->ptr = (void *)tag; - - *exceptions = g_list_prepend(*exceptions, ex); - } + if (!tag->block->has_display()) { + /* If we have no display field, we can check it by tag */ + if (tag->flags & CM_BLOCK) { + tag->block->set_display(css::css_display_value::DISPLAY_BLOCK); } - else if (*exceptions && tag->parent) { - /* Current block is visible, check if parent is invisible */ - auto *ex = (struct rspamd_process_exception*)g_list_first(*exceptions)->data; - - /* - * TODO: we need to handle the following cases: - * -< insert one more exception - * -< increase content_offset decrease length - * -< decrease length - */ - if (ex && ex->type == RSPAMD_EXCEPTION_INVISIBLE && - ex->ptr == (void *)tag->parent) { - auto *parent = tag->parent; - - if (tag->content_offset + tag->content_length == - parent->content_offset + parent->content_length) { - /* */ - ex->len -= tag->content_length; - } - else if (tag->content_offset == parent->content_offset) { - /* */ - ex->len -= tag->content_length; - ex->pos += tag->content_length; - } - else if (tag->content_offset > ex->pos) { - auto *nex = rspamd_mempool_alloc_type (pool, - struct rspamd_process_exception); - auto start_len = tag->content_offset - ex->pos; - auto end_len = ex->len - tag->content_length - tag->content_length; - nex->pos = tag->content_offset + tag->content_length; - nex->len = end_len; - nex->type = RSPAMD_EXCEPTION_INVISIBLE; - nex->ptr = (void *)parent; /* ! */ - ex->len = start_len; - *exceptions = g_list_prepend(*exceptions, ex); - } - - } + else { + tag->block->set_display(css::css_display_value::DISPLAY_INLINE); } } + tag->block->compute_visibility(); + for (const auto *cld_tag : tag->children) { if (cld_tag->block) { cld_tag->block->propagate_block(*tag->block); @@ -1582,6 +1577,10 @@ html_process_input(rspamd_mempool_t *pool, return true; }, html_content::traverse_type::PRE_ORDER); + if (hc->root_tag) { + html_append_tag_content(start, end - start, hc, hc->root_tag); + } + /* Leftover */ switch (state) { case html_text_content: diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx index 51f3dbb9d..f2bbf1d64 100644 --- a/src/libserver/html/html_block.hxx +++ b/src/libserver/html/html_block.hxx @@ -219,6 +219,10 @@ struct html_block { return (mask & transparent_flag) != 0; } + constexpr auto has_display(void) const -> bool { + return (mask & display_mask) != 0; + } + /** * Returns a default html block for root HTML element * @return @@ -227,7 +231,7 @@ struct html_block { return html_block{rspamd::css::css_color::black(), rspamd::css::css_color::white(), 0, 0, - (fg_color_mask|bg_color_mask|display_mask|font_size_mask), + (fg_color_mask|bg_color_mask|font_size_mask), rspamd::css::css_display_value::DISPLAY_INLINE, 12}; } diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 2dd4a6f5a..b4f8ac75c 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -275,14 +275,6 @@ rspamd_tokenize_exception (struct rspamd_process_exception *ex, GArray *res) g_array_append_val (res, token); token.flags = 0; } - else if (ex->type == RSPAMD_EXCEPTION_INVISIBLE) { - token.original.begin = "!!INV!!"; - token.original.len = sizeof ("!!INV!!") - 1; - token.flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - - g_array_append_val (res, token); - token.flags = 0; - } } diff --git a/src/libutil/util.h b/src/libutil/util.h index 9ee8a09ae..d993fcbdf 100644 --- a/src/libutil/util.h +++ b/src/libutil/util.h @@ -25,7 +25,6 @@ enum rspamd_exception_type { RSPAMD_EXCEPTION_NEWLINE = 0, RSPAMD_EXCEPTION_URL, RSPAMD_EXCEPTION_GENERIC, - RSPAMD_EXCEPTION_INVISIBLE, }; /** * Structure to point exception in text from processing