From bb995323cf4d2b0897c84066cd39d1b8c7f343a5 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 22 Jun 2021 11:10:54 +0100 Subject: [PATCH] [Project] Html: Insert closing tags as well :( --- src/libserver/html/html.cxx | 57 ++++++++++++++++++++++----------- src/libserver/html/html_tag.hxx | 4 +-- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index f82bd0359..15433cc72 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -79,7 +79,8 @@ static auto html_check_balance(struct html_tag *tag, struct html_tag *parent, std::vector &tags_stack, - goffset tag_start_offset) -> bool + goffset tag_start_offset, + goffset tag_end_offset) -> bool { if (tag->flags & FL_CLOSING) { @@ -131,7 +132,8 @@ html_process_tag(rspamd_mempool_t *pool, struct html_content *hc, struct html_tag *tag, std::vector &tags_stack, - goffset tag_start_offset) -> bool + goffset tag_start_offset, + goffset tag_end_offset) -> bool { struct html_tag *parent; @@ -164,14 +166,14 @@ html_process_tag(rspamd_mempool_t *pool, return false; } - if (hc->total_tags < rspamd::html::max_tags) { - if (!html_check_balance(tag, parent, tags_stack, tag_start_offset)) { - msg_debug_html ( - "mark part as unbalanced as it has not pairable closing tags"); - hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; - } - - hc->total_tags++; + if (!html_check_balance(tag, parent, tags_stack, + tag_start_offset, tag_end_offset)) { + msg_debug_html ( + "mark part as unbalanced as it has not pairable closing tags"); + hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; + } + else { + parent->children.push_back(tag); } } else { @@ -1076,12 +1078,18 @@ html_append_tag_content(const gchar *start, gsize len, auto cur_offset = tag->content_offset; auto total_len = tag->content_length; + if (tag->flags & FL_CLOSING) { + return; + } + if (cur_offset > len || total_len + cur_offset > len) { RSPAMD_UNREACHABLE; } if (tag->id == Tag_BR || tag->id == Tag_HR) { - hc->parsed.append("\n"); + if (!hc->parsed.empty()) { + hc->parsed.append("\n"); + } return; } @@ -1089,8 +1097,12 @@ html_append_tag_content(const gchar *start, gsize len, return; /* XXX: is it always true? */ } - if (tag->block->has_display() && tag->block->display == css::css_display_value::DISPLAY_BLOCK) { - hc->parsed.append("\n"); + auto is_block = tag->block->has_display() && + tag->block->display == css::css_display_value::DISPLAY_BLOCK; + if (is_block) { + if (!hc->parsed.empty()) { + hc->parsed.append("\n"); + } } for (const auto &cld_tag : tag->children) { @@ -1102,19 +1114,25 @@ html_append_tag_content(const gchar *start, gsize len, } html_append_tag_content(start, len, hc, cld_tag); auto old_offset = cur_offset; + cur_offset = cld_tag->content_offset + cld_tag->content_length; if (total_len < cur_offset - old_offset) { /* Child tag spans over parent (e.g. wrong nesting) */ total_len = 0; - break; } - total_len -= cur_offset - old_offset; + else { + total_len -= cur_offset - old_offset; + } } if (total_len > 0 && tag->block->is_visible()) { html_append_content(hc, {start + cur_offset, total_len}); } + + if (is_block) { + hc->parsed.append("\n"); + } } static auto @@ -1412,7 +1430,7 @@ html_process_input(rspamd_mempool_t *pool, cur_tag->content_offset = p - start + 1; if (!html_process_tag(pool, hc, cur_tag, tags_stack, - c - start)) { + c - start, p - start)) { if (cur_tag->id == Tag_STYLE) { state = content_style; } @@ -1540,6 +1558,10 @@ html_process_input(rspamd_mempool_t *pool, /* Propagate styles */ hc->traverse_block_tags([&hc](const html_tag *tag) -> bool { + if (tag->flags & FL_CLOSING) { + return true; + } + if (hc->css_style) { auto *css_block = hc->css_style->check_tag_block(tag); @@ -1688,14 +1710,13 @@ TEST_CASE("html text extraction") { const std::vector> cases{ - {"foobarbaz", "foobarbaz"}, {"foobarbaz", "foobarbaz"}, {"test", "test"}, {"test ", "test "}, {"test foo, bar", "test foo, bar"}, {"

text

", "text"}, {"olo

text

lolo", "olo\ntext\nlolo"}, - + {"foobarbaz", "foobarbaz"}, {"foo
baz", "foo\nbaz"}, {"
foo
bar
", "foo\nbar"}, }; diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index f6442bdc3..cad5368cf 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -58,10 +58,10 @@ struct html_tag_component { struct html_tag { unsigned int tag_start = 0; - mutable unsigned int content_length = 0; /* Allow content length propagation */ + unsigned int content_length = 0; unsigned int content_offset = 0; std::uint32_t flags = 0; - std::int16_t id = -1; + std::int32_t id = -1; std::string_view name; std::vector parameters; -- 2.39.5