From 5e99aef6caea1bc206825c20d51491a6571aec52 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 21 Jun 2021 23:46:26 +0100 Subject: [PATCH] [Project] Html: Rework tags placement --- src/libserver/html/html.cxx | 110 ++++++++++++-------------------- src/libserver/html/html_tag.hxx | 5 +- 2 files changed, 45 insertions(+), 70 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 00dcebad6..925735f41 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -78,7 +78,8 @@ INIT_LOG_MODULE(html) static auto html_check_balance(struct html_tag *tag, struct html_tag *parent, - std::vector &tags_stack) -> bool + std::vector &tags_stack, + goffset tag_start_offset) -> bool { if (tag->flags & FL_CLOSING) { @@ -89,7 +90,19 @@ html_check_balance(struct html_tag *tag, }); if (found_opening != tags_stack.rend()) { - (*found_opening)->flags |= FL_CLOSED; + auto *opening_tag = (*found_opening); + opening_tag->flags |= FL_CLOSED; + + /* Adjust size */ + auto opening_content_offset = opening_tag->content_offset; + + if (opening_content_offset <= tag_start_offset) { + opening_tag->content_length = + tag_start_offset - opening_content_offset; + } + else { + opening_tag->content_length = 0; + } if (found_opening == tags_stack.rbegin()) { tags_stack.pop_back(); @@ -100,7 +113,7 @@ html_check_balance(struct html_tag *tag, /* Move to front */ std::iter_swap(found_opening, tags_stack.rbegin()); tags_stack.pop_back(); - return true; + return false; } } else { @@ -117,7 +130,8 @@ static auto html_process_tag(rspamd_mempool_t *pool, struct html_content *hc, struct html_tag *tag, - std::vector &tags_stack) -> bool + std::vector &tags_stack, + goffset tag_start_offset) -> bool { struct html_tag *parent; @@ -141,7 +155,7 @@ html_process_tag(rspamd_mempool_t *pool, tag->parent = parent; - if (!(tag->flags & (CM_INLINE | CM_EMPTY))) { + if (!(tag->flags & (CM_EMPTY))) { /* Block tag */ if ((tag->flags & (FL_CLOSING | FL_CLOSED))) { /* Closed block tag */ @@ -151,7 +165,7 @@ html_process_tag(rspamd_mempool_t *pool, } if (hc->total_tags < rspamd::html::max_tags) { - if (!html_check_balance(tag, parent, tags_stack)) { + if (!html_check_balance(tag, parent, tags_stack, tag_start_offset)) { msg_debug_html ( "mark part as unbalanced as it has not pairable closing tags"); hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; @@ -168,7 +182,7 @@ html_process_tag(rspamd_mempool_t *pool, } if (!(tag->flags & FL_CLOSED) && - !(parent->flags & FL_BLOCK)) { + (parent->flags & CM_EMPTY)) { /* We likely have some bad nesting */ if (parent->id == tag->id) { /* Something like blafoo... */ @@ -197,6 +211,13 @@ html_process_tag(rspamd_mempool_t *pool, } else { hc->root_tag = tag; + if (hc->total_tags < rspamd::html::max_tags) { + if ((tag->flags & FL_CLOSED) == 0) { + tags_stack.push_back(tag); + } + + hc->total_tags++; + } } if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) { @@ -204,7 +225,6 @@ html_process_tag(rspamd_mempool_t *pool, return false; } - } } else { @@ -1062,8 +1082,8 @@ html_process_input(rspamd_mempool_t *pool, gboolean closing = FALSE; guint obrace = 0, ebrace = 0; struct rspamd_url *url = NULL; - gint len, href_offset = -1; - struct html_tag *cur_tag = NULL, *content_tag = NULL; + gint href_offset = -1; + struct html_tag *cur_tag = NULL; std::vector tags_stack; struct tag_content_parser_state content_parser_env; @@ -1121,6 +1141,7 @@ html_process_input(rspamd_mempool_t *pool, case tag_begin: switch (t) { case '<': + c = p; p ++; closing = FALSE; break; @@ -1148,6 +1169,7 @@ html_process_input(rspamd_mempool_t *pool, hc->all_tags.emplace_back(std::make_unique()); cur_tag = hc->all_tags.back().get(); + cur_tag->tag_start = c - start; break; } @@ -1314,6 +1336,7 @@ html_process_input(rspamd_mempool_t *pool, case tag_content: html_parse_tag_content(pool, hc, cur_tag, p, content_parser_env); + if (t == '>') { if (closing) { cur_tag->flags |= FL_CLOSING; @@ -1338,8 +1361,10 @@ html_process_input(rspamd_mempool_t *pool, if (cur_tag != nullptr) { state = html_text_content; - cur_tag->content_offset = p - start; - if (!html_process_tag(pool, hc, cur_tag, tags_stack)) { + cur_tag->content_offset = p - start + 1; + + if (!html_process_tag(pool, hc, cur_tag, tags_stack, + c - start)) { if (cur_tag->id == Tag_STYLE) { state = content_style; } @@ -1355,59 +1380,6 @@ html_process_input(rspamd_mempool_t *pool, hc->tags_seen[cur_tag->id] = true; } - if (!(cur_tag->flags & (FL_CLOSED|FL_CLOSING))) { - content_tag = cur_tag; - } - - /* Handle newlines */ - if (cur_tag->id == Tag_BR || cur_tag->id == Tag_HR) { - if (!hc->parsed.empty() && - hc->parsed.back() != '\n') { - - hc->parsed += "\r\n"; - - if (content_tag) { - if (content_tag->content_length == 0) { - /* - * Special case - * we have a \r\n at the beginning but - * we have no set content_offset - * so we need to do it here - */ - content_tag->content_offset = hc->parsed.size(); - } - else { - content_tag->content_length += 2; - } - } - } - } - - if ((cur_tag->id == Tag_P || - cur_tag->id == Tag_TR || - cur_tag->id == Tag_DIV)) { - if (!hc->parsed.empty() && - hc->parsed.back() != '\n') { - - hc->parsed += "\r\n"; - - if (content_tag) { - if (content_tag->content_length == 0) { - /* - * Special case - * we have a \r\n at the beginning but - * we have no set content_offset - * so we need to get it here - */ - content_tag->content_offset = hc->parsed.size(); - } - else { - content_tag->content_length += 2; - } - } - } - } - /* XXX: uncomment when styles parsing is not so broken */ if (cur_tag->flags & FL_HREF /* && !(cur_tag->flags & FL_IGNORE) */) { if (!(cur_tag->flags & (FL_CLOSING))) { @@ -1502,7 +1474,7 @@ html_process_input(rspamd_mempool_t *pool, part_urls); } - if (cur_tag->flags & FL_BLOCK) { + if (!(cur_tag->flags & CM_EMPTY)) { if (!(cur_tag->flags & FL_CLOSING)) { html_process_block_tag(pool, cur_tag, hc); @@ -1520,6 +1492,7 @@ html_process_input(rspamd_mempool_t *pool, /* Summarize content length from children */ hc->traverse_block_tags([](const html_tag *tag) -> bool { + for (const auto *cld_tag : tag->children) { tag->content_length += cld_tag->content_length; } @@ -1716,13 +1689,14 @@ TEST_CASE("html text extraction") { const std::vector> cases{ + {"foobarbaz", "foobarbaz"}, + {"foobarbaz", "foobarbaz"}, {"test", "test"}, {"test ", "test "}, {"test foo, bar", "test foo, bar"}, {"

text

", "text"}, {"olo

text

lolo", "olo\ntext\nlolo"}, - {"foobarbaz", "foobarbaz"}, - {"foobarbaz", "foobarbaz"}, + {"foo
baz", "foo\nbaz"}, {"
foo
bar
", "foo\nbar"}, }; diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 72ae6d616..f6442bdc3 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -57,10 +57,11 @@ struct html_tag_component { }; struct html_tag { - int id = -1; - unsigned int flags = 0; + unsigned int tag_start = 0; mutable unsigned int content_length = 0; /* Allow content length propagation */ unsigned int content_offset = 0; + std::uint32_t flags = 0; + std::int16_t id = -1; std::string_view name; std::vector parameters; -- 2.39.5