From: Vsevolod Stakhov Date: Sat, 26 Jun 2021 13:30:19 +0000 (+0100) Subject: [Project] Html: Implement logic for tags pairing X-Git-Tag: 3.0~238 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=620d003e7ab56e8f4cf1fc4a2569fd20f8f98b3f;p=rspamd.git [Project] Html: Implement logic for tags pairing --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 894b1ee45..8d312b733 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -76,51 +76,136 @@ auto html_components_map = frozen::make_unordered_map &tags_stack, goffset tag_start_offset, goffset tag_end_offset) -> bool { - if (tag->flags & FL_CLOSING) { - /* Find the opening pair if any and check if it is correctly placed */ - auto found_opening = std::find_if(tags_stack.rbegin(), tags_stack.rend(), - [&](const html_tag *t) { - return (t->flags & FL_CLOSED) == 0 && t->id == tag->id; - }); + auto calculate_content_length = [tag_start_offset](html_tag *t) { + auto opening_content_offset = t->content_offset; - if (found_opening != tags_stack.rend()) { - auto *opening_tag = (*found_opening); - opening_tag->flags |= FL_CLOSED; + if (opening_content_offset <= tag_start_offset) { + t->content_length = tag_start_offset - opening_content_offset; + } + else { + t->content_length = 0; + } + }; - /* Adjust size */ - auto opening_content_offset = opening_tag->content_offset; + auto balance_tag = [&]() -> void { + auto it = tags_stack.rbegin(); - if (opening_content_offset <= tag_start_offset) { - opening_tag->content_length = - tag_start_offset - opening_content_offset; - } - else { - opening_tag->content_length = 0; + for (auto end_it = tags_stack.rend(); it != end_it; ++it) { + if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) { + break; } + /* Insert a virtual closing tag for all tags that are not closed */ + auto &&vtag = std::make_unique(); + vtag->id = (*it)->id; + vtag->flags = FL_VIRTUAL|FL_CLOSING; + vtag->tag_start = tag->tag_start; + vtag->content_offset = tag->content_offset; + vtag->content_length = 0; + vtag->parent = (*it)->parent; + calculate_content_length(*it); + (*it)->flags |= FL_CLOSED; + hc->all_tags.emplace_back(std::move(vtag)); + } + + /* Remove tags */ + tags_stack.erase(it.base(), std::end(tags_stack)); + }; - if (found_opening == tags_stack.rbegin()) { + if (tag->flags & FL_CLOSING) { + if (!tags_stack.empty()) { + auto *last_tag = tags_stack.back(); + + if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) { + last_tag->flags |= FL_CLOSED; + + calculate_content_length(last_tag); tags_stack.pop_back(); /* All good */ return true; } else { - /* Move to front */ - std::iter_swap(found_opening, tags_stack.rbegin()); - tags_stack.pop_back(); + balance_tag(); + return false; } } else { - /* We have unpaired tag */ - return false; + /* + * We have no opening tags in the stack, so we need to assume that there + * is an opening tag at the beginning of the document. + * There are two possibilities: + * + * 1) We have some block tag in hc->all_tags; + * 2) We have no tags + */ + + if (hc->all_tags.empty()) { + auto &&vtag = std::make_unique(); + vtag->id = tag->id; + vtag->flags = FL_VIRTUAL|FL_CLOSED; + vtag->tag_start = 0; + vtag->content_offset = 0; + calculate_content_length(vtag.get()); + + + if (!hc->root_tag) { + hc->root_tag = vtag.get(); + } + else { + vtag->parent = hc->root_tag; + } + hc->all_tags.emplace_back(std::move(vtag)); + } + else { + auto found_closing = std::find_if(hc->all_tags.rbegin(), + hc->all_tags.rend(), + [&](const auto &t) { + constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING; + return (t->flags & expect_flags) == (expect_flags) && + t.get() != tag && + t->parent != nullptr; + }); + + if (found_closing != hc->all_tags.rend()) { + auto *closing_tag = (*found_closing).get(); + auto &&vtag = std::make_unique(); + vtag->id = tag->id; + vtag->flags = FL_VIRTUAL|FL_CLOSED; + vtag->tag_start = closing_tag->content_offset - 1; + vtag->content_offset = vtag->tag_start + 1; + vtag->parent = closing_tag->parent; + vtag->content_length = tag->tag_start - vtag->content_offset; + hc->all_tags.emplace_back(std::move(vtag)); + } + else { + auto &&vtag = std::make_unique(); + vtag->id = tag->id; + vtag->flags = FL_VIRTUAL|FL_CLOSED; + vtag->tag_start = 0; + vtag->content_offset = 0; + calculate_content_length(vtag.get()); + + + if (!hc->root_tag) { + hc->root_tag = vtag.get(); + } + else { + vtag->parent = hc->root_tag; + } + hc->all_tags.emplace_back(std::move(vtag)); + } + } } + + return false; } /* Misuse */ @@ -166,7 +251,7 @@ html_process_tag(rspamd_mempool_t *pool, return false; } - if (!html_check_balance(tag, parent, tags_stack, + if (!html_check_balance(hc, tag, parent, tags_stack, tag_start_offset, tag_end_offset)) { msg_debug_html ( "mark part as unbalanced as it has not pairable closing tags"); diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 9091b9060..36110c8c7 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -43,16 +43,17 @@ enum class html_component_type : std::uint8_t { /* Public tags flags */ /* XML tag */ -#define FL_XML (1 << 23) +#define FL_XML (1 << 22) /* Closing tag */ -#define FL_CLOSING (1 << 24) +#define FL_CLOSING (1 << 23) /* Fully closed tag (e.g. ) */ -#define FL_CLOSED (1 << 25) -#define FL_BROKEN (1 << 26) -#define FL_IGNORE (1 << 27) -#define FL_BLOCK (1 << 28) -#define FL_HREF (1 << 29) +#define FL_CLOSED (1 << 24) +#define FL_BROKEN (1 << 25) +#define FL_IGNORE (1 << 26) +#define FL_BLOCK (1 << 27) +#define FL_HREF (1 << 28) #define FL_COMMENT (1 << 29) +#define FL_VIRTUAL (1 << 30) /** * Returns component type from a string diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h index 8f430204e..3f209c08e 100644 --- a/src/libserver/html/html_tags.h +++ b/src/libserver/html/html_tags.h @@ -191,13 +191,10 @@ typedef enum { #define CM_NO_INDENT (1 << 18) /* Elements that are obsolete (such as "dir", "menu"). */ #define CM_OBSOLETE (1 << 19) -/* User defined elements. Used to determine how attributes without value - should be printed. */ -#define CM_NEW (1 << 20) /* Elements that cannot be omitted. */ -#define CM_OMITST (1 << 21) +#define CM_OMITST (1 << 20) /* Unique elements */ -#define CM_UNIQUE (1 << 22) +#define CM_UNIQUE (1 << 21) #ifdef __cplusplus }