From: Vsevolod Stakhov Date: Wed, 30 Jun 2021 11:50:43 +0000 (+0100) Subject: [Project] Html: Another rework of the tags structure X-Git-Tag: 3.0~231 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=eae0eddc9625e66d47aabcc5d854a73c5366661c;p=rspamd.git [Project] Html: Another rework of the tags structure --- diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 51f537b5a..9e26eb42f 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -114,7 +114,7 @@ css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> } /* First, find id in a tag and a class */ - for (const auto ¶m : tag->parameters) { + for (const auto ¶m : tag->components) { if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { id_comp = param.value; } diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index f0315434a..9d1531f04 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -75,259 +75,91 @@ auto html_components_map = frozen::make_unordered_map &tags_stack, goffset tag_start_offset, - goffset tag_end_offset) -> bool + goffset tag_end_offset) -> html_tag * { + /* As agreed, the closing tag has the last opening at the parent ptr */ + auto *opening_tag = tag->parent; - auto calculate_content_length = [tag_start_offset](html_tag *t) { + auto calculate_content_length = [tag_start_offset,tag_end_offset](html_tag *t) { auto opening_content_offset = t->content_offset; if (opening_content_offset <= tag_start_offset) { - t->content_length = tag_start_offset - opening_content_offset; + t->closing.start = tag_start_offset; + t->closing.end = tag_end_offset; } else { - t->content_length = 0; + + t->closing.start = t->content_offset; + t->closing.end = tag_end_offset; } }; - auto balance_tag = [&]() -> void { - auto it = tags_stack.rbegin(); + auto balance_tag = [&]() -> html_tag * { + auto it = tag->parent; - for (auto end_it = tags_stack.rend(); it != end_it; ++it) { - if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) { + for (; it != nullptr; it = it->parent) { + if (it->id == tag->id && !(it->flags & FL_CLOSED)) { break; } /* Insert a virtual closing tag for all tags that are not closed */ - auto &&vtag = std::make_unique(); - vtag->id = (*it)->id; - vtag->flags = FL_VIRTUAL|FL_CLOSING; - vtag->tag_start = tag->tag_start; - vtag->content_offset = tag->content_offset; - vtag->content_length = 0; - vtag->parent = (*it)->parent; - calculate_content_length(*it); - (*it)->flags |= FL_CLOSED; - hc->all_tags.emplace_back(std::move(vtag)); + calculate_content_length(it); + it->flags |= FL_CLOSED; } /* Remove tags */ - tags_stack.erase(it.base(), std::end(tags_stack)); + return it; }; - if (tag->flags & FL_CLOSING) { - if (!tags_stack.empty()) { - auto *last_tag = tags_stack.back(); + if (opening_tag) { - if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) { - last_tag->flags |= FL_CLOSED; + if (opening_tag->id == tag->id) { + opening_tag->flags |= FL_CLOSED; - calculate_content_length(last_tag); - tags_stack.pop_back(); - /* All good */ - return true; - } - else { - balance_tag(); - - return false; - } + calculate_content_length(opening_tag); + /* All good */ + return opening_tag->parent; } else { - /* - * We have no opening tags in the stack, so we need to assume that there - * is an opening tag at the beginning of the document. - * There are two possibilities: - * - * 1) We have some block tag in hc->all_tags; - * 2) We have no tags - */ - - if (hc->all_tags.empty()) { - auto &&vtag = std::make_unique(); - vtag->id = tag->id; - vtag->flags = FL_VIRTUAL|FL_CLOSED; - vtag->tag_start = 0; - vtag->content_offset = 0; - calculate_content_length(vtag.get()); - - - if (!hc->root_tag) { - hc->root_tag = vtag.get(); - } - else { - vtag->parent = hc->root_tag; - } - hc->all_tags.emplace_back(std::move(vtag)); - } - else { - auto found_closing = std::find_if(hc->all_tags.rbegin(), - hc->all_tags.rend(), - [&](const auto &t) { - constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING; - return (t->flags & expect_flags) == (expect_flags) && - t.get() != tag && - t->parent != nullptr; - }); - - if (found_closing != hc->all_tags.rend()) { - auto *closing_tag = (*found_closing).get(); - auto &&vtag = std::make_unique(); - vtag->id = tag->id; - vtag->flags = FL_VIRTUAL|FL_CLOSED; - vtag->tag_start = closing_tag->content_offset - 1; - vtag->content_offset = vtag->tag_start + 1; - vtag->parent = closing_tag->parent; - vtag->content_length = tag->tag_start - vtag->content_offset; - hc->all_tags.emplace_back(std::move(vtag)); - } - else { - auto &&vtag = std::make_unique(); - vtag->id = tag->id; - vtag->flags = FL_VIRTUAL|FL_CLOSED; - vtag->tag_start = 0; - vtag->content_offset = 0; - calculate_content_length(vtag.get()); - - - if (!hc->root_tag) { - hc->root_tag = vtag.get(); - } - else { - vtag->parent = hc->root_tag; - } - hc->all_tags.emplace_back(std::move(vtag)); - } - } + return balance_tag(); } - - return false; - } - - /* Misuse */ - RSPAMD_UNREACHABLE; -} - -static auto -html_process_tag(rspamd_mempool_t *pool, - struct html_content *hc, - struct html_tag *tag, - std::vector &tags_stack, - goffset tag_start_offset, - goffset tag_end_offset) -> bool -{ - struct html_tag *parent; - - if (hc->total_tags > rspamd::html::max_tags) { - hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; - } - - if (tag->id == -1) { - /* Ignore unknown tags */ - hc->total_tags++; - return false; - } - - - if (tags_stack.empty()) { - parent = hc->root_tag; } else { - parent = tags_stack.back(); - } - - tag->parent = parent; - - if (!(tag->flags & (CM_EMPTY))) { - /* Block tag */ - if (tag->flags & FL_CLOSING) { - /* Closed block tag */ - if (parent == nullptr) { - msg_debug_html ("bad parent node"); - return false; - } - - if (!html_check_balance(hc, tag, parent, tags_stack, - tag_start_offset, tag_end_offset)) { - msg_debug_html ( - "mark part as unbalanced as it has not pairable closing tags"); - hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; - } - } - else { - /* Opening block tag */ - if (parent) { - if ((parent->flags & FL_IGNORE)) { - tag->flags |= FL_IGNORE; - } - - if (!(tag->flags & FL_CLOSED) && - (parent->flags & CM_EMPTY)) { - /* We likely have some bad nesting */ - if (parent->id == tag->id) { - /* Something like blafoo... */ - hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED; - tag->parent = parent->parent; - - if (hc->total_tags < rspamd::html::max_tags) { - parent->children.push_back(tag); - tags_stack.push_back(tag); - hc->total_tags++; - } - - return true; - } - } - - if (hc->total_tags < rspamd::html::max_tags) { - parent->children.push_back(tag); + /* + * We have no opening tag + * There are two possibilities: + * + * 1) We have some block tag in hc->all_tags; + * 2) We have no tags + */ + + if (hc->all_tags.empty()) { + auto &&vtag = std::make_unique(); + vtag->id = tag->id; + vtag->flags = FL_VIRTUAL|FL_CLOSED; + vtag->tag_start = 0; + vtag->content_offset = 0; + calculate_content_length(vtag.get()); - if ((tag->flags & FL_CLOSED) == 0) { - tags_stack.push_back(tag); - } - hc->total_tags++; - } + if (!hc->root_tag) { + hc->root_tag = vtag.get(); } else { - hc->root_tag = tag; - if (hc->total_tags < rspamd::html::max_tags) { - if ((tag->flags & FL_CLOSED) == 0) { - tags_stack.push_back(tag); - } - - hc->total_tags++; - } - } - - if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) { - tag->flags |= FL_IGNORE; - - return false; - } - } - } - else { - /* Inline tag */ - if (parent) { - if (hc->total_tags < rspamd::html::max_tags) { - parent->children.push_back(tag); - - hc->total_tags++; - } - if ((parent->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE))) { - tag->flags |= FL_IGNORE; - - return false; + vtag->parent = hc->root_tag; } + hc->all_tags.emplace_back(std::move(vtag)); } } - return true; + return nullptr; } auto @@ -404,7 +236,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, spaces_before_eq, spaces_after_eq, spaces_after_param, - ignore_bad_tag + ignore_bad_tag, } state; gboolean store = FALSE; @@ -425,7 +257,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, auto *s = rspamd_mempool_alloc_buffer(pool, sz); memcpy(s, parser_env.saved_p, sz); sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p); - tag->parameters.emplace_back(parser_env.cur_component.value(), + tag->components.emplace_back(parser_env.cur_component.value(), std::string_view{s, sz}); } @@ -941,7 +773,7 @@ html_process_img_tag(rspamd_mempool_t *pool, img = rspamd_mempool_alloc0_type (pool, struct html_image); img->tag = tag; - for (const auto ¶m : tag->parameters) { + for (const auto ¶m : tag->components) { if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { /* Check base url */ @@ -1103,7 +935,7 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, { std::optional maybe_fgcolor, maybe_bgcolor; - for (const auto ¶m : tag->parameters) { + for (const auto ¶m : tag->components) { if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); } @@ -1173,7 +1005,7 @@ html_append_tag_content(rspamd_mempool_t *pool, khash_t (rspamd_url_hash) *url_set) -> goffset { auto is_visible = true, is_block = false; - goffset next_tag_offset = tag->content_length + tag->content_offset, + goffset next_tag_offset = tag->closing.end, initial_dest_offset = hc->parsed.size(); if (tag->id == Tag_BR || tag->id == Tag_HR) { @@ -1182,7 +1014,7 @@ html_append_tag_content(rspamd_mempool_t *pool, return tag->content_offset; } - if ((tag->flags & (FL_COMMENT|FL_XML))) { + if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE))) { is_visible = false; } else { @@ -1213,8 +1045,7 @@ html_append_tag_content(rspamd_mempool_t *pool, if (!enclosed_tags.empty()) { next_enclosed = enclosed_tags.back(); enclosed_start = next_enclosed->tag_start; - enclosed_end = next_enclosed->content_length + - next_enclosed->content_offset; + enclosed_end = next_enclosed->closing.end; if (enclosed_end > next_tag_offset) { next_tag_offset = enclosed_end; @@ -1270,7 +1101,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } } - if (is_visible && !(tag->flags & FL_CLOSING)) { + if (is_visible) { if (tag->id == Tag_A) { auto written_len = hc->parsed.size() - initial_dest_offset; html_process_displayed_href_tag(pool, hc, @@ -1313,7 +1144,7 @@ html_append_tags_content(rspamd_mempool_t *pool, for (auto i = 0; i < hc->all_tags.size();) { const auto &tag = hc->all_tags[i]; html_tag *next_tag = nullptr; - auto next_offset = tag->content_offset + tag->content_length; + auto next_offset = tag->closing.end; auto j = i + 1; while (j < hc->all_tags.size()) { @@ -1321,9 +1152,9 @@ html_append_tags_content(rspamd_mempool_t *pool, if (next_tag->content_offset <= next_offset) { enclosed_tags_stack.push_back(next_tag); - if (next_tag->content_offset + next_tag->content_length > next_offset) { + if (next_tag->closing.end > next_offset) { /* Tag spans over its parent */ - next_offset = next_tag->content_offset + next_tag->content_length; + next_offset = next_tag->closing.end; } j ++; } @@ -1351,10 +1182,9 @@ html_process_input(rspamd_mempool_t *pool, guchar t; auto closing = false, in_head = false; guint obrace = 0, ebrace = 0; - struct rspamd_url *url = NULL; + struct rspamd_url *url = nullptr; gint href_offset = -1; - struct html_tag *cur_tag = NULL; - std::vector tags_stack; + struct html_tag *cur_tag = nullptr, cur_closing_tag; struct tag_content_parser_state content_parser_env; enum { @@ -1368,10 +1198,12 @@ html_process_input(rspamd_mempool_t *pool, comment_content, sgml_content, tag_content, - tag_end, + tag_end_opening, + tag_end_closing, html_text_content, xml_tag_end, content_style, + tags_limit_overflow, } state = parse_start; g_assert (in != NULL); @@ -1380,6 +1212,32 @@ html_process_input(rspamd_mempool_t *pool, struct html_content *hc = new html_content; rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc); + auto new_tag = [&](int flags = 0) -> struct html_tag * { + + if (hc->total_tags > rspamd::html::max_tags) { + hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; + + return nullptr; + } + + auto *parent = cur_tag; + + hc->all_tags.emplace_back(std::make_unique()); + auto *ntag = hc->all_tags.back().get(); + ntag->tag_start = c - start; + ntag->flags = flags; + + if (parent) { + ntag->parent = parent; + parent->children.push_back(ntag); + } + else { + hc->root_tag = ntag; + } + + return ntag; + }; + p = (const char *)in->data; c = p; end = p + in->len; @@ -1392,14 +1250,21 @@ html_process_input(rspamd_mempool_t *pool, case parse_start: if (t == '<') { state = tag_begin; + in_head = true; } else { /* We have no starting tag, so assume that it's content */ hc->flags |= RSPAMD_HTML_FLAG_BAD_START; - hc->all_tags.emplace_back(std::make_unique()); - cur_tag = hc->all_tags.back().get(); - cur_tag->id = Tag_HTML; - state = content_before_start; + in_head = false; + cur_tag = new_tag(); + + if (cur_tag) { + cur_tag->id = Tag_HTML; + state = content_before_start; + } + else { + state = tags_limit_overflow; + } } break; case content_before_start: @@ -1419,37 +1284,56 @@ html_process_input(rspamd_mempool_t *pool, closing = FALSE; break; case '!': - state = sgml_tag; - hc->all_tags.emplace_back(std::make_unique()); - cur_tag = hc->all_tags.back().get(); - cur_tag->tag_start = c - start; + cur_tag = new_tag(FL_XML); + if (cur_tag) { + state = sgml_tag; + } + else { + state = tags_limit_overflow; + } p ++; break; case '?': - state = xml_tag; - hc->all_tags.emplace_back(std::make_unique()); - cur_tag = hc->all_tags.back().get(); - cur_tag->tag_start = c - start; - cur_tag->flags |= FL_XML; + cur_tag = new_tag(FL_XML); + if (cur_tag) { + state = xml_tag; + } + else { + state = tags_limit_overflow; + } hc->flags |= RSPAMD_HTML_FLAG_XML; p ++; break; case '/': closing = TRUE; + /* We fill fake closing tag to fill it with the content parser */ + cur_closing_tag.clear(); + cur_closing_tag.parent = cur_tag; /* For simplicity */ + cur_tag = &cur_closing_tag; p ++; break; case '>': /* Empty tag */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; - state = tag_end; + state = html_text_content; continue; default: - state = tag_content; - content_parser_env.reset(); + if (g_ascii_isalpha(t)) { + state = tag_content; + content_parser_env.reset(); + cur_tag = new_tag(); - hc->all_tags.emplace_back(std::make_unique()); - cur_tag = hc->all_tags.back().get(); - cur_tag->tag_start = c - start; + if (cur_tag) { + state = tag_content; + } + else { + state = tags_limit_overflow; + } + } + else { + /* Wrong bad tag */ + state = html_text_content; + } break; } @@ -1482,7 +1366,7 @@ html_process_input(rspamd_mempool_t *pool, else if (t == '>') { /* Misformed xml tag */ hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; - state = tag_end; + state = tag_end_opening; continue; } /* We efficiently ignore xml tags */ @@ -1491,7 +1375,7 @@ html_process_input(rspamd_mempool_t *pool, case xml_tag_end: if (t == '>') { - state = tag_end; + state = tag_end_opening; continue; } else { @@ -1508,16 +1392,16 @@ html_process_input(rspamd_mempool_t *pool, ebrace ++; } else if (t == '>' && obrace == ebrace) { - state = tag_end; + state = tag_end_opening; continue; } p ++; break; case comment_tag: - if (t != '-') { + if (t != '-') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; - state = tag_end; + state = tag_end_opening; } else { p++; @@ -1534,11 +1418,11 @@ html_process_input(rspamd_mempool_t *pool, if (p[0] == '-' && p + 1 < end && p[1] == '>') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; p ++; - state = tag_end; + state = tag_end_opening; } else if (*p == '>') { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; - state = tag_end; + state = tag_end_opening; } else { state = comment_content; @@ -1551,7 +1435,7 @@ html_process_input(rspamd_mempool_t *pool, ebrace ++; } else if (t == '>' && ebrace >= 2) { - state = tag_end; + state = tag_end_opening; continue; } else { @@ -1610,7 +1494,7 @@ html_process_input(rspamd_mempool_t *pool, case sgml_content: /* TODO: parse DOCTYPE here */ if (t == '>') { - state = tag_end; + state = tag_end_closing; /* We don't know a lot about sgml tags, ignore them */ cur_tag = nullptr; continue; @@ -1623,32 +1507,38 @@ html_process_input(rspamd_mempool_t *pool, if (t == '>') { if (closing) { - cur_tag->flags |= FL_CLOSING; + cur_tag->closing.start = c - start; + cur_tag->closing.end = p - start + 1; - if (cur_tag->flags & FL_CLOSED) { - /* Bad mix of closed and closing */ - hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; + closing = FALSE; + if (cur_tag->id == Tag_HEAD) { + in_head = false; + } + state = tag_end_closing; + } + else { + cur_tag->content_offset = p - start + 1; + if (cur_tag->id == Tag_HEAD) { + in_head = true; + } + else if (cur_tag->id == Tag_BODY) { + in_head = false; } - closing = FALSE; + state = tag_end_opening; } - state = tag_end; + continue; } p ++; break; - case tag_end: + case tag_end_opening: content_parser_env.reset(); if (cur_tag != nullptr) { - cur_tag->content_offset = p - start + 1; - - html_process_tag(pool, hc, cur_tag, tags_stack, - c - start, p - start); - if (cur_tag->id != -1 && cur_tag->id < N_TAGS) { if (cur_tag->flags & CM_UNIQUE) { if (!hc->tags_seen[cur_tag->id]) { @@ -1659,37 +1549,31 @@ html_process_input(rspamd_mempool_t *pool, hc->tags_seen[cur_tag->id] = true; } - if (cur_tag->id == Tag_HEAD) { - in_head = !(cur_tag->flags & FL_CLOSING); - } - /* XXX: uncomment when styles parsing is not so broken */ if (cur_tag->flags & FL_HREF && !in_head) { - if (!(cur_tag->flags & (FL_CLOSING))) { - auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); - if (maybe_url) { - url = maybe_url.value(); - - if (url_set != NULL) { - struct rspamd_url *maybe_existing = - rspamd_url_set_add_or_return (url_set, maybe_url.value()); - if (maybe_existing == maybe_url.value()) { - html_process_query_url(pool, url, url_set, - part_urls); - } - else { - url = maybe_existing; - /* Increase count to avoid odd checks failure */ - url->count ++; - } - } + if (maybe_url) { + url = maybe_url.value(); - href_offset = hc->parsed.size(); + if (url_set != NULL) { + struct rspamd_url *maybe_existing = + rspamd_url_set_add_or_return (url_set, maybe_url.value()); + if (maybe_existing == maybe_url.value()) { + html_process_query_url(pool, url, url_set, + part_urls); + } + else { + url = maybe_existing; + /* Increase count to avoid odd checks failure */ + url->count ++; + } } + + href_offset = hc->parsed.size(); } } - else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) { + else if (cur_tag->id == Tag_BASE) { /* * Base is allowed only within head tag but HTML is retarded */ @@ -1708,42 +1592,53 @@ html_process_input(rspamd_mempool_t *pool, } } - if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) { + if (cur_tag->id == Tag_IMG) { html_process_img_tag(pool, cur_tag, hc, url_set, part_urls); } - else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) { + else if (cur_tag->id == Tag_LINK) { html_process_link_tag(pool, cur_tag, hc, url_set, part_urls); } if (!(cur_tag->flags & CM_EMPTY)) { - - if (!(cur_tag->flags & FL_CLOSING)) { - html_process_block_tag(pool, cur_tag, hc); - } + html_process_block_tag(pool, cur_tag, hc); } } - if (cur_tag && (cur_tag->id == Tag_STYLE && !(cur_tag->flags & FL_CLOSING))) { + if (cur_tag && (cur_tag->id == Tag_STYLE)) { state = content_style; } else { state = html_text_content; } + if (!(cur_tag->flags & (FL_CLOSED|CM_EMPTY))) { + /* Pop stack to the parent */ + cur_tag = cur_tag->parent; + } + p++; c = p; - cur_tag = NULL; + break; + case tag_end_closing: + /* cur_tag here is a closing tag */ + html_check_balance(hc, cur_tag, + c - start, p - start); + cur_tag = nullptr; + break; + case tags_limit_overflow: + msg_warn_pool("tags limit of %d tags is reached at the position %d;" + " ignoring the rest of the HTML content", + (int)hc->all_tags.size(), (int)(p - start)); + html_append_content(hc, {p, (std::size_t)(end - p)}); + p = end; break; } } /* Propagate styles */ hc->traverse_block_tags([&hc](const html_tag *tag) -> bool { - if (tag->flags & FL_CLOSING) { - return true; - } if (hc->css_style) { auto *css_block = hc->css_style->check_tag_block(tag); diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 368155d81..67ef5a612 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -110,7 +110,7 @@ struct html_content { auto traverse_all_tags(fu2::function &&func) const -> bool { for (const auto &tag : all_tags) { - if (!(tag->flags & (FL_CLOSING|FL_XML))) { + if (!(tag->flags & (FL_XML|FL_VIRTUAL))) { if (!func(tag.get())) { return false; } diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index a79195a5f..40b2eb955 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -44,16 +44,14 @@ enum class html_component_type : std::uint8_t { /* Public tags flags */ /* XML tag */ #define FL_XML (1 << 22) -/* Closing tag */ -#define FL_CLOSING (1 << 23) /* Fully closed tag (e.g. ) */ -#define FL_CLOSED (1 << 24) -#define FL_BROKEN (1 << 25) -#define FL_IGNORE (1 << 26) -#define FL_BLOCK (1 << 27) -#define FL_HREF (1 << 28) -#define FL_COMMENT (1 << 29) -#define FL_VIRTUAL (1 << 30) +#define FL_CLOSED (1 << 23) +#define FL_BROKEN (1 << 24) +#define FL_IGNORE (1 << 25) +#define FL_BLOCK (1 << 26) +#define FL_HREF (1 << 27) +#define FL_COMMENT (1 << 28) +#define FL_VIRTUAL (1 << 29) /** * Returns component type from a string @@ -71,23 +69,33 @@ struct html_tag_component { : type(type), value(value) {} }; +/* Pairing closing tag representation */ +struct html_closing_tag { + int start = -1; + int end = -1; + + auto clear() -> void { + start = end = -1; + } +}; + struct html_tag { unsigned int tag_start = 0; - unsigned int content_length = 0; unsigned int content_offset = 0; std::uint32_t flags = 0; std::int32_t id = -1; + html_closing_tag closing; - std::vector parameters; + std::vector components; html_tag_extra_t extra; - mutable struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */ + mutable struct html_block *block = nullptr; std::vector children; struct html_tag *parent; auto find_component(html_component_type what) const -> std::optional { - for (const auto &comp : parameters) { + for (const auto &comp : components) { if (comp.type == what) { return comp.value; } @@ -104,6 +112,17 @@ struct html_tag { return std::nullopt; } + + auto clear(void) -> void { + id = -1; + tag_start = content_offset = 0; + extra = std::monostate{}; + components.clear(); + flags = 0; + block = nullptr; + children.clear(); + closing.clear(); + } }; } diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx index 376df9fbb..4348d91c0 100644 --- a/src/lua/lua_html.cxx +++ b/src/lua/lua_html.cxx @@ -447,7 +447,7 @@ lua_html_foreach_tag (lua_State *L) ltag->tag = tag; ltag->html = hc; rspamd_lua_setclass (L, "rspamd{html_tag}", -1); - lua_pushinteger (L, tag->content_length); + lua_pushinteger (L, tag->closing.start - tag->content_offset); /* Leaf flag */ if (tag->children.empty()) { @@ -541,10 +541,6 @@ lua_html_tag_get_flags (lua_State *L) if (ltag->tag) { /* Push flags */ lua_createtable (L, 4, 0); - if (ltag->tag->flags & FL_CLOSING) { - lua_pushstring (L, "closing"); - lua_rawseti (L, -2, i++); - } if (ltag->tag->flags & FL_HREF) { lua_pushstring (L, "href"); lua_rawseti (L, -2, i++); @@ -581,13 +577,14 @@ lua_html_tag_get_content (lua_State *L) struct rspamd_lua_text *t; if (ltag) { - if (ltag->html && ltag->tag->content_length && - ltag->html->parsed.size() >= ltag->tag->content_offset + ltag->tag->content_length) { + auto clen = ltag->tag->closing.start - ltag->tag->content_offset; + if (ltag->html && clen && + ltag->html->parsed.size() >= ltag->tag->content_offset + clen) { t = static_cast(lua_newuserdata(L, sizeof(*t))); rspamd_lua_setclass (L, "rspamd{text}", -1); t->start = reinterpret_cast(ltag->html->parsed.data()) + ltag->tag->content_offset; - t->len = ltag->tag->content_length; + t->len = clen; t->flags = 0; } else { @@ -608,7 +605,7 @@ lua_html_tag_get_content_length (lua_State *L) struct lua_html_tag *ltag = lua_check_html_tag (L, 1); if (ltag) { - lua_pushinteger (L, ltag->tag->content_length); + lua_pushinteger (L, ltag->tag->closing.start - ltag->tag->content_offset); } else { return luaL_error (L, "invalid arguments");