]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Html: Another rework of the tags structure
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 30 Jun 2021 11:50:43 +0000 (12:50 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 30 Jun 2021 11:50:43 +0000 (12:50 +0100)
src/libserver/css/css.cxx
src/libserver/html/html.cxx
src/libserver/html/html.hxx
src/libserver/html/html_tag.hxx
src/lua/lua_html.cxx

index 51f537b5a9c044aabc015089fbe0e9567473080e..9e26eb42f84e31101534906aac27367ed590c540 100644 (file)
@@ -114,7 +114,7 @@ css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) ->
        }
 
        /* First, find id in a tag and a class */
-       for (const auto &param : tag->parameters) {
+       for (const auto &param : tag->components) {
                if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) {
                        id_comp = param.value;
                }
index f0315434a0d3b281535bb909f96ed1a6c1d3020b..9d1531f04839d5c65323edbd1ac5d030827b433d 100644 (file)
@@ -75,259 +75,91 @@ auto html_components_map = frozen::make_unordered_map<frozen::string, html_compo
 
 INIT_LOG_MODULE(html)
 
+/*
+ * This function is expected to be called on a closing tag to fill up all tags
+ * and return the current parent (meaning unclosed) tag
+ */
 static auto
 html_check_balance(struct html_content *hc,
                                   struct html_tag *tag,
-                                  struct html_tag *parent,
-                                  std::vector<html_tag *> &tags_stack,
                                   goffset tag_start_offset,
-                                  goffset tag_end_offset) -> bool
+                                  goffset tag_end_offset) -> html_tag *
 {
+       /* As agreed, the closing tag has the last opening at the parent ptr */
+       auto *opening_tag = tag->parent;
 
-       auto calculate_content_length = [tag_start_offset](html_tag *t) {
+       auto calculate_content_length = [tag_start_offset,tag_end_offset](html_tag *t) {
                auto opening_content_offset = t->content_offset;
 
                if (opening_content_offset <= tag_start_offset) {
-                       t->content_length = tag_start_offset - opening_content_offset;
+                       t->closing.start = tag_start_offset;
+                       t->closing.end = tag_end_offset;
                }
                else {
-                       t->content_length = 0;
+
+                       t->closing.start = t->content_offset;
+                       t->closing.end = tag_end_offset;
                }
        };
 
-       auto balance_tag = [&]() -> void {
-               auto it = tags_stack.rbegin();
+       auto balance_tag = [&]() -> html_tag * {
+               auto it = tag->parent;
 
-               for (auto end_it = tags_stack.rend(); it != end_it; ++it) {
-                       if ((*it)->id == tag->id && !((*it)->flags & FL_CLOSING)) {
+               for (; it != nullptr; it = it->parent) {
+                       if (it->id == tag->id && !(it->flags & FL_CLOSED)) {
                                break;
                        }
                        /* Insert a virtual closing tag for all tags that are not closed */
-                       auto &&vtag = std::make_unique<html_tag>();
-                       vtag->id = (*it)->id;
-                       vtag->flags = FL_VIRTUAL|FL_CLOSING;
-                       vtag->tag_start = tag->tag_start;
-                       vtag->content_offset = tag->content_offset;
-                       vtag->content_length = 0;
-                       vtag->parent = (*it)->parent;
-                       calculate_content_length(*it);
-                       (*it)->flags |= FL_CLOSED;
-                       hc->all_tags.emplace_back(std::move(vtag));
+                       calculate_content_length(it);
+                       it->flags |= FL_CLOSED;
                }
 
                /* Remove tags */
-               tags_stack.erase(it.base(), std::end(tags_stack));
+               return it;
        };
 
-       if (tag->flags & FL_CLOSING) {
-               if (!tags_stack.empty()) {
-                       auto *last_tag = tags_stack.back();
+       if (opening_tag) {
 
-                       if (last_tag->id == tag->id && !(last_tag->flags & FL_CLOSED)) {
-                               last_tag->flags |= FL_CLOSED;
+               if (opening_tag->id == tag->id) {
+                       opening_tag->flags |= FL_CLOSED;
 
-                               calculate_content_length(last_tag);
-                               tags_stack.pop_back();
-                               /* All good */
-                               return true;
-                       }
-                       else {
-                               balance_tag();
-
-                               return false;
-                       }
+                       calculate_content_length(opening_tag);
+                       /* All good */
+                       return opening_tag->parent;
                }
                else {
-                       /*
-                        * We have no opening tags in the stack, so we need to assume that there
-                        * is an opening tag at the beginning of the document.
-                        * There are two possibilities:
-                        *
-                        * 1) We have some block tag in hc->all_tags;
-                        * 2) We have no tags
-                        */
-
-                       if (hc->all_tags.empty()) {
-                               auto &&vtag = std::make_unique<html_tag>();
-                               vtag->id = tag->id;
-                               vtag->flags = FL_VIRTUAL|FL_CLOSED;
-                               vtag->tag_start = 0;
-                               vtag->content_offset = 0;
-                               calculate_content_length(vtag.get());
-
-
-                               if (!hc->root_tag) {
-                                       hc->root_tag = vtag.get();
-                               }
-                               else {
-                                       vtag->parent = hc->root_tag;
-                               }
-                               hc->all_tags.emplace_back(std::move(vtag));
-                       }
-                       else {
-                               auto found_closing = std::find_if(hc->all_tags.rbegin(),
-                                               hc->all_tags.rend(),
-                                               [&](const auto &t) {
-                                                       constexpr const auto expect_flags = FL_BLOCK|FL_CLOSING;
-                                                       return (t->flags & expect_flags) == (expect_flags) &&
-                                                                       t.get() != tag &&
-                                                                       t->parent != nullptr;
-                                               });
-
-                               if (found_closing != hc->all_tags.rend()) {
-                                       auto *closing_tag = (*found_closing).get();
-                                       auto &&vtag = std::make_unique<html_tag>();
-                                       vtag->id = tag->id;
-                                       vtag->flags = FL_VIRTUAL|FL_CLOSED;
-                                       vtag->tag_start = closing_tag->content_offset - 1;
-                                       vtag->content_offset = vtag->tag_start + 1;
-                                       vtag->parent = closing_tag->parent;
-                                       vtag->content_length = tag->tag_start - vtag->content_offset;
-                                       hc->all_tags.emplace_back(std::move(vtag));
-                               }
-                               else {
-                                       auto &&vtag = std::make_unique<html_tag>();
-                                       vtag->id = tag->id;
-                                       vtag->flags = FL_VIRTUAL|FL_CLOSED;
-                                       vtag->tag_start = 0;
-                                       vtag->content_offset = 0;
-                                       calculate_content_length(vtag.get());
-
-
-                                       if (!hc->root_tag) {
-                                               hc->root_tag = vtag.get();
-                                       }
-                                       else {
-                                               vtag->parent = hc->root_tag;
-                                       }
-                                       hc->all_tags.emplace_back(std::move(vtag));
-                               }
-                       }
+                       return balance_tag();
                }
-
-               return false;
-       }
-
-       /* Misuse */
-       RSPAMD_UNREACHABLE;
-}
-
-static auto
-html_process_tag(rspamd_mempool_t *pool,
-                                struct html_content *hc,
-                                struct html_tag *tag,
-                                std::vector<html_tag *> &tags_stack,
-                                goffset tag_start_offset,
-                                goffset tag_end_offset) -> bool
-{
-       struct html_tag *parent;
-
-       if (hc->total_tags > rspamd::html::max_tags) {
-               hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
-       }
-
-       if (tag->id == -1) {
-               /* Ignore unknown tags */
-               hc->total_tags++;
-               return false;
-       }
-
-
-       if (tags_stack.empty()) {
-               parent = hc->root_tag;
        }
        else {
-               parent = tags_stack.back();
-       }
-
-       tag->parent = parent;
-
-       if (!(tag->flags & (CM_EMPTY))) {
-               /* Block tag */
-               if (tag->flags & FL_CLOSING) {
-                       /* Closed block tag */
-                       if (parent == nullptr) {
-                               msg_debug_html ("bad parent node");
-                               return false;
-                       }
-
-                       if (!html_check_balance(hc, tag, parent, tags_stack,
-                                       tag_start_offset, tag_end_offset)) {
-                               msg_debug_html (
-                                               "mark part as unbalanced as it has not pairable closing tags");
-                               hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-                       }
-               }
-               else {
-                       /* Opening block tag */
-                       if (parent) {
-                               if ((parent->flags & FL_IGNORE)) {
-                                       tag->flags |= FL_IGNORE;
-                               }
-
-                               if (!(tag->flags & FL_CLOSED) &&
-                                       (parent->flags & CM_EMPTY)) {
-                                       /* We likely have some bad nesting */
-                                       if (parent->id == tag->id) {
-                                               /* Something like <a>bla<a>foo... */
-                                               hc->flags |= RSPAMD_HTML_FLAG_UNBALANCED;
-                                               tag->parent = parent->parent;
-
-                                               if (hc->total_tags < rspamd::html::max_tags) {
-                                                       parent->children.push_back(tag);
-                                                       tags_stack.push_back(tag);
-                                                       hc->total_tags++;
-                                               }
-
-                                               return true;
-                                       }
-                               }
-
-                               if (hc->total_tags < rspamd::html::max_tags) {
-                                       parent->children.push_back(tag);
+               /*
+                * We have no opening tag
+                * There are two possibilities:
+                *
+                * 1) We have some block tag in hc->all_tags;
+                * 2) We have no tags
+                */
+
+               if (hc->all_tags.empty()) {
+                       auto &&vtag = std::make_unique<html_tag>();
+                       vtag->id = tag->id;
+                       vtag->flags = FL_VIRTUAL|FL_CLOSED;
+                       vtag->tag_start = 0;
+                       vtag->content_offset = 0;
+                       calculate_content_length(vtag.get());
 
-                                       if ((tag->flags & FL_CLOSED) == 0) {
-                                               tags_stack.push_back(tag);
-                                       }
 
-                                       hc->total_tags++;
-                               }
+                       if (!hc->root_tag) {
+                               hc->root_tag = vtag.get();
                        }
                        else {
-                               hc->root_tag = tag;
-                               if (hc->total_tags < rspamd::html::max_tags) {
-                                       if ((tag->flags & FL_CLOSED) == 0) {
-                                               tags_stack.push_back(tag);
-                                       }
-
-                                       hc->total_tags++;
-                               }
-                       }
-
-                       if (tag->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE)) {
-                               tag->flags |= FL_IGNORE;
-
-                               return false;
-                       }
-               }
-       }
-       else {
-               /* Inline tag */
-               if (parent) {
-                       if (hc->total_tags < rspamd::html::max_tags) {
-                               parent->children.push_back(tag);
-
-                               hc->total_tags++;
-                       }
-                       if ((parent->flags & (CM_HEAD | CM_UNKNOWN | FL_IGNORE))) {
-                               tag->flags |= FL_IGNORE;
-
-                               return false;
+                               vtag->parent = hc->root_tag;
                        }
+                       hc->all_tags.emplace_back(std::move(vtag));
                }
        }
 
-       return true;
+       return nullptr;
 }
 
 auto
@@ -404,7 +236,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                spaces_before_eq,
                spaces_after_eq,
                spaces_after_param,
-               ignore_bad_tag
+               ignore_bad_tag,
        } state;
        gboolean store = FALSE;
 
@@ -425,7 +257,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
                                auto *s = rspamd_mempool_alloc_buffer(pool, sz);
                                memcpy(s, parser_env.saved_p, sz);
                                sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p);
-                               tag->parameters.emplace_back(parser_env.cur_component.value(),
+                               tag->components.emplace_back(parser_env.cur_component.value(),
                                                std::string_view{s, sz});
                }
 
@@ -941,7 +773,7 @@ html_process_img_tag(rspamd_mempool_t *pool,
        img = rspamd_mempool_alloc0_type (pool, struct html_image);
        img->tag = tag;
 
-       for (const auto &param : tag->parameters) {
+       for (const auto &param : tag->components) {
 
                if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
                        /* Check base url */
@@ -1103,7 +935,7 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
 {
        std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
 
-       for (const auto &param : tag->parameters) {
+       for (const auto &param : tag->components) {
                if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
                        maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
                }
@@ -1173,7 +1005,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                                                khash_t (rspamd_url_hash) *url_set) -> goffset
 {
        auto is_visible = true, is_block = false;
-       goffset next_tag_offset = tag->content_length + tag->content_offset,
+       goffset next_tag_offset = tag->closing.end,
                        initial_dest_offset = hc->parsed.size();
 
        if (tag->id == Tag_BR || tag->id == Tag_HR) {
@@ -1182,7 +1014,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                return tag->content_offset;
        }
 
-       if ((tag->flags & (FL_COMMENT|FL_XML))) {
+       if ((tag->flags & (FL_COMMENT|FL_XML|FL_IGNORE))) {
                is_visible = false;
        }
        else {
@@ -1213,8 +1045,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                if (!enclosed_tags.empty()) {
                        next_enclosed = enclosed_tags.back();
                        enclosed_start = next_enclosed->tag_start;
-                       enclosed_end = next_enclosed->content_length +
-                                                  next_enclosed->content_offset;
+                       enclosed_end = next_enclosed->closing.end;
 
                        if (enclosed_end > next_tag_offset) {
                                next_tag_offset = enclosed_end;
@@ -1270,7 +1101,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                }
        }
 
-       if (is_visible && !(tag->flags & FL_CLOSING)) {
+       if (is_visible) {
                if (tag->id == Tag_A) {
                        auto written_len = hc->parsed.size() - initial_dest_offset;
                        html_process_displayed_href_tag(pool, hc,
@@ -1313,7 +1144,7 @@ html_append_tags_content(rspamd_mempool_t *pool,
        for (auto i = 0; i < hc->all_tags.size();) {
                const auto &tag = hc->all_tags[i];
                html_tag *next_tag = nullptr;
-               auto next_offset = tag->content_offset + tag->content_length;
+               auto next_offset = tag->closing.end;
 
                auto j = i + 1;
                while (j < hc->all_tags.size()) {
@@ -1321,9 +1152,9 @@ html_append_tags_content(rspamd_mempool_t *pool,
 
                        if (next_tag->content_offset <= next_offset) {
                                enclosed_tags_stack.push_back(next_tag);
-                               if (next_tag->content_offset + next_tag->content_length > next_offset) {
+                               if (next_tag->closing.end > next_offset) {
                                        /* Tag spans over its parent */
-                                       next_offset = next_tag->content_offset + next_tag->content_length;
+                                       next_offset = next_tag->closing.end;
                                }
                                j ++;
                        }
@@ -1351,10 +1182,9 @@ html_process_input(rspamd_mempool_t *pool,
        guchar t;
        auto closing = false, in_head = false;
        guint obrace = 0, ebrace = 0;
-       struct rspamd_url *url = NULL;
+       struct rspamd_url *url = nullptr;
        gint href_offset = -1;
-       struct html_tag *cur_tag = NULL;
-       std::vector<html_tag *> tags_stack;
+       struct html_tag *cur_tag = nullptr, cur_closing_tag;
        struct tag_content_parser_state content_parser_env;
 
        enum {
@@ -1368,10 +1198,12 @@ html_process_input(rspamd_mempool_t *pool,
                comment_content,
                sgml_content,
                tag_content,
-               tag_end,
+               tag_end_opening,
+               tag_end_closing,
                html_text_content,
                xml_tag_end,
                content_style,
+               tags_limit_overflow,
        } state = parse_start;
 
        g_assert (in != NULL);
@@ -1380,6 +1212,32 @@ html_process_input(rspamd_mempool_t *pool,
        struct html_content *hc = new html_content;
        rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc);
 
+       auto new_tag = [&](int flags = 0) -> struct html_tag * {
+
+               if (hc->total_tags > rspamd::html::max_tags) {
+                       hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
+
+                       return nullptr;
+               }
+
+               auto *parent = cur_tag;
+
+               hc->all_tags.emplace_back(std::make_unique<html_tag>());
+               auto *ntag = hc->all_tags.back().get();
+               ntag->tag_start = c - start;
+               ntag->flags = flags;
+
+               if (parent) {
+                       ntag->parent = parent;
+                       parent->children.push_back(ntag);
+               }
+               else {
+                       hc->root_tag = ntag;
+               }
+
+               return ntag;
+       };
+
        p = (const char *)in->data;
        c = p;
        end = p + in->len;
@@ -1392,14 +1250,21 @@ html_process_input(rspamd_mempool_t *pool,
                case parse_start:
                        if (t == '<') {
                                state = tag_begin;
+                               in_head = true;
                        }
                        else {
                                /* We have no starting tag, so assume that it's content */
                                hc->flags |= RSPAMD_HTML_FLAG_BAD_START;
-                               hc->all_tags.emplace_back(std::make_unique<html_tag>());
-                               cur_tag = hc->all_tags.back().get();
-                               cur_tag->id = Tag_HTML;
-                               state = content_before_start;
+                               in_head = false;
+                               cur_tag = new_tag();
+
+                               if (cur_tag) {
+                                       cur_tag->id = Tag_HTML;
+                                       state = content_before_start;
+                               }
+                               else {
+                                       state = tags_limit_overflow;
+                               }
                        }
                        break;
                case content_before_start:
@@ -1419,37 +1284,56 @@ html_process_input(rspamd_mempool_t *pool,
                                closing = FALSE;
                                break;
                        case '!':
-                               state = sgml_tag;
-                               hc->all_tags.emplace_back(std::make_unique<html_tag>());
-                               cur_tag = hc->all_tags.back().get();
-                               cur_tag->tag_start = c - start;
+                               cur_tag = new_tag(FL_XML);
+                               if (cur_tag) {
+                                       state = sgml_tag;
+                               }
+                               else {
+                                       state = tags_limit_overflow;
+                               }
                                p ++;
                                break;
                        case '?':
-                               state = xml_tag;
-                               hc->all_tags.emplace_back(std::make_unique<html_tag>());
-                               cur_tag = hc->all_tags.back().get();
-                               cur_tag->tag_start = c - start;
-                               cur_tag->flags |= FL_XML;
+                               cur_tag = new_tag(FL_XML);
+                               if (cur_tag) {
+                                       state = xml_tag;
+                               }
+                               else {
+                                       state = tags_limit_overflow;
+                               }
                                hc->flags |= RSPAMD_HTML_FLAG_XML;
                                p ++;
                                break;
                        case '/':
                                closing = TRUE;
+                               /* We fill fake closing tag to fill it with the content parser */
+                               cur_closing_tag.clear();
+                               cur_closing_tag.parent = cur_tag; /* For simplicity */
+                               cur_tag = &cur_closing_tag;
                                p ++;
                                break;
                        case '>':
                                /* Empty tag */
                                hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-                               state = tag_end;
+                               state = html_text_content;
                                continue;
                        default:
-                               state = tag_content;
-                               content_parser_env.reset();
+                               if (g_ascii_isalpha(t)) {
+                                       state = tag_content;
+                                       content_parser_env.reset();
+                                       cur_tag = new_tag();
 
-                               hc->all_tags.emplace_back(std::make_unique<html_tag>());
-                               cur_tag = hc->all_tags.back().get();
-                               cur_tag->tag_start = c - start;
+                                       if (cur_tag) {
+                                               state = tag_content;
+                                       }
+                                       else {
+                                               state = tags_limit_overflow;
+                                       }
+                               }
+                               else {
+                                       /* Wrong bad tag */
+                                       state = html_text_content;
+                               }
                                break;
                        }
 
@@ -1482,7 +1366,7 @@ html_process_input(rspamd_mempool_t *pool,
                        else if (t == '>') {
                                /* Misformed xml tag */
                                hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-                               state = tag_end;
+                               state = tag_end_opening;
                                continue;
                        }
                        /* We efficiently ignore xml tags */
@@ -1491,7 +1375,7 @@ html_process_input(rspamd_mempool_t *pool,
 
                case xml_tag_end:
                        if (t == '>') {
-                               state = tag_end;
+                               state = tag_end_opening;
                                continue;
                        }
                        else {
@@ -1508,16 +1392,16 @@ html_process_input(rspamd_mempool_t *pool,
                                ebrace ++;
                        }
                        else if (t == '>' && obrace == ebrace) {
-                               state = tag_end;
+                               state = tag_end_opening;
                                continue;
                        }
                        p ++;
                        break;
 
                case comment_tag:
-                       if (t != '-')  {
+                       if (t != '-') {
                                hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-                               state = tag_end;
+                               state = tag_end_opening;
                        }
                        else {
                                p++;
@@ -1534,11 +1418,11 @@ html_process_input(rspamd_mempool_t *pool,
                                if (p[0] == '-' && p + 1 < end && p[1] == '>') {
                                        hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
                                        p ++;
-                                       state = tag_end;
+                                       state = tag_end_opening;
                                }
                                else if (*p == '>') {
                                        hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
-                                       state = tag_end;
+                                       state = tag_end_opening;
                                }
                                else {
                                        state = comment_content;
@@ -1551,7 +1435,7 @@ html_process_input(rspamd_mempool_t *pool,
                                ebrace ++;
                        }
                        else if (t == '>' && ebrace >= 2) {
-                               state = tag_end;
+                               state = tag_end_opening;
                                continue;
                        }
                        else {
@@ -1610,7 +1494,7 @@ html_process_input(rspamd_mempool_t *pool,
                case sgml_content:
                        /* TODO: parse DOCTYPE here */
                        if (t == '>') {
-                               state = tag_end;
+                               state = tag_end_closing;
                                /* We don't know a lot about sgml tags, ignore them */
                                cur_tag = nullptr;
                                continue;
@@ -1623,32 +1507,38 @@ html_process_input(rspamd_mempool_t *pool,
 
                        if (t == '>') {
                                if (closing) {
-                                       cur_tag->flags |= FL_CLOSING;
+                                       cur_tag->closing.start = c - start;
+                                       cur_tag->closing.end = p - start + 1;
 
-                                       if (cur_tag->flags & FL_CLOSED) {
-                                               /* Bad mix of closed and closing */
-                                               hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS;
+                                       closing = FALSE;
+                                       if (cur_tag->id == Tag_HEAD) {
+                                               in_head = false;
+                                       }
+                                       state = tag_end_closing;
+                               }
+                               else {
+                                       cur_tag->content_offset = p - start + 1;
+                                       if (cur_tag->id == Tag_HEAD) {
+                                               in_head = true;
+                                       }
+                                       else if (cur_tag->id == Tag_BODY) {
+                                               in_head = false;
                                        }
 
-                                       closing = FALSE;
+                                       state = tag_end_opening;
                                }
 
-                               state = tag_end;
+
                                continue;
                        }
                        p ++;
                        break;
 
-               case tag_end:
+               case tag_end_opening:
                        content_parser_env.reset();
 
                        if (cur_tag != nullptr) {
 
-                               cur_tag->content_offset = p - start + 1;
-
-                               html_process_tag(pool, hc, cur_tag, tags_stack,
-                                               c - start, p - start);
-
                                if (cur_tag->id != -1 && cur_tag->id < N_TAGS) {
                                        if (cur_tag->flags & CM_UNIQUE) {
                                                if (!hc->tags_seen[cur_tag->id]) {
@@ -1659,37 +1549,31 @@ html_process_input(rspamd_mempool_t *pool,
                                        hc->tags_seen[cur_tag->id] = true;
                                }
 
-                               if (cur_tag->id == Tag_HEAD) {
-                                       in_head = !(cur_tag->flags & FL_CLOSING);
-                               }
-
                                /* XXX: uncomment when styles parsing is not so broken */
                                if (cur_tag->flags & FL_HREF && !in_head) {
-                                       if (!(cur_tag->flags & (FL_CLOSING))) {
-                                               auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
+                                       auto maybe_url = html_process_url_tag(pool, cur_tag, hc);
 
-                                               if (maybe_url) {
-                                                       url = maybe_url.value();
-
-                                                       if (url_set != NULL) {
-                                                               struct rspamd_url *maybe_existing =
-                                                                               rspamd_url_set_add_or_return (url_set, maybe_url.value());
-                                                               if (maybe_existing == maybe_url.value()) {
-                                                                       html_process_query_url(pool, url, url_set,
-                                                                                       part_urls);
-                                                               }
-                                                               else {
-                                                                       url = maybe_existing;
-                                                                       /* Increase count to avoid odd checks failure */
-                                                                       url->count ++;
-                                                               }
-                                                       }
+                                       if (maybe_url) {
+                                               url = maybe_url.value();
 
-                                                       href_offset = hc->parsed.size();
+                                               if (url_set != NULL) {
+                                                       struct rspamd_url *maybe_existing =
+                                                                       rspamd_url_set_add_or_return (url_set, maybe_url.value());
+                                                       if (maybe_existing == maybe_url.value()) {
+                                                               html_process_query_url(pool, url, url_set,
+                                                                               part_urls);
+                                                       }
+                                                       else {
+                                                               url = maybe_existing;
+                                                               /* Increase count to avoid odd checks failure */
+                                                               url->count ++;
+                                                       }
                                                }
+
+                                               href_offset = hc->parsed.size();
                                        }
                                }
-                               else if (cur_tag->id == Tag_BASE && !(cur_tag->flags & (FL_CLOSING))) {
+                               else if (cur_tag->id == Tag_BASE) {
                                        /*
                                         * Base is allowed only within head tag but HTML is retarded
                                         */
@@ -1708,42 +1592,53 @@ html_process_input(rspamd_mempool_t *pool,
                                        }
                                }
 
-                               if (cur_tag->id == Tag_IMG && !(cur_tag->flags & FL_CLOSING)) {
+                               if (cur_tag->id == Tag_IMG) {
                                        html_process_img_tag(pool, cur_tag, hc, url_set,
                                                        part_urls);
                                }
-                               else if (cur_tag->id == Tag_LINK && !(cur_tag->flags & FL_CLOSING)) {
+                               else if (cur_tag->id == Tag_LINK) {
                                        html_process_link_tag(pool, cur_tag, hc, url_set,
                                                        part_urls);
                                }
 
                                if (!(cur_tag->flags & CM_EMPTY)) {
-
-                                       if (!(cur_tag->flags & FL_CLOSING)) {
-                                               html_process_block_tag(pool, cur_tag, hc);
-                                       }
+                                       html_process_block_tag(pool, cur_tag, hc);
                                }
                        }
 
-                       if (cur_tag && (cur_tag->id == Tag_STYLE && !(cur_tag->flags & FL_CLOSING))) {
+                       if (cur_tag && (cur_tag->id == Tag_STYLE)) {
                                state = content_style;
                        }
                        else {
                                state = html_text_content;
                        }
 
+                       if (!(cur_tag->flags & (FL_CLOSED|CM_EMPTY))) {
+                               /* Pop stack to the parent */
+                               cur_tag = cur_tag->parent;
+                       }
+
                        p++;
                        c = p;
-                       cur_tag = NULL;
+                       break;
+               case tag_end_closing:
+                       /* cur_tag here is a closing tag */
+                       html_check_balance(hc, cur_tag,
+                                       c - start, p - start);
+                       cur_tag = nullptr;
+                       break;
+               case tags_limit_overflow:
+                       msg_warn_pool("tags limit of %d tags is reached at the position %d;"
+                                " ignoring the rest of the HTML content",
+                                       (int)hc->all_tags.size(), (int)(p - start));
+                       html_append_content(hc, {p, (std::size_t)(end - p)});
+                       p = end;
                        break;
                }
        }
 
        /* Propagate styles */
        hc->traverse_block_tags([&hc](const html_tag *tag) -> bool {
-               if (tag->flags & FL_CLOSING) {
-                       return true;
-               }
 
                if (hc->css_style) {
                        auto *css_block = hc->css_style->check_tag_block(tag);
index 368155d81eac69925fba918b24d61dfb3589da0f..67ef5a612b0397d5326d74ae07c05723f42862ab 100644 (file)
@@ -110,7 +110,7 @@ struct html_content {
 
        auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool {
                for (const auto &tag : all_tags) {
-                       if (!(tag->flags & (FL_CLOSING|FL_XML))) {
+                       if (!(tag->flags & (FL_XML|FL_VIRTUAL))) {
                                if (!func(tag.get())) {
                                        return false;
                                }
index a79195a5f680f66020f15c136def607c8758c8bb..40b2eb955baf381274f9d6c0b2d03953ba322648 100644 (file)
@@ -44,16 +44,14 @@ enum class html_component_type : std::uint8_t {
 /* Public tags flags */
 /* XML tag */
 #define FL_XML          (1 << 22)
-/* Closing tag */
-#define FL_CLOSING      (1 << 23)
 /* Fully closed tag (e.g. <a attrs />) */
-#define FL_CLOSED       (1 << 24)
-#define FL_BROKEN       (1 << 25)
-#define FL_IGNORE       (1 << 26)
-#define FL_BLOCK        (1 << 27)
-#define FL_HREF         (1 << 28)
-#define FL_COMMENT      (1 << 29)
-#define FL_VIRTUAL      (1 << 30)
+#define FL_CLOSED       (1 << 23)
+#define FL_BROKEN       (1 << 24)
+#define FL_IGNORE       (1 << 25)
+#define FL_BLOCK        (1 << 26)
+#define FL_HREF         (1 << 27)
+#define FL_COMMENT      (1 << 28)
+#define FL_VIRTUAL      (1 << 29)
 
 /**
  * Returns component type from a string
@@ -71,23 +69,33 @@ struct html_tag_component {
                : type(type), value(value) {}
 };
 
+/* Pairing closing tag representation */
+struct html_closing_tag {
+       int start = -1;
+       int end = -1;
+
+       auto clear() -> void {
+               start = end = -1;
+       }
+};
+
 struct html_tag {
        unsigned int tag_start = 0;
-       unsigned int content_length = 0;
        unsigned int content_offset = 0;
        std::uint32_t flags = 0;
        std::int32_t id = -1;
+       html_closing_tag closing;
 
-       std::vector<html_tag_component> parameters;
+       std::vector<html_tag_component> components;
 
        html_tag_extra_t extra;
-       mutable struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */
+       mutable struct html_block *block = nullptr;
        std::vector<struct html_tag *> children;
        struct html_tag *parent;
 
        auto find_component(html_component_type what) const -> std::optional<std::string_view>
        {
-               for (const auto &comp : parameters) {
+               for (const auto &comp : components) {
                        if (comp.type == what) {
                                return comp.value;
                        }
@@ -104,6 +112,17 @@ struct html_tag {
 
                return std::nullopt;
        }
+
+       auto clear(void) -> void {
+               id = -1;
+               tag_start = content_offset = 0;
+               extra = std::monostate{};
+               components.clear();
+               flags = 0;
+               block = nullptr;
+               children.clear();
+               closing.clear();
+       }
 };
 
 }
index 376df9fbb26c3d9854373aeb96d2fc0c89e64475..4348d91c03b2146670a669a8b9c08314bc92e8bb 100644 (file)
@@ -447,7 +447,7 @@ lua_html_foreach_tag (lua_State *L)
                                ltag->tag = tag;
                                ltag->html = hc;
                                rspamd_lua_setclass (L, "rspamd{html_tag}", -1);
-                               lua_pushinteger (L, tag->content_length);
+                               lua_pushinteger (L, tag->closing.start - tag->content_offset);
 
                                /* Leaf flag */
                                if (tag->children.empty()) {
@@ -541,10 +541,6 @@ lua_html_tag_get_flags (lua_State *L)
        if (ltag->tag) {
                /* Push flags */
                lua_createtable (L, 4, 0);
-               if (ltag->tag->flags & FL_CLOSING) {
-                       lua_pushstring (L, "closing");
-                       lua_rawseti (L, -2, i++);
-               }
                if (ltag->tag->flags & FL_HREF) {
                        lua_pushstring (L, "href");
                        lua_rawseti (L, -2, i++);
@@ -581,13 +577,14 @@ lua_html_tag_get_content (lua_State *L)
        struct rspamd_lua_text *t;
 
        if (ltag) {
-               if (ltag->html && ltag->tag->content_length &&
-                               ltag->html->parsed.size() >= ltag->tag->content_offset + ltag->tag->content_length) {
+               auto clen = ltag->tag->closing.start - ltag->tag->content_offset;
+               if (ltag->html && clen &&
+                               ltag->html->parsed.size() >= ltag->tag->content_offset + clen) {
                        t = static_cast<rspamd_lua_text *>(lua_newuserdata(L, sizeof(*t)));
                        rspamd_lua_setclass (L, "rspamd{text}", -1);
                        t->start = reinterpret_cast<const char *>(ltag->html->parsed.data()) +
                                        ltag->tag->content_offset;
-                       t->len = ltag->tag->content_length;
+                       t->len = clen;
                        t->flags = 0;
                }
                else {
@@ -608,7 +605,7 @@ lua_html_tag_get_content_length (lua_State *L)
        struct lua_html_tag *ltag = lua_check_html_tag (L, 1);
 
        if (ltag) {
-               lua_pushinteger (L, ltag->tag->content_length);
+               lua_pushinteger (L, ltag->tag->closing.start - ltag->tag->content_offset);
        }
        else {
                return luaL_error (L, "invalid arguments");