From: Vsevolod Stakhov Date: Thu, 1 Jul 2021 14:25:37 +0000 (+0100) Subject: [Minor] Fix the final tag processing + sgml/xml tags fixes X-Git-Tag: 3.0~224 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=78d955e5735077953966863c1d8c0ade38c27e73;p=rspamd.git [Minor] Fix the final tag processing + sgml/xml tags fixes --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 6cd5aa246..088202286 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1207,6 +1207,116 @@ html_process_input(rspamd_mempool_t *pool, return ntag; }; + auto process_opening_tag = [&]() { + if (cur_tag->id < N_TAGS) { + if (cur_tag->flags & CM_UNIQUE) { + if (!hc->tags_seen[cur_tag->id]) { + /* Duplicate tag has been found */ + hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; + } + } + hc->tags_seen[cur_tag->id] = true; + } + + /* Shift to the first unclosed tag */ + while (parent_tag && (parent_tag->flags & FL_CLOSED)) { + parent_tag = parent_tag->parent; + } + + if (parent_tag) { + cur_tag->parent = parent_tag; + parent_tag->children.push_back(cur_tag); + } + else { + if (hc->root_tag) { + cur_tag->parent = hc->root_tag; + hc->root_tag->children.push_back(cur_tag); + parent_tag = hc->root_tag; + } + else { + if (cur_tag->id == Tag_HTML) { + hc->root_tag = cur_tag; + } + else { + /* Insert a fake html tag */ + hc->all_tags.emplace_back(std::make_unique()); + auto *top_tag = hc->all_tags.back().get(); + top_tag->tag_start = 0; + top_tag->flags = FL_VIRTUAL; + if (in_head) { + top_tag->flags |= CM_HEAD; + } + top_tag->id = Tag_HTML; + top_tag->content_offset = 0; + top_tag->children.push_back(cur_tag); + cur_tag->parent = top_tag; + hc->root_tag = top_tag; + parent_tag = top_tag; + } + } + } + + if (cur_tag->flags & FL_HREF && !in_head) { + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url) { + url = maybe_url.value(); + + if (url_set != NULL) { + struct rspamd_url *maybe_existing = + rspamd_url_set_add_or_return (url_set, maybe_url.value()); + if (maybe_existing == maybe_url.value()) { + html_process_query_url(pool, url, url_set, + part_urls); + } + else { + url = maybe_existing; + /* Increase count to avoid odd checks failure */ + url->count ++; + } + } + + href_offset = hc->parsed.size(); + } + } + else if (cur_tag->id == Tag_BASE) { + /* + * Base is allowed only within head tag but HTML is retarded + */ + if (hc->base_url == NULL) { + auto maybe_url = html_process_url_tag(pool, cur_tag, hc); + + if (maybe_url) { + msg_debug_html ("got valid base tag"); + hc->base_url = url; + cur_tag->extra = url; + cur_tag->flags |= FL_HREF; + } + else { + msg_debug_html ("got invalid base tag!"); + } + } + } + + if (cur_tag->id == Tag_IMG) { + html_process_img_tag(pool, cur_tag, hc, url_set, + part_urls); + } + else if (cur_tag->id == Tag_LINK) { + html_process_link_tag(pool, cur_tag, hc, url_set, + part_urls); + } + + if (!(cur_tag->flags & CM_EMPTY)) { + html_process_block_tag(pool, cur_tag, hc); + } + + if (cur_tag->flags & FL_CLOSED) { + cur_tag->closing.end = cur_tag->content_offset; + cur_tag->closing.start = cur_tag->tag_start; + } + }; + p = (const char *)in->data; c = p; end = p + in->len; @@ -1219,7 +1329,6 @@ html_process_input(rspamd_mempool_t *pool, case parse_start: if (t == '<') { state = tag_begin; - in_head = true; } else { /* We have no starting tag, so assume that it's content */ @@ -1349,6 +1458,7 @@ html_process_input(rspamd_mempool_t *pool, if (t == '>') { state = tag_end_opening; cur_tag->content_offset = p - start + 1; + continue; } else { hc->flags |= RSPAMD_HTML_FLAG_BAD_ELEMENTS; @@ -1366,6 +1476,7 @@ html_process_input(rspamd_mempool_t *pool, else if (t == '>' && obrace == ebrace) { state = tag_end_opening; cur_tag->content_offset = p - start + 1; + continue; } p ++; break; @@ -1468,10 +1579,11 @@ html_process_input(rspamd_mempool_t *pool, /* TODO: parse DOCTYPE here */ if (t == '>') { cur_tag->content_offset = p - start + 1; - state = html_text_content; - /* We don't know a lot about sgml tags, ignore them */ + state = tag_end_opening; + } + else { + p++; } - p ++; break; case tag_content: @@ -1510,111 +1622,7 @@ html_process_input(rspamd_mempool_t *pool, content_parser_env.reset(); if (cur_tag != nullptr) { - - if (cur_tag->id < N_TAGS) { - if (cur_tag->flags & CM_UNIQUE) { - if (!hc->tags_seen[cur_tag->id]) { - /* Duplicate tag has been found */ - hc->flags |= RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS; - } - } - hc->tags_seen[cur_tag->id] = true; - } - - /* Shift to the first unclosed tag */ - while (parent_tag && (parent_tag->flags & FL_CLOSED)) { - parent_tag = parent_tag->parent; - } - - if (parent_tag) { - cur_tag->parent = parent_tag; - parent_tag->children.push_back(cur_tag); - } - else { - if (hc->root_tag) { - cur_tag->parent = hc->root_tag; - hc->root_tag->children.push_back(cur_tag); - parent_tag = hc->root_tag; - } - else { - if (cur_tag->id == Tag_HTML) { - hc->root_tag = cur_tag; - } - else { - /* Insert a fake html tag */ - hc->all_tags.emplace_back(std::make_unique()); - auto *top_tag = hc->all_tags.back().get(); - top_tag->tag_start = 0; - top_tag->flags = CM_HEAD|FL_VIRTUAL; - top_tag->id = Tag_HTML; - top_tag->content_offset = 0; - top_tag->children.push_back(cur_tag); - cur_tag->parent = top_tag; - hc->root_tag = top_tag; - parent_tag = top_tag; - } - } - } - - if (cur_tag->flags & FL_HREF && !in_head) { - auto maybe_url = html_process_url_tag(pool, cur_tag, hc); - - if (maybe_url) { - url = maybe_url.value(); - - if (url_set != NULL) { - struct rspamd_url *maybe_existing = - rspamd_url_set_add_or_return (url_set, maybe_url.value()); - if (maybe_existing == maybe_url.value()) { - html_process_query_url(pool, url, url_set, - part_urls); - } - else { - url = maybe_existing; - /* Increase count to avoid odd checks failure */ - url->count ++; - } - } - - href_offset = hc->parsed.size(); - } - } - else if (cur_tag->id == Tag_BASE) { - /* - * Base is allowed only within head tag but HTML is retarded - */ - if (hc->base_url == NULL) { - auto maybe_url = html_process_url_tag(pool, cur_tag, hc); - - if (maybe_url) { - msg_debug_html ("got valid base tag"); - hc->base_url = url; - cur_tag->extra = url; - cur_tag->flags |= FL_HREF; - } - else { - msg_debug_html ("got invalid base tag!"); - } - } - } - - if (cur_tag->id == Tag_IMG) { - html_process_img_tag(pool, cur_tag, hc, url_set, - part_urls); - } - else if (cur_tag->id == Tag_LINK) { - html_process_link_tag(pool, cur_tag, hc, url_set, - part_urls); - } - - if (!(cur_tag->flags & CM_EMPTY)) { - html_process_block_tag(pool, cur_tag, hc); - } - - if (cur_tag->flags & FL_CLOSED) { - cur_tag->closing.end = cur_tag->content_offset; - cur_tag->closing.start = cur_tag->tag_start; - } + process_opening_tag(); } if (cur_tag && (cur_tag->id == Tag_STYLE)) { @@ -1646,14 +1654,19 @@ html_process_input(rspamd_mempool_t *pool, */ auto *cur_opening_tag = cur_tag->parent; + while (cur_opening_tag && (cur_opening_tag->flags & FL_CLOSED)) { + cur_opening_tag = cur_opening_tag->parent; + } + if (!cur_opening_tag) { cur_opening_tag = hc->root_tag; } + auto &&vtag = std::make_unique(); vtag->id = cur_tag->id; - vtag->flags = FL_VIRTUAL | FL_CLOSED | FL_IGNORE; + vtag->flags = FL_VIRTUAL | FL_CLOSED; vtag->tag_start = cur_tag->closing.start; - vtag->content_offset = p - start; + vtag->content_offset = p - start + 1; vtag->closing = cur_tag->closing; vtag->parent = cur_opening_tag; cur_opening_tag->children.push_back(vtag.get()); @@ -1719,6 +1732,18 @@ html_process_input(rspamd_mempool_t *pool, return true; }, html_content::traverse_type::PRE_ORDER); + /* Leftover before content */ + switch (state) { + case tag_end_opening: + if (cur_tag != nullptr) { + process_opening_tag(); + } + break; + default: + /* Do nothing */ + break; + } + if (!hc->all_tags.empty()) { std::sort(hc->all_tags.begin(), hc->all_tags.end(), [](const auto &pt1, const auto &pt2) -> auto { return pt1->tag_start < pt2->tag_start; @@ -1727,7 +1752,7 @@ html_process_input(rspamd_mempool_t *pool, exceptions, url_set); } - /* Leftover */ + /* Leftover after content */ switch (state) { case html_text_content: case content_before_start: @@ -1735,6 +1760,11 @@ html_process_input(rspamd_mempool_t *pool, html_append_content(hc, {c, std::size_t(p - c)}); } break; + case tag_end_opening: + if (cur_tag != nullptr) { + process_opening_tag(); + } + break; default: /* Do nothing */ break; @@ -1788,8 +1818,13 @@ html_debug_structure(const html_content &hc) -> std::string std::string pluses(level, '+'); if (!(t->flags & (FL_VIRTUAL|FL_IGNORE))) { - output += fmt::format("{}{};", pluses, - html_tags_defs.name_by_id_safe(t->id)); + if (t->flags & FL_XML) { + output += fmt::format("{}xml;", pluses); + } + else { + output += fmt::format("{}{};", pluses, + html_tags_defs.name_by_id_safe(t->id)); + } level ++; } for (const auto *cld : t->children) { @@ -1824,14 +1859,14 @@ TEST_CASE("html parsing") { const std::vector> cases{ - {"", "+html;++body;"}, + {"", "+html;++xml;++body;"}, {"
", "+html;++div;+++div;"}, {"
", "+html;++div;+++div;"}, {"
", "+html;++div;+++div;"}, {"

", "+p;++p;+++a;"}, {"
", "+div;++a;"}, {"", - "+html;++body;+++head;++++body;"} + "+html;++xml;++body;+++head;++++body;"} }; rspamd_url_init(NULL); @@ -1857,23 +1892,6 @@ TEST_CASE("html text extraction") { const std::vector> cases{ - /* Complex html with bad tags */ - {"\n" - "\n" - " \n" - " \n" - " title\n" - " \n" - " \n" - " \n" - " \n" - " \n" - " Hello, world! test\n" - "

data<>\n" - "

\n" - " stuff

?\n" - " \n" - "", "Hello, world! test\ndata<> \nstuff?"}, /* XML tags */ {"\n" " text

", "text\n"}, {"olo

text

lolo", "olo\ntext\nlolo"}, {"
foo
bar
", "foo\nbar\n"}, - {"foobarbaz", "foobarbaz"}, {"foobarbaz", "foobarbaz"}, + {"foobarbaz", "foobarbaz"}, {"foo
baz", "foo\nbaz"}, {"test", "test"}, {"test", "test"}, @@ -1901,10 +1919,27 @@ TEST_CASE("html text extraction") "", "\n\n\ntest\n"}, {"
file " "sharing
", "fish\n"}, - /* FIXME: broken until rework */ + /* FIXME: broken until rework of css parser */ //{"
file " // "sharing
foo", "fish\nfoo"}, - {"

test", "test"}, + /* Complex html with bad tags */ + {"\n" + "\n" + " \n" + " \n" + " title\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " Hello, world! test\n" + "

data<>\n" + "

\n" + " stuff

?\n" + " \n" + "", "Hello, world! test\ndata<> \nstuff?"}, + {"

test

", "test\n"}, }; @@ -1939,7 +1974,7 @@ TEST_CASE("html text extraction") } -} +} /* namespace rspamd::html */ void * rspamd_html_process_part_full(rspamd_mempool_t *pool,