From 0275fe10768b17dd831928df12a030f9b40c9199 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 30 Jun 2021 21:22:43 +0100 Subject: [PATCH] [Project] Html: More fixes --- src/libserver/html/html.cxx | 127 +++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 61 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index d08cb75b2..202eebb87 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1004,7 +1004,7 @@ html_append_tag_content(rspamd_mempool_t *pool, khash_t (rspamd_url_hash) *url_set) -> goffset { auto is_visible = true, is_block = false; - goffset next_tag_offset = tag->closing.start, + goffset next_tag_offset = tag->closing.end + 1, initial_dest_offset = hc->parsed.size(); if (tag->id == Tag_BR || tag->id == Tag_HR) { @@ -1096,16 +1096,6 @@ html_append_tag_content(rspamd_mempool_t *pool, return next_tag_offset; } -static auto -html_append_tags_content(rspamd_mempool_t *pool, - const gchar *start, gsize len, - struct html_content *hc, - GList **exceptions, - khash_t (rspamd_url_hash) *url_set) -> void -{ - html_append_tag_content(pool, start, len, hc, hc->root_tag, exceptions, url_set); -} - static auto html_process_input(rspamd_mempool_t *pool, GByteArray *in, @@ -1120,7 +1110,7 @@ html_process_input(rspamd_mempool_t *pool, guint obrace = 0, ebrace = 0; struct rspamd_url *url = nullptr; gint href_offset = -1; - struct html_tag *cur_tag = nullptr, cur_closing_tag; + struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag; struct tag_content_parser_state content_parser_env; enum { @@ -1156,39 +1146,17 @@ html_process_input(rspamd_mempool_t *pool, return nullptr; } - auto *parent = cur_tag; - hc->all_tags.emplace_back(std::make_unique()); auto *ntag = hc->all_tags.back().get(); ntag->tag_start = c - start; ntag->flags = flags; - if (parent) { - ntag->parent = parent; - parent->children.push_back(ntag); + if (cur_tag) { + parent_tag = cur_tag; } - else { - if (hc->root_tag) { - ntag->parent = hc->root_tag; - hc->root_tag->children.push_back(ntag); - } - else { - if (ntag->id == Tag_HTML) { - hc->root_tag = ntag; - } - else { - /* Insert a fake html tag */ - hc->all_tags.emplace_back(std::make_unique()); - auto *top_tag = hc->all_tags.back().get(); - top_tag->tag_start = 0; - top_tag->flags = CM_HEAD|FL_VIRTUAL; - top_tag->id = Tag_HTML; - top_tag->content_offset = 0; - top_tag->children.push_back(ntag); - ntag->parent = top_tag; - hc->root_tag = top_tag; - } - } + + if (flags & FL_XML) { + return ntag; } return ntag; @@ -1216,6 +1184,7 @@ html_process_input(rspamd_mempool_t *pool, if (cur_tag) { cur_tag->id = Tag_HTML; + hc->root_tag = cur_tag; state = content_before_start; } else { @@ -1239,7 +1208,7 @@ html_process_input(rspamd_mempool_t *pool, closing = FALSE; break; case '!': - cur_tag = new_tag(FL_XML); + cur_tag = new_tag(FL_XML|FL_CLOSED); if (cur_tag) { state = sgml_tag; } @@ -1249,7 +1218,7 @@ html_process_input(rspamd_mempool_t *pool, p ++; break; case '?': - cur_tag = new_tag(FL_XML); + cur_tag = new_tag(FL_XML|FL_CLOSED); if (cur_tag) { state = xml_tag; } @@ -1503,9 +1472,43 @@ html_process_input(rspamd_mempool_t *pool, } } hc->tags_seen[cur_tag->id] = true; + + /* Shift to the first unclosed tag */ + while (parent_tag && (parent_tag->flags & FL_CLOSED)) { + parent_tag = parent_tag->parent; + } + + if (parent_tag) { + cur_tag->parent = parent_tag; + parent_tag->children.push_back(cur_tag); + } + else { + if (hc->root_tag) { + cur_tag->parent = hc->root_tag; + hc->root_tag->children.push_back(cur_tag); + parent_tag = hc->root_tag; + } + else { + if (cur_tag->id == Tag_HTML) { + hc->root_tag = cur_tag; + } + else { + /* Insert a fake html tag */ + hc->all_tags.emplace_back(std::make_unique()); + auto *top_tag = hc->all_tags.back().get(); + top_tag->tag_start = 0; + top_tag->flags = CM_HEAD|FL_VIRTUAL; + top_tag->id = Tag_HTML; + top_tag->content_offset = 0; + top_tag->children.push_back(cur_tag); + cur_tag->parent = top_tag; + hc->root_tag = top_tag; + parent_tag = top_tag; + } + } + } } - /* XXX: uncomment when styles parsing is not so broken */ if (cur_tag->flags & FL_HREF && !in_head) { auto maybe_url = html_process_url_tag(pool, cur_tag, hc); @@ -1637,7 +1640,8 @@ html_process_input(rspamd_mempool_t *pool, std::sort(hc->all_tags.begin(), hc->all_tags.end(), [](const auto &pt1, const auto &pt2) -> auto { return pt1->tag_start < pt2->tag_start; }); - html_append_tags_content(pool, start, end - start, hc, exceptions, url_set); + html_append_tag_content(pool, start, end - start, hc, hc->root_tag, + exceptions, url_set); } /* Leftover */ @@ -1766,7 +1770,23 @@ TEST_CASE("html text extraction") { const std::vector> cases{ - {"
foo
bar
", "foo\nbar\n"}, + /* Complex html with bad tags */ + {"\n" + "\n" + " \n" + " \n" + " title\n" + " \n" + " \n" + " \n" + " \n" + " \n" + " Hello, world! test\n" + "

data<>\n" + "

\n" + " stuff

?\n" + " \n" + "", "Hello, world! test\ndata<> \nstuff?"}, /* XML tags */ {"\n" " text

", "text\n"}, {"olo

text

lolo", "olo\ntext\nlolo"}, + {"
foo
bar
", "foo\nbar\n"}, {"foobarbaz", "foobarbaz"}, {"foobarbaz", "foobarbaz"}, {"foo
baz", "foo\nbaz"}, @@ -1797,23 +1818,7 @@ TEST_CASE("html text extraction") //{"
file " // "sharing
foo", "fish\nfoo"}, {"

test", "test"}, - /* Complex html with bad tags */ - {"\n" - "\n" - " \n" - " \n" - " title\n" - " \n" - " \n" - " \n" - " \n" - " \n" - " Hello, world! test\n" - "

data<>\n" - "

\n" - " stuff

?\n" - " \n" - "", "Hello, world! test\ndata<> \nstuff?"}, + }; rspamd_url_init(NULL); -- 2.39.5