From: Vsevolod Stakhov Date: Tue, 22 Jun 2021 15:10:52 +0000 (+0100) Subject: [Project] Html: One more attempt to write text content X-Git-Tag: 3.0~261 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f13b2d042d2e42bf2216632f94495aa2244b91d5;p=rspamd.git [Project] Html: One more attempt to write text content --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index e65e1b028..37d464c1d 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1072,8 +1072,11 @@ static auto html_append_tag_content(const gchar *start, gsize len, struct html_content *hc, const html_tag *tag, - goffset next_tag_offset) -> goffset + std::vector &enclosed_tags) -> goffset { + auto is_visible = true, is_block = false; + goffset next_tag_offset = tag->content_length + tag->content_offset; + if (tag->id == Tag_BR || tag->id == Tag_HR) { if (!hc->parsed.empty()) { hc->parsed.append("\n"); @@ -1083,30 +1086,83 @@ html_append_tag_content(const gchar *start, gsize len, } if (!tag->block) { - return next_tag_offset; /* XXX: is it always true? */ + is_visible = false; + } + else if (!tag->block->is_visible()) { + is_visible = false; + } + else { + is_block = tag->block->has_display() && + tag->block->display == css::css_display_value::DISPLAY_BLOCK; } - auto is_block = tag->block->has_display() && - tag->block->display == css::css_display_value::DISPLAY_BLOCK; if (is_block) { - if (!hc->parsed.empty()) { + if (!hc->parsed.empty() && hc->parsed.back() != '\n') { hc->parsed.append("\n"); } } - if (tag->content_length + tag->content_offset <= next_tag_offset) { - if (tag->block->is_visible()) { - html_append_content(hc, {start + tag->content_offset, - tag->content_length}); + goffset cur_offset = tag->content_offset; - if (is_block) { - if (!hc->parsed.empty()) { - hc->parsed.append("\n"); - } + do { + auto enclosed_end = 0, enclosed_start = 0; + decltype(tag) next_enclosed = nullptr; + + if (!enclosed_tags.empty()) { + next_enclosed = enclosed_tags.back(); + enclosed_start = next_enclosed->tag_start; + enclosed_end = next_enclosed->content_length + + next_enclosed->content_offset; + + if (enclosed_end > next_tag_offset) { + next_tag_offset = enclosed_end; + } + enclosed_tags.pop_back(); + } + else { + enclosed_start = next_tag_offset; + } + + goffset initial_part_len = enclosed_start - cur_offset; + + if (is_visible && initial_part_len > 0) { + html_append_content(hc, {start + cur_offset, + std::size_t(initial_part_len)}); + } + + /* Deal with the remaining part */ + std::decay_t nested_stack; + + while (!enclosed_tags.empty() && enclosed_end > 0) { + const auto *last_tag = enclosed_tags.back(); + + if (last_tag->tag_start <= enclosed_end) { + nested_stack.push_back(last_tag); + enclosed_tags.pop_back(); + } + else { + break; + } + } + + if (!nested_stack.empty() && next_enclosed) { + /* Recursively print enclosed tags */ + std::reverse(std::begin(nested_stack), std::end(nested_stack)); + cur_offset = html_append_tag_content(start, len, hc, next_enclosed, nested_stack); + + initial_part_len = next_tag_offset - cur_offset; + if (is_visible && initial_part_len > 0) { + html_append_content(hc, {start + cur_offset, + std::size_t(initial_part_len)}); } } - return tag->content_length + tag->content_offset; + } while (!enclosed_tags.empty()); + + if (is_block && is_visible) { + if (!hc->parsed.empty()) { + hc->parsed.append("\n"); + } } return next_tag_offset; @@ -1117,18 +1173,30 @@ html_append_tags_content(const gchar *start, gsize len, struct html_content *hc) -> void { auto cur_offset = 0; + std::vector enclosed_tags_stack; - for (auto i = 0; i < hc->all_tags.size(); i ++) { + for (auto i = 0; i < hc->all_tags.size();) { const auto &tag = hc->all_tags[i]; html_tag *next_tag = nullptr; - auto next_offset = len; + auto next_offset = tag->content_offset + tag->content_length; - if (i + 1 < hc->all_tags.size()) { - next_tag = hc->all_tags[i + 1].get(); - next_offset = next_tag->tag_start; + auto j = i + 1; + while (j < hc->all_tags.size()) { + next_tag = hc->all_tags[j].get(); + + if (next_tag->content_offset <= next_offset) { + enclosed_tags_stack.push_back(next_tag); + j ++; + } + else { + break; + } } - cur_offset = html_append_tag_content(start, len, hc, tag.get(), next_offset); + std::reverse(enclosed_tags_stack.begin(), enclosed_tags_stack.end()); + cur_offset = html_append_tag_content(start, len, hc, tag.get(), + enclosed_tags_stack); + i = j; } } @@ -1713,16 +1781,16 @@ TEST_CASE("html text extraction") { const std::vector> cases{ - {"foo
baz", "foo\nbaz"}, + {"foobarbaz", "foobarbaz"}, {"foobarbaz", "foobarbaz"}, {"test", "test"}, {"test ", "test "}, {"test foo, bar", "test foo, bar"}, - {"

text

", "text"}, + {"

text

", "text\n"}, {"olo

text

lolo", "olo\ntext\nlolo"}, - {"foobarbaz", "foobarbaz"}, - {"
foo
bar
", "foo\nbar"}, + {"foo
baz", "foo\nbaz"}, + {"
foo
bar
", "foo\nbar\n"}, }; rspamd_url_init(NULL);