diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-09-07 15:00:38 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-09-07 15:01:30 +0100 |
commit | 909a594f8d003dbe89462ad02d1634371f42bec3 (patch) | |
tree | 8bedd9431801c3fcdc28bc8f2d37de774c491ee9 /src/libserver | |
parent | 0d3cdd6e4512233cc1e328c6d517c8fda0e4aef2 (diff) | |
download | rspamd-909a594f8d003dbe89462ad02d1634371f42bec3.tar.gz rspamd-909a594f8d003dbe89462ad02d1634371f42bec3.zip |
[Rework] Save invisible content to a separate buffer
Diffstat (limited to 'src/libserver')
-rw-r--r-- | src/libserver/html/html.cxx | 115 | ||||
-rw-r--r-- | src/libserver/html/html.hxx | 1 | ||||
-rw-r--r-- | src/libserver/html/html_tag.hxx | 16 |
3 files changed, 86 insertions, 46 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index e4cc137b4..97009749f 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -985,12 +985,15 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, } static inline auto -html_append_parsed(struct html_content *hc, std::string_view data, bool transparent, - std::size_t input_len) -> std::size_t +html_append_parsed(struct html_content *hc, + std::string_view data, + bool transparent, + std::size_t input_len, + std::string &dest) -> std::size_t { - auto cur_offset = hc->parsed.size(); + auto cur_offset = dest.size(); - if (hc->parsed.size() > input_len) { + if (dest.size() > input_len) { /* Impossible case, refuse to append */ return 0; } @@ -999,9 +1002,9 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar /* Handle multiple spaces at the begin */ if (cur_offset > 0) { - auto last = hc->parsed.back(); + auto last = dest.back(); if (!g_ascii_isspace(last) && g_ascii_isspace(data.front())) { - hc->parsed.append(" "); + dest.append(" "); data = {data.data() + 1, data.size() - 1}; cur_offset++; } @@ -1020,24 +1023,24 @@ html_append_parsed(struct html_content *hc, std::string_view data, bool transpar } }; - hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD")); - replace_zero_func(data, hc->parsed); + dest.reserve(dest.size() + data.size() + sizeof(u8"\uFFFD")); + replace_zero_func(data, dest); hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS; } else { - hc->parsed.append(data); + dest.append(data); } } - auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset, - hc->parsed.size() - cur_offset, true); + auto nlen = decode_html_entitles_inplace(dest.data() + cur_offset, + dest.size() - cur_offset, true); - hc->parsed.resize(nlen + cur_offset); + dest.resize(nlen + cur_offset); if (transparent) { /* Replace all visible characters with spaces */ - auto start = std::next(hc->parsed.begin(), cur_offset); - std::replace_if(start, std::end(hc->parsed), [](const auto c) { + auto start = std::next(dest.begin(), cur_offset); + std::replace_if(start, std::end(dest), [](const auto c) { return !g_ascii_isspace(c); }, ' '); } @@ -1076,11 +1079,18 @@ html_append_tag_content(rspamd_mempool_t *pool, { auto is_visible = true, is_block = false, is_spaces = false, is_transparent = false; goffset next_tag_offset = tag->closing.end, - initial_dest_offset = hc->parsed.size(); + initial_parsed_offset = hc->parsed.size(), + initial_invisible_offset = hc->invisible.size(); - auto calculate_final_tag_offsets = [&tag, initial_dest_offset, hc]() -> void { - tag->content_offset = initial_dest_offset; - tag->closing.start = hc->parsed.size(); + auto calculate_final_tag_offsets = [&]() -> void { + if (is_visible) { + tag->content_offset = initial_parsed_offset; + tag->closing.start = hc->parsed.size(); + } + else { + tag->content_offset = initial_invisible_offset; + tag->closing.start = hc->invisible.size(); + } }; if (tag->closing.end == -1) { @@ -1098,17 +1108,18 @@ html_append_tag_content(rspamd_mempool_t *pool, } auto append_margin = [&](char c) -> void { + /* We do care about visible margins only */ if (is_visible) { if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { if (hc->parsed.back() == ' ') { /* We also strip extra spaces at the end, but limiting the start */ - auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_dest_offset); + auto last = std::make_reverse_iterator(hc->parsed.begin() + initial_parsed_offset); auto first = std::find_if(hc->parsed.rbegin(), last, [](auto ch) -> auto { return ch != ' '; }); hc->parsed.erase(first.base(), hc->parsed.end()); - g_assert(hc->parsed.size() >= initial_dest_offset); + g_assert(hc->parsed.size() >= initial_parsed_offset); } hc->parsed.push_back(c); } @@ -1177,10 +1188,17 @@ html_append_tag_content(rspamd_mempool_t *pool, auto enclosed_start = cld->tag_start; goffset initial_part_len = enclosed_start - cur_offset; - if (is_visible && initial_part_len > 0) { - html_append_parsed(hc, - {start + cur_offset, std::size_t(initial_part_len)}, - is_transparent, len); + if (initial_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(initial_part_len)}, + is_transparent, len, hc->invisible); + } } auto next_offset = html_append_tag_content(pool, start, len, @@ -1195,11 +1213,21 @@ html_append_tag_content(rspamd_mempool_t *pool, if (cur_offset < tag->closing.start) { goffset final_part_len = tag->closing.start - cur_offset; - if (is_visible && final_part_len > 0) { - html_append_parsed(hc, - {start + cur_offset, std::size_t(final_part_len)}, - is_transparent, - len); + if (final_part_len > 0) { + if (is_visible) { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->parsed); + } + else { + html_append_parsed(hc, + {start + cur_offset, std::size_t(final_part_len)}, + is_transparent, + len, + hc->invisible); + } } } if (is_block) { @@ -1211,11 +1239,11 @@ html_append_tag_content(rspamd_mempool_t *pool, if (is_visible) { if (tag->id == Tag_A) { - auto written_len = hc->parsed.size() - initial_dest_offset; + auto written_len = hc->parsed.size() - initial_parsed_offset; html_process_displayed_href_tag(pool, hc, - {hc->parsed.data() + initial_dest_offset, written_len}, + {hc->parsed.data() + initial_parsed_offset, written_len}, tag, exceptions, - url_set, initial_dest_offset); + url_set, initial_parsed_offset); } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ @@ -1997,7 +2025,7 @@ html_process_input(rspamd_mempool_t *pool, break; case tags_limit_overflow: html_append_parsed(hc, {c, (std::size_t) (end - c)}, - false, end - start); + false, end - start, hc->parsed); break; default: /* Do nothing */ @@ -2084,6 +2112,27 @@ auto html_tag_by_name(const std::string_view &name) return std::nullopt; } +auto +html_tag::get_content(const struct html_content *hc) const -> std::string_view +{ + const std::string *dest = &hc->parsed; + + if (block && !block->is_visible()) { + dest = &hc->invisible; + } + const auto clen = get_content_length(); + if (content_offset < dest->size()) { + if (dest->size() - content_offset >= clen) { + return std::string_view{*dest}.substr(content_offset, clen); + } + else { + return std::string_view{*dest}.substr(content_offset, dest->size() - content_offset); + } + } + + return std::string_view{}; +} + } void * diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 7e63bedce..5c16d085a 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -47,6 +47,7 @@ struct html_content { std::vector<html_image *> images; std::vector<std::unique_ptr<struct html_tag>> all_tags; std::string parsed; + std::string invisible; std::shared_ptr<css::css_style_sheet> css_style; /* Preallocate and reserve all internal structures */ diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index b6fc73120..5971ca179 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -31,6 +31,8 @@ struct html_image; namespace rspamd::html { +struct html_content; /* Forward declaration */ + enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, @@ -141,19 +143,7 @@ struct html_tag { return 0; } - constexpr auto get_content(std::string_view parsed) const -> std::string_view { - const auto clen = get_content_length(); - if (content_offset < parsed.size()) { - if (parsed.size() - content_offset >= clen) { - return parsed.substr(content_offset, clen); - } - else { - return parsed.substr(content_offset, parsed.size() - content_offset); - } - } - - return std::string_view{}; - } + auto get_content(const struct html_content *hc) const -> std::string_view; }; static_assert(CM_USER_SHIFT + 7 < sizeof(html_tag::flags) * NBBY); |