From 56f52fcf7a180a3fb8a0b803142ad13c9478e5f6 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 23 Jun 2021 12:04:05 +0100 Subject: [PATCH] [Minor] Another set of fixes in the spaces normalisation --- src/libserver/html/html.cxx | 24 ++++++++++++++++-------- src/libserver/html/html_entities.cxx | 23 +++++++++++++++++++++++ 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 32862ed20..20a38ee09 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1090,9 +1090,7 @@ html_append_tag_content(rspamd_mempool_t *pool, initial_dest_offset = hc->parsed.size(); if (tag->id == Tag_BR || tag->id == Tag_HR) { - if (!hc->parsed.empty()) { - hc->parsed.append("\n"); - } + hc->parsed.append("\n"); return tag->content_offset; } @@ -1163,17 +1161,19 @@ html_append_tag_content(rspamd_mempool_t *pool, cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed, nested_stack, exceptions, url_set); - initial_part_len = next_tag_offset - cur_offset; - if (is_visible && initial_part_len > 0) { - html_append_content(hc, {start + cur_offset, - std::size_t(initial_part_len)}); + if (enclosed_tags.empty()) { + initial_part_len = next_tag_offset - cur_offset; + if (is_visible && initial_part_len > 0) { + html_append_content(hc, {start + cur_offset, + std::size_t(initial_part_len)}); + } } } } while (!enclosed_tags.empty()); if (is_block && is_visible) { - if (!hc->parsed.empty()) { + if (!hc->parsed.empty() && hc->parsed.back() != '\n') { hc->parsed.append("\n"); } } @@ -1817,6 +1817,14 @@ TEST_CASE("html text extraction") {"
foo
bar
", "foo\nbar\n"}, {"test", "test"}, {"test", "test"}, + {"" + " \n" + "


\n" + "

\n" + "

\n" + "
\n" + " test
" + "", "\ntest\n"}, }; rspamd_url_init(NULL); diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index 2cc3c11f5..97c84f64e 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -2236,6 +2236,7 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) normal_content, ampersand, skip_multi_spaces, + skip_start_spaces, } state = parser_state::normal_content; end = s + len; @@ -2441,6 +2442,10 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) return false; }; + if (norm_spaces && g_ascii_isspace(*h)) { + state = parser_state::skip_start_spaces; + } + while (h - s < len && t <= h) { switch (state) { case parser_state::normal_content: @@ -2516,6 +2521,14 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) state = parser_state::normal_content; } break; + case parser_state::skip_start_spaces: + if (g_ascii_isspace(*h)) { + h ++; + } + else { + state = parser_state::normal_content; + } + break; } } @@ -2537,6 +2550,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces) } } + if (norm_spaces && g_ascii_isspace(*t)) { + do { + t --; + } while (t > s && g_ascii_isspace(*t)); + + if (!g_ascii_isspace(*t)) { + t++; /* Preserve last space character */ + } + } + return (t - s); } -- 2.39.5