[Minor] Another set of fixes in the spaces normalisation

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index 32862ed20ccc0eff29cde5cf0616303800b222c7..20a38ee09bee5836d9de6a35aeb09c54397da420 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1090,9 +1090,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                         initial_dest_offset = hc->parsed.size();
  
         if (tag->id == Tag_BR || tag->id == Tag_HR) {
-               if (!hc->parsed.empty()) {
-                       hc->parsed.append("\n");
-               }
+               hc->parsed.append("\n");
  
                 return tag->content_offset;
         }
@@ -1163,17 +1161,19 @@ html_append_tag_content(rspamd_mempool_t *pool,
                         cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed,
                                         nested_stack, exceptions, url_set);
  
-                       initial_part_len = next_tag_offset - cur_offset;
-                       if (is_visible && initial_part_len > 0) {
-                               html_append_content(hc, {start + cur_offset,
-                                                                                std::size_t(initial_part_len)});
+                       if (enclosed_tags.empty()) {
+                               initial_part_len = next_tag_offset - cur_offset;
+                               if (is_visible && initial_part_len > 0) {
+                                       html_append_content(hc, {start + cur_offset,
+                                                                                        std::size_t(initial_part_len)});
+                               }
                         }
                 }
  
         } while (!enclosed_tags.empty());
  
         if (is_block && is_visible) {
-               if (!hc->parsed.empty()) {
+               if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
                         hc->parsed.append("\n");
                 }
         }
@@ -1817,6 +1817,14 @@ TEST_CASE("html text extraction")
                         {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
                         {"<a href=https://example.com>test</a>", "test"},
                         {"<img alt=test>", "test"},
+                       {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
+                        "  <body>\n"
+                        "    <p><br>\n"
+                        "    </p>\n"
+                        "    <div class=\"moz-forward-container\"><br>\n"
+                        "      <br>\n"
+                        "      test</div>"
+                        "</body>", "\ntest\n"},
         };
  
         rspamd_url_init(NULL);
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx

index 2cc3c11f56356ba16cb7a6eca500c43a29dbc288..97c84f64e8183a2f986c723ef4da8d3c6314f8fc 100644 (file)
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2236,6 +2236,7 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                 normal_content,
                 ampersand,
                 skip_multi_spaces,
+               skip_start_spaces,
         } state = parser_state::normal_content;
  
         end = s + len;
@@ -2441,6 +2442,10 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                 return false;
         };
  
+       if (norm_spaces && g_ascii_isspace(*h)) {
+               state = parser_state::skip_start_spaces;
+       }
+
         while (h - s < len && t <= h) {
                 switch (state) {
                 case parser_state::normal_content:
@@ -2516,6 +2521,14 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                                 state = parser_state::normal_content;
                         }
                         break;
+               case parser_state::skip_start_spaces:
+                       if (g_ascii_isspace(*h)) {
+                               h ++;
+                       }
+                       else {
+                               state = parser_state::normal_content;
+                       }
+                       break;
                 }
         }
  
@@ -2537,6 +2550,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                 }
         }
  
+       if (norm_spaces && g_ascii_isspace(*t)) {
+               do {
+                       t --;
+               } while (t > s && g_ascii_isspace(*t));
+
+               if (!g_ascii_isspace(*t)) {
+                       t++; /* Preserve last space character */
+               }
+       }
+
         return (t - s);
  }
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
src/libserver/html/html.cxx		patch \| blob \| history
src/libserver/html/html_entities.cxx		patch \| blob \| history