]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Another set of fixes in the spaces normalisation
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 23 Jun 2021 11:04:05 +0000 (12:04 +0100)
src/libserver/html/html.cxx
src/libserver/html/html_entities.cxx

index 32862ed20ccc0eff29cde5cf0616303800b222c7..20a38ee09bee5836d9de6a35aeb09c54397da420 100644 (file)
@@ -1090,9 +1090,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
                        initial_dest_offset = hc->parsed.size();
 
        if (tag->id == Tag_BR || tag->id == Tag_HR) {
-               if (!hc->parsed.empty()) {
-                       hc->parsed.append("\n");
-               }
+               hc->parsed.append("\n");
 
                return tag->content_offset;
        }
@@ -1163,17 +1161,19 @@ html_append_tag_content(rspamd_mempool_t *pool,
                        cur_offset = html_append_tag_content(pool, start, len, hc, next_enclosed,
                                        nested_stack, exceptions, url_set);
 
-                       initial_part_len = next_tag_offset - cur_offset;
-                       if (is_visible && initial_part_len > 0) {
-                               html_append_content(hc, {start + cur_offset,
-                                                                                std::size_t(initial_part_len)});
+                       if (enclosed_tags.empty()) {
+                               initial_part_len = next_tag_offset - cur_offset;
+                               if (is_visible && initial_part_len > 0) {
+                                       html_append_content(hc, {start + cur_offset,
+                                                                                        std::size_t(initial_part_len)});
+                               }
                        }
                }
 
        } while (!enclosed_tags.empty());
 
        if (is_block && is_visible) {
-               if (!hc->parsed.empty()) {
+               if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
                        hc->parsed.append("\n");
                }
        }
@@ -1817,6 +1817,14 @@ TEST_CASE("html text extraction")
                        {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
                        {"<a href=https://example.com>test</a>", "test"},
                        {"<img alt=test>", "test"},
+                       {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
+                        "  <body>\n"
+                        "    <p><br>\n"
+                        "    </p>\n"
+                        "    <div class=\"moz-forward-container\"><br>\n"
+                        "      <br>\n"
+                        "      test</div>"
+                        "</body>", "\ntest\n"},
        };
 
        rspamd_url_init(NULL);
index 2cc3c11f56356ba16cb7a6eca500c43a29dbc288..97c84f64e8183a2f986c723ef4da8d3c6314f8fc 100644 (file)
@@ -2236,6 +2236,7 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                normal_content,
                ampersand,
                skip_multi_spaces,
+               skip_start_spaces,
        } state = parser_state::normal_content;
 
        end = s + len;
@@ -2441,6 +2442,10 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                return false;
        };
 
+       if (norm_spaces && g_ascii_isspace(*h)) {
+               state = parser_state::skip_start_spaces;
+       }
+
        while (h - s < len && t <= h) {
                switch (state) {
                case parser_state::normal_content:
@@ -2516,6 +2521,14 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                                state = parser_state::normal_content;
                        }
                        break;
+               case parser_state::skip_start_spaces:
+                       if (g_ascii_isspace(*h)) {
+                               h ++;
+                       }
+                       else {
+                               state = parser_state::normal_content;
+                       }
+                       break;
                }
        }
 
@@ -2537,6 +2550,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
                }
        }
 
+       if (norm_spaces && g_ascii_isspace(*t)) {
+               do {
+                       t --;
+               } while (t > s && g_ascii_isspace(*t));
+
+               if (!g_ascii_isspace(*t)) {
+                       t++; /* Preserve last space character */
+               }
+       }
+
        return (t - s);
 }