]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Html: More spaces logic fixes
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 24 Jun 2021 16:38:20 +0000 (17:38 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 24 Jun 2021 16:38:20 +0000 (17:38 +0100)
src/libserver/html/html.cxx
src/libserver/html/html_entities.cxx

index 45094e7f82ad02de228316090c20b1197d090e5f..894b1ee45c987a351fc1e8c65c0e887744b4afec 100644 (file)
@@ -1849,6 +1849,26 @@ TEST_CASE("html text extraction")
                         "</body>", "\n\n\ntest\n"},
                        {"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
                         "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>", "fish\n"},
+                       /* FIXME: broken until rework */
+                       //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
+                       // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
+                       {"<p><!--comment-->test", "test"},
+                       {"<!DOCTYPE html>\n"
+                        "<html lang=\"en\">\n"
+                        "  <head>\n"
+                        "    <meta charset=\"utf-8\">\n"
+                        "    <title>title</title>\n"
+                        "    <link rel=\"stylesheet\" href=\"style.css\">\n"
+                        "    <script src=\"script.js\"></script>\n"
+                        "  </head>\n"
+                        "  <body>\n"
+                        "    <!-- page content -->\n"
+                        "    Hello, world! <b>test</b>\n"
+                        "    <p>data<>\n"
+                        "    </P>\n"
+                        "    <b>stuff</p>?\n"
+                        "  </body>\n"
+                        "</html>", "Hello, world! test\ndata<> \nstuff?"}
        };
 
        rspamd_url_init(NULL);
index 573872f434b11c8873df2145f393aba13284774b..4cbdf02bf51f667dec0236b11f6c911d6a6076a5 100644 (file)
@@ -2552,9 +2552,16 @@ decode_html_entitles_inplace(char *s, std::size_t len, bool norm_spaces)
        }
 
        if (norm_spaces) {
+               bool seen_spaces = false;
+
                while (t > s && g_ascii_isspace(*(t - 1))) {
+                       seen_spaces = true;
                        t --;
                }
+
+               if (seen_spaces) {
+                       *t++ = ' ';
+               }
        }
 
        return (t - s);