]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Fix xml tags and comments processing
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 28 Jun 2021 10:51:31 +0000 (11:51 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 28 Jun 2021 13:01:40 +0000 (14:01 +0100)
src/libserver/html/html.cxx

index 8d312b73351ebd357a9d2926617a92ff20267a82..c5d35105c291b59e63689a4df8195a767b9fae8c 100644 (file)
@@ -244,7 +244,7 @@ html_process_tag(rspamd_mempool_t *pool,
 
        if (!(tag->flags & (CM_EMPTY))) {
                /* Block tag */
-               if ((tag->flags & (FL_CLOSING | FL_CLOSED))) {
+               if (tag->flags & FL_CLOSING) {
                        /* Closed block tag */
                        if (parent == nullptr) {
                                msg_debug_html ("bad parent node");
@@ -1178,21 +1178,21 @@ html_append_tag_content(rspamd_mempool_t *pool,
                return tag->content_offset;
        }
 
-       if (!tag->block) {
-               if ((tag->flags & (FL_COMMENT|FL_XML))) {
+       if ((tag->flags & (FL_COMMENT|FL_XML))) {
+               is_visible = false;
+       }
+       else {
+               if (!tag->block) {
+                       is_visible = true;
+               }
+               else if (!tag->block->is_visible()) {
                        is_visible = false;
                }
                else {
-                       is_visible = true;
+                       is_block = tag->block->has_display() &&
+                                          tag->block->display == css::css_display_value::DISPLAY_BLOCK;
                }
        }
-       else if (!tag->block->is_visible()) {
-               is_visible = false;
-       }
-       else {
-               is_block = tag->block->has_display() &&
-                                  tag->block->display == css::css_display_value::DISPLAY_BLOCK;
-       }
 
        if (is_block) {
                if (!hc->parsed.empty() && hc->parsed.back() != '\n') {
@@ -1913,6 +1913,12 @@ TEST_CASE("html text extraction")
 {
 
        const std::vector<std::pair<std::string, std::string>> cases{
+                       /* XML tags */
+                       {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+                        " <!DOCTYPE html\n"
+                        "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+                        "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+                        "<body>test</body>", "test"},
                        {"test", "test"},
                        {"test   ", "test"},
                        {"test   foo,   bar", "test foo, bar"},
@@ -1938,6 +1944,7 @@ TEST_CASE("html text extraction")
                        //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
                        // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
                        {"<p><!--comment-->test", "test"},
+                       /* Complex html with bad tags */
                        {"<!DOCTYPE html>\n"
                         "<html lang=\"en\">\n"
                         "  <head>\n"
@@ -1953,7 +1960,7 @@ TEST_CASE("html text extraction")
                         "    </P>\n"
                         "    <b>stuff</p>?\n"
                         "  </body>\n"
-                        "</html>", "Hello, world! test\ndata<> \nstuff?"}
+                        "</html>", "Hello, world! test\ndata<> \nstuff?"},
        };
 
        rspamd_url_init(NULL);