diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-05 17:40:59 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-05 17:40:59 +0100 |
commit | 549754288a5715cf278ecccd57c7ed5f9a23da8a (patch) | |
tree | a27182bd3c6c33d64aa28d1ca2edd1a9f0512462 | |
parent | de88fecfb5e67da4df24e3dc310c4a43687d5178 (diff) | |
download | rspamd-549754288a5715cf278ecccd57c7ed5f9a23da8a.tar.gz rspamd-549754288a5715cf278ecccd57c7ed5f9a23da8a.zip |
[Project] Html: Try to deal with bad unknown tags properly
-rw-r--r-- | src/libserver/html/html.cxx | 48 | ||||
-rw-r--r-- | src/libserver/html/html.hxx | 2 | ||||
-rw-r--r-- | src/libserver/html/html_tag.hxx | 4 | ||||
-rw-r--r-- | src/libserver/html/html_tags.h | 5 |
4 files changed, 38 insertions, 21 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index d46d291d6..b29a7d37d 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -354,7 +354,10 @@ html_parse_tag_content(rspamd_mempool_t *pool, if (tag_def == nullptr) { hc->flags |= RSPAMD_HTML_FLAG_UNKNOWN_ELEMENTS; - tag->id = N_TAGS; + /* Assign -hash to match closing tag if needed */ + auto nhash = static_cast<std::int32_t>(std::hash<std::string_view>{}({s, nsize})); + /* Always negative */ + tag->id = static_cast<tag_id_t>(nhash | G_MININT32); } else { tag->id = tag_def->id; @@ -1040,7 +1043,7 @@ static auto html_append_tag_content(rspamd_mempool_t *pool, const gchar *start, gsize len, struct html_content *hc, - const html_tag *tag, + html_tag *tag, GList **exceptions, khash_t (rspamd_url_hash) *url_set) -> goffset { @@ -1048,6 +1051,20 @@ html_append_tag_content(rspamd_mempool_t *pool, goffset next_tag_offset = tag->closing.end, initial_dest_offset = hc->parsed.size(); + if (tag->closing.end == -1) { + if (tag->closing.start != -1) { + next_tag_offset = tag->closing.start; + tag->closing.end = tag->closing.start; + } + else { + next_tag_offset = len; + tag->closing.end = len; + } + } + if (tag->closing.start == -1) { + tag->closing.start = tag->closing.end; + } + auto append_margin = [&](char c) -> void { if (is_visible) { if (!hc->parsed.empty() && hc->parsed.back() != c && hc->parsed.back() != '\n') { @@ -1072,7 +1089,7 @@ html_append_tag_content(rspamd_mempool_t *pool, return tag->content_offset; } - else if (tag->id == Tag_HEAD || tag->id >= N_TAGS) { + else if (tag->id == Tag_HEAD) { return tag->closing.end; } @@ -1235,7 +1252,7 @@ html_process_input(rspamd_mempool_t *pool, }; auto process_opening_tag = [&]() { - if (cur_tag->id < N_TAGS) { + if (cur_tag->id > Tag_UNKNOWN) { if (cur_tag->flags & CM_UNIQUE) { if (!hc->tags_seen[cur_tag->id]) { /* Duplicate tag has been found */ @@ -1796,12 +1813,6 @@ html_process_input(rspamd_mempool_t *pool, /* Leftover after content */ switch (state) { - case html_text_content: - case content_before_start: - if (p > c) { - html_append_content(hc, {c, std::size_t(p - c)}); - } - break; case tag_end_opening: if (cur_tag != nullptr) { process_opening_tag(); @@ -1934,12 +1945,6 @@ TEST_CASE("html text extraction") { const std::vector<std::pair<std::string, std::string>> cases{ - {"</head>\n" - "<body>\n" - "<p> Hello. I have some bad news.\n" - "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n" - "</body>\n" - "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"}, {" <body>\n" " <!-- escape content -->\n" " a b a > b a < b a & b 'a "a"\n" @@ -2003,6 +2008,7 @@ TEST_CASE("html text extraction") " <td>data2</td>\n" " </tr>\n" " </table>", "heada headb\ndata1 data2\n"}, + /* Invalid closing br and hr + comment */ {" <body>\n" " <!-- page content -->\n" " Hello, world!<br>test</br><br>content</hr>more content<br>\n" @@ -2010,6 +2016,16 @@ TEST_CASE("html text extraction") " content inside div\n" " </div>\n" " </body>", "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"}, + /* First closing tag */ + {"</head>\n" + "<body>\n" + "<p> Hello. I have some bad news.\n" + "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n" + "</body>\n" + "</html>", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"}, + /* Invalid tags */ + {"lol <sht> omg </sht> oh my!\n" + "<name>words words</name> goodbye","lol omg oh my! words words goodbye"}, }; rspamd_url_init(NULL); diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 67ef5a612..5b5d0ddc0 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -51,7 +51,7 @@ struct html_content { /* Preallocate and reserve all internal structures */ html_content() { - tags_seen.resize(N_TAGS, false); + tags_seen.resize(Tag_MAX, false); all_tags.reserve(128); parsed.reserve(256); } diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 17c15962a..a221a48ff 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -85,7 +85,7 @@ struct html_tag { unsigned int tag_start = 0; unsigned int content_offset = 0; std::uint32_t flags = 0; - tag_id_t id = N_TAGS; + tag_id_t id = Tag_UNKNOWN; html_closing_tag closing; std::vector<html_tag_component> components; @@ -116,7 +116,7 @@ struct html_tag { } auto clear(void) -> void { - id = N_TAGS; + id = Tag_UNKNOWN; tag_start = content_offset = 0; extra = std::monostate{}; components.clear(); diff --git a/src/libserver/html/html_tags.h b/src/libserver/html/html_tags.h index 3f209c08e..e94dd6a9a 100644 --- a/src/libserver/html/html_tags.h +++ b/src/libserver/html/html_tags.h @@ -22,7 +22,7 @@ extern "C" { /* Known HTML tags */ typedef enum { - Tag_UNKNOWN, /**< Unknown tag! */ + Tag_UNKNOWN = 0, /**< Unknown tag! */ Tag_A, /**< A */ Tag_ABBR, /**< ABBR */ Tag_ACRONYM, /**< ACRONYM */ @@ -143,8 +143,9 @@ typedef enum { Tag_XMP, /**< XMP */ Tag_XML, /**< XML */ Tag_NEXTID, /**< NEXTID */ + Tag_MAX, - N_TAGS /**< Must be last */ + N_TAGS = -1 /**< Must be -1 */ } tag_id_t; #define CM_UNKNOWN 0 |