From 87ef0c44cef19ce6498fe5e595097fd09aeaf396 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 13 Jul 2021 16:52:09 +0100 Subject: [Minor] Ignore bogus head tags inside body --- src/libserver/html/html.cxx | 3 ++- src/libserver/html/html_tests.cxx | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'src/libserver/html') diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index cf12b0a01..51f8589e2 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1085,7 +1085,7 @@ html_append_tag_content(rspamd_mempool_t *pool, return ret; } - else if (tag->id == Tag_HEAD) { + else if (tag->id == Tag_HEAD && (tag->flags & FL_IGNORE)) { auto ret = tag->closing.end; calculate_final_tag_offsets(); @@ -1706,6 +1706,7 @@ html_process_input(rspamd_mempool_t *pool, if (html_document_state == html_document_state::doctype) { if (cur_tag->id == Tag_HEAD || (cur_tag->flags & CM_HEAD)) { html_document_state = html_document_state::head; + cur_tag->flags |= FL_IGNORE; } else if (cur_tag->id != Tag_HTML) { html_document_state = html_document_state::body; diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx index ac06a353b..1181e79ac 100644 --- a/src/libserver/html/html_tests.cxx +++ b/src/libserver/html/html_tests.cxx @@ -69,6 +69,7 @@ TEST_CASE("html text extraction") { using namespace std::string_literals; const std::vector> cases{ + {"displayed", "displayed"}, {"test", "test"}, {"test\0"s, "test\uFFFD"s}, {"test\0test"s, "test\uFFFDtest"s}, @@ -184,6 +185,7 @@ TEST_CASE("html text extraction") /* Head tag with some stuff */ {"

oh my god", "oh my god\n"}, {"oh my god</head><body></body></html>", ""}, + }; rspamd_url_init(NULL); -- cgit v1.2.3