From 4ce4d6163e3c28e548ddb306fc3f52a82394a02b Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 12 Jul 2021 16:50:07 +0100 Subject: [PATCH] [Project] Html: Replace \0 in html content --- src/libserver/html/html.cxx | 46 +++++++++++++++++++++++++++---------- src/libserver/html/html.h | 1 + 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index e4a3097d0..d5a351341 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -968,7 +968,26 @@ html_append_content(struct html_content *hc, std::string_view data, bool transpa } } - hc->parsed.append(data); + if (data.find('\0') != data.npos) { + auto replace_zero_func = [](auto input, auto output) { + const auto last = input.cend(); + for (auto it = input.cbegin(); it != last; ++it) { + if (*it == '\0') { + output.append(u8"\uFFFD"); + } + else { + output.push_back(*it); + } + } + }; + + hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD")); + replace_zero_func(data, hc->parsed); + hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS; + } + else { + hc->parsed.append(data); + } } auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset, @@ -2002,19 +2021,12 @@ TEST_CASE("html parsing") TEST_CASE("html text extraction") { - + using namespace std::string_literals; const std::vector> cases{ - {" \n" - " \n" - " a b a > b a < b a & b 'a "a"\n" - " ", R"|(a b a > b a < b a & b 'a "a")|"}, - /* XML tags */ - {"\n" - " \n" - "test", "test"}, {"test", "test"}, + {"test\0"s, "test\uFFFD"}, + {"test\0test"s, "test\uFFFDtest"}, + {"test\0\0test"s, "test\uFFFD\uFFFDtest"}, {"test ", "test"}, {"test foo, bar", "test foo, bar"}, {"

text

", "text\n"}, @@ -2025,6 +2037,16 @@ TEST_CASE("html text extraction") {"foo
baz", "foo\nbaz"}, {"test", "test"}, {"test", "test"}, + {" \n" + " \n" + " a b a > b a < b a & b 'a "a"\n" + " ", R"|(a b a > b a < b a & b 'a "a")|"}, + /* XML tags */ + {"\n" + " \n" + "test", "test"}, {"" " \n" "


\n" diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index b6307f88f..cc8039c22 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -36,6 +36,7 @@ extern "C" { #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5) #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6) #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7) +#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8) /* * Image flags -- 2.39.5