[Project] Html: Replace \0 in html content

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index e4a3097d0ad299598dbfc9fcbd866fad51109a2d..d5a35134124d4a55e1bdbf5fb157b56fb67770fb 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -968,7 +968,26 @@ html_append_content(struct html_content *hc, std::string_view data, bool transpa
                         }
                 }
  
-               hc->parsed.append(data);
+               if (data.find('\0') != data.npos) {
+                       auto replace_zero_func = [](auto input, auto output) {
+                               const auto last = input.cend();
+                               for (auto it = input.cbegin(); it != last; ++it) {
+                                       if (*it == '\0') {
+                                               output.append(u8"\uFFFD");
+                                       }
+                                       else {
+                                               output.push_back(*it);
+                                       }
+                               }
+                       };
+
+                       hc->parsed.reserve(hc->parsed.size() + data.size() + sizeof(u8"\uFFFD"));
+                       replace_zero_func(data, hc->parsed);
+                       hc->flags |= RSPAMD_HTML_FLAG_HAS_ZEROS;
+               }
+               else {
+                       hc->parsed.append(data);
+               }
         }
  
         auto nlen = decode_html_entitles_inplace(hc->parsed.data() + cur_offset,
@@ -2002,19 +2021,12 @@ TEST_CASE("html parsing")
  
  TEST_CASE("html text extraction")
  {
-
+       using namespace std::string_literals;
         const std::vector<std::pair<std::string, std::string>> cases{
-                       {"  <body>\n"
-                        "    <!-- escape content -->\n"
-                        "    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
-                        "  </body>", R"|(a b a > b a < b a & b 'a "a")|"},
-                       /* XML tags */
-                       {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
-                        " <!DOCTYPE html\n"
-                        "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
-                        "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
-                        "<body>test</body>", "test"},
                         {"test", "test"},
+                       {"test\0"s, "test\uFFFD"},
+                       {"test\0test"s, "test\uFFFDtest"},
+                       {"test\0\0test"s, "test\uFFFD\uFFFDtest"},
                         {"test   ", "test"},
                         {"test   foo,   bar", "test foo, bar"},
                         {"<p>text</p>", "text\n"},
@@ -2025,6 +2037,16 @@ TEST_CASE("html text extraction")
                         {"foo<br>baz", "foo\nbaz"},
                         {"<a href=https://example.com>test</a>", "test"},
                         {"<img alt=test>", "test"},
+                       {"  <body>\n"
+                        "    <!-- escape content -->\n"
+                        "    a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
+                        "  </body>", R"|(a b a > b a < b a & b 'a "a")|"},
+                       /* XML tags */
+                       {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
+                        " <!DOCTYPE html\n"
+                        "   PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
+                        "   \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
+                        "<body>test</body>", "test"},
                         {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
                          "  <body>\n"
                          "    <p><br>\n"
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h

index b6307f88f88021eb6f180e2dd49404128ed627bd..cc8039c22aaf45f80da74257db9c733de47e0601 100644 (file)
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -36,6 +36,7 @@ extern "C" {
  #define RSPAMD_HTML_FLAG_DUPLICATE_ELEMENTS (1 << 5)
  #define RSPAMD_HTML_FLAG_TOO_MANY_TAGS (1 << 6)
  #define RSPAMD_HTML_FLAG_HAS_DATA_URLS (1 << 7)
+#define RSPAMD_HTML_FLAG_HAS_ZEROS (1 << 8)
  
  /*
   * Image flags
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Mon, 12 Jul 2021 15:50:07 +0000 (16:50 +0100)
src/libserver/html/html.cxx		patch \| blob \| history
src/libserver/html/html.h		patch \| blob \| history