[Minor] Switch from head state on meaningful tags

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx

index bde7c01178141830d249460f1fac65965a4c09ff..cf12b0a0186f22b475f0af4f6a4cb04bf10cdffc 100644 (file)
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1724,6 +1724,9 @@ html_process_input(rspamd_mempool_t *pool,
  
                                                         html_document_state = html_document_state::body;
                                                 }
+                                               else if (cur_tag->id == Tag_BODY) {
+                                                       html_document_state = html_document_state::body;
+                                               }
                                         }
                                 }
  
diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx

index fa19463a0b59c678d8411f542d1f2118bb957e67..95eb9f9885ba738130a321d96604e0c11c81bcca 100644 (file)
--- a/src/libserver/html/html_entities.cxx
+++ b/src/libserver/html/html_entities.cxx
@@ -2579,9 +2579,9 @@ decode_html_entitles_inplace(std::string &st) -> void
         st.resize(nlen);
  }
  
-TEST_SUITE("html") {
+TEST_SUITE("html entities") {
  
-       TEST_CASE("html entities") {
+       TEST_CASE("html entities decode") {
                 std::vector<std::pair<std::string, std::string>> cases{
                                 {"", ""},
                                 {"abc", "abc"},
diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx

index a0c60b2998367671144837bdc752c08b182dae42..ac06a353bb78ccd1a8997d30a548d3d21e35b123 100644 (file)
--- a/src/libserver/html/html_tests.cxx
+++ b/src/libserver/html/html_tests.cxx
@@ -220,9 +220,15 @@ TEST_CASE("html text extraction")
  TEST_CASE("html urls extraction")
  {
         using namespace std::string_literals;
-       const std::vector<std::pair<std::string, std::vector<std::string>>> cases{
-                       {"<a href=\"https://example.com\">test</a>", {"https://example.com"}},
-                       {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}},
+       const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{
+                       {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"},
+                       {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"},
+                       {"<html>\n"
+                        "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n"
+                        "<body>\n"
+                        "<a href=\"https://www.example.com\">hello</a>\n"
+                        "</body>\n"
+                        "</html>", {"https://www.example.com"}, "hello"}
         };
  
         rspamd_url_init(NULL);
@@ -232,15 +238,20 @@ TEST_CASE("html urls extraction")
         for (const auto &c : cases) {
                 SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) {
                         GPtrArray *purls = g_ptr_array_new();
-                       GByteArray *tmp = g_byte_array_sized_new(c.first.size());
-                       g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
+                       auto input = std::get<0>(c);
+                       GByteArray *tmp = g_byte_array_sized_new(input.size());
+                       g_byte_array_append(tmp, (const guint8 *)input.data(), input.size());
                         auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true);
                         CHECK(hc != nullptr);
-                       auto expected = c.second;
-                       CHECK(expected.size() == purls->len);
-                       for (auto j = 0; j < expected.size(); ++j) {
+                       auto &expected_text = std::get<2>(c);
+                       if (expected_text.has_value()) {
+                               CHECK(hc->parsed == expected_text.value());
+                       }
+                       const auto &expected_urls = std::get<1>(c);
+                       CHECK(expected_urls.size() == purls->len);
+                       for (auto j = 0; j < expected_urls.size(); ++j) {
                                 auto *url = (rspamd_url *)g_ptr_array_index(purls, j);
-                               CHECK(expected[j] == std::string{url->string, url->urllen});
+                               CHECK(expected_urls[j] == std::string{url->string, url->urllen});
                         }
                         g_byte_array_free(tmp, TRUE);
                         g_ptr_array_free(purls, TRUE);
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Tue, 13 Jul 2021 15:45:46 +0000 (16:45 +0100)
src/libserver/html/html.cxx		patch \| blob \| history
src/libserver/html/html_entities.cxx		patch \| blob \| history
src/libserver/html/html_tests.cxx		patch \| blob \| history