diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-13 16:45:46 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-07-13 16:45:46 +0100 |
commit | edf974f4d07d08009fe51409d834cf4a0352e792 (patch) | |
tree | cd1616ce877099bba7daf57cfcd8278b990cd70e /src/libserver/html | |
parent | 4a51df3cc2e822a1401137698adc94bfa49d229a (diff) | |
download | rspamd-edf974f4d07d08009fe51409d834cf4a0352e792.tar.gz rspamd-edf974f4d07d08009fe51409d834cf4a0352e792.zip |
[Minor] Switch from head state on meaningful tags
Diffstat (limited to 'src/libserver/html')
-rw-r--r-- | src/libserver/html/html.cxx | 3 | ||||
-rw-r--r-- | src/libserver/html/html_entities.cxx | 4 | ||||
-rw-r--r-- | src/libserver/html/html_tests.cxx | 29 |
3 files changed, 25 insertions, 11 deletions
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index bde7c0117..cf12b0a01 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1724,6 +1724,9 @@ html_process_input(rspamd_mempool_t *pool, html_document_state = html_document_state::body; } + else if (cur_tag->id == Tag_BODY) { + html_document_state = html_document_state::body; + } } } diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index fa19463a0..95eb9f988 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -2579,9 +2579,9 @@ decode_html_entitles_inplace(std::string &st) -> void st.resize(nlen); } -TEST_SUITE("html") { +TEST_SUITE("html entities") { - TEST_CASE("html entities") { + TEST_CASE("html entities decode") { std::vector<std::pair<std::string, std::string>> cases{ {"", ""}, {"abc", "abc"}, diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx index a0c60b299..ac06a353b 100644 --- a/src/libserver/html/html_tests.cxx +++ b/src/libserver/html/html_tests.cxx @@ -220,9 +220,15 @@ TEST_CASE("html text extraction") TEST_CASE("html urls extraction") { using namespace std::string_literals; - const std::vector<std::pair<std::string, std::vector<std::string>>> cases{ - {"<a href=\"https://example.com\">test</a>", {"https://example.com"}}, - {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}}, + const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{ + {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"}, + {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"}, + {"<html>\n" + "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n" + "<body>\n" + "<a href=\"https://www.example.com\">hello</a>\n" + "</body>\n" + "</html>", {"https://www.example.com"}, "hello"} }; rspamd_url_init(NULL); @@ -232,15 +238,20 @@ TEST_CASE("html urls extraction") for (const auto &c : cases) { SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) { GPtrArray *purls = g_ptr_array_new(); - GByteArray *tmp = g_byte_array_sized_new(c.first.size()); - g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto input = std::get<0>(c); + GByteArray *tmp = g_byte_array_sized_new(input.size()); + g_byte_array_append(tmp, (const guint8 *)input.data(), input.size()); auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true); CHECK(hc != nullptr); - auto expected = c.second; - CHECK(expected.size() == purls->len); - for (auto j = 0; j < expected.size(); ++j) { + auto &expected_text = std::get<2>(c); + if (expected_text.has_value()) { + CHECK(hc->parsed == expected_text.value()); + } + const auto &expected_urls = std::get<1>(c); + CHECK(expected_urls.size() == purls->len); + for (auto j = 0; j < expected_urls.size(); ++j) { auto *url = (rspamd_url *)g_ptr_array_index(purls, j); - CHECK(expected[j] == std::string{url->string, url->urllen}); + CHECK(expected_urls[j] == std::string{url->string, url->urllen}); } g_byte_array_free(tmp, TRUE); g_ptr_array_free(purls, TRUE); |