From edf974f4d07d08009fe51409d834cf4a0352e792 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 13 Jul 2021 16:45:46 +0100 Subject: [Minor] Switch from head state on meaningful tags --- src/libserver/html/html.cxx | 3 +++ src/libserver/html/html_entities.cxx | 4 ++-- src/libserver/html/html_tests.cxx | 29 ++++++++++++++++++++--------- 3 files changed, 25 insertions(+), 11 deletions(-) (limited to 'src/libserver/html') diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index bde7c0117..cf12b0a01 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1724,6 +1724,9 @@ html_process_input(rspamd_mempool_t *pool, html_document_state = html_document_state::body; } + else if (cur_tag->id == Tag_BODY) { + html_document_state = html_document_state::body; + } } } diff --git a/src/libserver/html/html_entities.cxx b/src/libserver/html/html_entities.cxx index fa19463a0..95eb9f988 100644 --- a/src/libserver/html/html_entities.cxx +++ b/src/libserver/html/html_entities.cxx @@ -2579,9 +2579,9 @@ decode_html_entitles_inplace(std::string &st) -> void st.resize(nlen); } -TEST_SUITE("html") { +TEST_SUITE("html entities") { - TEST_CASE("html entities") { + TEST_CASE("html entities decode") { std::vector> cases{ {"", ""}, {"abc", "abc"}, diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx index a0c60b299..ac06a353b 100644 --- a/src/libserver/html/html_tests.cxx +++ b/src/libserver/html/html_tests.cxx @@ -220,9 +220,15 @@ TEST_CASE("html text extraction") TEST_CASE("html urls extraction") { using namespace std::string_literals; - const std::vector>> cases{ - {"test", {"https://example.com"}}, - {"hello", {"http://example.com"}}, + const std::vector, std::optional>> cases{ + {"test", {"https://example.com"}, "test"}, + {"hello", {"http://example.com"}, "hello"}, + {"\n" + "\n" + "\n" + "hello\n" + "\n" + "", {"https://www.example.com"}, "hello"} }; rspamd_url_init(NULL); @@ -232,15 +238,20 @@ TEST_CASE("html urls extraction") for (const auto &c : cases) { SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) { GPtrArray *purls = g_ptr_array_new(); - GByteArray *tmp = g_byte_array_sized_new(c.first.size()); - g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); + auto input = std::get<0>(c); + GByteArray *tmp = g_byte_array_sized_new(input.size()); + g_byte_array_append(tmp, (const guint8 *)input.data(), input.size()); auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true); CHECK(hc != nullptr); - auto expected = c.second; - CHECK(expected.size() == purls->len); - for (auto j = 0; j < expected.size(); ++j) { + auto &expected_text = std::get<2>(c); + if (expected_text.has_value()) { + CHECK(hc->parsed == expected_text.value()); + } + const auto &expected_urls = std::get<1>(c); + CHECK(expected_urls.size() == purls->len); + for (auto j = 0; j < expected_urls.size(); ++j) { auto *url = (rspamd_url *)g_ptr_array_index(purls, j); - CHECK(expected[j] == std::string{url->string, url->urllen}); + CHECK(expected_urls[j] == std::string{url->string, url->urllen}); } g_byte_array_free(tmp, TRUE); g_ptr_array_free(purls, TRUE); -- cgit v1.2.3