/*- * Copyright 2021 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "config.h" #include "html.hxx" #include #include #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL #include "doctest/doctest.h" namespace rspamd::html { /* * Tests part */ TEST_SUITE("html") { TEST_CASE("html parsing") { const std::vector> cases{ {"", "+html;++xml;++body;"}, {"
", "+html;++div;+++div;"}, {"
", "+html;++div;+++div;"}, {"
", "+html;++div;+++div;"}, {"

", "+p;++p;+++a;"}, {"
", "+div;++a;"}, /* Broken, as I don't know how the hell this should be really parsed */ //{"", // "+html;++xml;++body;+++head;+++body;"} }; rspamd_url_init(NULL); auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "html", 0); for (const auto &c : cases) { SUBCASE((std::string("extract tags from: ") + c.first).c_str()) { GByteArray *tmp = g_byte_array_sized_new(c.first.size()); g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true); CHECK(hc != nullptr); auto dump = html_debug_structure(*hc); CHECK(c.second == dump); g_byte_array_free(tmp, TRUE); } } rspamd_mempool_delete(pool); } TEST_CASE("html text extraction") { using namespace std::string_literals; const std::vector> cases{ {"test", "test"}, {"test\0"s, "test\uFFFD"s}, {"test\0test"s, "test\uFFFDtest"s}, {"test\0\0test"s, "test\uFFFD\uFFFDtest"s}, {"test ", "test"}, {"test foo, bar", "test foo, bar"}, {"

text

", "text\n"}, {"olo

text

lolo", "olo\ntext\nlolo"}, {"
foo
bar
", "foo\nbar\n"}, {"foobarbaz", "foobarbaz"}, {"foobarbaz", "foobarbaz"}, {"foo
baz", "foo\nbaz"}, {"test", "test"}, {"test", "test"}, {" \n" " \n" " a b a > b a < b a & b 'a "a"\n" " ", R"|(a b a > b a < b a & b 'a "a")|"}, /* XML tags */ {"\n" " \n" "test", "test"}, {"" " \n" "


\n" "

\n" "

\n" "
\n" " test
" "", "\n\n\ntest\n"}, {"
file " "sharing
", "fish\n"}, /* FIXME: broken until rework of css parser */ //{"
file " // "sharing
foo", "fish\nfoo"}, /* Complex html with bad tags */ {"\n" "\n" " \n" " \n" " title\n" " \n" " \n" " \n" " \n" " \n" " Hello, world! test\n" "

data<>\n" "

\n" " stuff

?\n" " \n" "", "Hello, world! test \ndata<>\nstuff\n?"}, {"

test

", "test\n"}, /* Tables */ {"\n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" " \n" "
headaheadb
data1data2
", "heada headb\ndata1 data2\n"}, /* Invalid closing br and hr + comment */ {" \n" " \n" " Hello, world!
test

contentmore content
\n" "

\n" " content inside div\n" "
\n" " ", "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"}, /* First closing tag */ {"\n" "\n" "

Hello. I have some bad news.\n" "











test

\n" "\n" "", "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"}, /* Invalid tags */ {"lol omg oh my!\n" "words words goodbye","lol omg oh my! words words goodbye"}, /* Invisible stuff */ {"
\n" "

\n" "FSincerely,

\n" "

\n" "8SkypeFWebF

\n" "kreyes\n" "

\n" " 

", " Sincerely,\n Skype Web\n"}, {"lala", "lala"}, {"
\n" "DONKEY\n" "
", ""}, /* bgcolor propagation */ {"\n" "FRevie" "wFΜΉ", " Review"}, {"\n" "hello world\n" "", "hello world"}, /* Colors */ {"goodbye cruel" "world", "goodbye cruelworld"}, /* Font-size propagation */ {"

goodbye cruelworld

", "goodbye world\n"}, /* Newline before tag -> must be space */ {"goodbye cruel\n" "world", "goodbye cruel world"}, /* Head tag with some stuff */ {"

oh my god", "oh my god\n"}, {"oh my god</head><body></body></html>", ""}, {"<html><body><html><head>displayed</body></html></body></html>", "displayed"}, }; rspamd_url_init(NULL); auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "html", 0); auto replace_newlines = [](std::string &str) { auto start_pos = 0; while((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) { str.replace(start_pos, 1, "\\n", 2); start_pos += 2; } }; auto i = 1; for (const auto &c : cases) { SUBCASE((fmt::format("html extraction case {}", i)).c_str()) { GByteArray *tmp = g_byte_array_sized_new(c.first.size()); g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); auto *hc = html_process_input(pool, tmp, nullptr, nullptr, nullptr, true); CHECK(hc != nullptr); replace_newlines(hc->parsed); auto expected = c.second; replace_newlines(expected); CHECK(hc->parsed == expected); g_byte_array_free(tmp, TRUE); } i ++; } rspamd_mempool_delete(pool); } TEST_CASE("html urls extraction") { using namespace std::string_literals; const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{ {"<style></style><a href=\"https://www.example.com\">yolo</a>", {"https://www.example.com"}, "yolo"}, {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"}, {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"}, {"<html>\n" "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n" "<body>\n" "<a href=\"https://www.example.com\">hello</a>\n" "</body>\n" "</html>", {"https://www.example.com"}, "hello"}, }; rspamd_url_init(NULL); auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(), "html", 0); auto i = 1; for (const auto &c : cases) { SUBCASE((fmt::format("html url extraction case {}", i)).c_str()) { GPtrArray *purls = g_ptr_array_new(); auto input = std::get<0>(c); GByteArray *tmp = g_byte_array_sized_new(input.size()); g_byte_array_append(tmp, (const guint8 *)input.data(), input.size()); auto *hc = html_process_input(pool, tmp, nullptr, nullptr, purls, true); CHECK(hc != nullptr); auto &expected_text = std::get<2>(c); if (expected_text.has_value()) { CHECK(hc->parsed == expected_text.value()); } const auto &expected_urls = std::get<1>(c); CHECK(expected_urls.size() == purls->len); for (auto j = 0; j < expected_urls.size(); ++j) { auto *url = (rspamd_url *)g_ptr_array_index(purls, j); CHECK(expected_urls[j] == std::string{url->string, url->urllen}); } g_byte_array_free(tmp, TRUE); g_ptr_array_free(purls, TRUE); } ++i; } rspamd_mempool_delete(pool); } } } /* namespace rspamd::html */