You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html_tests.cxx 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304
  1. /*
  2. * Copyright 2024 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "config.h"
  17. #include "html.hxx"
  18. #include "libserver/task.h"
  19. #include <vector>
  20. #include <fmt/core.h>
  21. #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
  22. #include "doctest/doctest.h"
  23. namespace rspamd::html {
  24. /*
  25. * Tests part
  26. */
  27. TEST_SUITE("html")
  28. {
  29. TEST_CASE("html parsing")
  30. {
  31. const std::vector<std::pair<std::string, std::string>> cases{
  32. {"<html><!DOCTYPE html><body>", "+html;++xml;++body;"},
  33. {"<html><div><div></div></div></html>", "+html;++div;+++div;"},
  34. {"<html><div><div></div></html>", "+html;++div;+++div;"},
  35. {"<html><div><div></div></html></div>", "+html;++div;+++div;"},
  36. {"<p><p><a></p></a></a>", "+p;++p;+++a;"},
  37. {"<div><a href=\"http://example.com\"></div></a>", "+div;++a;"},
  38. /* Broken, as I don't know how the hell this should be really parsed */
  39. //{"<html><!DOCTYPE html><body><head><body></body></html></body></html>",
  40. // "+html;++xml;++body;+++head;+++body;"}
  41. };
  42. rspamd_url_init(NULL);
  43. auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  44. "html", 0);
  45. struct rspamd_task fake_task;
  46. memset(&fake_task, 0, sizeof(fake_task));
  47. fake_task.task_pool = pool;
  48. for (const auto &c: cases) {
  49. SUBCASE((std::string("extract tags from: ") + c.first).c_str())
  50. {
  51. GByteArray *tmp = g_byte_array_sized_new(c.first.size());
  52. g_byte_array_append(tmp, (const uint8_t *) c.first.data(), c.first.size());
  53. auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
  54. CHECK(hc != nullptr);
  55. auto dump = html_debug_structure(*hc);
  56. CHECK(c.second == dump);
  57. g_byte_array_free(tmp, TRUE);
  58. }
  59. }
  60. rspamd_mempool_delete(pool);
  61. }
  62. TEST_CASE("html text extraction")
  63. {
  64. using namespace std::string_literals;
  65. const std::vector<std::pair<std::string, std::string>> cases{
  66. {"test", "test"},
  67. {"test\0"s, "test\uFFFD"s},
  68. {"test\0test"s, "test\uFFFDtest"s},
  69. {"test\0\0test"s, "test\uFFFD\uFFFDtest"s},
  70. {"test ", "test"},
  71. {"test foo, bar", "test foo, bar"},
  72. {"<p>text</p>", "text\n"},
  73. {"olo<p>text</p>lolo", "olo\ntext\nlolo"},
  74. {"<div>foo</div><div>bar</div>", "foo\nbar\n"},
  75. {"<b>foo<i>bar</b>baz</i>", "foobarbaz"},
  76. {"<b>foo<i>bar</i>baz</b>", "foobarbaz"},
  77. {"foo<br>baz", "foo\nbaz"},
  78. {"<a href=https://example.com>test</a>", "test"},
  79. {"<img alt=test>", "test"},
  80. {" <body>\n"
  81. " <!-- escape content -->\n"
  82. " a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;\n"
  83. " </body>",
  84. R"|(a b a > b a < b a & b 'a "a")|"},
  85. /* XML tags */
  86. {"<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n"
  87. " <!DOCTYPE html\n"
  88. " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"\n"
  89. " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
  90. "<body>test</body>",
  91. "test"},
  92. {"<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=UTF-8\"></head>"
  93. " <body>\n"
  94. " <p><br>\n"
  95. " </p>\n"
  96. " <div class=\"moz-forward-container\"><br>\n"
  97. " <br>\n"
  98. " test</div>"
  99. "</body>",
  100. "\n\n\ntest\n"},
  101. {"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
  102. "sh<span style=\"FONT-SIZE: 0px\">aring </span></div>",
  103. "fish\n"},
  104. /* FIXME: broken until rework of css parser */
  105. //{"<div>fi<span style=\"FONT-SIZE: 0px\">le </span>"
  106. // "sh<span style=\"FONT-SIZE: 0px\">aring </div>foo</span>", "fish\nfoo"},
  107. /* Complex html with bad tags */
  108. {"<!DOCTYPE html>\n"
  109. "<html lang=\"en\">\n"
  110. " <head>\n"
  111. " <meta charset=\"utf-8\">\n"
  112. " <title>title</title>\n"
  113. " <link rel=\"stylesheet\" href=\"style.css\">\n"
  114. " <script src=\"script.js\"></script>\n"
  115. " </head>\n"
  116. " <body>\n"
  117. " <!-- page content -->\n"
  118. " Hello, world! <b>test</b>\n"
  119. " <p>data<>\n"
  120. " </P>\n"
  121. " <b>stuff</p>?\n"
  122. " </body>\n"
  123. "</html>",
  124. "Hello, world! test \ndata<>\nstuff\n?"},
  125. {"<p><!--comment-->test</br></hr><br>", "test\n"},
  126. /* Tables */
  127. {"<table>\n"
  128. " <tr>\n"
  129. " <th>heada</th>\n"
  130. " <th>headb</th>\n"
  131. " </tr>\n"
  132. " <tr>\n"
  133. " <td>data1</td>\n"
  134. " <td>data2</td>\n"
  135. " </tr>\n"
  136. " </table>",
  137. "heada headb\ndata1 data2\n"},
  138. /* Invalid closing br and hr + comment */
  139. {" <body>\n"
  140. " <!-- page content -->\n"
  141. " Hello, world!<br>test</br><br>content</hr>more content<br>\n"
  142. " <div>\n"
  143. " content inside div\n"
  144. " </div>\n"
  145. " </body>",
  146. "Hello, world!\ntest\ncontentmore content\ncontent inside div\n"},
  147. /* First closing tag */
  148. {"</head>\n"
  149. "<body>\n"
  150. "<p> Hello. I have some bad news.\n"
  151. "<br /> <br /> <br /> <strong> <br /> <br /> <br /> <br /> <br /> <br /> <br /> <br /> </strong><span> <br /> </span>test</p>\n"
  152. "</body>\n"
  153. "</html>",
  154. "Hello. I have some bad news. \n\n\n\n\n\n\n\n\n\n\n\ntest\n"},
  155. /* Invalid tags */
  156. {"lol <sht> omg </sht> oh my!\n"
  157. "<name>words words</name> goodbye",
  158. "lol omg oh my! words words goodbye"},
  159. /* Invisible stuff */
  160. {"<div style=\"color:#555555;font-family:Arial, 'Helvetica Neue', Helvetica, sans-serif;line-height:1.2;padding-top:10px;padding-right:10px;padding-bottom:10px;padding-left:10px;font-style: italic;\">\n"
  161. "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
  162. "<span style=\"color:#FFFFFF; \">F</span>Sincerely,</p>\n"
  163. "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
  164. "<span style=\"color:#FFFFFF; \">8</span>Sky<span style=\"opacity:1;\"></span>pe<span style=\"color:#FFFFFF; \">F</span>Web<span style=\"color:#FFFFFF; \">F</span></p>\n"
  165. "<span style=\"color:#FFFFFF; \">kreyes</span>\n"
  166. "<p style=\"font-size: 11px; line-height: 1.2; color: #555555; font-family: Arial, 'Helvetica Neue', Helvetica, sans-serif; mso-line-height-alt: 14px; margin: 0;\">\n"
  167. "&nbsp;</p>",
  168. " Sincerely,\n Skype Web\n"},
  169. {"lala<p hidden>fafa</p>", "lala"},
  170. {"<table style=\"FONT-SIZE: 0px;\"><tbody><tr><td>\n"
  171. "DONKEY\n"
  172. "</td></tr></tbody></table>",
  173. ""},
  174. /* bgcolor propagation */
  175. {"<a style=\"display: inline-block; color: #ffffff; background-color: #00aff0;\">\n"
  176. "<span style=\"color: #00aff0;\">F</span>Rev<span style=\"opacity: 1;\"></span></span>ie<span style=\"opacity: 1;\"></span>"
  177. "</span>w<span style=\"color: #00aff0;\">F<span style=\"opacity: 1;\">̹</span></span>",
  178. " Review"},
  179. {"<td style=\"color:#ffffff\" bgcolor=\"#005595\">\n"
  180. "hello world\n"
  181. "</td>",
  182. "hello world"},
  183. /* Colors */
  184. {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>"
  185. "<span>world</span>",
  186. "goodbye cruelworld"},
  187. /* Font-size propagation */
  188. {"<p style=\"font-size: 11pt;line-height:22px\">goodbye <span style=\"font-size:0px\">cruel</span>world</p>",
  189. "goodbye world\n"},
  190. /* Newline before tag -> must be space */
  191. {"goodbye <span style=\"COLOR: rgb(64,64,64)\">cruel</span>\n"
  192. "<span>world</span>",
  193. "goodbye cruel world"},
  194. /* Head tag with some stuff */
  195. {"<html><head><p>oh my god</head><body></body></html>", "oh my god\n"},
  196. {"<html><head><title>oh my god</head><body></body></html>", ""},
  197. {"<html><body><html><head>displayed</body></html></body></html>", "displayed"},
  198. };
  199. rspamd_url_init(NULL);
  200. auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  201. "html", 0);
  202. struct rspamd_task fake_task;
  203. memset(&fake_task, 0, sizeof(fake_task));
  204. fake_task.task_pool = pool;
  205. auto replace_newlines = [](std::string &str) {
  206. auto start_pos = 0;
  207. while ((start_pos = str.find("\n", start_pos, 1)) != std::string::npos) {
  208. str.replace(start_pos, 1, "\\n", 2);
  209. start_pos += 2;
  210. }
  211. };
  212. auto i = 1;
  213. for (const auto &c: cases) {
  214. SUBCASE((fmt::format("html extraction case {}", i)).c_str())
  215. {
  216. GByteArray *tmp = g_byte_array_sized_new(c.first.size());
  217. g_byte_array_append(tmp, (const uint8_t *) c.first.data(), c.first.size());
  218. auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
  219. CHECK(hc != nullptr);
  220. replace_newlines(hc->parsed);
  221. auto expected = c.second;
  222. replace_newlines(expected);
  223. CHECK(hc->parsed == expected);
  224. g_byte_array_free(tmp, TRUE);
  225. }
  226. i++;
  227. }
  228. rspamd_mempool_delete(pool);
  229. }
  230. TEST_CASE("html urls extraction")
  231. {
  232. using namespace std::string_literals;
  233. const std::vector<std::tuple<std::string, std::vector<std::string>, std::optional<std::string>>> cases{
  234. {"<style></style><a href=\"https://www.example.com\">yolo</a>",
  235. {"https://www.example.com"},
  236. "yolo"},
  237. {"<a href=\"https://example.com\">test</a>", {"https://example.com"}, "test"},
  238. {"<a <poo href=\"http://example.com\">hello</a>", {"http://example.com"}, "hello"},
  239. {"<html>\n"
  240. "<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n"
  241. "<body>\n"
  242. "<a href=\"https://www.example.com\">hello</a>\n"
  243. "</body>\n"
  244. "</html>",
  245. {"https://www.example.com"},
  246. "hello"},
  247. };
  248. rspamd_url_init(NULL);
  249. auto *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  250. "html", 0);
  251. struct rspamd_task fake_task;
  252. memset(&fake_task, 0, sizeof(fake_task));
  253. fake_task.task_pool = pool;
  254. auto i = 1;
  255. for (const auto &c: cases) {
  256. SUBCASE((fmt::format("html url extraction case {}", i)).c_str())
  257. {
  258. GPtrArray *purls = g_ptr_array_new();
  259. auto input = std::get<0>(c);
  260. GByteArray *tmp = g_byte_array_sized_new(input.size());
  261. g_byte_array_append(tmp, (const uint8_t *) input.data(), input.size());
  262. auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr);
  263. CHECK(hc != nullptr);
  264. auto &expected_text = std::get<2>(c);
  265. if (expected_text.has_value()) {
  266. CHECK(hc->parsed == expected_text.value());
  267. }
  268. const auto &expected_urls = std::get<1>(c);
  269. CHECK(expected_urls.size() == purls->len);
  270. for (auto j = 0; j < expected_urls.size(); ++j) {
  271. auto *url = (rspamd_url *) g_ptr_array_index(purls, j);
  272. CHECK(expected_urls[j] == std::string{url->string, url->urllen});
  273. }
  274. g_byte_array_free(tmp, TRUE);
  275. g_ptr_array_free(purls, TRUE);
  276. }
  277. ++i;
  278. }
  279. rspamd_mempool_delete(pool);
  280. }
  281. }
  282. } /* namespace rspamd::html */