You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

css_parser.cxx 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "css_parser.hxx"
  17. #include "css_tokeniser.hxx"
  18. #include "css_selector.hxx"
  19. #include "css_rule.hxx"
  20. #include "fmt/core.h"
  21. #include <vector>
  22. #include <unicode/utf8.h>
  23. #define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL
  24. #include "doctest/doctest.h"
  25. namespace rspamd::css {
  26. const css_consumed_block css_parser_eof_block{};
  27. auto css_consumed_block::attach_block(consumed_block_ptr &&block) -> bool {
  28. if (std::holds_alternative<std::monostate>(content)) {
  29. /* Switch from monostate */
  30. content = std::vector<consumed_block_ptr>();
  31. }
  32. else if (!std::holds_alternative<std::vector<consumed_block_ptr>>(content)) {
  33. /* A single component, cannot attach a block ! */
  34. return false;
  35. }
  36. auto &value_vec = std::get<std::vector<consumed_block_ptr>>(content);
  37. value_vec.push_back(std::move(block));
  38. return true;
  39. }
  40. auto css_consumed_block::add_function_argument(consumed_block_ptr &&block) -> bool {
  41. if (!std::holds_alternative<css_function_block>(content)) {
  42. return false;
  43. }
  44. auto &&func_bloc = std::get<css_function_block>(content);
  45. func_bloc.args.push_back(std::move(block));
  46. return true;
  47. }
  48. auto css_consumed_block::token_type_str(void) const -> const char *
  49. {
  50. const auto *ret = "";
  51. switch(tag) {
  52. case parser_tag_type::css_top_block:
  53. ret = "top";
  54. break;
  55. case parser_tag_type::css_qualified_rule:
  56. ret = "qualified rule";
  57. break;
  58. case parser_tag_type::css_at_rule:
  59. ret = "at rule";
  60. break;
  61. case parser_tag_type::css_simple_block:
  62. ret = "simple block";
  63. break;
  64. case parser_tag_type::css_function:
  65. ret = "function";
  66. break;
  67. case parser_tag_type::css_function_arg:
  68. ret = "function arg";
  69. break;
  70. case parser_tag_type::css_component:
  71. ret = "component";
  72. break;
  73. case parser_tag_type::css_selector:
  74. ret = "selector";
  75. break;
  76. case parser_tag_type::css_eof_block:
  77. ret = "eof";
  78. break;
  79. }
  80. return ret;
  81. }
  82. auto css_consumed_block::debug_str(void) -> std::string {
  83. std::string ret = fmt::format(R"("type": "{}", "value": )", token_type_str());
  84. std::visit([&](auto& arg) {
  85. using T = std::decay_t<decltype(arg)>;
  86. if constexpr (std::is_same_v<T, std::vector<consumed_block_ptr>>) {
  87. /* Array of blocks */
  88. ret += "[";
  89. for (const auto &block : arg) {
  90. ret += "{";
  91. ret += block->debug_str();
  92. ret += "}, ";
  93. }
  94. if (*(--ret.end()) == ' ') {
  95. ret.pop_back();
  96. ret.pop_back(); /* Last ',' */
  97. }
  98. ret += "]";
  99. }
  100. else if constexpr (std::is_same_v<T, std::monostate>) {
  101. /* Empty block */
  102. ret += R"("empty")";
  103. }
  104. else if constexpr (std::is_same_v<T, css_function_block>) {
  105. ret += R"({ "content": {"token": )";
  106. ret += "\"" + arg.function.debug_token_str() + "\", ";
  107. ret += R"("arguments": [)";
  108. for (const auto &block : arg.args) {
  109. ret += "{";
  110. ret += block->debug_str();
  111. ret += "}, ";
  112. }
  113. if (*(--ret.end()) == ' ') {
  114. ret.pop_back();
  115. ret.pop_back(); /* Last ',' */
  116. }
  117. ret += "]}}";
  118. }
  119. else {
  120. /* Single element block */
  121. ret += "\"" + arg.debug_token_str() + "\"";
  122. }
  123. },
  124. content);
  125. return ret;
  126. }
  127. class css_parser {
  128. public:
  129. css_parser(void) = delete; /* Require mempool to be set for logging */
  130. explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
  131. bool consume_input(const std::string_view &sv);
  132. auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
  133. if (style_object) {
  134. return std::move(style_object);
  135. }
  136. return tl::make_unexpected(error);
  137. }
  138. private:
  139. std::unique_ptr<css_style_sheet> style_object;
  140. std::unique_ptr<css_tokeniser> tokeniser;
  141. css_parse_error error;
  142. rspamd_mempool_t *pool;
  143. int rec_level = 0;
  144. const int max_rec = 20;
  145. bool eof = false;
  146. /* Helper parser methods */
  147. bool need_unescape(const std::string_view &sv);
  148. /* Consumers */
  149. auto component_value_consumer(std::unique_ptr<css_consumed_block> &top) -> bool;
  150. auto function_consumer(std::unique_ptr<css_consumed_block> &top) -> bool;
  151. auto simple_block_consumer(std::unique_ptr<css_consumed_block> &top,
  152. css_parser_token::token_type expected_end,
  153. bool consume_current) -> bool;
  154. auto qualified_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool;
  155. auto at_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool;
  156. };
  157. /*
  158. * Find if we need to unescape css
  159. */
  160. bool
  161. css_parser::need_unescape(const std::string_view &sv)
  162. {
  163. bool in_quote = false;
  164. char quote_char, prev_c = 0;
  165. for (const auto c : sv) {
  166. if (!in_quote) {
  167. if (c == '"' || c == '\'') {
  168. in_quote = true;
  169. quote_char = c;
  170. }
  171. else if (c == '\\') {
  172. return true;
  173. }
  174. }
  175. else {
  176. if (c == quote_char) {
  177. if (prev_c != '\\') {
  178. in_quote = false;
  179. }
  180. }
  181. prev_c = c;
  182. }
  183. }
  184. return false;
  185. }
  186. auto css_parser::function_consumer(std::unique_ptr<css_consumed_block> &top) -> bool
  187. {
  188. auto ret = true, want_more = true;
  189. msg_debug_css("consume function block; top block: %s, recursion level %d",
  190. top->token_type_str(), rec_level);
  191. if (++rec_level > max_rec) {
  192. msg_err_css("max nesting reached, ignore style");
  193. error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING);
  194. return false;
  195. }
  196. while (ret && want_more && !eof) {
  197. auto next_token = tokeniser->next_token();
  198. switch (next_token.type) {
  199. case css_parser_token::token_type::eof_token:
  200. eof = true;
  201. break;
  202. case css_parser_token::token_type::whitespace_token:
  203. /* Ignore whitespaces */
  204. break;
  205. case css_parser_token::token_type::ebrace_token:
  206. ret = true;
  207. want_more = false;
  208. break;
  209. case css_parser_token::token_type::comma_token:
  210. case css_parser_token::token_type::delim_token:
  211. case css_parser_token::token_type::obrace_token:
  212. break;
  213. default:
  214. /* Attach everything to the function block */
  215. top->add_function_argument(std::make_unique<css_consumed_block>(
  216. css::css_consumed_block::parser_tag_type::css_function_arg,
  217. std::move(next_token)));
  218. break;
  219. }
  220. }
  221. --rec_level;
  222. return ret;
  223. }
  224. auto css_parser::simple_block_consumer(std::unique_ptr<css_consumed_block> &top,
  225. css_parser_token::token_type expected_end,
  226. bool consume_current) -> bool
  227. {
  228. auto ret = true;
  229. std::unique_ptr<css_consumed_block> block;
  230. msg_debug_css("consume simple block; top block: %s, recursion level %d",
  231. top->token_type_str(), rec_level);
  232. if (!consume_current && ++rec_level > max_rec) {
  233. msg_err_css("max nesting reached, ignore style");
  234. error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING);
  235. return false;
  236. }
  237. if (!consume_current) {
  238. block = std::make_unique<css_consumed_block>(
  239. css_consumed_block::parser_tag_type::css_simple_block);
  240. }
  241. while (ret && !eof) {
  242. auto next_token = tokeniser->next_token();
  243. if (next_token.type == expected_end) {
  244. break;
  245. }
  246. switch (next_token.type) {
  247. case css_parser_token::token_type::eof_token:
  248. eof = true;
  249. break;
  250. case css_parser_token::token_type::whitespace_token:
  251. /* Ignore whitespaces */
  252. break;
  253. default:
  254. tokeniser->pushback_token(std::move(next_token));
  255. ret = component_value_consumer(consume_current ? top : block);
  256. break;
  257. }
  258. }
  259. if (!consume_current && ret) {
  260. msg_debug_css("attached node 'simple block' rule %s; length=%d",
  261. block->token_type_str(), (int)block->size());
  262. top->attach_block(std::move(block));
  263. }
  264. if (!consume_current) {
  265. --rec_level;
  266. }
  267. return ret;
  268. }
  269. auto css_parser::qualified_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool
  270. {
  271. msg_debug_css("consume qualified block; top block: %s, recursion level %d",
  272. top->token_type_str(), rec_level);
  273. if (++rec_level > max_rec) {
  274. msg_err_css("max nesting reached, ignore style");
  275. error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING);
  276. return false;
  277. }
  278. auto ret = true, want_more = true;
  279. auto block = std::make_unique<css_consumed_block>(
  280. css_consumed_block::parser_tag_type::css_qualified_rule);
  281. while (ret && want_more && !eof) {
  282. auto next_token = tokeniser->next_token();
  283. switch (next_token.type) {
  284. case css_parser_token::token_type::eof_token:
  285. eof = true;
  286. break;
  287. case css_parser_token::token_type::cdo_token:
  288. case css_parser_token::token_type::cdc_token:
  289. if (top->tag == css_consumed_block::parser_tag_type::css_top_block) {
  290. /* Ignore */
  291. ret = true;
  292. }
  293. else {
  294. }
  295. break;
  296. case css_parser_token::token_type::ocurlbrace_token:
  297. ret = simple_block_consumer(block,
  298. css_parser_token::token_type::ecurlbrace_token, false);
  299. want_more = false;
  300. break;
  301. case css_parser_token::token_type::whitespace_token:
  302. /* Ignore whitespaces */
  303. break;
  304. default:
  305. tokeniser->pushback_token(std::move(next_token));
  306. ret = component_value_consumer(block);
  307. break;
  308. };
  309. }
  310. if (ret) {
  311. if (top->tag == css_consumed_block::parser_tag_type::css_top_block) {
  312. msg_debug_css("attached node qualified rule %s; length=%d",
  313. block->token_type_str(), (int)block->size());
  314. top->attach_block(std::move(block));
  315. }
  316. }
  317. --rec_level;
  318. return ret;
  319. }
  320. auto css_parser::at_rule_consumer(std::unique_ptr<css_consumed_block> &top) -> bool
  321. {
  322. msg_debug_css("consume at-rule block; top block: %s, recursion level %d",
  323. top->token_type_str(), rec_level);
  324. if (++rec_level > max_rec) {
  325. msg_err_css("max nesting reached, ignore style");
  326. error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING);
  327. return false;
  328. }
  329. auto ret = true, want_more = true;
  330. auto block = std::make_unique<css_consumed_block>(
  331. css_consumed_block::parser_tag_type::css_at_rule);
  332. while (ret && want_more && !eof) {
  333. auto next_token = tokeniser->next_token();
  334. switch (next_token.type) {
  335. case css_parser_token::token_type::eof_token:
  336. eof = true;
  337. break;
  338. case css_parser_token::token_type::cdo_token:
  339. case css_parser_token::token_type::cdc_token:
  340. if (top->tag == css_consumed_block::parser_tag_type::css_top_block) {
  341. /* Ignore */
  342. ret = true;
  343. }
  344. else {
  345. }
  346. break;
  347. case css_parser_token::token_type::ocurlbrace_token:
  348. ret = simple_block_consumer(block,
  349. css_parser_token::token_type::ecurlbrace_token, false);
  350. want_more = false;
  351. break;
  352. case css_parser_token::token_type::whitespace_token:
  353. /* Ignore whitespaces */
  354. break;
  355. case css_parser_token::token_type::semicolon_token:
  356. want_more = false;
  357. break;
  358. default:
  359. tokeniser->pushback_token(std::move(next_token));
  360. ret = component_value_consumer(block);
  361. break;
  362. };
  363. }
  364. if (ret) {
  365. if (top->tag == css_consumed_block::parser_tag_type::css_top_block) {
  366. msg_debug_css("attached node qualified rule %s; length=%d",
  367. block->token_type_str(), (int)block->size());
  368. top->attach_block(std::move(block));
  369. }
  370. }
  371. --rec_level;
  372. return ret;
  373. }
  374. auto css_parser::component_value_consumer(std::unique_ptr<css_consumed_block> &top) -> bool
  375. {
  376. auto ret = true, need_more = true;
  377. std::unique_ptr<css_consumed_block> block;
  378. msg_debug_css("consume component block; top block: %s, recursion level %d",
  379. top->token_type_str(), rec_level);
  380. if (++rec_level > max_rec) {
  381. error = css_parse_error(css_parse_error_type::PARSE_ERROR_BAD_NESTING);
  382. return false;
  383. }
  384. while (ret && need_more && !eof) {
  385. auto next_token = tokeniser->next_token();
  386. switch (next_token.type) {
  387. case css_parser_token::token_type::eof_token:
  388. eof = true;
  389. break;
  390. case css_parser_token::token_type::ocurlbrace_token:
  391. block = std::make_unique<css_consumed_block>(
  392. css_consumed_block::parser_tag_type::css_simple_block);
  393. ret = simple_block_consumer(block,
  394. css_parser_token::token_type::ecurlbrace_token,
  395. true);
  396. need_more = false;
  397. break;
  398. case css_parser_token::token_type::obrace_token:
  399. block = std::make_unique<css_consumed_block>(
  400. css_consumed_block::parser_tag_type::css_simple_block);
  401. ret = simple_block_consumer(block,
  402. css_parser_token::token_type::ebrace_token,
  403. true);
  404. need_more = false;
  405. break;
  406. case css_parser_token::token_type::osqbrace_token:
  407. block = std::make_unique<css_consumed_block>(
  408. css_consumed_block::parser_tag_type::css_simple_block);
  409. ret = simple_block_consumer(block,
  410. css_parser_token::token_type::esqbrace_token,
  411. true);
  412. need_more = false;
  413. break;
  414. case css_parser_token::token_type::whitespace_token:
  415. /* Ignore whitespaces */
  416. break;
  417. case css_parser_token::token_type::function_token: {
  418. need_more = false;
  419. block = std::make_unique<css_consumed_block>(
  420. css_consumed_block::parser_tag_type::css_function,
  421. std::move(next_token));
  422. /* Consume the rest */
  423. ret = function_consumer(block);
  424. break;
  425. }
  426. default:
  427. block = std::make_unique<css_consumed_block>(
  428. css_consumed_block::parser_tag_type::css_component,
  429. std::move(next_token));
  430. need_more = false;
  431. break;
  432. }
  433. }
  434. if (ret && block) {
  435. msg_debug_css("attached node component rule %s; length=%d",
  436. block->token_type_str(), (int)block->size());
  437. top->attach_block(std::move(block));
  438. }
  439. --rec_level;
  440. return ret;
  441. }
  442. bool css_parser::consume_input(const std::string_view &sv)
  443. {
  444. tokeniser = std::make_unique<css_tokeniser>(pool, sv);
  445. auto ret = true;
  446. auto consumed_blocks =
  447. std::make_unique<css_consumed_block>(css_consumed_block::parser_tag_type::css_top_block);
  448. while (!eof && ret) {
  449. auto next_token = tokeniser->next_token();
  450. switch (next_token.type) {
  451. case css_parser_token::token_type::whitespace_token:
  452. /* Ignore whitespaces */
  453. break;
  454. case css_parser_token::token_type::eof_token:
  455. eof = true;
  456. break;
  457. case css_parser_token::token_type::at_keyword_token:
  458. tokeniser->pushback_token(std::move(next_token));
  459. ret = at_rule_consumer(consumed_blocks);
  460. break;
  461. default:
  462. tokeniser->pushback_token(std::move(next_token));
  463. ret = qualified_rule_consumer(consumed_blocks);
  464. break;
  465. }
  466. }
  467. const auto &rules = consumed_blocks->get_blocks_or_empty();
  468. for (auto &&rule : rules) {
  469. /*
  470. * For now, we do not need any of the at rules, so we can safely ignore them
  471. */
  472. auto &&children = rule->get_blocks_or_empty();
  473. if (children.size() > 1 &&
  474. children[0]->tag == css_consumed_block::parser_tag_type::css_component) {
  475. auto simple_block = std::find_if(children.begin(), children.end(),
  476. [](auto &bl) {
  477. return bl->tag == css_consumed_block::parser_tag_type::css_simple_block;
  478. });
  479. if (simple_block != children.end()) {
  480. /*
  481. * We have a component and a simple block,
  482. * so we can parse a selector and then extract
  483. * declarations from a simple block
  484. */
  485. /* First, tag all components as preamble */
  486. auto selector_it = children.cbegin();
  487. auto selector_token_functor = [&selector_it,&simple_block](void)
  488. -> const css_consumed_block & {
  489. for (;;) {
  490. if (selector_it == simple_block) {
  491. return css_parser_eof_block;
  492. }
  493. const auto &ret = (*selector_it);
  494. ++selector_it;
  495. return *ret;
  496. }
  497. };
  498. auto selectors_vec = process_selector_tokens(pool, selector_token_functor);
  499. auto decls_it = (*simple_block)->get_blocks_or_empty().cbegin();
  500. auto decls_end = (*simple_block)->get_blocks_or_empty().cend();
  501. auto declaration_token_functor = [&decls_it,&decls_end](void)
  502. -> const css_consumed_block & {
  503. for (;;) {
  504. if (decls_it == decls_end) {
  505. return css_parser_eof_block;
  506. }
  507. const auto &ret = (*decls_it);
  508. ++decls_it;
  509. return *ret;
  510. }
  511. };
  512. auto declarations_vec = process_declaration_tokens(pool,
  513. declaration_token_functor);
  514. }
  515. }
  516. }
  517. auto debug_str = consumed_blocks->debug_str();
  518. msg_debug_css("consumed css: {%*s}", (int)debug_str.size(), debug_str.data());
  519. tokeniser.reset(nullptr); /* No longer needed */
  520. return ret;
  521. }
  522. /*
  523. * Wrapper for the parser
  524. */
  525. auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
  526. bool
  527. {
  528. css_parser parser(pool);
  529. if (parser.consume_input(st)) {
  530. return true;
  531. }
  532. return false;
  533. }
  534. TEST_SUITE("css parser") {
  535. TEST_CASE("parse colors") {
  536. const std::vector<const char *> cases{
  537. "p { color: rgb(100%, 50%, 0%); opacity: -1; width: 1em; display: none; } /* very transparent solid orange */",
  538. "p { color: rgb(100%, 50%, 0%); opacity: 2; display: inline; } /* very transparent solid orange */",
  539. "p { color: rgb(100%, 50%, 0%); opacity: 0.5; } /* very transparent solid orange */\n",
  540. "p { color: rgb(100%, 50%, 0%); opacity: 1; width: 99%; } /* very transparent solid orange */\n",
  541. "p { color: rgb(100%, 50%, 0%); opacity: 10%; width: 99%; } /* very transparent solid orange */\n",
  542. "p { color: rgb(100%, 50%, 0%); opacity: 10%; width: 100px; } /* very transparent solid orange */\n",
  543. "p { color: rgb(100%, 50%, 0%); opacity: 10% } /* very transparent solid orange */\n",
  544. "* { color: hsl(0, 100%, 50%) !important } /* red */\n",
  545. "* { color: hsl(120, 100%, 50%) important } /* lime */\n",
  546. "* { color: hsl(120, 100%, 25%) } /* dark green */\n",
  547. "* { color: hsl(120, 100%, 75%) } /* light green */\n",
  548. "* { color: hsl(120, 75%, 75%) } /* pastel green, and so on */\n",
  549. "em { color: #f00 } /* #rgb */\n",
  550. "em { color: #ff0000 } /* #rrggbb */\n",
  551. "em { color: rgb(255,0,0) }\n",
  552. "em { color: rgb(100%, 0%, 0%) }\n",
  553. "body {color: black; background: white }\n",
  554. "h1 { color: maroon }\n",
  555. "h2 { color: olive }\n",
  556. "em { color: rgb(255,0,0) } /* integer range 0 - 255 */\n",
  557. "em { color: rgb(300,0,0) } /* clipped to rgb(255,0,0) */\n",
  558. "em { color: rgb(255,-10,0) } /* clipped to rgb(255,0,0) */\n",
  559. "em { color: rgb(110%, 0%, 0%) } /* clipped to rgb(100%,0%,0%) */\n",
  560. "em { color: rgb(255,0,0) } /* integer range 0 - 255 */\n",
  561. "em { color: rgba(255,0,0,1) /* the same, with explicit opacity of 1 */\n",
  562. "em { color: rgb(100%,0%,0%) } /* float range 0.0% - 100.0% */\n",
  563. "em { color: rgba(100%,0%,0%,1) } /* the same, with explicit opacity of 1 */\n",
  564. "p { color: rgba(0,0,255,0.5) } /* semi-transparent solid blue */\n",
  565. "p { color: rgba(100%, 50%, 0%, 0.1) } /* very transparent solid orange */",
  566. };
  567. rspamd_mempool_t *pool = rspamd_mempool_new(rspamd_mempool_suggest_size(),
  568. "css", 0);
  569. for (const auto &c : cases) {
  570. CHECK_UNARY(parse_css(pool, c));
  571. }
  572. rspamd_mempool_delete(pool);
  573. }
  574. }
  575. }