You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

html.hxx 3.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_HTML_HXX
  17. #define RSPAMD_HTML_HXX
  18. #pragma once
  19. #include "config.h"
  20. #include "libserver/url.h"
  21. #include "libserver/html/html_tag.hxx"
  22. #include "libserver/html/html.h"
  23. #include "libserver/html/html_tags.h"
  24. #include <vector>
  25. #include <memory>
  26. #include <string>
  27. #include "function2/function2.hpp"
  28. namespace rspamd::css {
  29. /* Forward declaration */
  30. class css_style_sheet;
  31. }// namespace rspamd::css
  32. namespace rspamd::html {
  33. struct html_block;
  34. struct html_content {
  35. struct rspamd_url *base_url = nullptr;
  36. struct html_tag *root_tag = nullptr;
  37. int flags = 0;
  38. std::vector<bool> tags_seen;
  39. std::vector<html_image *> images;
  40. std::vector<std::unique_ptr<struct html_tag>> all_tags;
  41. std::string parsed;
  42. std::string invisible;
  43. std::shared_ptr<css::css_style_sheet> css_style;
  44. /* Preallocate and reserve all internal structures */
  45. html_content()
  46. {
  47. tags_seen.resize(Tag_MAX, false);
  48. all_tags.reserve(128);
  49. parsed.reserve(256);
  50. }
  51. static void html_content_dtor(void *ptr)
  52. {
  53. delete html_content::from_ptr(ptr);
  54. }
  55. static auto from_ptr(void *ptr) -> html_content *
  56. {
  57. return static_cast<html_content *>(ptr);
  58. }
  59. enum class traverse_type {
  60. PRE_ORDER,
  61. POST_ORDER
  62. };
  63. auto traverse_block_tags(fu2::function<bool(const html_tag *)> &&func,
  64. traverse_type how = traverse_type::PRE_ORDER) const -> bool
  65. {
  66. if (root_tag == nullptr) {
  67. return false;
  68. }
  69. auto rec_functor_pre_order = [&](const html_tag *root, auto &&rec) -> bool {
  70. if (func(root)) {
  71. for (const auto *c: root->children) {
  72. if (!rec(c, rec)) {
  73. return false;
  74. }
  75. }
  76. return true;
  77. }
  78. return false;
  79. };
  80. auto rec_functor_post_order = [&](const html_tag *root, auto &&rec) -> bool {
  81. for (const auto *c: root->children) {
  82. if (!rec(c, rec)) {
  83. return false;
  84. }
  85. }
  86. return func(root);
  87. };
  88. switch (how) {
  89. case traverse_type::PRE_ORDER:
  90. return rec_functor_pre_order(root_tag, rec_functor_pre_order);
  91. case traverse_type::POST_ORDER:
  92. return rec_functor_post_order(root_tag, rec_functor_post_order);
  93. default:
  94. RSPAMD_UNREACHABLE;
  95. }
  96. }
  97. auto traverse_all_tags(fu2::function<bool(const html_tag *)> &&func) const -> bool
  98. {
  99. for (const auto &tag: all_tags) {
  100. if (!(tag->flags & (FL_XML | FL_VIRTUAL))) {
  101. if (!func(tag.get())) {
  102. return false;
  103. }
  104. }
  105. }
  106. return true;
  107. }
  108. private:
  109. ~html_content() = default;
  110. };
  111. auto html_tag_by_name(const std::string_view &name) -> std::optional<tag_id_t>;
  112. auto html_process_input(struct rspamd_task *task,
  113. GByteArray *in,
  114. GList **exceptions,
  115. khash_t(rspamd_url_hash) * url_set,
  116. GPtrArray *part_urls,
  117. bool allow_css,
  118. std::uint16_t *cur_url_order) -> html_content *;
  119. auto html_debug_structure(const html_content &hc) -> std::string;
  120. }// namespace rspamd::html
  121. #endif//RSPAMD_HTML_HXX