您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

html_tag_defs.hxx 7.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_HTML_TAG_DEFS_HXX
  17. #define RSPAMD_HTML_TAG_DEFS_HXX
  18. #include "config.h"
  19. #include "html_tags.h"
  20. #include "libutil/cxx/util.hxx"
  21. #include <string>
  22. #include "contrib/ankerl/unordered_dense.h"
  23. namespace rspamd::html {
  24. struct html_tag_def {
  25. std::string name;
  26. tag_id_t id;
  27. unsigned int flags;
  28. };
  29. #define TAG_DEF(id, name, flags) \
  30. html_tag_def \
  31. { \
  32. (name), (id), (flags) \
  33. }
  34. static const auto html_tag_defs_array = rspamd::array_of(
  35. /* W3C defined elements */
  36. TAG_DEF(Tag_A, "a", FL_HREF),
  37. TAG_DEF(Tag_ABBR, "abbr", (CM_INLINE)),
  38. TAG_DEF(Tag_ACRONYM, "acronym", (CM_INLINE)),
  39. TAG_DEF(Tag_ADDRESS, "address", (CM_BLOCK)),
  40. TAG_DEF(Tag_APPLET, "applet", (CM_IMG | CM_INLINE | CM_PARAM)),
  41. TAG_DEF(Tag_AREA, "area", (CM_BLOCK | CM_EMPTY | FL_HREF)),
  42. TAG_DEF(Tag_B, "b", (CM_INLINE | FL_BLOCK)),
  43. TAG_DEF(Tag_BASE, "base", (CM_HEAD | CM_EMPTY)),
  44. TAG_DEF(Tag_BASEFONT, "basefont", (CM_INLINE | CM_EMPTY)),
  45. TAG_DEF(Tag_BDO, "bdo", (CM_INLINE)),
  46. TAG_DEF(Tag_BIG, "big", (CM_INLINE)),
  47. TAG_DEF(Tag_BLOCKQUOTE, "blockquote", (CM_BLOCK)),
  48. TAG_DEF(Tag_BODY, "body", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE | FL_BLOCK)),
  49. TAG_DEF(Tag_BR, "br", (CM_INLINE | CM_EMPTY)),
  50. TAG_DEF(Tag_BUTTON, "button", (CM_INLINE | FL_BLOCK)),
  51. TAG_DEF(Tag_CAPTION, "caption", (CM_TABLE)),
  52. TAG_DEF(Tag_CENTER, "center", (CM_BLOCK)),
  53. TAG_DEF(Tag_CITE, "cite", (CM_INLINE)),
  54. TAG_DEF(Tag_CODE, "code", (CM_INLINE)),
  55. TAG_DEF(Tag_COL, "col", (CM_TABLE | CM_EMPTY)),
  56. TAG_DEF(Tag_COLGROUP, "colgroup", (CM_TABLE | CM_OPT)),
  57. TAG_DEF(Tag_DD, "dd", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  58. TAG_DEF(Tag_DEL, "del", (CM_INLINE | CM_BLOCK)),
  59. TAG_DEF(Tag_DFN, "dfn", (CM_INLINE)),
  60. TAG_DEF(Tag_DIR, "dir", (CM_BLOCK)),
  61. TAG_DEF(Tag_DIV, "div", (CM_BLOCK | FL_BLOCK)),
  62. TAG_DEF(Tag_DL, "dl", (CM_BLOCK | FL_BLOCK)),
  63. TAG_DEF(Tag_DT, "dt", (CM_DEFLIST | CM_OPT | CM_NO_INDENT)),
  64. TAG_DEF(Tag_EM, "em", (CM_INLINE)),
  65. TAG_DEF(Tag_FIELDSET, "fieldset", (CM_BLOCK)),
  66. TAG_DEF(Tag_FONT, "font", (FL_BLOCK)),
  67. TAG_DEF(Tag_FORM, "form", (CM_BLOCK | FL_HREF)),
  68. TAG_DEF(Tag_FRAME, "frame", (CM_EMPTY | FL_HREF)),
  69. TAG_DEF(Tag_FRAMESET, "frameset", (CM_HTML)),
  70. TAG_DEF(Tag_H1, "h1", (CM_BLOCK)),
  71. TAG_DEF(Tag_H2, "h2", (CM_BLOCK)),
  72. TAG_DEF(Tag_H3, "h3", (CM_BLOCK)),
  73. TAG_DEF(Tag_H4, "h4", (CM_BLOCK)),
  74. TAG_DEF(Tag_H5, "h5", (CM_BLOCK)),
  75. TAG_DEF(Tag_H6, "h6", (CM_BLOCK)),
  76. TAG_DEF(Tag_HEAD, "head", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  77. TAG_DEF(Tag_HR, "hr", (CM_BLOCK | CM_EMPTY)),
  78. TAG_DEF(Tag_HTML, "html", (CM_HTML | CM_OPT | CM_OMITST | CM_UNIQUE)),
  79. TAG_DEF(Tag_I, "i", (CM_INLINE)),
  80. TAG_DEF(Tag_IFRAME, "iframe", (FL_HREF)),
  81. TAG_DEF(Tag_IMG, "img", (CM_INLINE | CM_IMG | CM_EMPTY)),
  82. TAG_DEF(Tag_INPUT, "input", (CM_INLINE | CM_IMG | CM_EMPTY)),
  83. TAG_DEF(Tag_INS, "ins", (CM_INLINE | CM_BLOCK)),
  84. TAG_DEF(Tag_ISINDEX, "isindex", (CM_BLOCK | CM_EMPTY)),
  85. TAG_DEF(Tag_KBD, "kbd", (CM_INLINE)),
  86. TAG_DEF(Tag_LABEL, "label", (CM_INLINE)),
  87. TAG_DEF(Tag_LEGEND, "legend", (CM_INLINE)),
  88. TAG_DEF(Tag_LI, "li", (CM_LIST | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  89. TAG_DEF(Tag_LINK, "link", (CM_EMPTY | FL_HREF)),
  90. TAG_DEF(Tag_LISTING, "listing", (CM_BLOCK)),
  91. TAG_DEF(Tag_MAP, "map", (CM_INLINE | FL_HREF)),
  92. TAG_DEF(Tag_MENU, "menu", (CM_BLOCK)),
  93. TAG_DEF(Tag_META, "meta", (CM_HEAD | CM_INLINE | CM_EMPTY)),
  94. TAG_DEF(Tag_NOFRAMES, "noframes", (CM_BLOCK)),
  95. TAG_DEF(Tag_NOSCRIPT, "noscript", (CM_BLOCK | CM_INLINE | CM_RAW)),
  96. TAG_DEF(Tag_OBJECT, "object", (CM_HEAD | CM_IMG | CM_INLINE | CM_PARAM)),
  97. TAG_DEF(Tag_OL, "ol", (CM_BLOCK | FL_BLOCK)),
  98. TAG_DEF(Tag_OPTGROUP, "optgroup", (CM_FIELD | CM_OPT)),
  99. TAG_DEF(Tag_OPTION, "option", (CM_FIELD | CM_OPT)),
  100. TAG_DEF(Tag_P, "p", (CM_BLOCK | CM_OPT | FL_BLOCK)),
  101. TAG_DEF(Tag_PARAM, "param", (CM_INLINE | CM_EMPTY)),
  102. TAG_DEF(Tag_PLAINTEXT, "plaintext", (CM_BLOCK)),
  103. TAG_DEF(Tag_PRE, "pre", (CM_BLOCK)),
  104. TAG_DEF(Tag_Q, "q", (CM_INLINE)),
  105. TAG_DEF(Tag_RB, "rb", (CM_INLINE)),
  106. TAG_DEF(Tag_RBC, "rbc", (CM_INLINE)),
  107. TAG_DEF(Tag_RP, "rp", (CM_INLINE)),
  108. TAG_DEF(Tag_RT, "rt", (CM_INLINE)),
  109. TAG_DEF(Tag_RTC, "rtc", (CM_INLINE)),
  110. TAG_DEF(Tag_RUBY, "ruby", (CM_INLINE)),
  111. TAG_DEF(Tag_S, "s", (CM_INLINE)),
  112. TAG_DEF(Tag_SAMP, "samp", (CM_INLINE)),
  113. TAG_DEF(Tag_SCRIPT, "script", (CM_HEAD | CM_RAW)),
  114. TAG_DEF(Tag_SELECT, "select", (CM_INLINE | CM_FIELD)),
  115. TAG_DEF(Tag_SMALL, "small", (CM_INLINE)),
  116. TAG_DEF(Tag_SPAN, "span", (CM_NO_INDENT | FL_BLOCK)),
  117. TAG_DEF(Tag_STRIKE, "strike", (CM_INLINE)),
  118. TAG_DEF(Tag_STRONG, "strong", (CM_INLINE)),
  119. TAG_DEF(Tag_STYLE, "style", (CM_HEAD | CM_RAW)),
  120. TAG_DEF(Tag_SUB, "sub", (CM_INLINE)),
  121. TAG_DEF(Tag_SUP, "sup", (CM_INLINE)),
  122. TAG_DEF(Tag_TABLE, "table", (CM_BLOCK | FL_BLOCK)),
  123. TAG_DEF(Tag_TBODY, "tbody", (CM_TABLE | CM_ROWGRP | CM_OPT | FL_BLOCK)),
  124. TAG_DEF(Tag_TD, "td", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  125. TAG_DEF(Tag_TEXTAREA, "textarea", (CM_INLINE | CM_FIELD)),
  126. TAG_DEF(Tag_TFOOT, "tfoot", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  127. TAG_DEF(Tag_TH, "th", (CM_ROW | CM_OPT | CM_NO_INDENT | FL_BLOCK)),
  128. TAG_DEF(Tag_THEAD, "thead", (CM_TABLE | CM_ROWGRP | CM_OPT)),
  129. TAG_DEF(Tag_TITLE, "title", (CM_HEAD | CM_UNIQUE)),
  130. TAG_DEF(Tag_TR, "tr", (CM_TABLE | CM_OPT | FL_BLOCK)),
  131. TAG_DEF(Tag_TT, "tt", (CM_INLINE)),
  132. TAG_DEF(Tag_U, "u", (CM_INLINE)),
  133. TAG_DEF(Tag_UL, "ul", (CM_BLOCK | FL_BLOCK)),
  134. TAG_DEF(Tag_VAR, "var", (CM_INLINE)),
  135. TAG_DEF(Tag_XMP, "xmp", (CM_BLOCK)),
  136. TAG_DEF(Tag_NEXTID, "nextid", (CM_HEAD | CM_EMPTY)));
  137. class html_tags_storage {
  138. ankerl::unordered_dense::map<std::string_view, html_tag_def> tag_by_name;
  139. ankerl::unordered_dense::map<tag_id_t, html_tag_def> tag_by_id;
  140. public:
  141. html_tags_storage()
  142. {
  143. tag_by_name.reserve(html_tag_defs_array.size());
  144. tag_by_id.reserve(html_tag_defs_array.size());
  145. for (const auto &t: html_tag_defs_array) {
  146. tag_by_name[t.name] = t;
  147. tag_by_id[t.id] = t;
  148. }
  149. }
  150. auto by_name(std::string_view name) const -> const html_tag_def *
  151. {
  152. auto it = tag_by_name.find(name);
  153. if (it != tag_by_name.end()) {
  154. return &(it->second);
  155. }
  156. return nullptr;
  157. }
  158. auto by_id(int id) const -> const html_tag_def *
  159. {
  160. auto it = tag_by_id.find(static_cast<tag_id_t>(id));
  161. if (it != tag_by_id.end()) {
  162. return &(it->second);
  163. }
  164. return nullptr;
  165. }
  166. auto name_by_id_safe(int id) const -> std::string_view
  167. {
  168. auto it = tag_by_id.find(static_cast<tag_id_t>(id));
  169. if (it != tag_by_id.end()) {
  170. return it->second.name;
  171. }
  172. return "unknown";
  173. }
  174. };
  175. }// namespace rspamd::html
  176. #endif//RSPAMD_HTML_TAG_DEFS_HXX