diff options
Diffstat (limited to 'src/libserver')
-rw-r--r-- | src/libserver/css/css.cxx | 19 | ||||
-rw-r--r-- | src/libserver/html/html.cxx | 1017 | ||||
-rw-r--r-- | src/libserver/html/html_tag.hxx | 1348 |
3 files changed, 2158 insertions, 226 deletions
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 1b369ed17..c53e3c05e 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block * { - std::optional<std::string_view> id_comp, class_comp; rspamd::html::html_block *res = nullptr; if (!tag) { @@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa } /* First, find id in a tag and a class */ - for (const auto ¶m: tag->components) { - if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { - id_comp = param.value; - } - else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - class_comp = param.value; - } - } + auto id_comp = tag->find_id(); + auto class_comp = tag->find_class(); /* ID part */ if (id_comp && !pimpl->id_selectors.empty()) { @@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool, return std::make_pair(nullptr, parse_res.error()); } -}// namespace rspamd::css
\ No newline at end of file +}// namespace rspamd::css diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 93d1fdf91..78a6a975c 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -39,6 +39,7 @@ #include "contrib/frozen/include/frozen/string.h" #include "contrib/fmt/include/fmt/core.h" +#include <functional> #include <unicode/uversion.h> namespace rspamd::html { @@ -47,23 +48,88 @@ static const unsigned int max_tags = 8192; /* Ignore tags if this maximum is rea static const html_tags_storage html_tags_defs; -auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>( +auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_enum_type>( { - {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME}, - {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR}, - {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, - {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE}, - {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS}, - {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH}, - {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT}, - {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE}, - {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL}, - {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT}, - {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID}, - {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + {"name", html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME}, + {"href", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"src", html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC}, + {"action", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"color", html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR}, + {"bgcolor", html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, + {"style", html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE}, + {"class", html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS}, + {"width", html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH}, + {"height", html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT}, + {"size", html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE}, + {"rel", html_component_enum_type::RSPAMD_HTML_COMPONENT_REL}, + {"alt", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT}, + {"id", html_component_enum_type::RSPAMD_HTML_COMPONENT_ID}, + {"hidden", html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + // Typography + {"font-family", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY}, + {"font-size", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE}, + {"font-weight", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT}, + {"font-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE}, + {"text-align", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN}, + {"text-decoration", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION}, + {"line-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT}, + // Layout & positioning + {"margin", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN}, + {"margin-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP}, + {"margin-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM}, + {"margin-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT}, + {"margin-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT}, + {"padding", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING}, + {"padding-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP}, + {"padding-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM}, + {"padding-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT}, + {"padding-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT}, + {"border", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER}, + {"border-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR}, + {"border-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH}, + {"border-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE}, + // Display & visibility + {"display", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY}, + {"visibility", html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY}, + {"opacity", html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY}, + // Dimensions + {"min-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH}, + {"max-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH}, + {"min-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT}, + {"max-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT}, + // Table attributes + {"cellpadding", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING}, + {"cellspacing", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING}, + {"valign", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN}, + {"align", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN}, + // Form attributes + {"type", html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE}, + {"value", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE}, + {"placeholder", html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER}, + {"disabled", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED}, + {"readonly", html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY}, + {"checked", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED}, + {"selected", html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED}, + // Link & media + {"target", html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET}, + {"title", html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE}, + // Meta & document + {"charset", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET}, + {"content", html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT}, + {"http-equiv", html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV}, + // Accessibility + {"role", html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE}, + {"tabindex", html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX}, + // Background + {"background", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND}, + {"background-image", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE}, + {"background-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR}, + {"background-repeat", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT}, + {"background-position", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION}, + // Email-specific tracking + {"data-track", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK}, + {"data-id", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID}, + {"data-url", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL}, }); #define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \ @@ -199,18 +265,608 @@ html_check_balance(struct html_content *hc, return nullptr; } -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type> +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component { - auto known_component_it = html_components_map.find(st); + auto known_component_it = html_components_map.find(name); if (known_component_it != html_components_map.end()) { - return known_component_it->second; + switch (known_component_it->second) { + case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME: + return html_component_name{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF: + return html_component_href{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR: + return html_component_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR: + return html_component_bgcolor{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE: + return html_component_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS: + return html_component_class{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH: + return html_component_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT: + return html_component_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE: + return html_component_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_REL: + return html_component_rel{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT: + return html_component_alt{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ID: + return html_component_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN: + return html_component_hidden{}; + // Typography + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY: + return html_component_font_family{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE: + return html_component_font_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT: + return html_component_font_weight{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE: + return html_component_font_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN: + return html_component_text_align{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION: + return html_component_text_decoration{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT: + return html_component_line_height{value}; + // Layout + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN: + return html_component_margin{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP: + return html_component_margin_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM: + return html_component_margin_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT: + return html_component_margin_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT: + return html_component_margin_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING: + return html_component_padding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP: + return html_component_padding_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM: + return html_component_padding_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT: + return html_component_padding_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT: + return html_component_padding_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER: + return html_component_border{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR: + return html_component_border_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH: + return html_component_border_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE: + return html_component_border_style{value}; + // Display + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY: + return html_component_display{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY: + return html_component_visibility{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY: + return html_component_opacity{value}; + // Dimensions + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH: + return html_component_min_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH: + return html_component_max_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT: + return html_component_min_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT: + return html_component_max_height{value}; + // Table + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING: + return html_component_cellpadding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING: + return html_component_cellspacing{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN: + return html_component_valign{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN: + return html_component_align{value}; + // Form + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE: + return html_component_type{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE: + return html_component_value{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER: + return html_component_placeholder{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED: + return html_component_disabled{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY: + return html_component_readonly{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED: + return html_component_checked{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED: + return html_component_selected{}; + // Link & media + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET: + return html_component_target{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE: + return html_component_title{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC: + return html_component_src{value}; + // Meta + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET: + return html_component_charset{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT: + return html_component_content{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV: + return html_component_http_equiv{value}; + // Accessibility + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE: + return html_component_role{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX: + return html_component_tabindex{value}; + // Background + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND: + return html_component_background{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE: + return html_component_background_image{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR: + return html_component_background_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT: + return html_component_background_repeat{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION: + return html_component_background_position{value}; + // Email tracking + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK: + return html_component_data_track{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID: + return html_component_data_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL: + return html_component_data_url{value}; + default: + return html_component_unknown{name, value}; + } } else { - return std::nullopt; + return html_component_unknown{name, value}; } } +using component_extractor_func = std::function<std::optional<std::string_view>(const html_tag *)>; +static const auto component_extractors = frozen::make_unordered_map<frozen::string, component_extractor_func>( + { + // Basic components + {"name", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_name>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"href", [](const html_tag *tag) { return tag->find_href(); }}, + {"src", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_src>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"class", [](const html_tag *tag) { return tag->find_class(); }}, + {"id", [](const html_tag *tag) { return tag->find_id(); }}, + {"style", [](const html_tag *tag) { return tag->find_style(); }}, + {"alt", [](const html_tag *tag) { return tag->find_alt(); }}, + {"rel", [](const html_tag *tag) { return tag->find_rel(); }}, + {"color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"bgcolor", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_bgcolor>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Numeric components (return string representation) + {"width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Boolean components + {"hidden", [](const html_tag *tag) -> std::optional<std::string_view> { + return tag->is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt; + }}, + + // Typography components + {"font-family", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_family>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"font-weight", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_weight>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-decoration", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_decoration>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"line-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_line_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Layout components + {"margin", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"border-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Display components + {"display", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_display>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"visibility", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_visibility>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"opacity", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_opacity>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Additional dimensions + {"min-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"min-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Table components + {"cellpadding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellpadding>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"cellspacing", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellspacing>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"valign", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_valign>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Form components + {"type", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_type>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"value", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_value>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"placeholder", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_placeholder>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"disabled", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_disabled>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"readonly", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_readonly>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"checked", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_checked>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"selected", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_selected>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + + // Link & media components + {"target", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_target>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"title", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_title>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Meta components + {"charset", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_charset>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"content", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_content>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"http-equiv", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_http_equiv>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Accessibility components + {"role", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_role>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"tabindex", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_tabindex>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Background components + {"background", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-image", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_image>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-repeat", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_repeat>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-position", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_position>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Email tracking components + {"data-track", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_track>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-id", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_id>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-url", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_url>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + }); + +auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view> +{ + auto it = component_extractors.find(attr_name); + if (it != component_extractors.end()) { + return it->second(this); + } + + // Fallback to unknown components + return find_unknown_component(attr_name); +} + +auto html_tag::get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>> +{ + std::vector<std::pair<std::string_view, std::string_view>> result; + + // First, get all known attributes using the component_extractors map + for (const auto &[attr_name, extractor_func]: component_extractors) { + if (auto value = extractor_func(this)) { + // Convert frozen::string to std::string_view for the key + std::string_view name_view{attr_name.data(), attr_name.size()}; + result.emplace_back(name_view, value.value()); + } + } + + // Then add all unknown attributes + auto unknown_attrs = get_unknown_components(); + for (const auto &[name, value]: unknown_attrs) { + result.emplace_back(name, value); + } + + return result; +} + enum tag_parser_state { parse_start = 0, parse_name, @@ -234,13 +890,13 @@ enum tag_parser_state { struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; - std::optional<html_component_type> cur_component; + std::string attr_name;// Store current attribute name void reset() { cur_state = parse_start; buf.clear(); - cur_component = std::nullopt; + attr_name.clear(); } }; @@ -254,56 +910,50 @@ html_parse_tag_content(rspamd_mempool_t *pool, auto state = parser_env.cur_state; /* - * Stores tag component if it doesn't exist, performing copy of the - * value + decoding of the entities - * Parser env is set to clear the current html attribute fields (saved_p and - * cur_component) + * Stores tag component creating the appropriate variant type + * Parser env is cleared after storing */ auto store_component_value = [&]() -> void { - if (parser_env.cur_component) { + if (!parser_env.attr_name.empty()) { + std::string_view attr_name_view, value_view; - if (parser_env.buf.empty()) { - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{}); + // Store attribute name in persistent memory + if (!parser_env.attr_name.empty()) { + auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size()); + memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size()); + attr_name_view = {name_storage, parser_env.attr_name.size()}; } - else { - /* We need to copy buf to a persistent storage */ - auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); - if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || - parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - /* Lowercase */ - rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + // Store value in persistent memory if not empty + if (!parser_env.buf.empty()) { + auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + // Lowercase for id and class attributes + if (parser_env.attr_name == "id" || parser_env.attr_name == "class") { + rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size()); } else { - memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size()); } - auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{s, sz}); + auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size()); + value_view = {value_storage, sz}; } + + // Create the appropriate component variant + auto component = html_component_from_string(attr_name_view, value_view); + tag->components.emplace_back(std::move(component)); } parser_env.buf.clear(); - parser_env.cur_component = std::nullopt; + parser_env.attr_name.clear(); }; auto store_component_name = [&]() -> bool { decode_html_entitles_inplace(parser_env.buf); - auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.attr_name = parser_env.buf; parser_env.buf.clear(); - - if (known_component_it != html_components_map.end()) { - parser_env.cur_component = known_component_it->second; - - return true; - } - else { - parser_env.cur_component = std::nullopt; - } - - return false; + return true; }; auto store_value_character = [&](bool lc) -> void { @@ -471,6 +1121,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_dquote: if (*in == '"') { + store_component_value(); state = spaces_after_param; } else { @@ -481,6 +1132,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_squote: if (*in == '\'') { + store_component_value(); state = spaces_after_param; } else { @@ -620,7 +1272,7 @@ html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> std::optional<struct rspamd_url *> { - auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + auto found_href_maybe = tag->find_href(); if (found_href_maybe) { /* Check base url */ @@ -816,130 +1468,126 @@ html_process_img_tag(rspamd_mempool_t *pool, img = rspamd_mempool_alloc0_type(pool, struct html_image); img->tag = tag; - for (const auto ¶m: tag->components) { + // Process SRC component (preferred for img tags) or HREF component (fallback) + std::optional<std::string_view> href_value; - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { - /* Check base url */ - const auto &href_value = param.value; + // Try SRC first (standard for img tags) + if (auto src_comp = tag->find_component<html_component_src>()) { + href_value = src_comp.value()->value; + } + // Fallback to HREF (for backward compatibility or non-standard usage) + else if (auto href_comp = tag->find_href()) { + href_value = href_comp; + } - if (href_value.size() > 0) { - rspamd_ftok_t fstr; - fstr.begin = href_value.data(); - fstr.len = href_value.size(); - img->src = rspamd_mempool_ftokdup(pool, &fstr); + if (href_value && href_value->size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value->data(); + fstr.len = href_value->size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); - if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), - "cid:", sizeof("cid:") - 1) == 0) { - /* We have an embedded image */ - img->src += sizeof("cid:") - 1; - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; - } - else { - if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), - "data:", sizeof("data:") - 1) == 0) { - /* We have an embedded image in HTML tag */ - img->flags |= - (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); - html_process_data_image(pool, img, href_value); - hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; - } - else { - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; - if (img->src) { - - std::string_view cpy{href_value}; - auto maybe_url = html_process_url(pool, cpy); - - if (maybe_url) { - img->url = maybe_url.value(); - struct rspamd_url *existing; - - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - existing = rspamd_url_set_add_or_return(url_set, - img->url); - - if (existing && existing != img->url) { - /* - * We have some other URL that could be - * found, e.g. from another part. However, - * we still want to set an image flag on it - */ - existing->flags |= img->url->flags; - existing->count++; - } - else if (part_urls) { - /* New url */ - g_ptr_array_add(part_urls, img->url); - } - } - } - } - } - } + if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; } + else { + if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, *href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + std::string_view cpy{*href_value}; + auto maybe_url = html_process_url(pool, cpy); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { - unsigned long val; + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->height = val; - } + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { - unsigned long val; - - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->width = val; + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } + } + } + } } + } - /* TODO: rework to css at some time */ - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - if (img->height == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "height", sizeof("height") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("height") - 1); + // Process numeric dimensions using the new helper methods + if (auto height = tag->find_height()) { + img->height = height.value(); + } - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->height = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + if (auto width = tag->find_width()) { + img->width = width.value(); + } + + // Process style component for dimensions + if (auto style_value = tag->find_style()) { + if (img->height == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } - if (img->width == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "width", sizeof("width") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("width") - 1); - - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->width = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + } + if (img->width == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } @@ -968,7 +1616,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, khash_t(rspamd_url_hash) * url_set, GPtrArray *part_urls) -> void { - auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + auto found_rel_maybe = tag->find_rel(); if (found_rel_maybe) { if (found_rel_maybe.value() == "icon") { @@ -984,24 +1632,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor; bool hidden = false; - for (const auto ¶m: tag->components) { - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { - maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { - maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); - } + // Process color components + if (auto color_comp = tag->find_component<html_component_color>()) { + maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - tag->block = rspamd::css::parse_css_declaration(pool, param.value); - } + if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) { + maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { - hidden = true; - } + // Process style component + if (auto style_value = tag->find_style()) { + tag->block = rspamd::css::parse_css_declaration(pool, *style_value); } + // Check if hidden + hidden = tag->is_hidden(); + if (!tag->block) { tag->block = html_block::undefined_html_block_pool(pool); } @@ -1284,7 +1931,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ - auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + auto maybe_alt = tag->find_alt(); if (maybe_alt) { if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { @@ -1384,9 +2031,7 @@ auto html_process_input(struct rspamd_task *task, overflow_input = true; } - auto new_tag = [&](int flags = 0) -> struct html_tag * - { - + auto new_tag = [&](int flags = 0) -> struct html_tag * { if (hc->all_tags.size() > rspamd::html::max_tags) { hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; @@ -2151,7 +2796,7 @@ auto html_process_input(struct rspamd_task *task, /* Leftover after content */ switch (state) { case tags_limit_overflow: - html_append_parsed(hc, {c, (std::size_t)(end - c)}, + html_append_parsed(hc, {c, (std::size_t) (end - c)}, false, end - start, hc->parsed); break; default: @@ -2390,4 +3035,4 @@ gsize rspamd_html_get_tags_count(void *html_content) } return hc->all_tags.size(); -}
\ No newline at end of file +} diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 309d76177..6d41f1337 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +26,7 @@ #include <cstdint> #include "html_tags.h" +#include "libutil/str_util.h" struct rspamd_url; struct html_image; @@ -34,7 +35,8 @@ namespace rspamd::html { struct html_content; /* Forward declaration */ -enum class html_component_type : std::uint8_t { +// Internal enum for mapping (not exposed in public API) +enum class html_component_enum_type : std::uint8_t { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, RSPAMD_HTML_COMPONENT_COLOR, @@ -48,8 +50,1214 @@ enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_ALT, RSPAMD_HTML_COMPONENT_ID, RSPAMD_HTML_COMPONENT_HIDDEN, + // Typography + RSPAMD_HTML_COMPONENT_FONT_FAMILY, + RSPAMD_HTML_COMPONENT_FONT_SIZE, + RSPAMD_HTML_COMPONENT_FONT_WEIGHT, + RSPAMD_HTML_COMPONENT_FONT_STYLE, + RSPAMD_HTML_COMPONENT_TEXT_ALIGN, + RSPAMD_HTML_COMPONENT_TEXT_DECORATION, + RSPAMD_HTML_COMPONENT_LINE_HEIGHT, + // Layout & positioning + RSPAMD_HTML_COMPONENT_MARGIN, + RSPAMD_HTML_COMPONENT_MARGIN_TOP, + RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM, + RSPAMD_HTML_COMPONENT_MARGIN_LEFT, + RSPAMD_HTML_COMPONENT_MARGIN_RIGHT, + RSPAMD_HTML_COMPONENT_PADDING, + RSPAMD_HTML_COMPONENT_PADDING_TOP, + RSPAMD_HTML_COMPONENT_PADDING_BOTTOM, + RSPAMD_HTML_COMPONENT_PADDING_LEFT, + RSPAMD_HTML_COMPONENT_PADDING_RIGHT, + RSPAMD_HTML_COMPONENT_BORDER, + RSPAMD_HTML_COMPONENT_BORDER_COLOR, + RSPAMD_HTML_COMPONENT_BORDER_WIDTH, + RSPAMD_HTML_COMPONENT_BORDER_STYLE, + // Display & visibility + RSPAMD_HTML_COMPONENT_DISPLAY, + RSPAMD_HTML_COMPONENT_VISIBILITY, + RSPAMD_HTML_COMPONENT_OPACITY, + // Dimensions + RSPAMD_HTML_COMPONENT_MIN_WIDTH, + RSPAMD_HTML_COMPONENT_MAX_WIDTH, + RSPAMD_HTML_COMPONENT_MIN_HEIGHT, + RSPAMD_HTML_COMPONENT_MAX_HEIGHT, + // Table attributes + RSPAMD_HTML_COMPONENT_CELLPADDING, + RSPAMD_HTML_COMPONENT_CELLSPACING, + RSPAMD_HTML_COMPONENT_VALIGN, + RSPAMD_HTML_COMPONENT_ALIGN, + // Form attributes + RSPAMD_HTML_COMPONENT_TYPE, + RSPAMD_HTML_COMPONENT_VALUE, + RSPAMD_HTML_COMPONENT_PLACEHOLDER, + RSPAMD_HTML_COMPONENT_DISABLED, + RSPAMD_HTML_COMPONENT_READONLY, + RSPAMD_HTML_COMPONENT_CHECKED, + RSPAMD_HTML_COMPONENT_SELECTED, + // Link & media + RSPAMD_HTML_COMPONENT_TARGET, + RSPAMD_HTML_COMPONENT_TITLE, + RSPAMD_HTML_COMPONENT_SRC, + // Meta & document + RSPAMD_HTML_COMPONENT_CHARSET, + RSPAMD_HTML_COMPONENT_CONTENT, + RSPAMD_HTML_COMPONENT_HTTP_EQUIV, + // Accessibility + RSPAMD_HTML_COMPONENT_ROLE, + RSPAMD_HTML_COMPONENT_TABINDEX, + // Background + RSPAMD_HTML_COMPONENT_BACKGROUND, + RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE, + RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR, + RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT, + RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION, + // Email-specific tracking + RSPAMD_HTML_COMPONENT_DATA_TRACK, + RSPAMD_HTML_COMPONENT_DATA_ID, + RSPAMD_HTML_COMPONENT_DATA_URL, }; +// Forward declarations for component types +struct html_component_name; +struct html_component_href; +struct html_component_color; +struct html_component_bgcolor; +struct html_component_style; +struct html_component_class; +struct html_component_width; +struct html_component_height; +struct html_component_size; +struct html_component_rel; +struct html_component_alt; +struct html_component_id; +struct html_component_hidden; +struct html_component_unknown; + +// Base interface for all components +struct html_component_base { + virtual ~html_component_base() = default; + virtual constexpr std::string_view get_string_value() const = 0; +}; + +// String-based components +struct html_component_name : html_component_base { + std::string_view value; + explicit constexpr html_component_name(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_href : html_component_base { + std::string_view value; + explicit constexpr html_component_href(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_style : html_component_base { + std::string_view value; + explicit constexpr html_component_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_class : html_component_base { + std::string_view value; + explicit constexpr html_component_class(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_rel : html_component_base { + std::string_view value; + explicit constexpr html_component_rel(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_alt : html_component_base { + std::string_view value; + explicit constexpr html_component_alt(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_id : html_component_base { + std::string_view value; + explicit constexpr html_component_id(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Color components (could be extended to parse actual colors) +struct html_component_color : html_component_base { + std::string_view value; + explicit constexpr html_component_color(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_bgcolor : html_component_base { + std::string_view value; + explicit constexpr html_component_bgcolor(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Numeric components +struct html_component_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_width(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_height(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Boolean/flag component +struct html_component_hidden : html_component_base { + bool present; + explicit constexpr html_component_hidden() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Unknown component with both name and value +struct html_component_unknown : html_component_base { + std::string_view name; + std::string_view value; + + constexpr html_component_unknown(std::string_view n, std::string_view v) + : name(n), value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } + constexpr std::string_view get_name() const + { + return name; + } +}; + +// Typography components +struct html_component_font_family : html_component_base { + std::string_view value; + explicit constexpr html_component_font_family(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_font_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_font_weight : html_component_base { + std::string_view value; + explicit constexpr html_component_font_weight(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_style : html_component_base { + std::string_view value; + explicit constexpr html_component_font_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_align : html_component_base { + std::string_view value; + explicit constexpr html_component_text_align(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_decoration : html_component_base { + std::string_view value; + explicit constexpr html_component_text_decoration(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_line_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_line_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Layout components (most are string-based for flexibility) +struct html_component_margin : html_component_base { + std::string_view value; + explicit constexpr html_component_margin(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_top : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_left : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_right : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding : html_component_base { + std::string_view value; + explicit constexpr html_component_padding(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_top : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_left : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_right : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border : html_component_base { + std::string_view value; + explicit html_component_border(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_color : html_component_base { + std::string_view value; + explicit html_component_border_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_border_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_border_style : html_component_base { + std::string_view value; + explicit html_component_border_style(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Display components +struct html_component_display : html_component_base { + std::string_view value; + explicit html_component_display(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_visibility : html_component_base { + std::string_view value; + explicit html_component_visibility(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_opacity : html_component_base { + std::string_view raw_value; + std::optional<float> numeric_value; + + explicit html_component_opacity(std::string_view v) + : raw_value(v) + { + char *endptr; + auto val = std::strtof(v.data(), &endptr); + if (endptr != v.data() && val >= 0.0f && val <= 1.0f) { + numeric_value = val; + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<float> get_numeric_value() const + { + return numeric_value; + } +}; + +// Additional dimension components +struct html_component_min_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_min_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Table components +struct html_component_cellpadding : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellpadding(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_cellspacing : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellspacing(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_valign : html_component_base { + std::string_view value; + explicit html_component_valign(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_align : html_component_base { + std::string_view value; + explicit html_component_align(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Form components +struct html_component_type : html_component_base { + std::string_view value; + explicit html_component_type(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_value : html_component_base { + std::string_view value; + explicit html_component_value(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_placeholder : html_component_base { + std::string_view value; + explicit html_component_placeholder(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Boolean form components +struct html_component_disabled : html_component_base { + bool present; + explicit constexpr html_component_disabled() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_readonly : html_component_base { + bool present; + explicit constexpr html_component_readonly() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_checked : html_component_base { + bool present; + explicit constexpr html_component_checked() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_selected : html_component_base { + bool present; + explicit constexpr html_component_selected() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Link & media components +struct html_component_target : html_component_base { + std::string_view value; + explicit html_component_target(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_title : html_component_base { + std::string_view value; + explicit html_component_title(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_src : html_component_base { + std::string_view value; + explicit html_component_src(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Meta components +struct html_component_charset : html_component_base { + std::string_view value; + explicit html_component_charset(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_content : html_component_base { + std::string_view value; + explicit html_component_content(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_http_equiv : html_component_base { + std::string_view value; + explicit html_component_http_equiv(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Accessibility components +struct html_component_role : html_component_base { + std::string_view value; + explicit html_component_role(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_tabindex : html_component_base { + std::string_view raw_value; + std::optional<std::int32_t> numeric_value; + + explicit html_component_tabindex(std::string_view v) + : raw_value(v) + { + long val; + if (rspamd_strtol(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::int32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::int32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Background components +struct html_component_background : html_component_base { + std::string_view value; + explicit html_component_background(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_image : html_component_base { + std::string_view value; + explicit html_component_background_image(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_color : html_component_base { + std::string_view value; + explicit html_component_background_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_repeat : html_component_base { + std::string_view value; + explicit html_component_background_repeat(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_position : html_component_base { + std::string_view value; + explicit html_component_background_position(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Email tracking components +struct html_component_data_track : html_component_base { + std::string_view value; + explicit html_component_data_track(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_id : html_component_base { + std::string_view value; + explicit html_component_data_id(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_url : html_component_base { + std::string_view value; + explicit html_component_data_url(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// The variant type that holds all possible components +using html_tag_component = std::variant< + html_component_name, + html_component_href, + html_component_color, + html_component_bgcolor, + html_component_style, + html_component_class, + html_component_width, + html_component_height, + html_component_size, + html_component_rel, + html_component_alt, + html_component_id, + html_component_hidden, + // Typography + html_component_font_family, + html_component_font_size, + html_component_font_weight, + html_component_font_style, + html_component_text_align, + html_component_text_decoration, + html_component_line_height, + // Layout + html_component_margin, + html_component_margin_top, + html_component_margin_bottom, + html_component_margin_left, + html_component_margin_right, + html_component_padding, + html_component_padding_top, + html_component_padding_bottom, + html_component_padding_left, + html_component_padding_right, + html_component_border, + html_component_border_color, + html_component_border_width, + html_component_border_style, + // Display + html_component_display, + html_component_visibility, + html_component_opacity, + // Dimensions + html_component_min_width, + html_component_max_width, + html_component_min_height, + html_component_max_height, + // Table + html_component_cellpadding, + html_component_cellspacing, + html_component_valign, + html_component_align, + // Form + html_component_type, + html_component_value, + html_component_placeholder, + html_component_disabled, + html_component_readonly, + html_component_checked, + html_component_selected, + // Link & media + html_component_target, + html_component_title, + html_component_src, + // Meta + html_component_charset, + html_component_content, + html_component_http_equiv, + // Accessibility + html_component_role, + html_component_tabindex, + // Background + html_component_background, + html_component_background_image, + html_component_background_color, + html_component_background_repeat, + html_component_background_position, + // Email tracking + html_component_data_track, + html_component_data_id, + html_component_data_url, + // Unknown + html_component_unknown>; + +/** + * Returns component variant from a string + * @param name attribute name + * @param value attribute value + * @return variant component + */ +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component; + /* Public tags flags */ /* XML tag */ #define FL_XML (1u << CM_USER_SHIFT) @@ -62,23 +1270,7 @@ enum class html_component_type : std::uint8_t { #define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) -/** - * Returns component type from a string - * @param st - * @return - */ -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; - using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; -struct html_tag_component { - html_component_type type; - std::string_view value; - - html_tag_component(html_component_type type, std::string_view value) - : type(type), value(value) - { - } -}; /* Pairing closing tag representation */ struct html_closing_tag { @@ -105,26 +1297,128 @@ struct html_tag { std::vector<struct html_tag *> children; struct html_tag *parent; - auto find_component(html_component_type what) const -> std::optional<std::string_view> + // Template method to find component by type + template<typename T> + auto find_component() const -> std::optional<const T *> { for (const auto &comp: components) { - if (comp.type == what) { - return comp.value; + if (std::holds_alternative<T>(comp)) { + return &std::get<T>(comp); } } + return std::nullopt; + } + // Helper methods for common component access + auto find_href() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_href>()) { + return comp.value()->value; + } return std::nullopt; } - auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> + auto find_class() const -> std::optional<std::string_view> { - if (what) { - return find_component(what.value()); + if (auto comp = find_component<html_component_class>()) { + return comp.value()->value; } + return std::nullopt; + } + auto find_id() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_id>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_width() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_width>()) { + return comp.value()->get_numeric_value(); + } + return std::nullopt; + } + + auto find_height() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_height>()) { + return comp.value()->get_numeric_value(); + } return std::nullopt; } + auto find_style() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_style>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_alt() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_alt>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_rel() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_rel>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto is_hidden() const -> bool + { + return find_component<html_component_hidden>().has_value(); + } + + auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view> + { + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + if (unknown.name == attr_name) { + return unknown.value; + } + } + } + return std::nullopt; + } + + auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>> + { + std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs; + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + unknown_attrs.emplace_back(unknown.name, unknown.value); + } + } + return unknown_attrs; + } + + // Generic visitor method for processing all components + template<typename Visitor> + auto visit_components(Visitor &&visitor) const + { + for (const auto &comp: components) { + std::visit(std::forward<Visitor>(visitor), comp); + } + } + + // Find any component by attribute name + auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>; + + // Get all attributes as name-value pairs + auto get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>; + auto clear(void) -> void { id = Tag_UNKNOWN; @@ -137,7 +1431,7 @@ struct html_tag { closing.clear(); } - constexpr auto get_content_length() const -> std::size_t + auto get_content_length() const -> std::size_t { if (flags & (FL_IGNORE | CM_HEAD)) { return 0; |