aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/libserver/css/css.cxx19
-rw-r--r--src/libserver/html/html.cxx1017
-rw-r--r--src/libserver/html/html_tag.hxx1348
-rw-r--r--src/lua/lua_html.cxx269
-rw-r--r--src/lua/lua_parsers.c67
5 files changed, 2491 insertions, 229 deletions
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 1b369ed17..c53e3c05e 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector
auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block *
{
- std::optional<std::string_view> id_comp, class_comp;
rspamd::html::html_block *res = nullptr;
if (!tag) {
@@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa
}
/* First, find id in a tag and a class */
- for (const auto &param: tag->components) {
- if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) {
- id_comp = param.value;
- }
- else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
- class_comp = param.value;
- }
- }
+ auto id_comp = tag->find_id();
+ auto class_comp = tag->find_class();
/* ID part */
if (id_comp && !pimpl->id_selectors.empty()) {
@@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool,
return std::make_pair(nullptr, parse_res.error());
}
-}// namespace rspamd::css \ No newline at end of file
+}// namespace rspamd::css
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 93d1fdf91..78a6a975c 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -39,6 +39,7 @@
#include "contrib/frozen/include/frozen/string.h"
#include "contrib/fmt/include/fmt/core.h"
+#include <functional>
#include <unicode/uversion.h>
namespace rspamd::html {
@@ -47,23 +48,88 @@ static const unsigned int max_tags = 8192; /* Ignore tags if this maximum is rea
static const html_tags_storage html_tags_defs;
-auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
+auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_enum_type>(
{
- {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
- {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
- {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
- {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
- {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
- {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
- {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
- {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
- {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
- {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
- {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
- {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+ {"name", html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME},
+ {"href", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"src", html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC},
+ {"action", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"color", html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR},
+ {"bgcolor", html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
+ {"style", html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE},
+ {"class", html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS},
+ {"width", html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH},
+ {"height", html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT},
+ {"size", html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE},
+ {"rel", html_component_enum_type::RSPAMD_HTML_COMPONENT_REL},
+ {"alt", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT},
+ {"id", html_component_enum_type::RSPAMD_HTML_COMPONENT_ID},
+ {"hidden", html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+ // Typography
+ {"font-family", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY},
+ {"font-size", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE},
+ {"font-weight", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT},
+ {"font-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE},
+ {"text-align", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN},
+ {"text-decoration", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION},
+ {"line-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT},
+ // Layout & positioning
+ {"margin", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN},
+ {"margin-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP},
+ {"margin-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM},
+ {"margin-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT},
+ {"margin-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT},
+ {"padding", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING},
+ {"padding-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP},
+ {"padding-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM},
+ {"padding-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT},
+ {"padding-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT},
+ {"border", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER},
+ {"border-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR},
+ {"border-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH},
+ {"border-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE},
+ // Display & visibility
+ {"display", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY},
+ {"visibility", html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY},
+ {"opacity", html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY},
+ // Dimensions
+ {"min-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH},
+ {"max-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH},
+ {"min-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT},
+ {"max-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT},
+ // Table attributes
+ {"cellpadding", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING},
+ {"cellspacing", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING},
+ {"valign", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN},
+ {"align", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN},
+ // Form attributes
+ {"type", html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE},
+ {"value", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE},
+ {"placeholder", html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER},
+ {"disabled", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED},
+ {"readonly", html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY},
+ {"checked", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED},
+ {"selected", html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED},
+ // Link & media
+ {"target", html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET},
+ {"title", html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE},
+ // Meta & document
+ {"charset", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET},
+ {"content", html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT},
+ {"http-equiv", html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV},
+ // Accessibility
+ {"role", html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE},
+ {"tabindex", html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX},
+ // Background
+ {"background", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND},
+ {"background-image", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE},
+ {"background-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR},
+ {"background-repeat", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT},
+ {"background-position", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION},
+ // Email-specific tracking
+ {"data-track", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK},
+ {"data-id", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID},
+ {"data-url", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL},
});
#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \
@@ -199,18 +265,608 @@ html_check_balance(struct html_content *hc,
return nullptr;
}
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component
{
- auto known_component_it = html_components_map.find(st);
+ auto known_component_it = html_components_map.find(name);
if (known_component_it != html_components_map.end()) {
- return known_component_it->second;
+ switch (known_component_it->second) {
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME:
+ return html_component_name{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF:
+ return html_component_href{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR:
+ return html_component_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR:
+ return html_component_bgcolor{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE:
+ return html_component_style{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS:
+ return html_component_class{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH:
+ return html_component_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT:
+ return html_component_height{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE:
+ return html_component_size{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_REL:
+ return html_component_rel{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT:
+ return html_component_alt{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ID:
+ return html_component_id{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN:
+ return html_component_hidden{};
+ // Typography
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY:
+ return html_component_font_family{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE:
+ return html_component_font_size{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT:
+ return html_component_font_weight{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE:
+ return html_component_font_style{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN:
+ return html_component_text_align{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION:
+ return html_component_text_decoration{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT:
+ return html_component_line_height{value};
+ // Layout
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN:
+ return html_component_margin{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP:
+ return html_component_margin_top{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM:
+ return html_component_margin_bottom{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT:
+ return html_component_margin_left{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT:
+ return html_component_margin_right{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING:
+ return html_component_padding{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP:
+ return html_component_padding_top{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM:
+ return html_component_padding_bottom{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT:
+ return html_component_padding_left{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT:
+ return html_component_padding_right{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER:
+ return html_component_border{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR:
+ return html_component_border_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH:
+ return html_component_border_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE:
+ return html_component_border_style{value};
+ // Display
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY:
+ return html_component_display{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY:
+ return html_component_visibility{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY:
+ return html_component_opacity{value};
+ // Dimensions
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH:
+ return html_component_min_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH:
+ return html_component_max_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT:
+ return html_component_min_height{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT:
+ return html_component_max_height{value};
+ // Table
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING:
+ return html_component_cellpadding{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING:
+ return html_component_cellspacing{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN:
+ return html_component_valign{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN:
+ return html_component_align{value};
+ // Form
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE:
+ return html_component_type{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE:
+ return html_component_value{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER:
+ return html_component_placeholder{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED:
+ return html_component_disabled{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY:
+ return html_component_readonly{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED:
+ return html_component_checked{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED:
+ return html_component_selected{};
+ // Link & media
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET:
+ return html_component_target{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE:
+ return html_component_title{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC:
+ return html_component_src{value};
+ // Meta
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET:
+ return html_component_charset{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT:
+ return html_component_content{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV:
+ return html_component_http_equiv{value};
+ // Accessibility
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE:
+ return html_component_role{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX:
+ return html_component_tabindex{value};
+ // Background
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND:
+ return html_component_background{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE:
+ return html_component_background_image{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR:
+ return html_component_background_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT:
+ return html_component_background_repeat{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION:
+ return html_component_background_position{value};
+ // Email tracking
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK:
+ return html_component_data_track{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID:
+ return html_component_data_id{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL:
+ return html_component_data_url{value};
+ default:
+ return html_component_unknown{name, value};
+ }
}
else {
- return std::nullopt;
+ return html_component_unknown{name, value};
}
}
+using component_extractor_func = std::function<std::optional<std::string_view>(const html_tag *)>;
+static const auto component_extractors = frozen::make_unordered_map<frozen::string, component_extractor_func>(
+ {
+ // Basic components
+ {"name", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_name>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"href", [](const html_tag *tag) { return tag->find_href(); }},
+ {"src", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_src>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"class", [](const html_tag *tag) { return tag->find_class(); }},
+ {"id", [](const html_tag *tag) { return tag->find_id(); }},
+ {"style", [](const html_tag *tag) { return tag->find_style(); }},
+ {"alt", [](const html_tag *tag) { return tag->find_alt(); }},
+ {"rel", [](const html_tag *tag) { return tag->find_rel(); }},
+ {"color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"bgcolor", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_bgcolor>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Numeric components (return string representation)
+ {"width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"size", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_size>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Boolean components
+ {"hidden", [](const html_tag *tag) -> std::optional<std::string_view> {
+ return tag->is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }},
+
+ // Typography components
+ {"font-family", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_family>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"font-size", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_size>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"font-weight", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_weight>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"font-style", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"text-align", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_text_align>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"text-decoration", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_text_decoration>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"line-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_line_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Layout components
+ {"margin", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-top", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_top>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-bottom", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_bottom>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-left", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_left>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-right", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_right>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-top", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_top>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-bottom", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_bottom>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-left", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_left>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-right", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_right>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border-color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"border-style", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Display components
+ {"display", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_display>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"visibility", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_visibility>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"opacity", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_opacity>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Additional dimensions
+ {"min-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_min_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"max-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_max_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"min-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_min_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"max-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_max_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Table components
+ {"cellpadding", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_cellpadding>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"cellspacing", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_cellspacing>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"valign", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_valign>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"align", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_align>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Form components
+ {"type", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_type>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"value", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_value>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"placeholder", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_placeholder>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"disabled", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_disabled>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"readonly", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_readonly>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"checked", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_checked>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"selected", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_selected>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+
+ // Link & media components
+ {"target", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_target>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"title", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_title>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Meta components
+ {"charset", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_charset>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"content", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_content>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"http-equiv", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_http_equiv>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Accessibility components
+ {"role", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_role>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"tabindex", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_tabindex>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Background components
+ {"background", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-image", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_image>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-repeat", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_repeat>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-position", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_position>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Email tracking components
+ {"data-track", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_track>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"data-id", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_id>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"data-url", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_url>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ });
+
+auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
+{
+ auto it = component_extractors.find(attr_name);
+ if (it != component_extractors.end()) {
+ return it->second(this);
+ }
+
+ // Fallback to unknown components
+ return find_unknown_component(attr_name);
+}
+
+auto html_tag::get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>
+{
+ std::vector<std::pair<std::string_view, std::string_view>> result;
+
+ // First, get all known attributes using the component_extractors map
+ for (const auto &[attr_name, extractor_func]: component_extractors) {
+ if (auto value = extractor_func(this)) {
+ // Convert frozen::string to std::string_view for the key
+ std::string_view name_view{attr_name.data(), attr_name.size()};
+ result.emplace_back(name_view, value.value());
+ }
+ }
+
+ // Then add all unknown attributes
+ auto unknown_attrs = get_unknown_components();
+ for (const auto &[name, value]: unknown_attrs) {
+ result.emplace_back(name, value);
+ }
+
+ return result;
+}
+
enum tag_parser_state {
parse_start = 0,
parse_name,
@@ -234,13 +890,13 @@ enum tag_parser_state {
struct tag_content_parser_state {
tag_parser_state cur_state = parse_start;
std::string buf;
- std::optional<html_component_type> cur_component;
+ std::string attr_name;// Store current attribute name
void reset()
{
cur_state = parse_start;
buf.clear();
- cur_component = std::nullopt;
+ attr_name.clear();
}
};
@@ -254,56 +910,50 @@ html_parse_tag_content(rspamd_mempool_t *pool,
auto state = parser_env.cur_state;
/*
- * Stores tag component if it doesn't exist, performing copy of the
- * value + decoding of the entities
- * Parser env is set to clear the current html attribute fields (saved_p and
- * cur_component)
+ * Stores tag component creating the appropriate variant type
+ * Parser env is cleared after storing
*/
auto store_component_value = [&]() -> void {
- if (parser_env.cur_component) {
+ if (!parser_env.attr_name.empty()) {
+ std::string_view attr_name_view, value_view;
- if (parser_env.buf.empty()) {
- tag->components.emplace_back(parser_env.cur_component.value(),
- std::string_view{});
+ // Store attribute name in persistent memory
+ if (!parser_env.attr_name.empty()) {
+ auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size());
+ memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size());
+ attr_name_view = {name_storage, parser_env.attr_name.size()};
}
- else {
- /* We need to copy buf to a persistent storage */
- auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
- if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
- parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
- /* Lowercase */
- rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
+ // Store value in persistent memory if not empty
+ if (!parser_env.buf.empty()) {
+ auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+
+ // Lowercase for id and class attributes
+ if (parser_env.attr_name == "id" || parser_env.attr_name == "class") {
+ rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size());
}
else {
- memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+ memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size());
}
- auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
- tag->components.emplace_back(parser_env.cur_component.value(),
- std::string_view{s, sz});
+ auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size());
+ value_view = {value_storage, sz};
}
+
+ // Create the appropriate component variant
+ auto component = html_component_from_string(attr_name_view, value_view);
+ tag->components.emplace_back(std::move(component));
}
parser_env.buf.clear();
- parser_env.cur_component = std::nullopt;
+ parser_env.attr_name.clear();
};
auto store_component_name = [&]() -> bool {
decode_html_entitles_inplace(parser_env.buf);
- auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+ parser_env.attr_name = parser_env.buf;
parser_env.buf.clear();
-
- if (known_component_it != html_components_map.end()) {
- parser_env.cur_component = known_component_it->second;
-
- return true;
- }
- else {
- parser_env.cur_component = std::nullopt;
- }
-
- return false;
+ return true;
};
auto store_value_character = [&](bool lc) -> void {
@@ -471,6 +1121,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_dquote:
if (*in == '"') {
+ store_component_value();
state = spaces_after_param;
}
else {
@@ -481,6 +1132,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_squote:
if (*in == '\'') {
+ store_component_value();
state = spaces_after_param;
}
else {
@@ -620,7 +1272,7 @@ html_process_url_tag(rspamd_mempool_t *pool,
struct html_tag *tag,
struct html_content *hc) -> std::optional<struct rspamd_url *>
{
- auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+ auto found_href_maybe = tag->find_href();
if (found_href_maybe) {
/* Check base url */
@@ -816,130 +1468,126 @@ html_process_img_tag(rspamd_mempool_t *pool,
img = rspamd_mempool_alloc0_type(pool, struct html_image);
img->tag = tag;
- for (const auto &param: tag->components) {
+ // Process SRC component (preferred for img tags) or HREF component (fallback)
+ std::optional<std::string_view> href_value;
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
- /* Check base url */
- const auto &href_value = param.value;
+ // Try SRC first (standard for img tags)
+ if (auto src_comp = tag->find_component<html_component_src>()) {
+ href_value = src_comp.value()->value;
+ }
+ // Fallback to HREF (for backward compatibility or non-standard usage)
+ else if (auto href_comp = tag->find_href()) {
+ href_value = href_comp;
+ }
- if (href_value.size() > 0) {
- rspamd_ftok_t fstr;
- fstr.begin = href_value.data();
- fstr.len = href_value.size();
- img->src = rspamd_mempool_ftokdup(pool, &fstr);
+ if (href_value && href_value->size() > 0) {
+ rspamd_ftok_t fstr;
+ fstr.begin = href_value->data();
+ fstr.len = href_value->size();
+ img->src = rspamd_mempool_ftokdup(pool, &fstr);
- if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
- "cid:", sizeof("cid:") - 1) == 0) {
- /* We have an embedded image */
- img->src += sizeof("cid:") - 1;
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
- }
- else {
- if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
- "data:", sizeof("data:") - 1) == 0) {
- /* We have an embedded image in HTML tag */
- img->flags |=
- (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
- html_process_data_image(pool, img, href_value);
- hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
- }
- else {
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
- if (img->src) {
-
- std::string_view cpy{href_value};
- auto maybe_url = html_process_url(pool, cpy);
-
- if (maybe_url) {
- img->url = maybe_url.value();
- struct rspamd_url *existing;
-
- img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
- existing = rspamd_url_set_add_or_return(url_set,
- img->url);
-
- if (existing && existing != img->url) {
- /*
- * We have some other URL that could be
- * found, e.g. from another part. However,
- * we still want to set an image flag on it
- */
- existing->flags |= img->url->flags;
- existing->count++;
- }
- else if (part_urls) {
- /* New url */
- g_ptr_array_add(part_urls, img->url);
- }
- }
- }
- }
- }
- }
+ if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(),
+ "cid:", sizeof("cid:") - 1) == 0) {
+ /* We have an embedded image */
+ img->src += sizeof("cid:") - 1;
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
}
+ else {
+ if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(),
+ "data:", sizeof("data:") - 1) == 0) {
+ /* We have an embedded image in HTML tag */
+ img->flags |=
+ (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+ html_process_data_image(pool, img, *href_value);
+ hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+ }
+ else {
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+ if (img->src) {
+ std::string_view cpy{*href_value};
+ auto maybe_url = html_process_url(pool, cpy);
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
- unsigned long val;
+ if (maybe_url) {
+ img->url = maybe_url.value();
+ struct rspamd_url *existing;
- rspamd_strtoul(param.value.data(), param.value.size(), &val);
- img->height = val;
- }
+ img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+ existing = rspamd_url_set_add_or_return(url_set,
+ img->url);
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
- unsigned long val;
-
- rspamd_strtoul(param.value.data(), param.value.size(), &val);
- img->width = val;
+ if (existing && existing != img->url) {
+ /*
+ * We have some other URL that could be
+ * found, e.g. from another part. However,
+ * we still want to set an image flag on it
+ */
+ existing->flags |= img->url->flags;
+ existing->count++;
+ }
+ else if (part_urls) {
+ /* New url */
+ g_ptr_array_add(part_urls, img->url);
+ }
+ }
+ }
+ }
}
+ }
- /* TODO: rework to css at some time */
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
- if (img->height == 0) {
- auto style_st = param.value;
- auto pos = rspamd_substring_search_caseless(style_st.data(),
- style_st.size(),
- "height", sizeof("height") - 1);
- if (pos != -1) {
- auto substr = style_st.substr(pos + sizeof("height") - 1);
+ // Process numeric dimensions using the new helper methods
+ if (auto height = tag->find_height()) {
+ img->height = height.value();
+ }
- for (auto i = 0; i < substr.size(); i++) {
- auto t = substr[i];
- if (g_ascii_isdigit(t)) {
- unsigned long val;
- rspamd_strtoul(substr.data(),
- substr.size(), &val);
- img->height = val;
- break;
- }
- else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
- /* Fallback */
- break;
- }
+ if (auto width = tag->find_width()) {
+ img->width = width.value();
+ }
+
+ // Process style component for dimensions
+ if (auto style_value = tag->find_style()) {
+ if (img->height == 0) {
+ auto pos = rspamd_substring_search_caseless(style_value->data(),
+ style_value->size(),
+ "height", sizeof("height") - 1);
+ if (pos != -1) {
+ auto substr = style_value->substr(pos + sizeof("height") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->height = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
}
}
}
- if (img->width == 0) {
- auto style_st = param.value;
- auto pos = rspamd_substring_search_caseless(style_st.data(),
- style_st.size(),
- "width", sizeof("width") - 1);
- if (pos != -1) {
- auto substr = style_st.substr(pos + sizeof("width") - 1);
-
- for (auto i = 0; i < substr.size(); i++) {
- auto t = substr[i];
- if (g_ascii_isdigit(t)) {
- unsigned long val;
- rspamd_strtoul(substr.data(),
- substr.size(), &val);
- img->width = val;
- break;
- }
- else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
- /* Fallback */
- break;
- }
+ }
+ if (img->width == 0) {
+ auto pos = rspamd_substring_search_caseless(style_value->data(),
+ style_value->size(),
+ "width", sizeof("width") - 1);
+ if (pos != -1) {
+ auto substr = style_value->substr(pos + sizeof("width") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->width = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
}
}
}
@@ -968,7 +1616,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
khash_t(rspamd_url_hash) * url_set,
GPtrArray *part_urls) -> void
{
- auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+ auto found_rel_maybe = tag->find_rel();
if (found_rel_maybe) {
if (found_rel_maybe.value() == "icon") {
@@ -984,24 +1632,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
bool hidden = false;
- for (const auto &param: tag->components) {
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
- maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
- }
-
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
- maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
- }
+ // Process color components
+ if (auto color_comp = tag->find_component<html_component_color>()) {
+ maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value);
+ }
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
- tag->block = rspamd::css::parse_css_declaration(pool, param.value);
- }
+ if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) {
+ maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value);
+ }
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
- hidden = true;
- }
+ // Process style component
+ if (auto style_value = tag->find_style()) {
+ tag->block = rspamd::css::parse_css_declaration(pool, *style_value);
}
+ // Check if hidden
+ hidden = tag->is_hidden();
+
if (!tag->block) {
tag->block = html_block::undefined_html_block_pool(pool);
}
@@ -1284,7 +1931,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
}
else if (tag->id == Tag_IMG) {
/* Process ALT if presented */
- auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
+ auto maybe_alt = tag->find_alt();
if (maybe_alt) {
if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
@@ -1384,9 +2031,7 @@ auto html_process_input(struct rspamd_task *task,
overflow_input = true;
}
- auto new_tag = [&](int flags = 0) -> struct html_tag *
- {
-
+ auto new_tag = [&](int flags = 0) -> struct html_tag * {
if (hc->all_tags.size() > rspamd::html::max_tags) {
hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
@@ -2151,7 +2796,7 @@ auto html_process_input(struct rspamd_task *task,
/* Leftover after content */
switch (state) {
case tags_limit_overflow:
- html_append_parsed(hc, {c, (std::size_t)(end - c)},
+ html_append_parsed(hc, {c, (std::size_t) (end - c)},
false, end - start, hc->parsed);
break;
default:
@@ -2390,4 +3035,4 @@ gsize rspamd_html_get_tags_count(void *html_content)
}
return hc->all_tags.size();
-} \ No newline at end of file
+}
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 309d76177..6d41f1337 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,6 +26,7 @@
#include <cstdint>
#include "html_tags.h"
+#include "libutil/str_util.h"
struct rspamd_url;
struct html_image;
@@ -34,7 +35,8 @@ namespace rspamd::html {
struct html_content; /* Forward declaration */
-enum class html_component_type : std::uint8_t {
+// Internal enum for mapping (not exposed in public API)
+enum class html_component_enum_type : std::uint8_t {
RSPAMD_HTML_COMPONENT_NAME = 0,
RSPAMD_HTML_COMPONENT_HREF,
RSPAMD_HTML_COMPONENT_COLOR,
@@ -48,8 +50,1214 @@ enum class html_component_type : std::uint8_t {
RSPAMD_HTML_COMPONENT_ALT,
RSPAMD_HTML_COMPONENT_ID,
RSPAMD_HTML_COMPONENT_HIDDEN,
+ // Typography
+ RSPAMD_HTML_COMPONENT_FONT_FAMILY,
+ RSPAMD_HTML_COMPONENT_FONT_SIZE,
+ RSPAMD_HTML_COMPONENT_FONT_WEIGHT,
+ RSPAMD_HTML_COMPONENT_FONT_STYLE,
+ RSPAMD_HTML_COMPONENT_TEXT_ALIGN,
+ RSPAMD_HTML_COMPONENT_TEXT_DECORATION,
+ RSPAMD_HTML_COMPONENT_LINE_HEIGHT,
+ // Layout & positioning
+ RSPAMD_HTML_COMPONENT_MARGIN,
+ RSPAMD_HTML_COMPONENT_MARGIN_TOP,
+ RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM,
+ RSPAMD_HTML_COMPONENT_MARGIN_LEFT,
+ RSPAMD_HTML_COMPONENT_MARGIN_RIGHT,
+ RSPAMD_HTML_COMPONENT_PADDING,
+ RSPAMD_HTML_COMPONENT_PADDING_TOP,
+ RSPAMD_HTML_COMPONENT_PADDING_BOTTOM,
+ RSPAMD_HTML_COMPONENT_PADDING_LEFT,
+ RSPAMD_HTML_COMPONENT_PADDING_RIGHT,
+ RSPAMD_HTML_COMPONENT_BORDER,
+ RSPAMD_HTML_COMPONENT_BORDER_COLOR,
+ RSPAMD_HTML_COMPONENT_BORDER_WIDTH,
+ RSPAMD_HTML_COMPONENT_BORDER_STYLE,
+ // Display & visibility
+ RSPAMD_HTML_COMPONENT_DISPLAY,
+ RSPAMD_HTML_COMPONENT_VISIBILITY,
+ RSPAMD_HTML_COMPONENT_OPACITY,
+ // Dimensions
+ RSPAMD_HTML_COMPONENT_MIN_WIDTH,
+ RSPAMD_HTML_COMPONENT_MAX_WIDTH,
+ RSPAMD_HTML_COMPONENT_MIN_HEIGHT,
+ RSPAMD_HTML_COMPONENT_MAX_HEIGHT,
+ // Table attributes
+ RSPAMD_HTML_COMPONENT_CELLPADDING,
+ RSPAMD_HTML_COMPONENT_CELLSPACING,
+ RSPAMD_HTML_COMPONENT_VALIGN,
+ RSPAMD_HTML_COMPONENT_ALIGN,
+ // Form attributes
+ RSPAMD_HTML_COMPONENT_TYPE,
+ RSPAMD_HTML_COMPONENT_VALUE,
+ RSPAMD_HTML_COMPONENT_PLACEHOLDER,
+ RSPAMD_HTML_COMPONENT_DISABLED,
+ RSPAMD_HTML_COMPONENT_READONLY,
+ RSPAMD_HTML_COMPONENT_CHECKED,
+ RSPAMD_HTML_COMPONENT_SELECTED,
+ // Link & media
+ RSPAMD_HTML_COMPONENT_TARGET,
+ RSPAMD_HTML_COMPONENT_TITLE,
+ RSPAMD_HTML_COMPONENT_SRC,
+ // Meta & document
+ RSPAMD_HTML_COMPONENT_CHARSET,
+ RSPAMD_HTML_COMPONENT_CONTENT,
+ RSPAMD_HTML_COMPONENT_HTTP_EQUIV,
+ // Accessibility
+ RSPAMD_HTML_COMPONENT_ROLE,
+ RSPAMD_HTML_COMPONENT_TABINDEX,
+ // Background
+ RSPAMD_HTML_COMPONENT_BACKGROUND,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION,
+ // Email-specific tracking
+ RSPAMD_HTML_COMPONENT_DATA_TRACK,
+ RSPAMD_HTML_COMPONENT_DATA_ID,
+ RSPAMD_HTML_COMPONENT_DATA_URL,
};
+// Forward declarations for component types
+struct html_component_name;
+struct html_component_href;
+struct html_component_color;
+struct html_component_bgcolor;
+struct html_component_style;
+struct html_component_class;
+struct html_component_width;
+struct html_component_height;
+struct html_component_size;
+struct html_component_rel;
+struct html_component_alt;
+struct html_component_id;
+struct html_component_hidden;
+struct html_component_unknown;
+
+// Base interface for all components
+struct html_component_base {
+ virtual ~html_component_base() = default;
+ virtual constexpr std::string_view get_string_value() const = 0;
+};
+
+// String-based components
+struct html_component_name : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_name(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_href : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_href(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_style : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_style(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_class : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_class(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_rel : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_rel(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_alt : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_alt(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_id : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_id(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Color components (could be extended to parse actual colors)
+struct html_component_color : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_color(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_bgcolor : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_bgcolor(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Numeric components
+struct html_component_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_width(const std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_height(const std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_size : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_size(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Boolean/flag component
+struct html_component_hidden : html_component_base {
+ bool present;
+ explicit constexpr html_component_hidden()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+// Unknown component with both name and value
+struct html_component_unknown : html_component_base {
+ std::string_view name;
+ std::string_view value;
+
+ constexpr html_component_unknown(std::string_view n, std::string_view v)
+ : name(n), value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+ constexpr std::string_view get_name() const
+ {
+ return name;
+ }
+};
+
+// Typography components
+struct html_component_font_family : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_family(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_font_size : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_font_size(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_font_weight : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_weight(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_font_style : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_style(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_text_align : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_text_align(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_text_decoration : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_text_decoration(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_line_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_line_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Layout components (most are string-based for flexibility)
+struct html_component_margin : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_top : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_top(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_bottom : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_bottom(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_left : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_left(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_right : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_right(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_top : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_top(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_bottom : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_bottom(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_left : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_left(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_right : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_right(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border : html_component_base {
+ std::string_view value;
+ explicit html_component_border(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border_color : html_component_base {
+ std::string_view value;
+ explicit html_component_border_color(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_border_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_border_style : html_component_base {
+ std::string_view value;
+ explicit html_component_border_style(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Display components
+struct html_component_display : html_component_base {
+ std::string_view value;
+ explicit html_component_display(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_visibility : html_component_base {
+ std::string_view value;
+ explicit html_component_visibility(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_opacity : html_component_base {
+ std::string_view raw_value;
+ std::optional<float> numeric_value;
+
+ explicit html_component_opacity(std::string_view v)
+ : raw_value(v)
+ {
+ char *endptr;
+ auto val = std::strtof(v.data(), &endptr);
+ if (endptr != v.data() && val >= 0.0f && val <= 1.0f) {
+ numeric_value = val;
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<float> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Additional dimension components
+struct html_component_min_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_min_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_max_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_max_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_min_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_min_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_max_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_max_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Table components
+struct html_component_cellpadding : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_cellpadding(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_cellspacing : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_cellspacing(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_valign : html_component_base {
+ std::string_view value;
+ explicit html_component_valign(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_align : html_component_base {
+ std::string_view value;
+ explicit html_component_align(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Form components
+struct html_component_type : html_component_base {
+ std::string_view value;
+ explicit html_component_type(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_value : html_component_base {
+ std::string_view value;
+ explicit html_component_value(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_placeholder : html_component_base {
+ std::string_view value;
+ explicit html_component_placeholder(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Boolean form components
+struct html_component_disabled : html_component_base {
+ bool present;
+ explicit constexpr html_component_disabled()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_readonly : html_component_base {
+ bool present;
+ explicit constexpr html_component_readonly()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_checked : html_component_base {
+ bool present;
+ explicit constexpr html_component_checked()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_selected : html_component_base {
+ bool present;
+ explicit constexpr html_component_selected()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+// Link & media components
+struct html_component_target : html_component_base {
+ std::string_view value;
+ explicit html_component_target(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_title : html_component_base {
+ std::string_view value;
+ explicit html_component_title(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_src : html_component_base {
+ std::string_view value;
+ explicit html_component_src(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Meta components
+struct html_component_charset : html_component_base {
+ std::string_view value;
+ explicit html_component_charset(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_content : html_component_base {
+ std::string_view value;
+ explicit html_component_content(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_http_equiv : html_component_base {
+ std::string_view value;
+ explicit html_component_http_equiv(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Accessibility components
+struct html_component_role : html_component_base {
+ std::string_view value;
+ explicit html_component_role(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_tabindex : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::int32_t> numeric_value;
+
+ explicit html_component_tabindex(std::string_view v)
+ : raw_value(v)
+ {
+ long val;
+ if (rspamd_strtol(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::int32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::int32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Background components
+struct html_component_background : html_component_base {
+ std::string_view value;
+ explicit html_component_background(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_image : html_component_base {
+ std::string_view value;
+ explicit html_component_background_image(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_color : html_component_base {
+ std::string_view value;
+ explicit html_component_background_color(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_repeat : html_component_base {
+ std::string_view value;
+ explicit html_component_background_repeat(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_position : html_component_base {
+ std::string_view value;
+ explicit html_component_background_position(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Email tracking components
+struct html_component_data_track : html_component_base {
+ std::string_view value;
+ explicit html_component_data_track(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_data_id : html_component_base {
+ std::string_view value;
+ explicit html_component_data_id(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_data_url : html_component_base {
+ std::string_view value;
+ explicit html_component_data_url(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// The variant type that holds all possible components
+using html_tag_component = std::variant<
+ html_component_name,
+ html_component_href,
+ html_component_color,
+ html_component_bgcolor,
+ html_component_style,
+ html_component_class,
+ html_component_width,
+ html_component_height,
+ html_component_size,
+ html_component_rel,
+ html_component_alt,
+ html_component_id,
+ html_component_hidden,
+ // Typography
+ html_component_font_family,
+ html_component_font_size,
+ html_component_font_weight,
+ html_component_font_style,
+ html_component_text_align,
+ html_component_text_decoration,
+ html_component_line_height,
+ // Layout
+ html_component_margin,
+ html_component_margin_top,
+ html_component_margin_bottom,
+ html_component_margin_left,
+ html_component_margin_right,
+ html_component_padding,
+ html_component_padding_top,
+ html_component_padding_bottom,
+ html_component_padding_left,
+ html_component_padding_right,
+ html_component_border,
+ html_component_border_color,
+ html_component_border_width,
+ html_component_border_style,
+ // Display
+ html_component_display,
+ html_component_visibility,
+ html_component_opacity,
+ // Dimensions
+ html_component_min_width,
+ html_component_max_width,
+ html_component_min_height,
+ html_component_max_height,
+ // Table
+ html_component_cellpadding,
+ html_component_cellspacing,
+ html_component_valign,
+ html_component_align,
+ // Form
+ html_component_type,
+ html_component_value,
+ html_component_placeholder,
+ html_component_disabled,
+ html_component_readonly,
+ html_component_checked,
+ html_component_selected,
+ // Link & media
+ html_component_target,
+ html_component_title,
+ html_component_src,
+ // Meta
+ html_component_charset,
+ html_component_content,
+ html_component_http_equiv,
+ // Accessibility
+ html_component_role,
+ html_component_tabindex,
+ // Background
+ html_component_background,
+ html_component_background_image,
+ html_component_background_color,
+ html_component_background_repeat,
+ html_component_background_position,
+ // Email tracking
+ html_component_data_track,
+ html_component_data_id,
+ html_component_data_url,
+ // Unknown
+ html_component_unknown>;
+
+/**
+ * Returns component variant from a string
+ * @param name attribute name
+ * @param value attribute value
+ * @return variant component
+ */
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component;
+
/* Public tags flags */
/* XML tag */
#define FL_XML (1u << CM_USER_SHIFT)
@@ -62,23 +1270,7 @@ enum class html_component_type : std::uint8_t {
#define FL_COMMENT (1 << (CM_USER_SHIFT + 6))
#define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7))
-/**
- * Returns component type from a string
- * @param st
- * @return
- */
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>;
-
using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
-struct html_tag_component {
- html_component_type type;
- std::string_view value;
-
- html_tag_component(html_component_type type, std::string_view value)
- : type(type), value(value)
- {
- }
-};
/* Pairing closing tag representation */
struct html_closing_tag {
@@ -105,26 +1297,128 @@ struct html_tag {
std::vector<struct html_tag *> children;
struct html_tag *parent;
- auto find_component(html_component_type what) const -> std::optional<std::string_view>
+ // Template method to find component by type
+ template<typename T>
+ auto find_component() const -> std::optional<const T *>
{
for (const auto &comp: components) {
- if (comp.type == what) {
- return comp.value;
+ if (std::holds_alternative<T>(comp)) {
+ return &std::get<T>(comp);
}
}
+ return std::nullopt;
+ }
+ // Helper methods for common component access
+ auto find_href() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_href>()) {
+ return comp.value()->value;
+ }
return std::nullopt;
}
- auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view>
+ auto find_class() const -> std::optional<std::string_view>
{
- if (what) {
- return find_component(what.value());
+ if (auto comp = find_component<html_component_class>()) {
+ return comp.value()->value;
}
+ return std::nullopt;
+ }
+ auto find_id() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_id>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_width() const -> std::optional<std::uint32_t>
+ {
+ if (auto comp = find_component<html_component_width>()) {
+ return comp.value()->get_numeric_value();
+ }
+ return std::nullopt;
+ }
+
+ auto find_height() const -> std::optional<std::uint32_t>
+ {
+ if (auto comp = find_component<html_component_height>()) {
+ return comp.value()->get_numeric_value();
+ }
return std::nullopt;
}
+ auto find_style() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_alt() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_alt>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_rel() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_rel>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto is_hidden() const -> bool
+ {
+ return find_component<html_component_hidden>().has_value();
+ }
+
+ auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view>
+ {
+ for (const auto &comp: components) {
+ if (std::holds_alternative<html_component_unknown>(comp)) {
+ const auto &unknown = std::get<html_component_unknown>(comp);
+ if (unknown.name == attr_name) {
+ return unknown.value;
+ }
+ }
+ }
+ return std::nullopt;
+ }
+
+ auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>>
+ {
+ std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs;
+ for (const auto &comp: components) {
+ if (std::holds_alternative<html_component_unknown>(comp)) {
+ const auto &unknown = std::get<html_component_unknown>(comp);
+ unknown_attrs.emplace_back(unknown.name, unknown.value);
+ }
+ }
+ return unknown_attrs;
+ }
+
+ // Generic visitor method for processing all components
+ template<typename Visitor>
+ auto visit_components(Visitor &&visitor) const
+ {
+ for (const auto &comp: components) {
+ std::visit(std::forward<Visitor>(visitor), comp);
+ }
+ }
+
+ // Find any component by attribute name
+ auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>;
+
+ // Get all attributes as name-value pairs
+ auto get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>;
+
auto clear(void) -> void
{
id = Tag_UNKNOWN;
@@ -137,7 +1431,7 @@ struct html_tag {
closing.clear();
}
- constexpr auto get_content_length() const -> std::size_t
+ auto get_content_length() const -> std::size_t
{
if (flags & (FL_IGNORE | CM_HEAD)) {
return 0;
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 090e2af55..9b0deed45 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -179,6 +179,44 @@ LUA_FUNCTION_DEF(html_tag, get_style);
*/
LUA_FUNCTION_DEF(html_tag, get_attribute);
+/***
+ * @method html_tag:get_all_attributes()
+ * Returns table of all attributes for the element
+ * @return {table} table with attribute names as keys and values as strings
+ */
+LUA_FUNCTION_DEF(html_tag, get_all_attributes);
+
+/***
+ * @method html_tag:get_unknown_attributes()
+ * Returns table of unknown/unrecognized attributes for the element
+ * @return {table} table with unknown attribute names as keys and values as strings
+ */
+LUA_FUNCTION_DEF(html_tag, get_unknown_attributes);
+
+/***
+ * @method html_tag:get_children()
+ * Returns array of child tags for the element
+ * @return {table} array of child html_tag objects
+ */
+LUA_FUNCTION_DEF(html_tag, get_children);
+
+/***
+ * @method html_tag:has_attribute(name)
+ * Checks if element has the specified attribute
+ * @param {string} name attribute name to check
+ * @return {boolean} true if attribute exists
+ */
+LUA_FUNCTION_DEF(html_tag, has_attribute);
+
+/***
+ * @method html_tag:get_numeric_attribute(name)
+ * Returns numeric value of attribute (if supported and parseable)
+ * Works for attributes like width, height, font-size, etc.
+ * @param {string} name attribute name
+ * @return {number|nil} numeric value or nil if not numeric/parseable
+ */
+LUA_FUNCTION_DEF(html_tag, get_numeric_attribute);
+
static const struct luaL_reg taglib_m[] = {
LUA_INTERFACE_DEF(html_tag, get_type),
LUA_INTERFACE_DEF(html_tag, get_extra),
@@ -188,6 +226,11 @@ static const struct luaL_reg taglib_m[] = {
LUA_INTERFACE_DEF(html_tag, get_content_length),
LUA_INTERFACE_DEF(html_tag, get_style),
LUA_INTERFACE_DEF(html_tag, get_attribute),
+ LUA_INTERFACE_DEF(html_tag, get_all_attributes),
+ LUA_INTERFACE_DEF(html_tag, get_unknown_attributes),
+ LUA_INTERFACE_DEF(html_tag, get_children),
+ LUA_INTERFACE_DEF(html_tag, has_attribute),
+ LUA_INTERFACE_DEF(html_tag, get_numeric_attribute),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}};
@@ -704,6 +747,29 @@ lua_html_tag_get_style(lua_State *L)
}
static int
+lua_html_tag_get_all_attributes(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ auto all_attrs = ltag->tag->get_all_attributes();
+ lua_createtable(L, 0, all_attrs.size());
+
+ for (const auto &[name, value]: all_attrs) {
+ lua_pushlstring(L, name.data(), name.size());
+ lua_pushlstring(L, value.data(), value.size());
+ lua_settable(L, -3);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
lua_html_tag_get_attribute(lua_State *L)
{
LUA_TRACE_POINT;
@@ -712,8 +778,7 @@ lua_html_tag_get_attribute(lua_State *L)
const char *attr_name = luaL_checklstring(L, 2, &slen);
if (ltag && attr_name) {
- auto maybe_attr = ltag->tag->find_component(
- rspamd::html::html_component_from_string({attr_name, slen}));
+ auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen});
if (maybe_attr) {
lua_pushlstring(L, maybe_attr->data(), maybe_attr->size());
@@ -729,6 +794,206 @@ lua_html_tag_get_attribute(lua_State *L)
return 1;
}
+static int
+lua_html_tag_get_unknown_attributes(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ auto unknown_attrs = ltag->tag->get_unknown_components();
+ lua_createtable(L, 0, unknown_attrs.size());
+
+ for (const auto &[name, value]: unknown_attrs) {
+ lua_pushlstring(L, name.data(), name.size());
+ lua_pushlstring(L, value.data(), value.size());
+ lua_settable(L, -3);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_get_children(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ lua_createtable(L, ltag->tag->children.size(), 0);
+
+ for (int i = 0; i < ltag->tag->children.size(); i++) {
+ auto *child_tag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag)));
+ child_tag->tag = ltag->tag->children[i];
+ child_tag->html = ltag->html;
+ rspamd_lua_setclass(L, rspamd_html_tag_classname, -1);
+ lua_rawseti(L, -2, i + 1);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_has_attribute(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+ gsize slen;
+ const char *attr_name = luaL_checklstring(L, 2, &slen);
+
+ if (ltag && attr_name) {
+ auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen});
+ lua_pushboolean(L, maybe_attr.has_value());
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_get_numeric_attribute(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+ gsize slen;
+ const char *attr_name = luaL_checklstring(L, 2, &slen);
+
+ if (ltag && attr_name) {
+ std::string_view name_view{attr_name, slen};
+
+ // Check for numeric components
+ if (name_view == "width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "size") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_size>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "font-size") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_font_size>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "line-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_line_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "border-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_border_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "opacity") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_opacity>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushnumber(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "min-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "max-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "min-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "max-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "cellpadding") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellpadding>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "cellspacing") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellspacing>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "tabindex") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_tabindex>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+
+ lua_pushnil(L);
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
void luaopen_html(lua_State *L)
{
rspamd_lua_new_class(L, rspamd_html_classname, htmllib_m);
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index 39e1b0317..eb7fa6bf5 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -46,6 +46,14 @@
*/
/***
+ * @function parsers.parse_html_content(input, mempool)
+ * Parses HTML and returns the HTML content object for structure analysis
+ * @param {string|text} in input HTML
+ * @param {rspamd_mempool} mempool memory pool for HTML content management
+ * @return {html_content} HTML content object with tag structure
+ */
+LUA_FUNCTION_DEF(parsers, parse_html_content);
+/***
* @function parsers.parse_mail_address(str, [pool])
* Parses email address and returns a table of tables in the following format:
*
@@ -93,6 +101,7 @@
static const struct luaL_reg parserslib_f[] = {
LUA_INTERFACE_DEF(parsers, tokenize_text),
LUA_INTERFACE_DEF(parsers, parse_html),
+ LUA_INTERFACE_DEF(parsers, parse_html_content),
LUA_INTERFACE_DEF(parsers, parse_mail_address),
LUA_INTERFACE_DEF(parsers, parse_content_type),
LUA_INTERFACE_DEF(parsers, parse_smtp_date),
@@ -242,6 +251,62 @@ int lua_parsers_parse_html(lua_State *L)
return 1;
}
+static int lua_parsers_parse_html_content(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const char *start = NULL;
+ gsize len;
+ GByteArray *in;
+ rspamd_mempool_t *pool;
+ void *hc;
+ void **phc;
+
+ if (lua_type(L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text(L, 1);
+
+ if (t != NULL) {
+ start = t->start;
+ len = t->len;
+ }
+ }
+ else if (lua_type(L, 1) == LUA_TSTRING) {
+ start = luaL_checklstring(L, 1, &len);
+ }
+
+ if (lua_type(L, 2) != LUA_TUSERDATA) {
+ return luaL_error(L, "invalid arguments: mempool expected as second argument");
+ }
+
+ pool = rspamd_lua_check_mempool(L, 2);
+ if (!pool) {
+ return luaL_error(L, "invalid mempool argument");
+ }
+
+ if (start != NULL) {
+ in = g_byte_array_sized_new(len);
+ g_byte_array_append(in, start, len);
+
+ hc = rspamd_html_process_part(pool, in);
+
+ if (hc) {
+ phc = lua_newuserdata(L, sizeof(void *));
+ *phc = hc;
+ rspamd_lua_setclass(L, rspamd_html_classname, -1);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ g_byte_array_free(in, TRUE);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
int lua_parsers_parse_mail_address(lua_State *L)
{
LUA_TRACE_POINT;
@@ -409,4 +474,4 @@ lua_load_parsers(lua_State *L)
void luaopen_parsers(lua_State *L)
{
rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers);
-} \ No newline at end of file
+}