aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--interface/css/rspamd.css4
-rw-r--r--interface/index.html18
-rw-r--r--interface/js/app/history.js57
-rw-r--r--interface/js/app/rspamd.js21
-rw-r--r--src/libserver/css/css.cxx19
-rw-r--r--src/libserver/html/html.cxx1017
-rw-r--r--src/libserver/html/html_tag.hxx1348
-rw-r--r--src/lua/lua_html.cxx269
-rw-r--r--src/lua/lua_parsers.c67
-rw-r--r--test/lua/unit/html.lua414
10 files changed, 2981 insertions, 253 deletions
diff --git a/interface/css/rspamd.css b/interface/css/rspamd.css
index 9f97a668b..54310049b 100644
--- a/interface/css/rspamd.css
+++ b/interface/css/rspamd.css
@@ -420,8 +420,10 @@ table#symbolsTable input[type="number"] {
display: none;
}
+#history-from,
+#history-count,
#history_page_size {
- width: 6em !important;
+ width: 6em;
text-align: center;
}
diff --git a/interface/index.html b/interface/index.html
index 30181e788..165eae200 100644
--- a/interface/index.html
+++ b/interface/index.html
@@ -124,6 +124,16 @@
</div>
</div>
</div>
+
+ <div class="card mt-1">
+ <div class="card-body">
+ <h6 class="card-title fw-bolder">History rows per load</h6>
+ <div class="input-group input-group-sm was-validated">
+ <input type="number" id="settings-history-count" class="form-control" min="1" step="1" placeholder="1000">
+ <button id="settings-history-count-restore" class="btn btn-secondary">Restore default</button>
+ </div>
+ </div>
+ </div>
</div>
</div>
</form>
@@ -681,9 +691,13 @@
<option value="score">Score value</option>
<option value="name">Name</option>
</select>
- <label for="history_page_size" class="ms-2">Rows per page:</label>
+ <label for="history-from" class="ms-3" title="Start from this row number">Offset:</label>
+ <input type="number" id="history-from" class="form-control ms-1" value="0" min="0" step="1" title="Start from this row number">
+ <label for="history-count" class="ms-2" title="Number of rows to load">Count:</label>
+ <input type="number" id="history-count" class="form-control ms-1" value="1000" min="1" step="1" title="Number of rows to load">
+ <label for="history_page_size" class="ms-2">Rows/page:</label>
<input id="history_page_size" class="form-control ms-1" value="25" min="1" type="number">
- <button class="btn btn-outline-secondary btn-sm ms-2 d-flex align-items-center dropdown-toggle ft-columns-btn" type="button" data-bs-toggle="dropdown" data-bs-auto-close="outside" aria-expanded="false" disabled>
+ <button class="btn btn-outline-secondary btn-sm ms-3 d-flex align-items-center dropdown-toggle ft-columns-btn" type="button" data-bs-toggle="dropdown" data-bs-auto-close="outside" aria-expanded="false" disabled>
<i class="fas fa-columns me-1"></i>Columns
</button>
<div class="dropdown-menu ft-columns-dropdown p-2"></div>
diff --git a/interface/js/app/history.js b/interface/js/app/history.js
index 185922087..bf1dbae53 100644
--- a/interface/js/app/history.js
+++ b/interface/js/app/history.js
@@ -30,6 +30,12 @@ define(["jquery", "app/common", "app/libft", "footable"],
const ui = {};
let prevVersion = null;
+ // History range: offset and count
+ const histFromDef = 0;
+ const historyCountDef = 1000;
+ let histFrom = histFromDef;
+ let histCount = parseInt(localStorage.getItem("historyCount"), 10) || historyCountDef;
+
function process_history_legacy(data) {
const items = [];
@@ -152,7 +158,8 @@ define(["jquery", "app/common", "app/libft", "footable"],
ui.getHistory = function () {
$("#refresh, #updateHistory").attr("disabled", true);
- common.query("history", {
+ const histTo = histFrom - 1 + histCount;
+ common.query(`history?from=${histFrom}&to=${histTo}`, {
success: function (req_data) {
function differentVersions(neighbours_data) {
const dv = neighbours_data.some((e) => e.version !== neighbours_data[0].version);
@@ -192,8 +199,10 @@ define(["jquery", "app/common", "app/libft", "footable"],
// Is there a way to get an event when the table is destroyed?
setTimeout(() => {
libft.initHistoryTable(data, items, "history", get_history_columns(data), false,
- () => $("#refresh, #updateHistory, #history .ft-columns-dropdown .btn-dropdown-apply")
- .removeAttr("disabled"));
+ () => {
+ $("#history .ft-columns-dropdown .btn-dropdown-apply").removeAttr("disabled");
+ ui.updateHistoryControlsState();
+ });
}, 200);
}
prevVersion = version;
@@ -201,7 +210,7 @@ define(["jquery", "app/common", "app/libft", "footable"],
libft.destroyTable("history");
}
},
- error: () => $("#refresh, #updateHistory").removeAttr("disabled"),
+ error: () => ui.updateHistoryControlsState(),
errorMessage: "Cannot receive history",
});
};
@@ -282,6 +291,46 @@ define(["jquery", "app/common", "app/libft", "footable"],
});
};
+ ui.updateHistoryControlsState = function () {
+ const from = parseInt($("#history-from").val(), 10);
+ const count = parseInt($("#history-count").val(), 10);
+ const valid = !(isNaN(from) || from < 0 || isNaN(count) || count < 1);
+
+ if (valid) {
+ $("#refresh, #updateHistory").removeAttr("disabled").removeClass("disabled");
+ } else {
+ $("#refresh, #updateHistory").attr("disabled", true).addClass("disabled");
+ }
+ };
+
+ function validateAndClampInput(el) {
+ const min = el.id === "history-from" ? 0 : 1;
+ let v = parseInt(el.value, 10);
+ if (isNaN(v) || v < min) {
+ v = min;
+ $(el).addClass("is-invalid");
+ } else {
+ $(el).removeClass("is-invalid");
+ }
+ return v;
+ }
+
+ $("#history-from").val(histFrom);
+ $("#history-count").val(histCount);
+ $("#history-from, #history-count").on("input", (e) => {
+ validateAndClampInput(e.currentTarget);
+ ui.updateHistoryControlsState();
+ });
+ $("#history-from, #history-count").on("blur", (e) => {
+ const el = e.currentTarget;
+ const v = validateAndClampInput(el);
+ $(el).val(v).removeClass("is-invalid");
+ ui.updateHistoryControlsState();
+ });
+ $("#history-from,#history-count").on("change", () => {
+ histFrom = parseInt($("#history-from").val(), 10) || histFromDef;
+ histCount = parseInt($("#history-count").val(), 10) || historyCountDef;
+ });
libft.set_page_size("history", $("#history_page_size").val());
libft.bindHistoryTableEventHandlers("history", 8);
diff --git a/interface/js/app/rspamd.js b/interface/js/app/rspamd.js
index cb7fb8ace..4b154c2ae 100644
--- a/interface/js/app/rspamd.js
+++ b/interface/js/app/rspamd.js
@@ -198,6 +198,8 @@ define(["jquery", "app/common", "stickytabs", "visibility",
$(".preset").hide();
$(".history").show();
$(".dynamic").hide();
+
+ module.updateHistoryControlsState();
});
break;
case "#disconnect":
@@ -348,6 +350,8 @@ define(["jquery", "app/common", "stickytabs", "visibility",
let selected_locale = null;
let custom_locale = null;
const localeTextbox = ".popover #settings-popover #locale";
+ const historyCountDef = 1000;
+ const historyCountSelector = ".popover #settings-popover #settings-history-count";
function validateLocale(saveToLocalStorage) {
function toggle_form_group_class(remove, add) {
@@ -406,6 +410,8 @@ define(["jquery", "app/common", "stickytabs", "visibility",
$(localeTextbox).val(custom_locale);
ajaxSetup(localStorage.getItem("ajax_timeout"), true);
+
+ $(historyCountSelector).val(parseInt(localStorage.getItem("historyCount"), 10) || historyCountDef);
});
$(document).on("change", '.popover #settings-popover input:radio[name="locale"]', function () {
selected_locale = this.value;
@@ -423,6 +429,21 @@ define(["jquery", "app/common", "stickytabs", "visibility",
ajaxSetup(null, true, true);
});
+ $(document).on("input", historyCountSelector, (e) => {
+ const v = parseInt($(e.currentTarget).val(), 10);
+ if (v > 0) {
+ localStorage.setItem("historyCount", v);
+ $(e.currentTarget).removeClass("is-invalid");
+ $("#history-count").val(v).trigger("change");
+ } else {
+ $(e.currentTarget).addClass("is-invalid");
+ }
+ });
+ $(document).on("click", ".popover #settings-popover #settings-history-count-restore", () => {
+ localStorage.removeItem("historyCount");
+ $(historyCountSelector).val(historyCountDef);
+ });
+
// Dismiss Bootstrap popover by clicking outside
$("body").on("click", (e) => {
$(".popover").each(function () {
diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx
index 1b369ed17..c53e3c05e 100644
--- a/src/libserver/css/css.cxx
+++ b/src/libserver/css/css.cxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector
auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block *
{
- std::optional<std::string_view> id_comp, class_comp;
rspamd::html::html_block *res = nullptr;
if (!tag) {
@@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa
}
/* First, find id in a tag and a class */
- for (const auto &param: tag->components) {
- if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) {
- id_comp = param.value;
- }
- else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
- class_comp = param.value;
- }
- }
+ auto id_comp = tag->find_id();
+ auto class_comp = tag->find_class();
/* ID part */
if (id_comp && !pimpl->id_selectors.empty()) {
@@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool,
return std::make_pair(nullptr, parse_res.error());
}
-}// namespace rspamd::css \ No newline at end of file
+}// namespace rspamd::css
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index 93d1fdf91..78a6a975c 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -39,6 +39,7 @@
#include "contrib/frozen/include/frozen/string.h"
#include "contrib/fmt/include/fmt/core.h"
+#include <functional>
#include <unicode/uversion.h>
namespace rspamd::html {
@@ -47,23 +48,88 @@ static const unsigned int max_tags = 8192; /* Ignore tags if this maximum is rea
static const html_tags_storage html_tags_defs;
-auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>(
+auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_enum_type>(
{
- {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME},
- {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF},
- {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR},
- {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
- {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE},
- {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS},
- {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH},
- {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT},
- {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE},
- {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL},
- {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT},
- {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID},
- {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+ {"name", html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME},
+ {"href", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"src", html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC},
+ {"action", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF},
+ {"color", html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR},
+ {"bgcolor", html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR},
+ {"style", html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE},
+ {"class", html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS},
+ {"width", html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH},
+ {"height", html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT},
+ {"size", html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE},
+ {"rel", html_component_enum_type::RSPAMD_HTML_COMPONENT_REL},
+ {"alt", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT},
+ {"id", html_component_enum_type::RSPAMD_HTML_COMPONENT_ID},
+ {"hidden", html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN},
+ // Typography
+ {"font-family", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY},
+ {"font-size", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE},
+ {"font-weight", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT},
+ {"font-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE},
+ {"text-align", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN},
+ {"text-decoration", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION},
+ {"line-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT},
+ // Layout & positioning
+ {"margin", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN},
+ {"margin-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP},
+ {"margin-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM},
+ {"margin-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT},
+ {"margin-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT},
+ {"padding", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING},
+ {"padding-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP},
+ {"padding-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM},
+ {"padding-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT},
+ {"padding-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT},
+ {"border", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER},
+ {"border-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR},
+ {"border-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH},
+ {"border-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE},
+ // Display & visibility
+ {"display", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY},
+ {"visibility", html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY},
+ {"opacity", html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY},
+ // Dimensions
+ {"min-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH},
+ {"max-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH},
+ {"min-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT},
+ {"max-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT},
+ // Table attributes
+ {"cellpadding", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING},
+ {"cellspacing", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING},
+ {"valign", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN},
+ {"align", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN},
+ // Form attributes
+ {"type", html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE},
+ {"value", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE},
+ {"placeholder", html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER},
+ {"disabled", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED},
+ {"readonly", html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY},
+ {"checked", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED},
+ {"selected", html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED},
+ // Link & media
+ {"target", html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET},
+ {"title", html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE},
+ // Meta & document
+ {"charset", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET},
+ {"content", html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT},
+ {"http-equiv", html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV},
+ // Accessibility
+ {"role", html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE},
+ {"tabindex", html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX},
+ // Background
+ {"background", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND},
+ {"background-image", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE},
+ {"background-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR},
+ {"background-repeat", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT},
+ {"background-position", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION},
+ // Email-specific tracking
+ {"data-track", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK},
+ {"data-id", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID},
+ {"data-url", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL},
});
#define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \
@@ -199,18 +265,608 @@ html_check_balance(struct html_content *hc,
return nullptr;
}
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component
{
- auto known_component_it = html_components_map.find(st);
+ auto known_component_it = html_components_map.find(name);
if (known_component_it != html_components_map.end()) {
- return known_component_it->second;
+ switch (known_component_it->second) {
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME:
+ return html_component_name{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF:
+ return html_component_href{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR:
+ return html_component_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR:
+ return html_component_bgcolor{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE:
+ return html_component_style{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS:
+ return html_component_class{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH:
+ return html_component_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT:
+ return html_component_height{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE:
+ return html_component_size{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_REL:
+ return html_component_rel{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT:
+ return html_component_alt{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ID:
+ return html_component_id{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN:
+ return html_component_hidden{};
+ // Typography
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY:
+ return html_component_font_family{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE:
+ return html_component_font_size{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT:
+ return html_component_font_weight{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE:
+ return html_component_font_style{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN:
+ return html_component_text_align{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION:
+ return html_component_text_decoration{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT:
+ return html_component_line_height{value};
+ // Layout
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN:
+ return html_component_margin{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP:
+ return html_component_margin_top{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM:
+ return html_component_margin_bottom{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT:
+ return html_component_margin_left{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT:
+ return html_component_margin_right{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING:
+ return html_component_padding{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP:
+ return html_component_padding_top{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM:
+ return html_component_padding_bottom{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT:
+ return html_component_padding_left{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT:
+ return html_component_padding_right{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER:
+ return html_component_border{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR:
+ return html_component_border_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH:
+ return html_component_border_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE:
+ return html_component_border_style{value};
+ // Display
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY:
+ return html_component_display{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY:
+ return html_component_visibility{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY:
+ return html_component_opacity{value};
+ // Dimensions
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH:
+ return html_component_min_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH:
+ return html_component_max_width{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT:
+ return html_component_min_height{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT:
+ return html_component_max_height{value};
+ // Table
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING:
+ return html_component_cellpadding{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING:
+ return html_component_cellspacing{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN:
+ return html_component_valign{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN:
+ return html_component_align{value};
+ // Form
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE:
+ return html_component_type{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE:
+ return html_component_value{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER:
+ return html_component_placeholder{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED:
+ return html_component_disabled{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY:
+ return html_component_readonly{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED:
+ return html_component_checked{};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED:
+ return html_component_selected{};
+ // Link & media
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET:
+ return html_component_target{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE:
+ return html_component_title{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC:
+ return html_component_src{value};
+ // Meta
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET:
+ return html_component_charset{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT:
+ return html_component_content{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV:
+ return html_component_http_equiv{value};
+ // Accessibility
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE:
+ return html_component_role{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX:
+ return html_component_tabindex{value};
+ // Background
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND:
+ return html_component_background{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE:
+ return html_component_background_image{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR:
+ return html_component_background_color{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT:
+ return html_component_background_repeat{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION:
+ return html_component_background_position{value};
+ // Email tracking
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK:
+ return html_component_data_track{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID:
+ return html_component_data_id{value};
+ case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL:
+ return html_component_data_url{value};
+ default:
+ return html_component_unknown{name, value};
+ }
}
else {
- return std::nullopt;
+ return html_component_unknown{name, value};
}
}
+using component_extractor_func = std::function<std::optional<std::string_view>(const html_tag *)>;
+static const auto component_extractors = frozen::make_unordered_map<frozen::string, component_extractor_func>(
+ {
+ // Basic components
+ {"name", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_name>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"href", [](const html_tag *tag) { return tag->find_href(); }},
+ {"src", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_src>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"class", [](const html_tag *tag) { return tag->find_class(); }},
+ {"id", [](const html_tag *tag) { return tag->find_id(); }},
+ {"style", [](const html_tag *tag) { return tag->find_style(); }},
+ {"alt", [](const html_tag *tag) { return tag->find_alt(); }},
+ {"rel", [](const html_tag *tag) { return tag->find_rel(); }},
+ {"color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"bgcolor", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_bgcolor>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Numeric components (return string representation)
+ {"width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"size", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_size>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Boolean components
+ {"hidden", [](const html_tag *tag) -> std::optional<std::string_view> {
+ return tag->is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }},
+
+ // Typography components
+ {"font-family", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_family>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"font-size", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_size>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"font-weight", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_weight>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"font-style", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_font_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"text-align", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_text_align>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"text-decoration", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_text_decoration>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"line-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_line_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Layout components
+ {"margin", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-top", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_top>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-bottom", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_bottom>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-left", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_left>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"margin-right", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_margin_right>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-top", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_top>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-bottom", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_bottom>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-left", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_left>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"padding-right", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_padding_right>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border-color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"border-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"border-style", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_border_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Display components
+ {"display", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_display>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"visibility", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_visibility>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"opacity", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_opacity>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Additional dimensions
+ {"min-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_min_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"max-width", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_max_width>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"min-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_min_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"max-height", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_max_height>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Table components
+ {"cellpadding", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_cellpadding>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"cellspacing", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_cellspacing>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+ {"valign", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_valign>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"align", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_align>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Form components
+ {"type", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_type>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"value", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_value>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"placeholder", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_placeholder>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"disabled", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_disabled>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"readonly", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_readonly>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"checked", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_checked>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+ {"selected", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_selected>()) {
+ return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt;
+ }
+ return std::nullopt;
+ }},
+
+ // Link & media components
+ {"target", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_target>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"title", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_title>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Meta components
+ {"charset", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_charset>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"content", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_content>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"http-equiv", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_http_equiv>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Accessibility components
+ {"role", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_role>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"tabindex", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_tabindex>()) {
+ return comp.value()->get_string_value();
+ }
+ return std::nullopt;
+ }},
+
+ // Background components
+ {"background", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-image", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_image>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-color", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_color>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-repeat", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_repeat>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"background-position", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_background_position>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+
+ // Email tracking components
+ {"data-track", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_track>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"data-id", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_id>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ {"data-url", [](const html_tag *tag) -> std::optional<std::string_view> {
+ if (auto comp = tag->find_component<html_component_data_url>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }},
+ });
+
+auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>
+{
+ auto it = component_extractors.find(attr_name);
+ if (it != component_extractors.end()) {
+ return it->second(this);
+ }
+
+ // Fallback to unknown components
+ return find_unknown_component(attr_name);
+}
+
+auto html_tag::get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>
+{
+ std::vector<std::pair<std::string_view, std::string_view>> result;
+
+ // First, get all known attributes using the component_extractors map
+ for (const auto &[attr_name, extractor_func]: component_extractors) {
+ if (auto value = extractor_func(this)) {
+ // Convert frozen::string to std::string_view for the key
+ std::string_view name_view{attr_name.data(), attr_name.size()};
+ result.emplace_back(name_view, value.value());
+ }
+ }
+
+ // Then add all unknown attributes
+ auto unknown_attrs = get_unknown_components();
+ for (const auto &[name, value]: unknown_attrs) {
+ result.emplace_back(name, value);
+ }
+
+ return result;
+}
+
enum tag_parser_state {
parse_start = 0,
parse_name,
@@ -234,13 +890,13 @@ enum tag_parser_state {
struct tag_content_parser_state {
tag_parser_state cur_state = parse_start;
std::string buf;
- std::optional<html_component_type> cur_component;
+ std::string attr_name;// Store current attribute name
void reset()
{
cur_state = parse_start;
buf.clear();
- cur_component = std::nullopt;
+ attr_name.clear();
}
};
@@ -254,56 +910,50 @@ html_parse_tag_content(rspamd_mempool_t *pool,
auto state = parser_env.cur_state;
/*
- * Stores tag component if it doesn't exist, performing copy of the
- * value + decoding of the entities
- * Parser env is set to clear the current html attribute fields (saved_p and
- * cur_component)
+ * Stores tag component creating the appropriate variant type
+ * Parser env is cleared after storing
*/
auto store_component_value = [&]() -> void {
- if (parser_env.cur_component) {
+ if (!parser_env.attr_name.empty()) {
+ std::string_view attr_name_view, value_view;
- if (parser_env.buf.empty()) {
- tag->components.emplace_back(parser_env.cur_component.value(),
- std::string_view{});
+ // Store attribute name in persistent memory
+ if (!parser_env.attr_name.empty()) {
+ auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size());
+ memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size());
+ attr_name_view = {name_storage, parser_env.attr_name.size()};
}
- else {
- /* We need to copy buf to a persistent storage */
- auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
- if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID ||
- parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) {
- /* Lowercase */
- rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size());
+ // Store value in persistent memory if not empty
+ if (!parser_env.buf.empty()) {
+ auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size());
+
+ // Lowercase for id and class attributes
+ if (parser_env.attr_name == "id" || parser_env.attr_name == "class") {
+ rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size());
}
else {
- memcpy(s, parser_env.buf.data(), parser_env.buf.size());
+ memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size());
}
- auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size());
- tag->components.emplace_back(parser_env.cur_component.value(),
- std::string_view{s, sz});
+ auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size());
+ value_view = {value_storage, sz};
}
+
+ // Create the appropriate component variant
+ auto component = html_component_from_string(attr_name_view, value_view);
+ tag->components.emplace_back(std::move(component));
}
parser_env.buf.clear();
- parser_env.cur_component = std::nullopt;
+ parser_env.attr_name.clear();
};
auto store_component_name = [&]() -> bool {
decode_html_entitles_inplace(parser_env.buf);
- auto known_component_it = html_components_map.find(std::string_view{parser_env.buf});
+ parser_env.attr_name = parser_env.buf;
parser_env.buf.clear();
-
- if (known_component_it != html_components_map.end()) {
- parser_env.cur_component = known_component_it->second;
-
- return true;
- }
- else {
- parser_env.cur_component = std::nullopt;
- }
-
- return false;
+ return true;
};
auto store_value_character = [&](bool lc) -> void {
@@ -471,6 +1121,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_dquote:
if (*in == '"') {
+ store_component_value();
state = spaces_after_param;
}
else {
@@ -481,6 +1132,7 @@ html_parse_tag_content(rspamd_mempool_t *pool,
case parse_start_squote:
if (*in == '\'') {
+ store_component_value();
state = spaces_after_param;
}
else {
@@ -620,7 +1272,7 @@ html_process_url_tag(rspamd_mempool_t *pool,
struct html_tag *tag,
struct html_content *hc) -> std::optional<struct rspamd_url *>
{
- auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF);
+ auto found_href_maybe = tag->find_href();
if (found_href_maybe) {
/* Check base url */
@@ -816,130 +1468,126 @@ html_process_img_tag(rspamd_mempool_t *pool,
img = rspamd_mempool_alloc0_type(pool, struct html_image);
img->tag = tag;
- for (const auto &param: tag->components) {
+ // Process SRC component (preferred for img tags) or HREF component (fallback)
+ std::optional<std::string_view> href_value;
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) {
- /* Check base url */
- const auto &href_value = param.value;
+ // Try SRC first (standard for img tags)
+ if (auto src_comp = tag->find_component<html_component_src>()) {
+ href_value = src_comp.value()->value;
+ }
+ // Fallback to HREF (for backward compatibility or non-standard usage)
+ else if (auto href_comp = tag->find_href()) {
+ href_value = href_comp;
+ }
- if (href_value.size() > 0) {
- rspamd_ftok_t fstr;
- fstr.begin = href_value.data();
- fstr.len = href_value.size();
- img->src = rspamd_mempool_ftokdup(pool, &fstr);
+ if (href_value && href_value->size() > 0) {
+ rspamd_ftok_t fstr;
+ fstr.begin = href_value->data();
+ fstr.len = href_value->size();
+ img->src = rspamd_mempool_ftokdup(pool, &fstr);
- if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(),
- "cid:", sizeof("cid:") - 1) == 0) {
- /* We have an embedded image */
- img->src += sizeof("cid:") - 1;
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
- }
- else {
- if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(),
- "data:", sizeof("data:") - 1) == 0) {
- /* We have an embedded image in HTML tag */
- img->flags |=
- (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
- html_process_data_image(pool, img, href_value);
- hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
- }
- else {
- img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
- if (img->src) {
-
- std::string_view cpy{href_value};
- auto maybe_url = html_process_url(pool, cpy);
-
- if (maybe_url) {
- img->url = maybe_url.value();
- struct rspamd_url *existing;
-
- img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
- existing = rspamd_url_set_add_or_return(url_set,
- img->url);
-
- if (existing && existing != img->url) {
- /*
- * We have some other URL that could be
- * found, e.g. from another part. However,
- * we still want to set an image flag on it
- */
- existing->flags |= img->url->flags;
- existing->count++;
- }
- else if (part_urls) {
- /* New url */
- g_ptr_array_add(part_urls, img->url);
- }
- }
- }
- }
- }
- }
+ if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(),
+ "cid:", sizeof("cid:") - 1) == 0) {
+ /* We have an embedded image */
+ img->src += sizeof("cid:") - 1;
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED;
}
+ else {
+ if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(),
+ "data:", sizeof("data:") - 1) == 0) {
+ /* We have an embedded image in HTML tag */
+ img->flags |=
+ (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA);
+ html_process_data_image(pool, img, *href_value);
+ hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS;
+ }
+ else {
+ img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL;
+ if (img->src) {
+ std::string_view cpy{*href_value};
+ auto maybe_url = html_process_url(pool, cpy);
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) {
- unsigned long val;
+ if (maybe_url) {
+ img->url = maybe_url.value();
+ struct rspamd_url *existing;
- rspamd_strtoul(param.value.data(), param.value.size(), &val);
- img->height = val;
- }
+ img->url->flags |= RSPAMD_URL_FLAG_IMAGE;
+ existing = rspamd_url_set_add_or_return(url_set,
+ img->url);
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) {
- unsigned long val;
-
- rspamd_strtoul(param.value.data(), param.value.size(), &val);
- img->width = val;
+ if (existing && existing != img->url) {
+ /*
+ * We have some other URL that could be
+ * found, e.g. from another part. However,
+ * we still want to set an image flag on it
+ */
+ existing->flags |= img->url->flags;
+ existing->count++;
+ }
+ else if (part_urls) {
+ /* New url */
+ g_ptr_array_add(part_urls, img->url);
+ }
+ }
+ }
+ }
}
+ }
- /* TODO: rework to css at some time */
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
- if (img->height == 0) {
- auto style_st = param.value;
- auto pos = rspamd_substring_search_caseless(style_st.data(),
- style_st.size(),
- "height", sizeof("height") - 1);
- if (pos != -1) {
- auto substr = style_st.substr(pos + sizeof("height") - 1);
+ // Process numeric dimensions using the new helper methods
+ if (auto height = tag->find_height()) {
+ img->height = height.value();
+ }
- for (auto i = 0; i < substr.size(); i++) {
- auto t = substr[i];
- if (g_ascii_isdigit(t)) {
- unsigned long val;
- rspamd_strtoul(substr.data(),
- substr.size(), &val);
- img->height = val;
- break;
- }
- else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
- /* Fallback */
- break;
- }
+ if (auto width = tag->find_width()) {
+ img->width = width.value();
+ }
+
+ // Process style component for dimensions
+ if (auto style_value = tag->find_style()) {
+ if (img->height == 0) {
+ auto pos = rspamd_substring_search_caseless(style_value->data(),
+ style_value->size(),
+ "height", sizeof("height") - 1);
+ if (pos != -1) {
+ auto substr = style_value->substr(pos + sizeof("height") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->height = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
}
}
}
- if (img->width == 0) {
- auto style_st = param.value;
- auto pos = rspamd_substring_search_caseless(style_st.data(),
- style_st.size(),
- "width", sizeof("width") - 1);
- if (pos != -1) {
- auto substr = style_st.substr(pos + sizeof("width") - 1);
-
- for (auto i = 0; i < substr.size(); i++) {
- auto t = substr[i];
- if (g_ascii_isdigit(t)) {
- unsigned long val;
- rspamd_strtoul(substr.data(),
- substr.size(), &val);
- img->width = val;
- break;
- }
- else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
- /* Fallback */
- break;
- }
+ }
+ if (img->width == 0) {
+ auto pos = rspamd_substring_search_caseless(style_value->data(),
+ style_value->size(),
+ "width", sizeof("width") - 1);
+ if (pos != -1) {
+ auto substr = style_value->substr(pos + sizeof("width") - 1);
+
+ for (auto i = 0; i < substr.size(); i++) {
+ auto t = substr[i];
+ if (g_ascii_isdigit(t)) {
+ unsigned long val;
+ rspamd_strtoul(substr.data(),
+ substr.size(), &val);
+ img->width = val;
+ break;
+ }
+ else if (!g_ascii_isspace(t) && t != '=' && t != ':') {
+ /* Fallback */
+ break;
}
}
}
@@ -968,7 +1616,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag,
khash_t(rspamd_url_hash) * url_set,
GPtrArray *part_urls) -> void
{
- auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL);
+ auto found_rel_maybe = tag->find_rel();
if (found_rel_maybe) {
if (found_rel_maybe.value() == "icon") {
@@ -984,24 +1632,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag,
std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor;
bool hidden = false;
- for (const auto &param: tag->components) {
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) {
- maybe_fgcolor = css::css_value::maybe_color_from_string(param.value);
- }
-
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) {
- maybe_bgcolor = css::css_value::maybe_color_from_string(param.value);
- }
+ // Process color components
+ if (auto color_comp = tag->find_component<html_component_color>()) {
+ maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value);
+ }
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) {
- tag->block = rspamd::css::parse_css_declaration(pool, param.value);
- }
+ if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) {
+ maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value);
+ }
- if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) {
- hidden = true;
- }
+ // Process style component
+ if (auto style_value = tag->find_style()) {
+ tag->block = rspamd::css::parse_css_declaration(pool, *style_value);
}
+ // Check if hidden
+ hidden = tag->is_hidden();
+
if (!tag->block) {
tag->block = html_block::undefined_html_block_pool(pool);
}
@@ -1284,7 +1931,7 @@ html_append_tag_content(rspamd_mempool_t *pool,
}
else if (tag->id == Tag_IMG) {
/* Process ALT if presented */
- auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT);
+ auto maybe_alt = tag->find_alt();
if (maybe_alt) {
if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) {
@@ -1384,9 +2031,7 @@ auto html_process_input(struct rspamd_task *task,
overflow_input = true;
}
- auto new_tag = [&](int flags = 0) -> struct html_tag *
- {
-
+ auto new_tag = [&](int flags = 0) -> struct html_tag * {
if (hc->all_tags.size() > rspamd::html::max_tags) {
hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS;
@@ -2151,7 +2796,7 @@ auto html_process_input(struct rspamd_task *task,
/* Leftover after content */
switch (state) {
case tags_limit_overflow:
- html_append_parsed(hc, {c, (std::size_t)(end - c)},
+ html_append_parsed(hc, {c, (std::size_t) (end - c)},
false, end - start, hc->parsed);
break;
default:
@@ -2390,4 +3035,4 @@ gsize rspamd_html_get_tags_count(void *html_content)
}
return hc->all_tags.size();
-} \ No newline at end of file
+}
diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx
index 309d76177..6d41f1337 100644
--- a/src/libserver/html/html_tag.hxx
+++ b/src/libserver/html/html_tag.hxx
@@ -1,11 +1,11 @@
-/*-
- * Copyright 2021 Vsevolod Stakhov
+/*
+ * Copyright 2025 Vsevolod Stakhov
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
@@ -26,6 +26,7 @@
#include <cstdint>
#include "html_tags.h"
+#include "libutil/str_util.h"
struct rspamd_url;
struct html_image;
@@ -34,7 +35,8 @@ namespace rspamd::html {
struct html_content; /* Forward declaration */
-enum class html_component_type : std::uint8_t {
+// Internal enum for mapping (not exposed in public API)
+enum class html_component_enum_type : std::uint8_t {
RSPAMD_HTML_COMPONENT_NAME = 0,
RSPAMD_HTML_COMPONENT_HREF,
RSPAMD_HTML_COMPONENT_COLOR,
@@ -48,8 +50,1214 @@ enum class html_component_type : std::uint8_t {
RSPAMD_HTML_COMPONENT_ALT,
RSPAMD_HTML_COMPONENT_ID,
RSPAMD_HTML_COMPONENT_HIDDEN,
+ // Typography
+ RSPAMD_HTML_COMPONENT_FONT_FAMILY,
+ RSPAMD_HTML_COMPONENT_FONT_SIZE,
+ RSPAMD_HTML_COMPONENT_FONT_WEIGHT,
+ RSPAMD_HTML_COMPONENT_FONT_STYLE,
+ RSPAMD_HTML_COMPONENT_TEXT_ALIGN,
+ RSPAMD_HTML_COMPONENT_TEXT_DECORATION,
+ RSPAMD_HTML_COMPONENT_LINE_HEIGHT,
+ // Layout & positioning
+ RSPAMD_HTML_COMPONENT_MARGIN,
+ RSPAMD_HTML_COMPONENT_MARGIN_TOP,
+ RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM,
+ RSPAMD_HTML_COMPONENT_MARGIN_LEFT,
+ RSPAMD_HTML_COMPONENT_MARGIN_RIGHT,
+ RSPAMD_HTML_COMPONENT_PADDING,
+ RSPAMD_HTML_COMPONENT_PADDING_TOP,
+ RSPAMD_HTML_COMPONENT_PADDING_BOTTOM,
+ RSPAMD_HTML_COMPONENT_PADDING_LEFT,
+ RSPAMD_HTML_COMPONENT_PADDING_RIGHT,
+ RSPAMD_HTML_COMPONENT_BORDER,
+ RSPAMD_HTML_COMPONENT_BORDER_COLOR,
+ RSPAMD_HTML_COMPONENT_BORDER_WIDTH,
+ RSPAMD_HTML_COMPONENT_BORDER_STYLE,
+ // Display & visibility
+ RSPAMD_HTML_COMPONENT_DISPLAY,
+ RSPAMD_HTML_COMPONENT_VISIBILITY,
+ RSPAMD_HTML_COMPONENT_OPACITY,
+ // Dimensions
+ RSPAMD_HTML_COMPONENT_MIN_WIDTH,
+ RSPAMD_HTML_COMPONENT_MAX_WIDTH,
+ RSPAMD_HTML_COMPONENT_MIN_HEIGHT,
+ RSPAMD_HTML_COMPONENT_MAX_HEIGHT,
+ // Table attributes
+ RSPAMD_HTML_COMPONENT_CELLPADDING,
+ RSPAMD_HTML_COMPONENT_CELLSPACING,
+ RSPAMD_HTML_COMPONENT_VALIGN,
+ RSPAMD_HTML_COMPONENT_ALIGN,
+ // Form attributes
+ RSPAMD_HTML_COMPONENT_TYPE,
+ RSPAMD_HTML_COMPONENT_VALUE,
+ RSPAMD_HTML_COMPONENT_PLACEHOLDER,
+ RSPAMD_HTML_COMPONENT_DISABLED,
+ RSPAMD_HTML_COMPONENT_READONLY,
+ RSPAMD_HTML_COMPONENT_CHECKED,
+ RSPAMD_HTML_COMPONENT_SELECTED,
+ // Link & media
+ RSPAMD_HTML_COMPONENT_TARGET,
+ RSPAMD_HTML_COMPONENT_TITLE,
+ RSPAMD_HTML_COMPONENT_SRC,
+ // Meta & document
+ RSPAMD_HTML_COMPONENT_CHARSET,
+ RSPAMD_HTML_COMPONENT_CONTENT,
+ RSPAMD_HTML_COMPONENT_HTTP_EQUIV,
+ // Accessibility
+ RSPAMD_HTML_COMPONENT_ROLE,
+ RSPAMD_HTML_COMPONENT_TABINDEX,
+ // Background
+ RSPAMD_HTML_COMPONENT_BACKGROUND,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT,
+ RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION,
+ // Email-specific tracking
+ RSPAMD_HTML_COMPONENT_DATA_TRACK,
+ RSPAMD_HTML_COMPONENT_DATA_ID,
+ RSPAMD_HTML_COMPONENT_DATA_URL,
};
+// Forward declarations for component types
+struct html_component_name;
+struct html_component_href;
+struct html_component_color;
+struct html_component_bgcolor;
+struct html_component_style;
+struct html_component_class;
+struct html_component_width;
+struct html_component_height;
+struct html_component_size;
+struct html_component_rel;
+struct html_component_alt;
+struct html_component_id;
+struct html_component_hidden;
+struct html_component_unknown;
+
+// Base interface for all components
+struct html_component_base {
+ virtual ~html_component_base() = default;
+ virtual constexpr std::string_view get_string_value() const = 0;
+};
+
+// String-based components
+struct html_component_name : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_name(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_href : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_href(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_style : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_style(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_class : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_class(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_rel : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_rel(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_alt : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_alt(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_id : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_id(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Color components (could be extended to parse actual colors)
+struct html_component_color : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_color(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_bgcolor : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_bgcolor(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Numeric components
+struct html_component_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_width(const std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_height(const std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_size : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_size(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Boolean/flag component
+struct html_component_hidden : html_component_base {
+ bool present;
+ explicit constexpr html_component_hidden()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+// Unknown component with both name and value
+struct html_component_unknown : html_component_base {
+ std::string_view name;
+ std::string_view value;
+
+ constexpr html_component_unknown(std::string_view n, std::string_view v)
+ : name(n), value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+ constexpr std::string_view get_name() const
+ {
+ return name;
+ }
+};
+
+// Typography components
+struct html_component_font_family : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_family(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_font_size : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_font_size(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ constexpr std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_font_weight : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_weight(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_font_style : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_font_style(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_text_align : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_text_align(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_text_decoration : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_text_decoration(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_line_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_line_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Layout components (most are string-based for flexibility)
+struct html_component_margin : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_top : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_top(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_bottom : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_bottom(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_left : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_left(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_margin_right : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_margin_right(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_top : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_top(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_bottom : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_bottom(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_left : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_left(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_padding_right : html_component_base {
+ std::string_view value;
+ explicit constexpr html_component_padding_right(std::string_view v)
+ : value(v)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border : html_component_base {
+ std::string_view value;
+ explicit html_component_border(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border_color : html_component_base {
+ std::string_view value;
+ explicit html_component_border_color(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_border_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_border_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_border_style : html_component_base {
+ std::string_view value;
+ explicit html_component_border_style(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Display components
+struct html_component_display : html_component_base {
+ std::string_view value;
+ explicit html_component_display(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_visibility : html_component_base {
+ std::string_view value;
+ explicit html_component_visibility(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_opacity : html_component_base {
+ std::string_view raw_value;
+ std::optional<float> numeric_value;
+
+ explicit html_component_opacity(std::string_view v)
+ : raw_value(v)
+ {
+ char *endptr;
+ auto val = std::strtof(v.data(), &endptr);
+ if (endptr != v.data() && val >= 0.0f && val <= 1.0f) {
+ numeric_value = val;
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<float> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Additional dimension components
+struct html_component_min_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_min_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_max_width : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_max_width(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_min_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_min_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_max_height : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_max_height(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Table components
+struct html_component_cellpadding : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_cellpadding(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_cellspacing : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::uint32_t> numeric_value;
+
+ explicit html_component_cellspacing(std::string_view v)
+ : raw_value(v)
+ {
+ unsigned long val;
+ if (rspamd_strtoul(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::uint32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::uint32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+struct html_component_valign : html_component_base {
+ std::string_view value;
+ explicit html_component_valign(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_align : html_component_base {
+ std::string_view value;
+ explicit html_component_align(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Form components
+struct html_component_type : html_component_base {
+ std::string_view value;
+ explicit html_component_type(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_value : html_component_base {
+ std::string_view value;
+ explicit html_component_value(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_placeholder : html_component_base {
+ std::string_view value;
+ explicit html_component_placeholder(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Boolean form components
+struct html_component_disabled : html_component_base {
+ bool present;
+ explicit constexpr html_component_disabled()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_readonly : html_component_base {
+ bool present;
+ explicit constexpr html_component_readonly()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_checked : html_component_base {
+ bool present;
+ explicit constexpr html_component_checked()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+struct html_component_selected : html_component_base {
+ bool present;
+ explicit constexpr html_component_selected()
+ : present(true)
+ {
+ }
+ constexpr std::string_view get_string_value() const override
+ {
+ return present ? "true" : "false";
+ }
+ constexpr bool is_present() const
+ {
+ return present;
+ }
+};
+
+// Link & media components
+struct html_component_target : html_component_base {
+ std::string_view value;
+ explicit html_component_target(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_title : html_component_base {
+ std::string_view value;
+ explicit html_component_title(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_src : html_component_base {
+ std::string_view value;
+ explicit html_component_src(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Meta components
+struct html_component_charset : html_component_base {
+ std::string_view value;
+ explicit html_component_charset(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_content : html_component_base {
+ std::string_view value;
+ explicit html_component_content(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_http_equiv : html_component_base {
+ std::string_view value;
+ explicit html_component_http_equiv(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Accessibility components
+struct html_component_role : html_component_base {
+ std::string_view value;
+ explicit html_component_role(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_tabindex : html_component_base {
+ std::string_view raw_value;
+ std::optional<std::int32_t> numeric_value;
+
+ explicit html_component_tabindex(std::string_view v)
+ : raw_value(v)
+ {
+ long val;
+ if (rspamd_strtol(v.data(), v.size(), &val)) {
+ numeric_value = static_cast<std::int32_t>(val);
+ }
+ }
+
+ std::string_view get_string_value() const override
+ {
+ return raw_value;
+ }
+ std::optional<std::int32_t> get_numeric_value() const
+ {
+ return numeric_value;
+ }
+};
+
+// Background components
+struct html_component_background : html_component_base {
+ std::string_view value;
+ explicit html_component_background(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_image : html_component_base {
+ std::string_view value;
+ explicit html_component_background_image(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_color : html_component_base {
+ std::string_view value;
+ explicit html_component_background_color(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_repeat : html_component_base {
+ std::string_view value;
+ explicit html_component_background_repeat(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_background_position : html_component_base {
+ std::string_view value;
+ explicit html_component_background_position(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// Email tracking components
+struct html_component_data_track : html_component_base {
+ std::string_view value;
+ explicit html_component_data_track(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_data_id : html_component_base {
+ std::string_view value;
+ explicit html_component_data_id(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+struct html_component_data_url : html_component_base {
+ std::string_view value;
+ explicit html_component_data_url(std::string_view v)
+ : value(v)
+ {
+ }
+ std::string_view get_string_value() const override
+ {
+ return value;
+ }
+};
+
+// The variant type that holds all possible components
+using html_tag_component = std::variant<
+ html_component_name,
+ html_component_href,
+ html_component_color,
+ html_component_bgcolor,
+ html_component_style,
+ html_component_class,
+ html_component_width,
+ html_component_height,
+ html_component_size,
+ html_component_rel,
+ html_component_alt,
+ html_component_id,
+ html_component_hidden,
+ // Typography
+ html_component_font_family,
+ html_component_font_size,
+ html_component_font_weight,
+ html_component_font_style,
+ html_component_text_align,
+ html_component_text_decoration,
+ html_component_line_height,
+ // Layout
+ html_component_margin,
+ html_component_margin_top,
+ html_component_margin_bottom,
+ html_component_margin_left,
+ html_component_margin_right,
+ html_component_padding,
+ html_component_padding_top,
+ html_component_padding_bottom,
+ html_component_padding_left,
+ html_component_padding_right,
+ html_component_border,
+ html_component_border_color,
+ html_component_border_width,
+ html_component_border_style,
+ // Display
+ html_component_display,
+ html_component_visibility,
+ html_component_opacity,
+ // Dimensions
+ html_component_min_width,
+ html_component_max_width,
+ html_component_min_height,
+ html_component_max_height,
+ // Table
+ html_component_cellpadding,
+ html_component_cellspacing,
+ html_component_valign,
+ html_component_align,
+ // Form
+ html_component_type,
+ html_component_value,
+ html_component_placeholder,
+ html_component_disabled,
+ html_component_readonly,
+ html_component_checked,
+ html_component_selected,
+ // Link & media
+ html_component_target,
+ html_component_title,
+ html_component_src,
+ // Meta
+ html_component_charset,
+ html_component_content,
+ html_component_http_equiv,
+ // Accessibility
+ html_component_role,
+ html_component_tabindex,
+ // Background
+ html_component_background,
+ html_component_background_image,
+ html_component_background_color,
+ html_component_background_repeat,
+ html_component_background_position,
+ // Email tracking
+ html_component_data_track,
+ html_component_data_id,
+ html_component_data_url,
+ // Unknown
+ html_component_unknown>;
+
+/**
+ * Returns component variant from a string
+ * @param name attribute name
+ * @param value attribute value
+ * @return variant component
+ */
+auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component;
+
/* Public tags flags */
/* XML tag */
#define FL_XML (1u << CM_USER_SHIFT)
@@ -62,23 +1270,7 @@ enum class html_component_type : std::uint8_t {
#define FL_COMMENT (1 << (CM_USER_SHIFT + 6))
#define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7))
-/**
- * Returns component type from a string
- * @param st
- * @return
- */
-auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>;
-
using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>;
-struct html_tag_component {
- html_component_type type;
- std::string_view value;
-
- html_tag_component(html_component_type type, std::string_view value)
- : type(type), value(value)
- {
- }
-};
/* Pairing closing tag representation */
struct html_closing_tag {
@@ -105,26 +1297,128 @@ struct html_tag {
std::vector<struct html_tag *> children;
struct html_tag *parent;
- auto find_component(html_component_type what) const -> std::optional<std::string_view>
+ // Template method to find component by type
+ template<typename T>
+ auto find_component() const -> std::optional<const T *>
{
for (const auto &comp: components) {
- if (comp.type == what) {
- return comp.value;
+ if (std::holds_alternative<T>(comp)) {
+ return &std::get<T>(comp);
}
}
+ return std::nullopt;
+ }
+ // Helper methods for common component access
+ auto find_href() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_href>()) {
+ return comp.value()->value;
+ }
return std::nullopt;
}
- auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view>
+ auto find_class() const -> std::optional<std::string_view>
{
- if (what) {
- return find_component(what.value());
+ if (auto comp = find_component<html_component_class>()) {
+ return comp.value()->value;
}
+ return std::nullopt;
+ }
+ auto find_id() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_id>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_width() const -> std::optional<std::uint32_t>
+ {
+ if (auto comp = find_component<html_component_width>()) {
+ return comp.value()->get_numeric_value();
+ }
+ return std::nullopt;
+ }
+
+ auto find_height() const -> std::optional<std::uint32_t>
+ {
+ if (auto comp = find_component<html_component_height>()) {
+ return comp.value()->get_numeric_value();
+ }
return std::nullopt;
}
+ auto find_style() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_style>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_alt() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_alt>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto find_rel() const -> std::optional<std::string_view>
+ {
+ if (auto comp = find_component<html_component_rel>()) {
+ return comp.value()->value;
+ }
+ return std::nullopt;
+ }
+
+ auto is_hidden() const -> bool
+ {
+ return find_component<html_component_hidden>().has_value();
+ }
+
+ auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view>
+ {
+ for (const auto &comp: components) {
+ if (std::holds_alternative<html_component_unknown>(comp)) {
+ const auto &unknown = std::get<html_component_unknown>(comp);
+ if (unknown.name == attr_name) {
+ return unknown.value;
+ }
+ }
+ }
+ return std::nullopt;
+ }
+
+ auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>>
+ {
+ std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs;
+ for (const auto &comp: components) {
+ if (std::holds_alternative<html_component_unknown>(comp)) {
+ const auto &unknown = std::get<html_component_unknown>(comp);
+ unknown_attrs.emplace_back(unknown.name, unknown.value);
+ }
+ }
+ return unknown_attrs;
+ }
+
+ // Generic visitor method for processing all components
+ template<typename Visitor>
+ auto visit_components(Visitor &&visitor) const
+ {
+ for (const auto &comp: components) {
+ std::visit(std::forward<Visitor>(visitor), comp);
+ }
+ }
+
+ // Find any component by attribute name
+ auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>;
+
+ // Get all attributes as name-value pairs
+ auto get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>;
+
auto clear(void) -> void
{
id = Tag_UNKNOWN;
@@ -137,7 +1431,7 @@ struct html_tag {
closing.clear();
}
- constexpr auto get_content_length() const -> std::size_t
+ auto get_content_length() const -> std::size_t
{
if (flags & (FL_IGNORE | CM_HEAD)) {
return 0;
diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx
index 090e2af55..9b0deed45 100644
--- a/src/lua/lua_html.cxx
+++ b/src/lua/lua_html.cxx
@@ -179,6 +179,44 @@ LUA_FUNCTION_DEF(html_tag, get_style);
*/
LUA_FUNCTION_DEF(html_tag, get_attribute);
+/***
+ * @method html_tag:get_all_attributes()
+ * Returns table of all attributes for the element
+ * @return {table} table with attribute names as keys and values as strings
+ */
+LUA_FUNCTION_DEF(html_tag, get_all_attributes);
+
+/***
+ * @method html_tag:get_unknown_attributes()
+ * Returns table of unknown/unrecognized attributes for the element
+ * @return {table} table with unknown attribute names as keys and values as strings
+ */
+LUA_FUNCTION_DEF(html_tag, get_unknown_attributes);
+
+/***
+ * @method html_tag:get_children()
+ * Returns array of child tags for the element
+ * @return {table} array of child html_tag objects
+ */
+LUA_FUNCTION_DEF(html_tag, get_children);
+
+/***
+ * @method html_tag:has_attribute(name)
+ * Checks if element has the specified attribute
+ * @param {string} name attribute name to check
+ * @return {boolean} true if attribute exists
+ */
+LUA_FUNCTION_DEF(html_tag, has_attribute);
+
+/***
+ * @method html_tag:get_numeric_attribute(name)
+ * Returns numeric value of attribute (if supported and parseable)
+ * Works for attributes like width, height, font-size, etc.
+ * @param {string} name attribute name
+ * @return {number|nil} numeric value or nil if not numeric/parseable
+ */
+LUA_FUNCTION_DEF(html_tag, get_numeric_attribute);
+
static const struct luaL_reg taglib_m[] = {
LUA_INTERFACE_DEF(html_tag, get_type),
LUA_INTERFACE_DEF(html_tag, get_extra),
@@ -188,6 +226,11 @@ static const struct luaL_reg taglib_m[] = {
LUA_INTERFACE_DEF(html_tag, get_content_length),
LUA_INTERFACE_DEF(html_tag, get_style),
LUA_INTERFACE_DEF(html_tag, get_attribute),
+ LUA_INTERFACE_DEF(html_tag, get_all_attributes),
+ LUA_INTERFACE_DEF(html_tag, get_unknown_attributes),
+ LUA_INTERFACE_DEF(html_tag, get_children),
+ LUA_INTERFACE_DEF(html_tag, has_attribute),
+ LUA_INTERFACE_DEF(html_tag, get_numeric_attribute),
{"__tostring", rspamd_lua_class_tostring},
{NULL, NULL}};
@@ -704,6 +747,29 @@ lua_html_tag_get_style(lua_State *L)
}
static int
+lua_html_tag_get_all_attributes(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ auto all_attrs = ltag->tag->get_all_attributes();
+ lua_createtable(L, 0, all_attrs.size());
+
+ for (const auto &[name, value]: all_attrs) {
+ lua_pushlstring(L, name.data(), name.size());
+ lua_pushlstring(L, value.data(), value.size());
+ lua_settable(L, -3);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
lua_html_tag_get_attribute(lua_State *L)
{
LUA_TRACE_POINT;
@@ -712,8 +778,7 @@ lua_html_tag_get_attribute(lua_State *L)
const char *attr_name = luaL_checklstring(L, 2, &slen);
if (ltag && attr_name) {
- auto maybe_attr = ltag->tag->find_component(
- rspamd::html::html_component_from_string({attr_name, slen}));
+ auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen});
if (maybe_attr) {
lua_pushlstring(L, maybe_attr->data(), maybe_attr->size());
@@ -729,6 +794,206 @@ lua_html_tag_get_attribute(lua_State *L)
return 1;
}
+static int
+lua_html_tag_get_unknown_attributes(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ auto unknown_attrs = ltag->tag->get_unknown_components();
+ lua_createtable(L, 0, unknown_attrs.size());
+
+ for (const auto &[name, value]: unknown_attrs) {
+ lua_pushlstring(L, name.data(), name.size());
+ lua_pushlstring(L, value.data(), value.size());
+ lua_settable(L, -3);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_get_children(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+
+ if (ltag) {
+ lua_createtable(L, ltag->tag->children.size(), 0);
+
+ for (int i = 0; i < ltag->tag->children.size(); i++) {
+ auto *child_tag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag)));
+ child_tag->tag = ltag->tag->children[i];
+ child_tag->html = ltag->html;
+ rspamd_lua_setclass(L, rspamd_html_tag_classname, -1);
+ lua_rawseti(L, -2, i + 1);
+ }
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_has_attribute(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+ gsize slen;
+ const char *attr_name = luaL_checklstring(L, 2, &slen);
+
+ if (ltag && attr_name) {
+ auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen});
+ lua_pushboolean(L, maybe_attr.has_value());
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
+static int
+lua_html_tag_get_numeric_attribute(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct lua_html_tag *ltag = lua_check_html_tag(L, 1);
+ gsize slen;
+ const char *attr_name = luaL_checklstring(L, 2, &slen);
+
+ if (ltag && attr_name) {
+ std::string_view name_view{attr_name, slen};
+
+ // Check for numeric components
+ if (name_view == "width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "size") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_size>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "font-size") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_font_size>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "line-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_line_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "border-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_border_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "opacity") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_opacity>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushnumber(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "min-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "max-width") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_width>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "min-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "max-height") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_height>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "cellpadding") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellpadding>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "cellspacing") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellspacing>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+ else if (name_view == "tabindex") {
+ if (auto comp = ltag->tag->find_component<rspamd::html::html_component_tabindex>()) {
+ if (auto numeric_val = comp.value()->get_numeric_value()) {
+ lua_pushinteger(L, numeric_val.value());
+ return 1;
+ }
+ }
+ }
+
+ lua_pushnil(L);
+ }
+ else {
+ return luaL_error(L, "invalid arguments");
+ }
+
+ return 1;
+}
+
void luaopen_html(lua_State *L)
{
rspamd_lua_new_class(L, rspamd_html_classname, htmllib_m);
diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c
index 39e1b0317..eb7fa6bf5 100644
--- a/src/lua/lua_parsers.c
+++ b/src/lua/lua_parsers.c
@@ -46,6 +46,14 @@
*/
/***
+ * @function parsers.parse_html_content(input, mempool)
+ * Parses HTML and returns the HTML content object for structure analysis
+ * @param {string|text} in input HTML
+ * @param {rspamd_mempool} mempool memory pool for HTML content management
+ * @return {html_content} HTML content object with tag structure
+ */
+LUA_FUNCTION_DEF(parsers, parse_html_content);
+/***
* @function parsers.parse_mail_address(str, [pool])
* Parses email address and returns a table of tables in the following format:
*
@@ -93,6 +101,7 @@
static const struct luaL_reg parserslib_f[] = {
LUA_INTERFACE_DEF(parsers, tokenize_text),
LUA_INTERFACE_DEF(parsers, parse_html),
+ LUA_INTERFACE_DEF(parsers, parse_html_content),
LUA_INTERFACE_DEF(parsers, parse_mail_address),
LUA_INTERFACE_DEF(parsers, parse_content_type),
LUA_INTERFACE_DEF(parsers, parse_smtp_date),
@@ -242,6 +251,62 @@ int lua_parsers_parse_html(lua_State *L)
return 1;
}
+static int lua_parsers_parse_html_content(lua_State *L)
+{
+ LUA_TRACE_POINT;
+ struct rspamd_lua_text *t;
+ const char *start = NULL;
+ gsize len;
+ GByteArray *in;
+ rspamd_mempool_t *pool;
+ void *hc;
+ void **phc;
+
+ if (lua_type(L, 1) == LUA_TUSERDATA) {
+ t = lua_check_text(L, 1);
+
+ if (t != NULL) {
+ start = t->start;
+ len = t->len;
+ }
+ }
+ else if (lua_type(L, 1) == LUA_TSTRING) {
+ start = luaL_checklstring(L, 1, &len);
+ }
+
+ if (lua_type(L, 2) != LUA_TUSERDATA) {
+ return luaL_error(L, "invalid arguments: mempool expected as second argument");
+ }
+
+ pool = rspamd_lua_check_mempool(L, 2);
+ if (!pool) {
+ return luaL_error(L, "invalid mempool argument");
+ }
+
+ if (start != NULL) {
+ in = g_byte_array_sized_new(len);
+ g_byte_array_append(in, start, len);
+
+ hc = rspamd_html_process_part(pool, in);
+
+ if (hc) {
+ phc = lua_newuserdata(L, sizeof(void *));
+ *phc = hc;
+ rspamd_lua_setclass(L, rspamd_html_classname, -1);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ g_byte_array_free(in, TRUE);
+ }
+ else {
+ lua_pushnil(L);
+ }
+
+ return 1;
+}
+
int lua_parsers_parse_mail_address(lua_State *L)
{
LUA_TRACE_POINT;
@@ -409,4 +474,4 @@ lua_load_parsers(lua_State *L)
void luaopen_parsers(lua_State *L)
{
rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers);
-} \ No newline at end of file
+}
diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua
index 81c52ec1b..1802dc984 100644
--- a/test/lua/unit/html.lua
+++ b/test/lua/unit/html.lua
@@ -1,11 +1,10 @@
context("HTML processing", function()
local rspamd_util = require("rspamd_util")
- local logger = require("rspamd_logger")
local cases = {
- -- Entities
- {[[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
- [[.firebaseapp.com]]},
- {[[
+ -- Entities
+ { [[<html><body>.&#102;&#105;&#114;&#101;&#98;&#97;&#115;&#101;&#97;&#112;&#112;.&#99;&#111;&#109;</body></html>]],
+ [[.firebaseapp.com]] },
+ { [[
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
@@ -22,8 +21,8 @@ context("HTML processing", function()
</p>
</body>
- </html>]], 'Hello, world!\n'},
- {[[
+ </html>]], 'Hello, world!\n' },
+ { [[
<!DOCTYPE html>
<html lang="en">
<head>
@@ -39,8 +38,8 @@ context("HTML processing", function()
Hello, world!
</body>
</html>
- ]], 'Hello, world!'},
- {[[
+ ]], 'Hello, world!' },
+ { [[
<html lang="en">
<head>
<meta charset="utf-8">
@@ -56,8 +55,8 @@ context("HTML processing", function()
</div>
</body>
</html>
- ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'},
- {[[
+ ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n' },
+ { [[
<html lang="en">
<head>
<meta charset="utf-8">
@@ -83,8 +82,8 @@ context("HTML processing", function()
</body>
</html>
- ]], 'content\nheada headb\ndata1 data2\n'},
- {[[
+ ]], 'content\nheada headb\ndata1 data2\n' },
+ { [[
<html lang="en">
<head>
<meta charset="utf-8">
@@ -97,17 +96,398 @@ context("HTML processing", function()
a&nbsp;b a &gt; b a &lt; b a &amp; b &apos;a &quot;a&quot;
</body>
</html>
- ]], 'a b a > b a < b a & b \'a "a"'},
+ ]], 'a b a > b a < b a & b \'a "a"' },
}
- for i,c in ipairs(cases) do
+ for i, c in ipairs(cases) do
test("Extract text from HTML " .. tostring(i), function()
local t = rspamd_util.parse_html(c[1])
assert_not_nil(t)
assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'",
- c[2], t))
-
+ c[2], t))
end)
end
+
+ -- Test cases for new HTML tag API methods
+ local function parse_html_and_extract_tags(html_content, pool)
+ local rspamd_parsers = require("rspamd_parsers")
+
+ local parsed = rspamd_parsers.parse_html_content(html_content, pool)
+ local tags = {}
+
+ if parsed then
+ parsed:foreach_tag("any", function(tag, content_length, is_leaf)
+ table.insert(tags, tag)
+ return false
+ end)
+ end
+
+ return parsed, tags
+ end
+
+ test("HTML tag get_all_attributes basic test", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<div class="test-class" id="test-id" style="color: red;" width="100">content</div>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+ assert_true(#tags > 0)
+
+ -- Find the div tag
+ local div_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" then
+ div_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(div_tag)
+
+ local attrs = div_tag:get_all_attributes()
+ assert_not_nil(attrs)
+
+ -- Check that we have the expected attributes
+ assert_equal("test-class", attrs["class"])
+ assert_equal("test-id", attrs["id"])
+ assert_equal("color: red;", attrs["style"])
+ assert_equal("100", attrs["width"])
+
+ pool:destroy()
+ end)
+
+ test("HTML tag has_attribute test", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<img src="test.jpg" width="100" height="50" alt="Test image" hidden />]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local img_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "img" then
+ img_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(img_tag)
+
+ -- Test existing attributes
+ assert_true(img_tag:has_attribute("src"))
+ assert_true(img_tag:has_attribute("width"))
+ assert_true(img_tag:has_attribute("height"))
+ assert_true(img_tag:has_attribute("alt"))
+ assert_true(img_tag:has_attribute("hidden"))
+
+ -- Test non-existing attributes
+ assert_false(img_tag:has_attribute("nonexistent"))
+ assert_false(img_tag:has_attribute("class"))
+ assert_false(img_tag:has_attribute(""))
+
+ pool:destroy()
+ end)
+
+ test("HTML tag get_numeric_attribute test", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<div width="200" height="150" font-size="14" opacity="0.8" tabindex="5">content</div>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local div_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" then
+ div_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(div_tag)
+
+ -- Test numeric attributes
+ assert_equal(200, div_tag:get_numeric_attribute("width"))
+ assert_equal(150, div_tag:get_numeric_attribute("height"))
+ assert_equal(14, div_tag:get_numeric_attribute("font-size"))
+
+ -- Test opacity with floating-point tolerance
+ local opacity = div_tag:get_numeric_attribute("opacity")
+ assert_not_nil(opacity)
+ assert_true(math.abs(opacity - 0.8) < 0.01, string.format("Expected opacity ~0.8, got %f", opacity))
+
+ assert_equal(5, div_tag:get_numeric_attribute("tabindex"))
+
+ -- Test non-numeric attributes
+ assert_nil(div_tag:get_numeric_attribute("nonexistent"))
+
+ pool:destroy()
+ end)
+
+ test("HTML tag get_unknown_attributes test", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<div class="known" data-track="analytics" unknown-attr="test-value" custom-id="12345">content</div>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local div_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" then
+ div_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(div_tag)
+
+ local unknown_attrs = div_tag:get_unknown_attributes()
+ assert_not_nil(unknown_attrs)
+
+ -- Should include unknown attributes but not known ones like "class"
+ assert_not_nil(unknown_attrs["unknown-attr"])
+ assert_equal("test-value", unknown_attrs["unknown-attr"])
+ assert_not_nil(unknown_attrs["custom-id"])
+ assert_equal("12345", unknown_attrs["custom-id"])
+
+ -- data-track should be recognized as a known attribute now
+ -- but if not, it would appear in unknown attributes
+
+ pool:destroy()
+ end)
+
+ test("HTML tag get_children test", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[
+ <div id="parent">
+ <p>First child</p>
+ <span>Second child</span>
+ <img src="test.jpg" />
+ </div>
+ ]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local parent_div = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" and tag:has_attribute("id") and tag:get_attribute("id") == "parent" then
+ parent_div = tag
+ break
+ end
+ end
+
+ assert_not_nil(parent_div)
+
+ local children = parent_div:get_children()
+ assert_not_nil(children)
+ assert_equal(3, #children)
+
+ -- Check child types
+ local child_types = {}
+ for _, child in ipairs(children) do
+ table.insert(child_types, child:get_type())
+ end
+
+ -- Should contain p, span, and img
+ local child_types_str = table.concat(child_types, ",")
+ assert_true(child_types_str:find("p") ~= nil)
+ assert_true(child_types_str:find("span") ~= nil)
+ assert_true(child_types_str:find("img") ~= nil)
+
+ pool:destroy()
+ end)
+
+ test("HTML tag get_attribute vs get_all_attributes consistency", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<a href="https://example.com" class="link" target="_blank" title="Example Link">Link</a>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local a_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "a" then
+ a_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(a_tag)
+
+ local all_attrs = a_tag:get_all_attributes()
+
+ -- Test that individual get_attribute calls match get_all_attributes
+ for attr_name, attr_value in pairs(all_attrs) do
+ assert_equal(attr_value, a_tag:get_attribute(attr_name),
+ string.format("Attribute '%s' mismatch: get_attribute='%s', get_all_attributes='%s'",
+ attr_name, a_tag:get_attribute(attr_name) or "nil", attr_value))
+ end
+
+ -- Test specific expected attributes
+ assert_equal("https://example.com", a_tag:get_attribute("href"))
+ assert_equal("link", a_tag:get_attribute("class"))
+ assert_equal("_blank", a_tag:get_attribute("target"))
+ assert_equal("Example Link", a_tag:get_attribute("title"))
+
+ pool:destroy()
+ end)
+
+
+
+ test("HTML tag attribute edge cases", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[<div class="" hidden style=" " width="0" height="abc">content</div>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local div_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" then
+ div_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(div_tag)
+
+ -- Test empty attribute value
+ assert_true(div_tag:has_attribute("class"))
+ assert_equal("", div_tag:get_attribute("class"))
+
+ -- Test boolean attribute (hidden)
+ assert_true(div_tag:has_attribute("hidden"))
+
+ -- Test whitespace-only attribute
+ assert_true(div_tag:has_attribute("style"))
+ assert_equal(" ", div_tag:get_attribute("style"))
+
+ -- Test numeric attributes with edge cases
+ assert_equal(0, div_tag:get_numeric_attribute("width"))
+ assert_nil(div_tag:get_numeric_attribute("height")) -- "abc" is not numeric
+
+ -- Test non-existent attribute
+ assert_false(div_tag:has_attribute("nonexistent"))
+ assert_nil(div_tag:get_attribute("nonexistent"))
+ assert_nil(div_tag:get_numeric_attribute("nonexistent"))
+
+ pool:destroy()
+ end)
+
+ test("HTML tag complex nested structure", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html = [[
+ <table cellpadding="5" cellspacing="2" border="1">
+ <tr>
+ <td align="center" valign="top" width="100">
+ <img src="image1.jpg" width="80" height="60" alt="Image 1" />
+ </td>
+ <td align="left" valign="middle">
+ <p font-size="12">Text content</p>
+ </td>
+ </tr>
+ </table>
+ ]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ -- Find table tag
+ local table_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "table" then
+ table_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(table_tag)
+
+ -- Test table attributes
+ assert_equal(5, table_tag:get_numeric_attribute("cellpadding"))
+ assert_equal(2, table_tag:get_numeric_attribute("cellspacing"))
+ assert_equal("1", table_tag:get_attribute("border"))
+
+ -- Test that table has children
+ local children = table_tag:get_children()
+ assert_not_nil(children)
+ assert_true(#children > 0)
+
+ -- Find img tag
+ local img_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "img" then
+ img_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(img_tag)
+
+ -- Test img attributes
+ assert_equal("image1.jpg", img_tag:get_attribute("src"))
+ assert_equal(80, img_tag:get_numeric_attribute("width"))
+ assert_equal(60, img_tag:get_numeric_attribute("height"))
+ assert_equal("Image 1", img_tag:get_attribute("alt"))
+
+ pool:destroy()
+ end)
+
+ test("HTML tag with mixed known and unknown attributes", function()
+ local rspamd_mempool = require("rspamd_mempool")
+ local pool = rspamd_mempool.create()
+
+ local html =
+ [[<div class="container" data-analytics="track" custom-attr="value" style="color: blue;" unknown123="test">content</div>]]
+ local parsed, tags = parse_html_and_extract_tags(html, pool)
+
+ assert_not_nil(parsed)
+
+ local div_tag = nil
+ for _, tag in ipairs(tags) do
+ if tag:get_type() == "div" then
+ div_tag = tag
+ break
+ end
+ end
+
+ assert_not_nil(div_tag)
+
+ local all_attrs = div_tag:get_all_attributes()
+ local unknown_attrs = div_tag:get_unknown_attributes()
+
+ -- All attributes should include both known and unknown
+ assert_not_nil(all_attrs["class"]) -- known
+ assert_not_nil(all_attrs["style"]) -- known
+ assert_not_nil(all_attrs["custom-attr"]) -- unknown
+ assert_not_nil(all_attrs["unknown123"]) -- unknown
+
+ -- Unknown attributes should only include unrecognized ones
+ assert_nil(unknown_attrs["class"]) -- known, shouldn't be here
+ assert_nil(unknown_attrs["style"]) -- known, shouldn't be here
+ assert_not_nil(unknown_attrs["custom-attr"]) -- unknown, should be here
+ assert_not_nil(unknown_attrs["unknown123"]) -- unknown, should be here
+
+ assert_equal("value", unknown_attrs["custom-attr"])
+ assert_equal("test", unknown_attrs["unknown123"])
+
+ pool:destroy()
+ end)
end)