diff options
-rw-r--r-- | interface/css/rspamd.css | 4 | ||||
-rw-r--r-- | interface/index.html | 18 | ||||
-rw-r--r-- | interface/js/app/history.js | 57 | ||||
-rw-r--r-- | interface/js/app/rspamd.js | 21 | ||||
-rw-r--r-- | src/libserver/css/css.cxx | 19 | ||||
-rw-r--r-- | src/libserver/html/html.cxx | 1017 | ||||
-rw-r--r-- | src/libserver/html/html_tag.hxx | 1348 | ||||
-rw-r--r-- | src/lua/lua_html.cxx | 269 | ||||
-rw-r--r-- | src/lua/lua_parsers.c | 67 | ||||
-rw-r--r-- | test/lua/unit/html.lua | 414 |
10 files changed, 2981 insertions, 253 deletions
diff --git a/interface/css/rspamd.css b/interface/css/rspamd.css index 9f97a668b..54310049b 100644 --- a/interface/css/rspamd.css +++ b/interface/css/rspamd.css @@ -420,8 +420,10 @@ table#symbolsTable input[type="number"] { display: none; } +#history-from, +#history-count, #history_page_size { - width: 6em !important; + width: 6em; text-align: center; } diff --git a/interface/index.html b/interface/index.html index 30181e788..165eae200 100644 --- a/interface/index.html +++ b/interface/index.html @@ -124,6 +124,16 @@ </div> </div> </div> + + <div class="card mt-1"> + <div class="card-body"> + <h6 class="card-title fw-bolder">History rows per load</h6> + <div class="input-group input-group-sm was-validated"> + <input type="number" id="settings-history-count" class="form-control" min="1" step="1" placeholder="1000"> + <button id="settings-history-count-restore" class="btn btn-secondary">Restore default</button> + </div> + </div> + </div> </div> </div> </form> @@ -681,9 +691,13 @@ <option value="score">Score value</option> <option value="name">Name</option> </select> - <label for="history_page_size" class="ms-2">Rows per page:</label> + <label for="history-from" class="ms-3" title="Start from this row number">Offset:</label> + <input type="number" id="history-from" class="form-control ms-1" value="0" min="0" step="1" title="Start from this row number"> + <label for="history-count" class="ms-2" title="Number of rows to load">Count:</label> + <input type="number" id="history-count" class="form-control ms-1" value="1000" min="1" step="1" title="Number of rows to load"> + <label for="history_page_size" class="ms-2">Rows/page:</label> <input id="history_page_size" class="form-control ms-1" value="25" min="1" type="number"> - <button class="btn btn-outline-secondary btn-sm ms-2 d-flex align-items-center dropdown-toggle ft-columns-btn" type="button" data-bs-toggle="dropdown" data-bs-auto-close="outside" aria-expanded="false" disabled> + <button class="btn btn-outline-secondary btn-sm ms-3 d-flex align-items-center dropdown-toggle ft-columns-btn" type="button" data-bs-toggle="dropdown" data-bs-auto-close="outside" aria-expanded="false" disabled> <i class="fas fa-columns me-1"></i>Columns </button> <div class="dropdown-menu ft-columns-dropdown p-2"></div> diff --git a/interface/js/app/history.js b/interface/js/app/history.js index 185922087..bf1dbae53 100644 --- a/interface/js/app/history.js +++ b/interface/js/app/history.js @@ -30,6 +30,12 @@ define(["jquery", "app/common", "app/libft", "footable"], const ui = {}; let prevVersion = null; + // History range: offset and count + const histFromDef = 0; + const historyCountDef = 1000; + let histFrom = histFromDef; + let histCount = parseInt(localStorage.getItem("historyCount"), 10) || historyCountDef; + function process_history_legacy(data) { const items = []; @@ -152,7 +158,8 @@ define(["jquery", "app/common", "app/libft", "footable"], ui.getHistory = function () { $("#refresh, #updateHistory").attr("disabled", true); - common.query("history", { + const histTo = histFrom - 1 + histCount; + common.query(`history?from=${histFrom}&to=${histTo}`, { success: function (req_data) { function differentVersions(neighbours_data) { const dv = neighbours_data.some((e) => e.version !== neighbours_data[0].version); @@ -192,8 +199,10 @@ define(["jquery", "app/common", "app/libft", "footable"], // Is there a way to get an event when the table is destroyed? setTimeout(() => { libft.initHistoryTable(data, items, "history", get_history_columns(data), false, - () => $("#refresh, #updateHistory, #history .ft-columns-dropdown .btn-dropdown-apply") - .removeAttr("disabled")); + () => { + $("#history .ft-columns-dropdown .btn-dropdown-apply").removeAttr("disabled"); + ui.updateHistoryControlsState(); + }); }, 200); } prevVersion = version; @@ -201,7 +210,7 @@ define(["jquery", "app/common", "app/libft", "footable"], libft.destroyTable("history"); } }, - error: () => $("#refresh, #updateHistory").removeAttr("disabled"), + error: () => ui.updateHistoryControlsState(), errorMessage: "Cannot receive history", }); }; @@ -282,6 +291,46 @@ define(["jquery", "app/common", "app/libft", "footable"], }); }; + ui.updateHistoryControlsState = function () { + const from = parseInt($("#history-from").val(), 10); + const count = parseInt($("#history-count").val(), 10); + const valid = !(isNaN(from) || from < 0 || isNaN(count) || count < 1); + + if (valid) { + $("#refresh, #updateHistory").removeAttr("disabled").removeClass("disabled"); + } else { + $("#refresh, #updateHistory").attr("disabled", true).addClass("disabled"); + } + }; + + function validateAndClampInput(el) { + const min = el.id === "history-from" ? 0 : 1; + let v = parseInt(el.value, 10); + if (isNaN(v) || v < min) { + v = min; + $(el).addClass("is-invalid"); + } else { + $(el).removeClass("is-invalid"); + } + return v; + } + + $("#history-from").val(histFrom); + $("#history-count").val(histCount); + $("#history-from, #history-count").on("input", (e) => { + validateAndClampInput(e.currentTarget); + ui.updateHistoryControlsState(); + }); + $("#history-from, #history-count").on("blur", (e) => { + const el = e.currentTarget; + const v = validateAndClampInput(el); + $(el).val(v).removeClass("is-invalid"); + ui.updateHistoryControlsState(); + }); + $("#history-from,#history-count").on("change", () => { + histFrom = parseInt($("#history-from").val(), 10) || histFromDef; + histCount = parseInt($("#history-count").val(), 10) || historyCountDef; + }); libft.set_page_size("history", $("#history_page_size").val()); libft.bindHistoryTableEventHandlers("history", 8); diff --git a/interface/js/app/rspamd.js b/interface/js/app/rspamd.js index cb7fb8ace..4b154c2ae 100644 --- a/interface/js/app/rspamd.js +++ b/interface/js/app/rspamd.js @@ -198,6 +198,8 @@ define(["jquery", "app/common", "stickytabs", "visibility", $(".preset").hide(); $(".history").show(); $(".dynamic").hide(); + + module.updateHistoryControlsState(); }); break; case "#disconnect": @@ -348,6 +350,8 @@ define(["jquery", "app/common", "stickytabs", "visibility", let selected_locale = null; let custom_locale = null; const localeTextbox = ".popover #settings-popover #locale"; + const historyCountDef = 1000; + const historyCountSelector = ".popover #settings-popover #settings-history-count"; function validateLocale(saveToLocalStorage) { function toggle_form_group_class(remove, add) { @@ -406,6 +410,8 @@ define(["jquery", "app/common", "stickytabs", "visibility", $(localeTextbox).val(custom_locale); ajaxSetup(localStorage.getItem("ajax_timeout"), true); + + $(historyCountSelector).val(parseInt(localStorage.getItem("historyCount"), 10) || historyCountDef); }); $(document).on("change", '.popover #settings-popover input:radio[name="locale"]', function () { selected_locale = this.value; @@ -423,6 +429,21 @@ define(["jquery", "app/common", "stickytabs", "visibility", ajaxSetup(null, true, true); }); + $(document).on("input", historyCountSelector, (e) => { + const v = parseInt($(e.currentTarget).val(), 10); + if (v > 0) { + localStorage.setItem("historyCount", v); + $(e.currentTarget).removeClass("is-invalid"); + $("#history-count").val(v).trigger("change"); + } else { + $(e.currentTarget).addClass("is-invalid"); + } + }); + $(document).on("click", ".popover #settings-popover #settings-history-count-restore", () => { + localStorage.removeItem("historyCount"); + $(historyCountSelector).val(historyCountDef); + }); + // Dismiss Bootstrap popover by clicking outside $("body").on("click", (e) => { $(".popover").each(function () { diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 1b369ed17..c53e3c05e 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -107,7 +107,6 @@ auto css_style_sheet::add_selector_rule(std::unique_ptr<css_selector> &&selector auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspamd::html::html_block * { - std::optional<std::string_view> id_comp, class_comp; rspamd::html::html_block *res = nullptr; if (!tag) { @@ -115,14 +114,8 @@ auto css_style_sheet::check_tag_block(const rspamd::html::html_tag *tag) -> rspa } /* First, find id in a tag and a class */ - for (const auto ¶m: tag->components) { - if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_ID) { - id_comp = param.value; - } - else if (param.type == html::html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - class_comp = param.value; - } - } + auto id_comp = tag->find_id(); + auto class_comp = tag->find_class(); /* ID part */ if (id_comp && !pimpl->id_selectors.empty()) { @@ -224,4 +217,4 @@ auto css_parse_style(rspamd_mempool_t *pool, return std::make_pair(nullptr, parse_res.error()); } -}// namespace rspamd::css
\ No newline at end of file +}// namespace rspamd::css diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index 93d1fdf91..78a6a975c 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -39,6 +39,7 @@ #include "contrib/frozen/include/frozen/string.h" #include "contrib/fmt/include/fmt/core.h" +#include <functional> #include <unicode/uversion.h> namespace rspamd::html { @@ -47,23 +48,88 @@ static const unsigned int max_tags = 8192; /* Ignore tags if this maximum is rea static const html_tags_storage html_tags_defs; -auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_type>( +auto html_components_map = frozen::make_unordered_map<frozen::string, html_component_enum_type>( { - {"name", html_component_type::RSPAMD_HTML_COMPONENT_NAME}, - {"href", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"src", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"action", html_component_type::RSPAMD_HTML_COMPONENT_HREF}, - {"color", html_component_type::RSPAMD_HTML_COMPONENT_COLOR}, - {"bgcolor", html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, - {"style", html_component_type::RSPAMD_HTML_COMPONENT_STYLE}, - {"class", html_component_type::RSPAMD_HTML_COMPONENT_CLASS}, - {"width", html_component_type::RSPAMD_HTML_COMPONENT_WIDTH}, - {"height", html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT}, - {"size", html_component_type::RSPAMD_HTML_COMPONENT_SIZE}, - {"rel", html_component_type::RSPAMD_HTML_COMPONENT_REL}, - {"alt", html_component_type::RSPAMD_HTML_COMPONENT_ALT}, - {"id", html_component_type::RSPAMD_HTML_COMPONENT_ID}, - {"hidden", html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + {"name", html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME}, + {"href", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"src", html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC}, + {"action", html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF}, + {"color", html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR}, + {"bgcolor", html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR}, + {"style", html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE}, + {"class", html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS}, + {"width", html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH}, + {"height", html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT}, + {"size", html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE}, + {"rel", html_component_enum_type::RSPAMD_HTML_COMPONENT_REL}, + {"alt", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT}, + {"id", html_component_enum_type::RSPAMD_HTML_COMPONENT_ID}, + {"hidden", html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN}, + // Typography + {"font-family", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY}, + {"font-size", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE}, + {"font-weight", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT}, + {"font-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE}, + {"text-align", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN}, + {"text-decoration", html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION}, + {"line-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT}, + // Layout & positioning + {"margin", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN}, + {"margin-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP}, + {"margin-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM}, + {"margin-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT}, + {"margin-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT}, + {"padding", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING}, + {"padding-top", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP}, + {"padding-bottom", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM}, + {"padding-left", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT}, + {"padding-right", html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT}, + {"border", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER}, + {"border-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR}, + {"border-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH}, + {"border-style", html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE}, + // Display & visibility + {"display", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY}, + {"visibility", html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY}, + {"opacity", html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY}, + // Dimensions + {"min-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH}, + {"max-width", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH}, + {"min-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT}, + {"max-height", html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT}, + // Table attributes + {"cellpadding", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING}, + {"cellspacing", html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING}, + {"valign", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN}, + {"align", html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN}, + // Form attributes + {"type", html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE}, + {"value", html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE}, + {"placeholder", html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER}, + {"disabled", html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED}, + {"readonly", html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY}, + {"checked", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED}, + {"selected", html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED}, + // Link & media + {"target", html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET}, + {"title", html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE}, + // Meta & document + {"charset", html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET}, + {"content", html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT}, + {"http-equiv", html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV}, + // Accessibility + {"role", html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE}, + {"tabindex", html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX}, + // Background + {"background", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND}, + {"background-image", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE}, + {"background-color", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR}, + {"background-repeat", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT}, + {"background-position", html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION}, + // Email-specific tracking + {"data-track", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK}, + {"data-id", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID}, + {"data-url", html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL}, }); #define msg_debug_html(...) rspamd_conditional_debug_fast(NULL, NULL, \ @@ -199,18 +265,608 @@ html_check_balance(struct html_content *hc, return nullptr; } -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type> +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component { - auto known_component_it = html_components_map.find(st); + auto known_component_it = html_components_map.find(name); if (known_component_it != html_components_map.end()) { - return known_component_it->second; + switch (known_component_it->second) { + case html_component_enum_type::RSPAMD_HTML_COMPONENT_NAME: + return html_component_name{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HREF: + return html_component_href{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_COLOR: + return html_component_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BGCOLOR: + return html_component_bgcolor{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_STYLE: + return html_component_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CLASS: + return html_component_class{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_WIDTH: + return html_component_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HEIGHT: + return html_component_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SIZE: + return html_component_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_REL: + return html_component_rel{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALT: + return html_component_alt{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ID: + return html_component_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HIDDEN: + return html_component_hidden{}; + // Typography + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_FAMILY: + return html_component_font_family{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_SIZE: + return html_component_font_size{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_WEIGHT: + return html_component_font_weight{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_FONT_STYLE: + return html_component_font_style{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_ALIGN: + return html_component_text_align{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TEXT_DECORATION: + return html_component_text_decoration{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_LINE_HEIGHT: + return html_component_line_height{value}; + // Layout + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN: + return html_component_margin{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_TOP: + return html_component_margin_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM: + return html_component_margin_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_LEFT: + return html_component_margin_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MARGIN_RIGHT: + return html_component_margin_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING: + return html_component_padding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_TOP: + return html_component_padding_top{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_BOTTOM: + return html_component_padding_bottom{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_LEFT: + return html_component_padding_left{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PADDING_RIGHT: + return html_component_padding_right{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER: + return html_component_border{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_COLOR: + return html_component_border_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_WIDTH: + return html_component_border_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BORDER_STYLE: + return html_component_border_style{value}; + // Display + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISPLAY: + return html_component_display{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VISIBILITY: + return html_component_visibility{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_OPACITY: + return html_component_opacity{value}; + // Dimensions + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_WIDTH: + return html_component_min_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_WIDTH: + return html_component_max_width{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MIN_HEIGHT: + return html_component_min_height{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_MAX_HEIGHT: + return html_component_max_height{value}; + // Table + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLPADDING: + return html_component_cellpadding{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CELLSPACING: + return html_component_cellspacing{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALIGN: + return html_component_valign{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ALIGN: + return html_component_align{value}; + // Form + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TYPE: + return html_component_type{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_VALUE: + return html_component_value{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_PLACEHOLDER: + return html_component_placeholder{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DISABLED: + return html_component_disabled{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_READONLY: + return html_component_readonly{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHECKED: + return html_component_checked{}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SELECTED: + return html_component_selected{}; + // Link & media + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TARGET: + return html_component_target{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TITLE: + return html_component_title{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_SRC: + return html_component_src{value}; + // Meta + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CHARSET: + return html_component_charset{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_CONTENT: + return html_component_content{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_HTTP_EQUIV: + return html_component_http_equiv{value}; + // Accessibility + case html_component_enum_type::RSPAMD_HTML_COMPONENT_ROLE: + return html_component_role{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_TABINDEX: + return html_component_tabindex{value}; + // Background + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND: + return html_component_background{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE: + return html_component_background_image{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR: + return html_component_background_color{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT: + return html_component_background_repeat{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION: + return html_component_background_position{value}; + // Email tracking + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_TRACK: + return html_component_data_track{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_ID: + return html_component_data_id{value}; + case html_component_enum_type::RSPAMD_HTML_COMPONENT_DATA_URL: + return html_component_data_url{value}; + default: + return html_component_unknown{name, value}; + } } else { - return std::nullopt; + return html_component_unknown{name, value}; } } +using component_extractor_func = std::function<std::optional<std::string_view>(const html_tag *)>; +static const auto component_extractors = frozen::make_unordered_map<frozen::string, component_extractor_func>( + { + // Basic components + {"name", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_name>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"href", [](const html_tag *tag) { return tag->find_href(); }}, + {"src", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_src>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"class", [](const html_tag *tag) { return tag->find_class(); }}, + {"id", [](const html_tag *tag) { return tag->find_id(); }}, + {"style", [](const html_tag *tag) { return tag->find_style(); }}, + {"alt", [](const html_tag *tag) { return tag->find_alt(); }}, + {"rel", [](const html_tag *tag) { return tag->find_rel(); }}, + {"color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"bgcolor", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_bgcolor>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Numeric components (return string representation) + {"width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Boolean components + {"hidden", [](const html_tag *tag) -> std::optional<std::string_view> { + return tag->is_hidden() ? std::optional<std::string_view>{"true"} : std::nullopt; + }}, + + // Typography components + {"font-family", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_family>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-size", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_size>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"font-weight", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_weight>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"font-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_font_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"text-decoration", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_text_decoration>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"line-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_line_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Layout components + {"margin", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"margin-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_margin_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-top", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_top>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-bottom", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_bottom>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-left", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_left>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"padding-right", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_padding_right>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"border-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"border-style", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_border_style>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Display components + {"display", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_display>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"visibility", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_visibility>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"opacity", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_opacity>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Additional dimensions + {"min-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-width", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_width>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"min-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_min_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"max-height", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_max_height>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Table components + {"cellpadding", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellpadding>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"cellspacing", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_cellspacing>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + {"valign", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_valign>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"align", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_align>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Form components + {"type", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_type>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"value", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_value>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"placeholder", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_placeholder>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"disabled", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_disabled>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"readonly", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_readonly>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"checked", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_checked>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + {"selected", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_selected>()) { + return comp.value()->is_present() ? std::optional<std::string_view>{"true"} : std::nullopt; + } + return std::nullopt; + }}, + + // Link & media components + {"target", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_target>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"title", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_title>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Meta components + {"charset", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_charset>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"content", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_content>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"http-equiv", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_http_equiv>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Accessibility components + {"role", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_role>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"tabindex", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_tabindex>()) { + return comp.value()->get_string_value(); + } + return std::nullopt; + }}, + + // Background components + {"background", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-image", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_image>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-color", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_color>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-repeat", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_repeat>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"background-position", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_background_position>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + + // Email tracking components + {"data-track", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_track>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-id", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_id>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + {"data-url", [](const html_tag *tag) -> std::optional<std::string_view> { + if (auto comp = tag->find_component<html_component_data_url>()) { + return comp.value()->value; + } + return std::nullopt; + }}, + }); + +auto html_tag::find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view> +{ + auto it = component_extractors.find(attr_name); + if (it != component_extractors.end()) { + return it->second(this); + } + + // Fallback to unknown components + return find_unknown_component(attr_name); +} + +auto html_tag::get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>> +{ + std::vector<std::pair<std::string_view, std::string_view>> result; + + // First, get all known attributes using the component_extractors map + for (const auto &[attr_name, extractor_func]: component_extractors) { + if (auto value = extractor_func(this)) { + // Convert frozen::string to std::string_view for the key + std::string_view name_view{attr_name.data(), attr_name.size()}; + result.emplace_back(name_view, value.value()); + } + } + + // Then add all unknown attributes + auto unknown_attrs = get_unknown_components(); + for (const auto &[name, value]: unknown_attrs) { + result.emplace_back(name, value); + } + + return result; +} + enum tag_parser_state { parse_start = 0, parse_name, @@ -234,13 +890,13 @@ enum tag_parser_state { struct tag_content_parser_state { tag_parser_state cur_state = parse_start; std::string buf; - std::optional<html_component_type> cur_component; + std::string attr_name;// Store current attribute name void reset() { cur_state = parse_start; buf.clear(); - cur_component = std::nullopt; + attr_name.clear(); } }; @@ -254,56 +910,50 @@ html_parse_tag_content(rspamd_mempool_t *pool, auto state = parser_env.cur_state; /* - * Stores tag component if it doesn't exist, performing copy of the - * value + decoding of the entities - * Parser env is set to clear the current html attribute fields (saved_p and - * cur_component) + * Stores tag component creating the appropriate variant type + * Parser env is cleared after storing */ auto store_component_value = [&]() -> void { - if (parser_env.cur_component) { + if (!parser_env.attr_name.empty()) { + std::string_view attr_name_view, value_view; - if (parser_env.buf.empty()) { - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{}); + // Store attribute name in persistent memory + if (!parser_env.attr_name.empty()) { + auto *name_storage = rspamd_mempool_alloc_buffer(pool, parser_env.attr_name.size()); + memcpy(name_storage, parser_env.attr_name.data(), parser_env.attr_name.size()); + attr_name_view = {name_storage, parser_env.attr_name.size()}; } - else { - /* We need to copy buf to a persistent storage */ - auto *s = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); - if (parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_ID || - parser_env.cur_component.value() == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - /* Lowercase */ - rspamd_str_copy_lc(parser_env.buf.data(), s, parser_env.buf.size()); + // Store value in persistent memory if not empty + if (!parser_env.buf.empty()) { + auto *value_storage = rspamd_mempool_alloc_buffer(pool, parser_env.buf.size()); + + // Lowercase for id and class attributes + if (parser_env.attr_name == "id" || parser_env.attr_name == "class") { + rspamd_str_copy_lc(parser_env.buf.data(), value_storage, parser_env.buf.size()); } else { - memcpy(s, parser_env.buf.data(), parser_env.buf.size()); + memcpy(value_storage, parser_env.buf.data(), parser_env.buf.size()); } - auto sz = rspamd_html_decode_entitles_inplace(s, parser_env.buf.size()); - tag->components.emplace_back(parser_env.cur_component.value(), - std::string_view{s, sz}); + auto sz = rspamd_html_decode_entitles_inplace(value_storage, parser_env.buf.size()); + value_view = {value_storage, sz}; } + + // Create the appropriate component variant + auto component = html_component_from_string(attr_name_view, value_view); + tag->components.emplace_back(std::move(component)); } parser_env.buf.clear(); - parser_env.cur_component = std::nullopt; + parser_env.attr_name.clear(); }; auto store_component_name = [&]() -> bool { decode_html_entitles_inplace(parser_env.buf); - auto known_component_it = html_components_map.find(std::string_view{parser_env.buf}); + parser_env.attr_name = parser_env.buf; parser_env.buf.clear(); - - if (known_component_it != html_components_map.end()) { - parser_env.cur_component = known_component_it->second; - - return true; - } - else { - parser_env.cur_component = std::nullopt; - } - - return false; + return true; }; auto store_value_character = [&](bool lc) -> void { @@ -471,6 +1121,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_dquote: if (*in == '"') { + store_component_value(); state = spaces_after_param; } else { @@ -481,6 +1132,7 @@ html_parse_tag_content(rspamd_mempool_t *pool, case parse_start_squote: if (*in == '\'') { + store_component_value(); state = spaces_after_param; } else { @@ -620,7 +1272,7 @@ html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> std::optional<struct rspamd_url *> { - auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + auto found_href_maybe = tag->find_href(); if (found_href_maybe) { /* Check base url */ @@ -816,130 +1468,126 @@ html_process_img_tag(rspamd_mempool_t *pool, img = rspamd_mempool_alloc0_type(pool, struct html_image); img->tag = tag; - for (const auto ¶m: tag->components) { + // Process SRC component (preferred for img tags) or HREF component (fallback) + std::optional<std::string_view> href_value; - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { - /* Check base url */ - const auto &href_value = param.value; + // Try SRC first (standard for img tags) + if (auto src_comp = tag->find_component<html_component_src>()) { + href_value = src_comp.value()->value; + } + // Fallback to HREF (for backward compatibility or non-standard usage) + else if (auto href_comp = tag->find_href()) { + href_value = href_comp; + } - if (href_value.size() > 0) { - rspamd_ftok_t fstr; - fstr.begin = href_value.data(); - fstr.len = href_value.size(); - img->src = rspamd_mempool_ftokdup(pool, &fstr); + if (href_value && href_value->size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value->data(); + fstr.len = href_value->size(); + img->src = rspamd_mempool_ftokdup(pool, &fstr); - if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), - "cid:", sizeof("cid:") - 1) == 0) { - /* We have an embedded image */ - img->src += sizeof("cid:") - 1; - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; - } - else { - if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), - "data:", sizeof("data:") - 1) == 0) { - /* We have an embedded image in HTML tag */ - img->flags |= - (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); - html_process_data_image(pool, img, href_value); - hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; - } - else { - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; - if (img->src) { - - std::string_view cpy{href_value}; - auto maybe_url = html_process_url(pool, cpy); - - if (maybe_url) { - img->url = maybe_url.value(); - struct rspamd_url *existing; - - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - existing = rspamd_url_set_add_or_return(url_set, - img->url); - - if (existing && existing != img->url) { - /* - * We have some other URL that could be - * found, e.g. from another part. However, - * we still want to set an image flag on it - */ - existing->flags |= img->url->flags; - existing->count++; - } - else if (part_urls) { - /* New url */ - g_ptr_array_add(part_urls, img->url); - } - } - } - } - } - } + if (href_value->size() > sizeof("cid:") - 1 && memcmp(href_value->data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->src += sizeof("cid:") - 1; + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; } + else { + if (href_value->size() > sizeof("data:") - 1 && memcmp(href_value->data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, *href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { + std::string_view cpy{*href_value}; + auto maybe_url = html_process_url(pool, cpy); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { - unsigned long val; + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->height = val; - } + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, + img->url); - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { - unsigned long val; - - rspamd_strtoul(param.value.data(), param.value.size(), &val); - img->width = val; + if (existing && existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } + } + } + } } + } - /* TODO: rework to css at some time */ - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - if (img->height == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "height", sizeof("height") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("height") - 1); + // Process numeric dimensions using the new helper methods + if (auto height = tag->find_height()) { + img->height = height.value(); + } - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->height = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + if (auto width = tag->find_width()) { + img->width = width.value(); + } + + // Process style component for dimensions + if (auto style_value = tag->find_style()) { + if (img->height == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } - if (img->width == 0) { - auto style_st = param.value; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "width", sizeof("width") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("width") - 1); - - for (auto i = 0; i < substr.size(); i++) { - auto t = substr[i]; - if (g_ascii_isdigit(t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->width = val; - break; - } - else if (!g_ascii_isspace(t) && t != '=' && t != ':') { - /* Fallback */ - break; - } + } + if (img->width == 0) { + auto pos = rspamd_substring_search_caseless(style_value->data(), + style_value->size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_value->substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit(t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace(t) && t != '=' && t != ':') { + /* Fallback */ + break; } } } @@ -968,7 +1616,7 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, khash_t(rspamd_url_hash) * url_set, GPtrArray *part_urls) -> void { - auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); + auto found_rel_maybe = tag->find_rel(); if (found_rel_maybe) { if (found_rel_maybe.value() == "icon") { @@ -984,24 +1632,23 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, std::optional<css::css_value> maybe_fgcolor, maybe_bgcolor; bool hidden = false; - for (const auto ¶m: tag->components) { - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { - maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { - maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); - } + // Process color components + if (auto color_comp = tag->find_component<html_component_color>()) { + maybe_fgcolor = css::css_value::maybe_color_from_string(color_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - tag->block = rspamd::css::parse_css_declaration(pool, param.value); - } + if (auto bgcolor_comp = tag->find_component<html_component_bgcolor>()) { + maybe_bgcolor = css::css_value::maybe_color_from_string(bgcolor_comp.value()->value); + } - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HIDDEN) { - hidden = true; - } + // Process style component + if (auto style_value = tag->find_style()) { + tag->block = rspamd::css::parse_css_declaration(pool, *style_value); } + // Check if hidden + hidden = tag->is_hidden(); + if (!tag->block) { tag->block = html_block::undefined_html_block_pool(pool); } @@ -1284,7 +1931,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } else if (tag->id == Tag_IMG) { /* Process ALT if presented */ - auto maybe_alt = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_ALT); + auto maybe_alt = tag->find_alt(); if (maybe_alt) { if (!hc->parsed.empty() && !g_ascii_isspace(hc->parsed.back())) { @@ -1384,9 +2031,7 @@ auto html_process_input(struct rspamd_task *task, overflow_input = true; } - auto new_tag = [&](int flags = 0) -> struct html_tag * - { - + auto new_tag = [&](int flags = 0) -> struct html_tag * { if (hc->all_tags.size() > rspamd::html::max_tags) { hc->flags |= RSPAMD_HTML_FLAG_TOO_MANY_TAGS; @@ -2151,7 +2796,7 @@ auto html_process_input(struct rspamd_task *task, /* Leftover after content */ switch (state) { case tags_limit_overflow: - html_append_parsed(hc, {c, (std::size_t)(end - c)}, + html_append_parsed(hc, {c, (std::size_t) (end - c)}, false, end - start, hc->parsed); break; default: @@ -2390,4 +3035,4 @@ gsize rspamd_html_get_tags_count(void *html_content) } return hc->all_tags.size(); -}
\ No newline at end of file +} diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 309d76177..6d41f1337 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -1,11 +1,11 @@ -/*- - * Copyright 2021 Vsevolod Stakhov +/* + * Copyright 2025 Vsevolod Stakhov * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, @@ -26,6 +26,7 @@ #include <cstdint> #include "html_tags.h" +#include "libutil/str_util.h" struct rspamd_url; struct html_image; @@ -34,7 +35,8 @@ namespace rspamd::html { struct html_content; /* Forward declaration */ -enum class html_component_type : std::uint8_t { +// Internal enum for mapping (not exposed in public API) +enum class html_component_enum_type : std::uint8_t { RSPAMD_HTML_COMPONENT_NAME = 0, RSPAMD_HTML_COMPONENT_HREF, RSPAMD_HTML_COMPONENT_COLOR, @@ -48,8 +50,1214 @@ enum class html_component_type : std::uint8_t { RSPAMD_HTML_COMPONENT_ALT, RSPAMD_HTML_COMPONENT_ID, RSPAMD_HTML_COMPONENT_HIDDEN, + // Typography + RSPAMD_HTML_COMPONENT_FONT_FAMILY, + RSPAMD_HTML_COMPONENT_FONT_SIZE, + RSPAMD_HTML_COMPONENT_FONT_WEIGHT, + RSPAMD_HTML_COMPONENT_FONT_STYLE, + RSPAMD_HTML_COMPONENT_TEXT_ALIGN, + RSPAMD_HTML_COMPONENT_TEXT_DECORATION, + RSPAMD_HTML_COMPONENT_LINE_HEIGHT, + // Layout & positioning + RSPAMD_HTML_COMPONENT_MARGIN, + RSPAMD_HTML_COMPONENT_MARGIN_TOP, + RSPAMD_HTML_COMPONENT_MARGIN_BOTTOM, + RSPAMD_HTML_COMPONENT_MARGIN_LEFT, + RSPAMD_HTML_COMPONENT_MARGIN_RIGHT, + RSPAMD_HTML_COMPONENT_PADDING, + RSPAMD_HTML_COMPONENT_PADDING_TOP, + RSPAMD_HTML_COMPONENT_PADDING_BOTTOM, + RSPAMD_HTML_COMPONENT_PADDING_LEFT, + RSPAMD_HTML_COMPONENT_PADDING_RIGHT, + RSPAMD_HTML_COMPONENT_BORDER, + RSPAMD_HTML_COMPONENT_BORDER_COLOR, + RSPAMD_HTML_COMPONENT_BORDER_WIDTH, + RSPAMD_HTML_COMPONENT_BORDER_STYLE, + // Display & visibility + RSPAMD_HTML_COMPONENT_DISPLAY, + RSPAMD_HTML_COMPONENT_VISIBILITY, + RSPAMD_HTML_COMPONENT_OPACITY, + // Dimensions + RSPAMD_HTML_COMPONENT_MIN_WIDTH, + RSPAMD_HTML_COMPONENT_MAX_WIDTH, + RSPAMD_HTML_COMPONENT_MIN_HEIGHT, + RSPAMD_HTML_COMPONENT_MAX_HEIGHT, + // Table attributes + RSPAMD_HTML_COMPONENT_CELLPADDING, + RSPAMD_HTML_COMPONENT_CELLSPACING, + RSPAMD_HTML_COMPONENT_VALIGN, + RSPAMD_HTML_COMPONENT_ALIGN, + // Form attributes + RSPAMD_HTML_COMPONENT_TYPE, + RSPAMD_HTML_COMPONENT_VALUE, + RSPAMD_HTML_COMPONENT_PLACEHOLDER, + RSPAMD_HTML_COMPONENT_DISABLED, + RSPAMD_HTML_COMPONENT_READONLY, + RSPAMD_HTML_COMPONENT_CHECKED, + RSPAMD_HTML_COMPONENT_SELECTED, + // Link & media + RSPAMD_HTML_COMPONENT_TARGET, + RSPAMD_HTML_COMPONENT_TITLE, + RSPAMD_HTML_COMPONENT_SRC, + // Meta & document + RSPAMD_HTML_COMPONENT_CHARSET, + RSPAMD_HTML_COMPONENT_CONTENT, + RSPAMD_HTML_COMPONENT_HTTP_EQUIV, + // Accessibility + RSPAMD_HTML_COMPONENT_ROLE, + RSPAMD_HTML_COMPONENT_TABINDEX, + // Background + RSPAMD_HTML_COMPONENT_BACKGROUND, + RSPAMD_HTML_COMPONENT_BACKGROUND_IMAGE, + RSPAMD_HTML_COMPONENT_BACKGROUND_COLOR, + RSPAMD_HTML_COMPONENT_BACKGROUND_REPEAT, + RSPAMD_HTML_COMPONENT_BACKGROUND_POSITION, + // Email-specific tracking + RSPAMD_HTML_COMPONENT_DATA_TRACK, + RSPAMD_HTML_COMPONENT_DATA_ID, + RSPAMD_HTML_COMPONENT_DATA_URL, }; +// Forward declarations for component types +struct html_component_name; +struct html_component_href; +struct html_component_color; +struct html_component_bgcolor; +struct html_component_style; +struct html_component_class; +struct html_component_width; +struct html_component_height; +struct html_component_size; +struct html_component_rel; +struct html_component_alt; +struct html_component_id; +struct html_component_hidden; +struct html_component_unknown; + +// Base interface for all components +struct html_component_base { + virtual ~html_component_base() = default; + virtual constexpr std::string_view get_string_value() const = 0; +}; + +// String-based components +struct html_component_name : html_component_base { + std::string_view value; + explicit constexpr html_component_name(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_href : html_component_base { + std::string_view value; + explicit constexpr html_component_href(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_style : html_component_base { + std::string_view value; + explicit constexpr html_component_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_class : html_component_base { + std::string_view value; + explicit constexpr html_component_class(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_rel : html_component_base { + std::string_view value; + explicit constexpr html_component_rel(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_alt : html_component_base { + std::string_view value; + explicit constexpr html_component_alt(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_id : html_component_base { + std::string_view value; + explicit constexpr html_component_id(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Color components (could be extended to parse actual colors) +struct html_component_color : html_component_base { + std::string_view value; + explicit constexpr html_component_color(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_bgcolor : html_component_base { + std::string_view value; + explicit constexpr html_component_bgcolor(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +// Numeric components +struct html_component_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_width(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_height(const std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Boolean/flag component +struct html_component_hidden : html_component_base { + bool present; + explicit constexpr html_component_hidden() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Unknown component with both name and value +struct html_component_unknown : html_component_base { + std::string_view name; + std::string_view value; + + constexpr html_component_unknown(std::string_view n, std::string_view v) + : name(n), value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } + constexpr std::string_view get_name() const + { + return name; + } +}; + +// Typography components +struct html_component_font_family : html_component_base { + std::string_view value; + explicit constexpr html_component_font_family(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_size : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_font_size(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + constexpr std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_font_weight : html_component_base { + std::string_view value; + explicit constexpr html_component_font_weight(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_font_style : html_component_base { + std::string_view value; + explicit constexpr html_component_font_style(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_align : html_component_base { + std::string_view value; + explicit constexpr html_component_text_align(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_text_decoration : html_component_base { + std::string_view value; + explicit constexpr html_component_text_decoration(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_line_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_line_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Layout components (most are string-based for flexibility) +struct html_component_margin : html_component_base { + std::string_view value; + explicit constexpr html_component_margin(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_top : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_left : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_margin_right : html_component_base { + std::string_view value; + explicit constexpr html_component_margin_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding : html_component_base { + std::string_view value; + explicit constexpr html_component_padding(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_top : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_top(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_bottom : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_bottom(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_left : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_left(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_padding_right : html_component_base { + std::string_view value; + explicit constexpr html_component_padding_right(std::string_view v) + : value(v) + { + } + constexpr std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border : html_component_base { + std::string_view value; + explicit html_component_border(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_color : html_component_base { + std::string_view value; + explicit html_component_border_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_border_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_border_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_border_style : html_component_base { + std::string_view value; + explicit html_component_border_style(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Display components +struct html_component_display : html_component_base { + std::string_view value; + explicit html_component_display(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_visibility : html_component_base { + std::string_view value; + explicit html_component_visibility(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_opacity : html_component_base { + std::string_view raw_value; + std::optional<float> numeric_value; + + explicit html_component_opacity(std::string_view v) + : raw_value(v) + { + char *endptr; + auto val = std::strtof(v.data(), &endptr); + if (endptr != v.data() && val >= 0.0f && val <= 1.0f) { + numeric_value = val; + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<float> get_numeric_value() const + { + return numeric_value; + } +}; + +// Additional dimension components +struct html_component_min_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_width : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_width(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_min_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_min_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_max_height : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_max_height(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Table components +struct html_component_cellpadding : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellpadding(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_cellspacing : html_component_base { + std::string_view raw_value; + std::optional<std::uint32_t> numeric_value; + + explicit html_component_cellspacing(std::string_view v) + : raw_value(v) + { + unsigned long val; + if (rspamd_strtoul(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::uint32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::uint32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +struct html_component_valign : html_component_base { + std::string_view value; + explicit html_component_valign(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_align : html_component_base { + std::string_view value; + explicit html_component_align(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Form components +struct html_component_type : html_component_base { + std::string_view value; + explicit html_component_type(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_value : html_component_base { + std::string_view value; + explicit html_component_value(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_placeholder : html_component_base { + std::string_view value; + explicit html_component_placeholder(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Boolean form components +struct html_component_disabled : html_component_base { + bool present; + explicit constexpr html_component_disabled() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_readonly : html_component_base { + bool present; + explicit constexpr html_component_readonly() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_checked : html_component_base { + bool present; + explicit constexpr html_component_checked() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +struct html_component_selected : html_component_base { + bool present; + explicit constexpr html_component_selected() + : present(true) + { + } + constexpr std::string_view get_string_value() const override + { + return present ? "true" : "false"; + } + constexpr bool is_present() const + { + return present; + } +}; + +// Link & media components +struct html_component_target : html_component_base { + std::string_view value; + explicit html_component_target(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_title : html_component_base { + std::string_view value; + explicit html_component_title(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_src : html_component_base { + std::string_view value; + explicit html_component_src(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Meta components +struct html_component_charset : html_component_base { + std::string_view value; + explicit html_component_charset(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_content : html_component_base { + std::string_view value; + explicit html_component_content(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_http_equiv : html_component_base { + std::string_view value; + explicit html_component_http_equiv(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Accessibility components +struct html_component_role : html_component_base { + std::string_view value; + explicit html_component_role(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_tabindex : html_component_base { + std::string_view raw_value; + std::optional<std::int32_t> numeric_value; + + explicit html_component_tabindex(std::string_view v) + : raw_value(v) + { + long val; + if (rspamd_strtol(v.data(), v.size(), &val)) { + numeric_value = static_cast<std::int32_t>(val); + } + } + + std::string_view get_string_value() const override + { + return raw_value; + } + std::optional<std::int32_t> get_numeric_value() const + { + return numeric_value; + } +}; + +// Background components +struct html_component_background : html_component_base { + std::string_view value; + explicit html_component_background(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_image : html_component_base { + std::string_view value; + explicit html_component_background_image(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_color : html_component_base { + std::string_view value; + explicit html_component_background_color(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_repeat : html_component_base { + std::string_view value; + explicit html_component_background_repeat(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_background_position : html_component_base { + std::string_view value; + explicit html_component_background_position(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// Email tracking components +struct html_component_data_track : html_component_base { + std::string_view value; + explicit html_component_data_track(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_id : html_component_base { + std::string_view value; + explicit html_component_data_id(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +struct html_component_data_url : html_component_base { + std::string_view value; + explicit html_component_data_url(std::string_view v) + : value(v) + { + } + std::string_view get_string_value() const override + { + return value; + } +}; + +// The variant type that holds all possible components +using html_tag_component = std::variant< + html_component_name, + html_component_href, + html_component_color, + html_component_bgcolor, + html_component_style, + html_component_class, + html_component_width, + html_component_height, + html_component_size, + html_component_rel, + html_component_alt, + html_component_id, + html_component_hidden, + // Typography + html_component_font_family, + html_component_font_size, + html_component_font_weight, + html_component_font_style, + html_component_text_align, + html_component_text_decoration, + html_component_line_height, + // Layout + html_component_margin, + html_component_margin_top, + html_component_margin_bottom, + html_component_margin_left, + html_component_margin_right, + html_component_padding, + html_component_padding_top, + html_component_padding_bottom, + html_component_padding_left, + html_component_padding_right, + html_component_border, + html_component_border_color, + html_component_border_width, + html_component_border_style, + // Display + html_component_display, + html_component_visibility, + html_component_opacity, + // Dimensions + html_component_min_width, + html_component_max_width, + html_component_min_height, + html_component_max_height, + // Table + html_component_cellpadding, + html_component_cellspacing, + html_component_valign, + html_component_align, + // Form + html_component_type, + html_component_value, + html_component_placeholder, + html_component_disabled, + html_component_readonly, + html_component_checked, + html_component_selected, + // Link & media + html_component_target, + html_component_title, + html_component_src, + // Meta + html_component_charset, + html_component_content, + html_component_http_equiv, + // Accessibility + html_component_role, + html_component_tabindex, + // Background + html_component_background, + html_component_background_image, + html_component_background_color, + html_component_background_repeat, + html_component_background_position, + // Email tracking + html_component_data_track, + html_component_data_id, + html_component_data_url, + // Unknown + html_component_unknown>; + +/** + * Returns component variant from a string + * @param name attribute name + * @param value attribute value + * @return variant component + */ +auto html_component_from_string(std::string_view name, std::string_view value) -> html_tag_component; + /* Public tags flags */ /* XML tag */ #define FL_XML (1u << CM_USER_SHIFT) @@ -62,23 +1270,7 @@ enum class html_component_type : std::uint8_t { #define FL_COMMENT (1 << (CM_USER_SHIFT + 6)) #define FL_VIRTUAL (1 << (CM_USER_SHIFT + 7)) -/** - * Returns component type from a string - * @param st - * @return - */ -auto html_component_from_string(const std::string_view &st) -> std::optional<html_component_type>; - using html_tag_extra_t = std::variant<std::monostate, struct rspamd_url *, struct html_image *>; -struct html_tag_component { - html_component_type type; - std::string_view value; - - html_tag_component(html_component_type type, std::string_view value) - : type(type), value(value) - { - } -}; /* Pairing closing tag representation */ struct html_closing_tag { @@ -105,26 +1297,128 @@ struct html_tag { std::vector<struct html_tag *> children; struct html_tag *parent; - auto find_component(html_component_type what) const -> std::optional<std::string_view> + // Template method to find component by type + template<typename T> + auto find_component() const -> std::optional<const T *> { for (const auto &comp: components) { - if (comp.type == what) { - return comp.value; + if (std::holds_alternative<T>(comp)) { + return &std::get<T>(comp); } } + return std::nullopt; + } + // Helper methods for common component access + auto find_href() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_href>()) { + return comp.value()->value; + } return std::nullopt; } - auto find_component(std::optional<html_component_type> what) const -> std::optional<std::string_view> + auto find_class() const -> std::optional<std::string_view> { - if (what) { - return find_component(what.value()); + if (auto comp = find_component<html_component_class>()) { + return comp.value()->value; } + return std::nullopt; + } + auto find_id() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_id>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_width() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_width>()) { + return comp.value()->get_numeric_value(); + } + return std::nullopt; + } + + auto find_height() const -> std::optional<std::uint32_t> + { + if (auto comp = find_component<html_component_height>()) { + return comp.value()->get_numeric_value(); + } return std::nullopt; } + auto find_style() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_style>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_alt() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_alt>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto find_rel() const -> std::optional<std::string_view> + { + if (auto comp = find_component<html_component_rel>()) { + return comp.value()->value; + } + return std::nullopt; + } + + auto is_hidden() const -> bool + { + return find_component<html_component_hidden>().has_value(); + } + + auto find_unknown_component(std::string_view attr_name) const -> std::optional<std::string_view> + { + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + if (unknown.name == attr_name) { + return unknown.value; + } + } + } + return std::nullopt; + } + + auto get_unknown_components() const -> std::vector<std::pair<std::string_view, std::string_view>> + { + std::vector<std::pair<std::string_view, std::string_view>> unknown_attrs; + for (const auto &comp: components) { + if (std::holds_alternative<html_component_unknown>(comp)) { + const auto &unknown = std::get<html_component_unknown>(comp); + unknown_attrs.emplace_back(unknown.name, unknown.value); + } + } + return unknown_attrs; + } + + // Generic visitor method for processing all components + template<typename Visitor> + auto visit_components(Visitor &&visitor) const + { + for (const auto &comp: components) { + std::visit(std::forward<Visitor>(visitor), comp); + } + } + + // Find any component by attribute name + auto find_component_by_name(std::string_view attr_name) const -> std::optional<std::string_view>; + + // Get all attributes as name-value pairs + auto get_all_attributes() const -> std::vector<std::pair<std::string_view, std::string_view>>; + auto clear(void) -> void { id = Tag_UNKNOWN; @@ -137,7 +1431,7 @@ struct html_tag { closing.clear(); } - constexpr auto get_content_length() const -> std::size_t + auto get_content_length() const -> std::size_t { if (flags & (FL_IGNORE | CM_HEAD)) { return 0; diff --git a/src/lua/lua_html.cxx b/src/lua/lua_html.cxx index 090e2af55..9b0deed45 100644 --- a/src/lua/lua_html.cxx +++ b/src/lua/lua_html.cxx @@ -179,6 +179,44 @@ LUA_FUNCTION_DEF(html_tag, get_style); */ LUA_FUNCTION_DEF(html_tag, get_attribute); +/*** + * @method html_tag:get_all_attributes() + * Returns table of all attributes for the element + * @return {table} table with attribute names as keys and values as strings + */ +LUA_FUNCTION_DEF(html_tag, get_all_attributes); + +/*** + * @method html_tag:get_unknown_attributes() + * Returns table of unknown/unrecognized attributes for the element + * @return {table} table with unknown attribute names as keys and values as strings + */ +LUA_FUNCTION_DEF(html_tag, get_unknown_attributes); + +/*** + * @method html_tag:get_children() + * Returns array of child tags for the element + * @return {table} array of child html_tag objects + */ +LUA_FUNCTION_DEF(html_tag, get_children); + +/*** + * @method html_tag:has_attribute(name) + * Checks if element has the specified attribute + * @param {string} name attribute name to check + * @return {boolean} true if attribute exists + */ +LUA_FUNCTION_DEF(html_tag, has_attribute); + +/*** + * @method html_tag:get_numeric_attribute(name) + * Returns numeric value of attribute (if supported and parseable) + * Works for attributes like width, height, font-size, etc. + * @param {string} name attribute name + * @return {number|nil} numeric value or nil if not numeric/parseable + */ +LUA_FUNCTION_DEF(html_tag, get_numeric_attribute); + static const struct luaL_reg taglib_m[] = { LUA_INTERFACE_DEF(html_tag, get_type), LUA_INTERFACE_DEF(html_tag, get_extra), @@ -188,6 +226,11 @@ static const struct luaL_reg taglib_m[] = { LUA_INTERFACE_DEF(html_tag, get_content_length), LUA_INTERFACE_DEF(html_tag, get_style), LUA_INTERFACE_DEF(html_tag, get_attribute), + LUA_INTERFACE_DEF(html_tag, get_all_attributes), + LUA_INTERFACE_DEF(html_tag, get_unknown_attributes), + LUA_INTERFACE_DEF(html_tag, get_children), + LUA_INTERFACE_DEF(html_tag, has_attribute), + LUA_INTERFACE_DEF(html_tag, get_numeric_attribute), {"__tostring", rspamd_lua_class_tostring}, {NULL, NULL}}; @@ -704,6 +747,29 @@ lua_html_tag_get_style(lua_State *L) } static int +lua_html_tag_get_all_attributes(lua_State *L) +{ + LUA_TRACE_POINT; + struct lua_html_tag *ltag = lua_check_html_tag(L, 1); + + if (ltag) { + auto all_attrs = ltag->tag->get_all_attributes(); + lua_createtable(L, 0, all_attrs.size()); + + for (const auto &[name, value]: all_attrs) { + lua_pushlstring(L, name.data(), name.size()); + lua_pushlstring(L, value.data(), value.size()); + lua_settable(L, -3); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + +static int lua_html_tag_get_attribute(lua_State *L) { LUA_TRACE_POINT; @@ -712,8 +778,7 @@ lua_html_tag_get_attribute(lua_State *L) const char *attr_name = luaL_checklstring(L, 2, &slen); if (ltag && attr_name) { - auto maybe_attr = ltag->tag->find_component( - rspamd::html::html_component_from_string({attr_name, slen})); + auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen}); if (maybe_attr) { lua_pushlstring(L, maybe_attr->data(), maybe_attr->size()); @@ -729,6 +794,206 @@ lua_html_tag_get_attribute(lua_State *L) return 1; } +static int +lua_html_tag_get_unknown_attributes(lua_State *L) +{ + LUA_TRACE_POINT; + struct lua_html_tag *ltag = lua_check_html_tag(L, 1); + + if (ltag) { + auto unknown_attrs = ltag->tag->get_unknown_components(); + lua_createtable(L, 0, unknown_attrs.size()); + + for (const auto &[name, value]: unknown_attrs) { + lua_pushlstring(L, name.data(), name.size()); + lua_pushlstring(L, value.data(), value.size()); + lua_settable(L, -3); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + +static int +lua_html_tag_get_children(lua_State *L) +{ + LUA_TRACE_POINT; + struct lua_html_tag *ltag = lua_check_html_tag(L, 1); + + if (ltag) { + lua_createtable(L, ltag->tag->children.size(), 0); + + for (int i = 0; i < ltag->tag->children.size(); i++) { + auto *child_tag = static_cast<lua_html_tag *>(lua_newuserdata(L, sizeof(lua_html_tag))); + child_tag->tag = ltag->tag->children[i]; + child_tag->html = ltag->html; + rspamd_lua_setclass(L, rspamd_html_tag_classname, -1); + lua_rawseti(L, -2, i + 1); + } + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + +static int +lua_html_tag_has_attribute(lua_State *L) +{ + LUA_TRACE_POINT; + struct lua_html_tag *ltag = lua_check_html_tag(L, 1); + gsize slen; + const char *attr_name = luaL_checklstring(L, 2, &slen); + + if (ltag && attr_name) { + auto maybe_attr = ltag->tag->find_component_by_name({attr_name, slen}); + lua_pushboolean(L, maybe_attr.has_value()); + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + +static int +lua_html_tag_get_numeric_attribute(lua_State *L) +{ + LUA_TRACE_POINT; + struct lua_html_tag *ltag = lua_check_html_tag(L, 1); + gsize slen; + const char *attr_name = luaL_checklstring(L, 2, &slen); + + if (ltag && attr_name) { + std::string_view name_view{attr_name, slen}; + + // Check for numeric components + if (name_view == "width") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_width>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "height") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_height>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "size") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_size>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "font-size") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_font_size>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "line-height") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_line_height>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "border-width") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_border_width>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "opacity") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_opacity>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushnumber(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "min-width") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_width>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "max-width") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_width>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "min-height") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_min_height>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "max-height") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_max_height>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "cellpadding") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellpadding>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "cellspacing") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_cellspacing>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + else if (name_view == "tabindex") { + if (auto comp = ltag->tag->find_component<rspamd::html::html_component_tabindex>()) { + if (auto numeric_val = comp.value()->get_numeric_value()) { + lua_pushinteger(L, numeric_val.value()); + return 1; + } + } + } + + lua_pushnil(L); + } + else { + return luaL_error(L, "invalid arguments"); + } + + return 1; +} + void luaopen_html(lua_State *L) { rspamd_lua_new_class(L, rspamd_html_classname, htmllib_m); diff --git a/src/lua/lua_parsers.c b/src/lua/lua_parsers.c index 39e1b0317..eb7fa6bf5 100644 --- a/src/lua/lua_parsers.c +++ b/src/lua/lua_parsers.c @@ -46,6 +46,14 @@ */ /*** + * @function parsers.parse_html_content(input, mempool) + * Parses HTML and returns the HTML content object for structure analysis + * @param {string|text} in input HTML + * @param {rspamd_mempool} mempool memory pool for HTML content management + * @return {html_content} HTML content object with tag structure + */ +LUA_FUNCTION_DEF(parsers, parse_html_content); +/*** * @function parsers.parse_mail_address(str, [pool]) * Parses email address and returns a table of tables in the following format: * @@ -93,6 +101,7 @@ static const struct luaL_reg parserslib_f[] = { LUA_INTERFACE_DEF(parsers, tokenize_text), LUA_INTERFACE_DEF(parsers, parse_html), + LUA_INTERFACE_DEF(parsers, parse_html_content), LUA_INTERFACE_DEF(parsers, parse_mail_address), LUA_INTERFACE_DEF(parsers, parse_content_type), LUA_INTERFACE_DEF(parsers, parse_smtp_date), @@ -242,6 +251,62 @@ int lua_parsers_parse_html(lua_State *L) return 1; } +static int lua_parsers_parse_html_content(lua_State *L) +{ + LUA_TRACE_POINT; + struct rspamd_lua_text *t; + const char *start = NULL; + gsize len; + GByteArray *in; + rspamd_mempool_t *pool; + void *hc; + void **phc; + + if (lua_type(L, 1) == LUA_TUSERDATA) { + t = lua_check_text(L, 1); + + if (t != NULL) { + start = t->start; + len = t->len; + } + } + else if (lua_type(L, 1) == LUA_TSTRING) { + start = luaL_checklstring(L, 1, &len); + } + + if (lua_type(L, 2) != LUA_TUSERDATA) { + return luaL_error(L, "invalid arguments: mempool expected as second argument"); + } + + pool = rspamd_lua_check_mempool(L, 2); + if (!pool) { + return luaL_error(L, "invalid mempool argument"); + } + + if (start != NULL) { + in = g_byte_array_sized_new(len); + g_byte_array_append(in, start, len); + + hc = rspamd_html_process_part(pool, in); + + if (hc) { + phc = lua_newuserdata(L, sizeof(void *)); + *phc = hc; + rspamd_lua_setclass(L, rspamd_html_classname, -1); + } + else { + lua_pushnil(L); + } + + g_byte_array_free(in, TRUE); + } + else { + lua_pushnil(L); + } + + return 1; +} + int lua_parsers_parse_mail_address(lua_State *L) { LUA_TRACE_POINT; @@ -409,4 +474,4 @@ lua_load_parsers(lua_State *L) void luaopen_parsers(lua_State *L) { rspamd_lua_add_preload(L, "rspamd_parsers", lua_load_parsers); -}
\ No newline at end of file +} diff --git a/test/lua/unit/html.lua b/test/lua/unit/html.lua index 81c52ec1b..1802dc984 100644 --- a/test/lua/unit/html.lua +++ b/test/lua/unit/html.lua @@ -1,11 +1,10 @@ context("HTML processing", function() local rspamd_util = require("rspamd_util") - local logger = require("rspamd_logger") local cases = { - -- Entities - {[[<html><body>.firebaseapp.com</body></html>]], - [[.firebaseapp.com]]}, - {[[ + -- Entities + { [[<html><body>.firebaseapp.com</body></html>]], + [[.firebaseapp.com]] }, + { [[ <?xml version="1.0" encoding="iso-8859-1"?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" @@ -22,8 +21,8 @@ context("HTML processing", function() </p> </body> - </html>]], 'Hello, world!\n'}, - {[[ + </html>]], 'Hello, world!\n' }, + { [[ <!DOCTYPE html> <html lang="en"> <head> @@ -39,8 +38,8 @@ context("HTML processing", function() Hello, world! </body> </html> - ]], 'Hello, world!'}, - {[[ + ]], 'Hello, world!' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -56,8 +55,8 @@ context("HTML processing", function() </div> </body> </html> - ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n'}, - {[[ + ]], 'Hello, world!\ntest\ncontentmore content\ncontent inside div\n' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -83,8 +82,8 @@ context("HTML processing", function() </body> </html> - ]], 'content\nheada headb\ndata1 data2\n'}, - {[[ + ]], 'content\nheada headb\ndata1 data2\n' }, + { [[ <html lang="en"> <head> <meta charset="utf-8"> @@ -97,17 +96,398 @@ context("HTML processing", function() a b a > b a < b a & b 'a "a" </body> </html> - ]], 'a b a > b a < b a & b \'a "a"'}, + ]], 'a b a > b a < b a & b \'a "a"' }, } - for i,c in ipairs(cases) do + for i, c in ipairs(cases) do test("Extract text from HTML " .. tostring(i), function() local t = rspamd_util.parse_html(c[1]) assert_not_nil(t) assert_equal(c[2], tostring(t), string.format("'%s' doesn't match with '%s'", - c[2], t)) - + c[2], t)) end) end + + -- Test cases for new HTML tag API methods + local function parse_html_and_extract_tags(html_content, pool) + local rspamd_parsers = require("rspamd_parsers") + + local parsed = rspamd_parsers.parse_html_content(html_content, pool) + local tags = {} + + if parsed then + parsed:foreach_tag("any", function(tag, content_length, is_leaf) + table.insert(tags, tag) + return false + end) + end + + return parsed, tags + end + + test("HTML tag get_all_attributes basic test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="test-class" id="test-id" style="color: red;" width="100">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + assert_true(#tags > 0) + + -- Find the div tag + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local attrs = div_tag:get_all_attributes() + assert_not_nil(attrs) + + -- Check that we have the expected attributes + assert_equal("test-class", attrs["class"]) + assert_equal("test-id", attrs["id"]) + assert_equal("color: red;", attrs["style"]) + assert_equal("100", attrs["width"]) + + pool:destroy() + end) + + test("HTML tag has_attribute test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<img src="test.jpg" width="100" height="50" alt="Test image" hidden />]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local img_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "img" then + img_tag = tag + break + end + end + + assert_not_nil(img_tag) + + -- Test existing attributes + assert_true(img_tag:has_attribute("src")) + assert_true(img_tag:has_attribute("width")) + assert_true(img_tag:has_attribute("height")) + assert_true(img_tag:has_attribute("alt")) + assert_true(img_tag:has_attribute("hidden")) + + -- Test non-existing attributes + assert_false(img_tag:has_attribute("nonexistent")) + assert_false(img_tag:has_attribute("class")) + assert_false(img_tag:has_attribute("")) + + pool:destroy() + end) + + test("HTML tag get_numeric_attribute test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div width="200" height="150" font-size="14" opacity="0.8" tabindex="5">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + -- Test numeric attributes + assert_equal(200, div_tag:get_numeric_attribute("width")) + assert_equal(150, div_tag:get_numeric_attribute("height")) + assert_equal(14, div_tag:get_numeric_attribute("font-size")) + + -- Test opacity with floating-point tolerance + local opacity = div_tag:get_numeric_attribute("opacity") + assert_not_nil(opacity) + assert_true(math.abs(opacity - 0.8) < 0.01, string.format("Expected opacity ~0.8, got %f", opacity)) + + assert_equal(5, div_tag:get_numeric_attribute("tabindex")) + + -- Test non-numeric attributes + assert_nil(div_tag:get_numeric_attribute("nonexistent")) + + pool:destroy() + end) + + test("HTML tag get_unknown_attributes test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="known" data-track="analytics" unknown-attr="test-value" custom-id="12345">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local unknown_attrs = div_tag:get_unknown_attributes() + assert_not_nil(unknown_attrs) + + -- Should include unknown attributes but not known ones like "class" + assert_not_nil(unknown_attrs["unknown-attr"]) + assert_equal("test-value", unknown_attrs["unknown-attr"]) + assert_not_nil(unknown_attrs["custom-id"]) + assert_equal("12345", unknown_attrs["custom-id"]) + + -- data-track should be recognized as a known attribute now + -- but if not, it would appear in unknown attributes + + pool:destroy() + end) + + test("HTML tag get_children test", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[ + <div id="parent"> + <p>First child</p> + <span>Second child</span> + <img src="test.jpg" /> + </div> + ]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local parent_div = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" and tag:has_attribute("id") and tag:get_attribute("id") == "parent" then + parent_div = tag + break + end + end + + assert_not_nil(parent_div) + + local children = parent_div:get_children() + assert_not_nil(children) + assert_equal(3, #children) + + -- Check child types + local child_types = {} + for _, child in ipairs(children) do + table.insert(child_types, child:get_type()) + end + + -- Should contain p, span, and img + local child_types_str = table.concat(child_types, ",") + assert_true(child_types_str:find("p") ~= nil) + assert_true(child_types_str:find("span") ~= nil) + assert_true(child_types_str:find("img") ~= nil) + + pool:destroy() + end) + + test("HTML tag get_attribute vs get_all_attributes consistency", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<a href="https://example.com" class="link" target="_blank" title="Example Link">Link</a>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local a_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "a" then + a_tag = tag + break + end + end + + assert_not_nil(a_tag) + + local all_attrs = a_tag:get_all_attributes() + + -- Test that individual get_attribute calls match get_all_attributes + for attr_name, attr_value in pairs(all_attrs) do + assert_equal(attr_value, a_tag:get_attribute(attr_name), + string.format("Attribute '%s' mismatch: get_attribute='%s', get_all_attributes='%s'", + attr_name, a_tag:get_attribute(attr_name) or "nil", attr_value)) + end + + -- Test specific expected attributes + assert_equal("https://example.com", a_tag:get_attribute("href")) + assert_equal("link", a_tag:get_attribute("class")) + assert_equal("_blank", a_tag:get_attribute("target")) + assert_equal("Example Link", a_tag:get_attribute("title")) + + pool:destroy() + end) + + + + test("HTML tag attribute edge cases", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[<div class="" hidden style=" " width="0" height="abc">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + -- Test empty attribute value + assert_true(div_tag:has_attribute("class")) + assert_equal("", div_tag:get_attribute("class")) + + -- Test boolean attribute (hidden) + assert_true(div_tag:has_attribute("hidden")) + + -- Test whitespace-only attribute + assert_true(div_tag:has_attribute("style")) + assert_equal(" ", div_tag:get_attribute("style")) + + -- Test numeric attributes with edge cases + assert_equal(0, div_tag:get_numeric_attribute("width")) + assert_nil(div_tag:get_numeric_attribute("height")) -- "abc" is not numeric + + -- Test non-existent attribute + assert_false(div_tag:has_attribute("nonexistent")) + assert_nil(div_tag:get_attribute("nonexistent")) + assert_nil(div_tag:get_numeric_attribute("nonexistent")) + + pool:destroy() + end) + + test("HTML tag complex nested structure", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = [[ + <table cellpadding="5" cellspacing="2" border="1"> + <tr> + <td align="center" valign="top" width="100"> + <img src="image1.jpg" width="80" height="60" alt="Image 1" /> + </td> + <td align="left" valign="middle"> + <p font-size="12">Text content</p> + </td> + </tr> + </table> + ]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + -- Find table tag + local table_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "table" then + table_tag = tag + break + end + end + + assert_not_nil(table_tag) + + -- Test table attributes + assert_equal(5, table_tag:get_numeric_attribute("cellpadding")) + assert_equal(2, table_tag:get_numeric_attribute("cellspacing")) + assert_equal("1", table_tag:get_attribute("border")) + + -- Test that table has children + local children = table_tag:get_children() + assert_not_nil(children) + assert_true(#children > 0) + + -- Find img tag + local img_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "img" then + img_tag = tag + break + end + end + + assert_not_nil(img_tag) + + -- Test img attributes + assert_equal("image1.jpg", img_tag:get_attribute("src")) + assert_equal(80, img_tag:get_numeric_attribute("width")) + assert_equal(60, img_tag:get_numeric_attribute("height")) + assert_equal("Image 1", img_tag:get_attribute("alt")) + + pool:destroy() + end) + + test("HTML tag with mixed known and unknown attributes", function() + local rspamd_mempool = require("rspamd_mempool") + local pool = rspamd_mempool.create() + + local html = + [[<div class="container" data-analytics="track" custom-attr="value" style="color: blue;" unknown123="test">content</div>]] + local parsed, tags = parse_html_and_extract_tags(html, pool) + + assert_not_nil(parsed) + + local div_tag = nil + for _, tag in ipairs(tags) do + if tag:get_type() == "div" then + div_tag = tag + break + end + end + + assert_not_nil(div_tag) + + local all_attrs = div_tag:get_all_attributes() + local unknown_attrs = div_tag:get_unknown_attributes() + + -- All attributes should include both known and unknown + assert_not_nil(all_attrs["class"]) -- known + assert_not_nil(all_attrs["style"]) -- known + assert_not_nil(all_attrs["custom-attr"]) -- unknown + assert_not_nil(all_attrs["unknown123"]) -- unknown + + -- Unknown attributes should only include unrecognized ones + assert_nil(unknown_attrs["class"]) -- known, shouldn't be here + assert_nil(unknown_attrs["style"]) -- known, shouldn't be here + assert_not_nil(unknown_attrs["custom-attr"]) -- unknown, should be here + assert_not_nil(unknown_attrs["unknown123"]) -- unknown, should be here + + assert_equal("value", unknown_attrs["custom-attr"]) + assert_equal("test", unknown_attrs["unknown123"]) + + pool:destroy() + end) end) |