From: Vsevolod Stakhov Date: Mon, 7 Jun 2021 20:57:36 +0000 (+0100) Subject: [Rework] Html: Make parameters as a vector again X-Git-Tag: 3.0~337 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=7d0bb5ce599c01ed4da6e4204acc63d32bfca853;p=rspamd.git [Rework] Html: Make parameters as a vector again --- diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index f041f45b7..7ae748dff 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -300,16 +300,12 @@ html_parse_tag_content(rspamd_mempool_t *pool, in > parser_env.saved_p) { /* We ignore repeated attributes */ - auto found_it = tag->parameters.find(parser_env.cur_component.value()); - - if (found_it == tag->parameters.end()) { auto sz = (std::size_t)(in - parser_env.saved_p); auto *s = rspamd_mempool_alloc_buffer(pool, sz); memcpy(s, parser_env.saved_p, sz); sz = rspamd_html_decode_entitles_inplace(s, in - parser_env.saved_p); - tag->parameters.emplace(parser_env.cur_component.value(), + tag->parameters.emplace_back(parser_env.cur_component.value(), std::string_view{s, sz}); - } } parser_env.saved_p = nullptr; @@ -635,11 +631,11 @@ html_process_url_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> std::optional { - auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF); + auto found_href_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_HREF); - if (found_href_it != tag->parameters.end()) { + if (found_href_maybe) { /* Check base url */ - auto &href_value = found_href_it->second; + auto &href_value = found_href_maybe.value(); if (hc && hc->base_url && href_value.size() > 2) { /* @@ -823,148 +819,145 @@ html_process_img_tag(rspamd_mempool_t *pool, img->tag = tag; tag->flags |= FL_IMAGE; - auto found_href_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HREF); - if (found_href_it != tag->parameters.end()) { - /* Check base url */ - const auto &href_value = found_href_it->second; + for (const auto ¶m : tag->parameters) { - if (href_value.size() > 0) { - rspamd_ftok_t fstr; - fstr.begin = href_value.data(); - fstr.len = href_value.size(); - img->src = rspamd_mempool_ftokdup (pool, &fstr); + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HREF) { + /* Check base url */ + const auto &href_value = param.value; - if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), - "cid:", sizeof("cid:") - 1) == 0) { - /* We have an embedded image */ - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; - } - else { - if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), - "data:", sizeof("data:") - 1) == 0) { - /* We have an embedded image in HTML tag */ - img->flags |= - (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); - html_process_data_image(pool, img, href_value); - hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + if (href_value.size() > 0) { + rspamd_ftok_t fstr; + fstr.begin = href_value.data(); + fstr.len = href_value.size(); + img->src = rspamd_mempool_ftokdup (pool, &fstr); + + if (href_value.size() > sizeof("cid:") - 1 && memcmp(href_value.data(), + "cid:", sizeof("cid:") - 1) == 0) { + /* We have an embedded image */ + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EMBEDDED; } else { - img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; - if (img->src) { + if (href_value.size() > sizeof("data:") - 1 && memcmp(href_value.data(), + "data:", sizeof("data:") - 1) == 0) { + /* We have an embedded image in HTML tag */ + img->flags |= + (RSPAMD_HTML_FLAG_IMAGE_EMBEDDED | RSPAMD_HTML_FLAG_IMAGE_DATA); + html_process_data_image(pool, img, href_value); + hc->flags |= RSPAMD_HTML_FLAG_HAS_DATA_URLS; + } + else { + img->flags |= RSPAMD_HTML_FLAG_IMAGE_EXTERNAL; + if (img->src) { - std::string_view cpy{href_value}; - auto maybe_url = html_process_url(pool, cpy); + std::string_view cpy{href_value}; + auto maybe_url = html_process_url(pool, cpy); - if (maybe_url) { - img->url = maybe_url.value(); - struct rspamd_url *existing; + if (maybe_url) { + img->url = maybe_url.value(); + struct rspamd_url *existing; - img->url->flags |= RSPAMD_URL_FLAG_IMAGE; - existing = rspamd_url_set_add_or_return(url_set, img->url); + img->url->flags |= RSPAMD_URL_FLAG_IMAGE; + existing = rspamd_url_set_add_or_return(url_set, img->url); - if (existing != img->url) { - /* - * We have some other URL that could be - * found, e.g. from another part. However, - * we still want to set an image flag on it - */ - existing->flags |= img->url->flags; - existing->count++; - } - else if (part_urls) { - /* New url */ - g_ptr_array_add(part_urls, img->url); + if (existing != img->url) { + /* + * We have some other URL that could be + * found, e.g. from another part. However, + * we still want to set an image flag on it + */ + existing->flags |= img->url->flags; + existing->count++; + } + else if (part_urls) { + /* New url */ + g_ptr_array_add(part_urls, img->url); + } } } } } } } - } - auto found_height_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT); - if (found_height_it != tag->parameters.end()) { - unsigned long val; + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_HEIGHT) { + unsigned long val; - rspamd_strtoul(found_height_it->second.data(), found_height_it->second.size(), &val); - img->height = val; - } + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->height = val; + } - auto found_width_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_WIDTH); - if (found_width_it != tag->parameters.end()) { - unsigned long val; + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_WIDTH) { + unsigned long val; - rspamd_strtoul(found_width_it->second.data(), found_width_it->second.size(), &val); - img->width = val; - } + rspamd_strtoul(param.value.data(), param.value.size(), &val); + img->width = val; + } - /* TODO: rework to css at some time */ - auto found_style_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_STYLE); - if (found_style_it != tag->parameters.end()) { - if (found_height_it == tag->parameters.end()) { - auto style_st = found_style_it->second; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "height", sizeof("height") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("height") - 1); - - for (auto i = 0; i < substr.size(); i ++) { - auto t = substr[i]; - if (g_ascii_isdigit (t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->height = val; - break; - } - else if (!g_ascii_isspace (t) && t != '=' && t != ':') { - /* Fallback */ - break; + /* TODO: rework to css at some time */ + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + if (img->height == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "height", sizeof("height") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("height") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit (t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->height = val; + break; + } + else if (!g_ascii_isspace (t) && t != '=' && t != ':') { + /* Fallback */ + break; + } } } } - } - if (found_width_it == tag->parameters.end()) { - auto style_st = found_style_it->second; - auto pos = rspamd_substring_search_caseless(style_st.data(), - style_st.size(), - "width", sizeof("width") - 1); - if (pos != -1) { - auto substr = style_st.substr(pos + sizeof("width") - 1); - - for (auto i = 0; i < substr.size(); i ++) { - auto t = substr[i]; - if (g_ascii_isdigit (t)) { - unsigned long val; - rspamd_strtoul(substr.data(), - substr.size(), &val); - img->width = val; - break; - } - else if (!g_ascii_isspace (t) && t != '=' && t != ':') { - /* Fallback */ - break; + if (img->width == 0) { + auto style_st = param.value; + auto pos = rspamd_substring_search_caseless(style_st.data(), + style_st.size(), + "width", sizeof("width") - 1); + if (pos != -1) { + auto substr = style_st.substr(pos + sizeof("width") - 1); + + for (auto i = 0; i < substr.size(); i++) { + auto t = substr[i]; + if (g_ascii_isdigit (t)) { + unsigned long val; + rspamd_strtoul(substr.data(), + substr.size(), &val); + img->width = val; + break; + } + else if (!g_ascii_isspace (t) && t != '=' && t != ':') { + /* Fallback */ + break; + } } } } } - } - - auto found_alt_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_ALT); - if (found_alt_it != tag->parameters.end()) { - if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) { - /* Add a space */ - hc->parsed += ' '; - } - hc->parsed.append(found_alt_it->second); + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_ALT) { + if (!hc->parsed.empty() && !g_ascii_isspace (hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } + hc->parsed.append(param.value); - if (!g_ascii_isspace (hc->parsed.back())) { - /* Add a space */ - hc->parsed += ' '; + if (!g_ascii_isspace (hc->parsed.back())) { + /* Add a space */ + hc->parsed += ' '; + } } } @@ -987,10 +980,10 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls) -> void { - auto found_rel_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_REL); + auto found_rel_maybe = tag->find_component(html_component_type::RSPAMD_HTML_COMPONENT_REL); - if (found_rel_it != tag->parameters.end()) { - if (found_rel_it->second == "icon") { + if (found_rel_maybe) { + if (found_rel_maybe.value() == "icon") { html_process_img_tag(pool, tag, hc, url_set, part_urls); } } @@ -1484,45 +1477,41 @@ html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, bl->font_size = (guint) -1; bl->font_color.d.comp.alpha = 255; - auto found_color_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_COLOR); - - if (found_color_it != tag->parameters.end()) { - html_process_color(found_color_it->second, &bl->font_color); - msg_debug_html ("tag %*s; got color: %xd", - (int)tag->name.size(), tag->name.data(), - bl->font_color.d.val); - } - - auto found_bgcolor_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR); + for (const auto ¶m : tag->parameters) { + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { + html_process_color(param.value, &bl->font_color); + msg_debug_html ("tag %*s; got color: %xd", + (int) tag->name.size(), tag->name.data(), + bl->font_color.d.val); + } - if (found_bgcolor_it != tag->parameters.end()) { - html_process_color(found_bgcolor_it->second, &bl->background_color); - msg_debug_html ("tag %*s; got bgcolor: %xd", - (int)tag->name.size(), tag->name.data(), - bl->background_color.d.val); - if (tag->id == Tag_BODY) { - /* Set global background color */ - memcpy(&hc->bgcolor, &bl->background_color, - sizeof(hc->bgcolor)); + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { + html_process_color(param.value, &bl->background_color); + msg_debug_html ("tag %*s; got bgcolor: %xd", + (int) tag->name.size(), tag->name.data(), + bl->background_color.d.val); + if (tag->id == Tag_BODY) { + /* Set global background color */ + memcpy(&hc->bgcolor, &bl->background_color, + sizeof(hc->bgcolor)); + } } - } - auto found_style_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_STYLE); - if (found_style_it != tag->parameters.end()) { - html_process_style(pool, bl, hc, found_style_it->second); - msg_debug_html ("tag: %*s; got style: %*s", - (int)tag->name.size(), tag->name.data(), - (int) bl->style.len, bl->style.begin); - } + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { + html_process_style(pool, bl, hc, param.value); + msg_debug_html ("tag: %*s; got style: %*s", + (int) tag->name.size(), tag->name.data(), + (int) bl->style.len, bl->style.begin); + } - auto found_class_it = tag->parameters.find(html_component_type::RSPAMD_HTML_COMPONENT_CLASS); - if (found_class_it != tag->parameters.end()) { - rspamd_ftok_t fstr; - fstr.begin = found_class_it->second.data(); - fstr.len = found_class_it->second.size(); - bl->html_class = rspamd_mempool_ftokdup (pool, &fstr); - msg_debug_html ("tag: %*s; got class: %s", - (int)tag->name.size(), tag->name.data(), bl->html_class); + if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { + rspamd_ftok_t fstr; + fstr.begin = param.value.data(); + fstr.len = param.value.size(); + bl->html_class = rspamd_mempool_ftokdup (pool, &fstr); + msg_debug_html ("tag: %*s; got class: %s", + (int) tag->name.size(), tag->name.data(), bl->html_class); + } } hc->blocks.push_back(bl); diff --git a/src/libserver/html/html_tag.hxx b/src/libserver/html/html_tag.hxx index 4aba9af41..251ba148c 100644 --- a/src/libserver/html/html_tag.hxx +++ b/src/libserver/html/html_tag.hxx @@ -22,7 +22,7 @@ #include #include #include -#include +#include namespace rspamd::html { @@ -41,20 +41,38 @@ enum class html_component_type : std::uint8_t { }; using html_tag_extra_t = std::variant; +struct html_tag_component { + html_component_type type; + std::string_view value; + + html_tag_component(html_component_type type, std::string_view value) + : type(type), value(value) {} +}; struct html_tag { - gint id = -1; - gint flags = 0; - mutable guint content_length = 0; /* Allow content length propagation */ - goffset content_offset = 0; + int id = -1; + unsigned int flags = 0; + mutable unsigned int content_length = 0; /* Allow content length propagation */ + unsigned int content_offset = 0; std::string_view name; - robin_hood::unordered_flat_map parameters; + std::vector parameters; html_tag_extra_t extra; struct html_block *block = nullptr; /* TODO: temporary, must be handled by css */ std::vector children; struct html_tag *parent; + + auto find_component(html_component_type what) const -> std::optional + { + for (const auto &comp : parameters) { + if (comp.type == what) { + return comp.value; + } + } + + return std::nullopt; + } }; }