From: Vsevolod Stakhov Date: Thu, 10 Jun 2021 16:21:09 +0000 (+0100) Subject: [Project] Html/CSS: Switch styles parsing to css parser X-Git-Tag: 3.0~323 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=e02a4f2831af83eec951b98cc93823568c226f4f;p=rspamd.git [Project] Html/CSS: Switch styles parsing to css parser --- diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx index 774a65cfc..34d65aadc 100644 --- a/src/libserver/css/css_parser.cxx +++ b/src/libserver/css/css_parser.cxx @@ -801,6 +801,20 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st, "cannot parse input"}); } +auto +parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st) + -> rspamd::html::html_block * +{ + auto &&res = process_declaration_tokens(pool, + get_rules_parser_functor(pool, st)); + + if (res) { + return res->compile_to_block(pool); + } + + return nullptr; +} + TEST_SUITE("css parser") { TEST_CASE("parse colors") { const std::vector cases{ diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx index ec6d5159a..1e0762d78 100644 --- a/src/libserver/css/css_parser.hxx +++ b/src/libserver/css/css_parser.hxx @@ -30,6 +30,10 @@ #include "contrib/expected/expected.hpp" #include "logger.h" +/* Forward declaration */ +namespace rspamd::html { +struct html_block; +} namespace rspamd::css { @@ -205,6 +209,15 @@ auto get_selectors_parser_functor(rspamd_mempool_t *pool, auto get_rules_parser_functor(rspamd_mempool_t *pool, const std::string_view &st) -> blocks_gen_functor; +/** + * Parses a css declaration (e.g. embedded css and returns a completed html block) + * @param pool + * @param st + * @return + */ +auto parse_css_declaration(rspamd_mempool_t *pool, const std::string_view &st) + -> rspamd::html::html_block *; + } #endif //RSPAMD_CSS_PARSER_HXX diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index e867cce6d..1d13c2466 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -990,610 +990,36 @@ html_process_link_tag(rspamd_mempool_t *pool, struct html_tag *tag, } } -static void -html_process_color(std::string_view input, struct html_color *cl) -{ - const gchar *p = input.data(), *end = input.data() + input.size(); - char hexbuf[7]; - - memset(cl, 0, sizeof(*cl)); - - if (*p == '#') { - /* HEX color */ - p++; - rspamd_strlcpy(hexbuf, p, MIN ((gint) sizeof(hexbuf), end - p + 1)); - cl->d.val = strtoul(hexbuf, NULL, 16); - cl->d.comp.alpha = 255; - cl->valid = TRUE; - } - else if (input.size() > 4 && rspamd_lc_cmp(p, "rgb", 3) == 0) { - /* We have something like rgba(x,x,x,x) or rgb(x,x,x) */ - enum { - obrace, - num1, - num2, - num3, - num4, - skip_spaces - } state = skip_spaces, next_state = obrace; - gulong r = 0, g = 0, b = 0, opacity = 255; - const gchar *c; - gboolean valid = FALSE; - - p += 3; - - if (*p == 'a') { - p++; - } - - c = p; - - while (p < end) { - switch (state) { - case obrace: - if (*p == '(') { - p++; - state = skip_spaces; - next_state = num1; - } - else if (g_ascii_isspace (*p)) { - state = skip_spaces; - next_state = obrace; - } - else { - goto stop; - } - break; - case num1: - if (*p == ',') { - if (!rspamd_strtoul(c, p - c, &r)) { - goto stop; - } - - p++; - state = skip_spaces; - next_state = num2; - } - else if (!g_ascii_isdigit (*p)) { - goto stop; - } - else { - p++; - } - break; - case num2: - if (*p == ',') { - if (!rspamd_strtoul(c, p - c, &g)) { - goto stop; - } - - p++; - state = skip_spaces; - next_state = num3; - } - else if (!g_ascii_isdigit (*p)) { - goto stop; - } - else { - p++; - } - break; - case num3: - if (*p == ',') { - if (!rspamd_strtoul(c, p - c, &b)) { - goto stop; - } - - valid = TRUE; - p++; - state = skip_spaces; - next_state = num4; - } - else if (*p == ')') { - if (!rspamd_strtoul(c, p - c, &b)) { - goto stop; - } - - valid = TRUE; - goto stop; - } - else if (!g_ascii_isdigit (*p)) { - goto stop; - } - else { - p++; - } - break; - case num4: - if (*p == ',') { - if (!rspamd_strtoul(c, p - c, &opacity)) { - goto stop; - } - - valid = TRUE; - goto stop; - } - else if (*p == ')') { - if (!rspamd_strtoul(c, p - c, &opacity)) { - goto stop; - } - - valid = TRUE; - goto stop; - } - else if (!g_ascii_isdigit (*p)) { - goto stop; - } - else { - p++; - } - break; - case skip_spaces: - if (!g_ascii_isspace (*p)) { - c = p; - state = next_state; - } - else { - p++; - } - break; - } - } - -stop: - - if (valid) { - cl->d.comp.r = r; - cl->d.comp.g = g; - cl->d.comp.b = b; - cl->d.comp.alpha = opacity; - cl->valid = TRUE; - } - } - else { - auto maybe_color_value = - rspamd::css::css_value::maybe_color_from_string(input); - - if (maybe_color_value.has_value()) { - auto color = maybe_color_value->to_color().value(); - cl->d.val = color.to_number(); - cl->d.comp.alpha = 255; /* Non transparent */ - } - } -} - -/* - * Target is used for in and out if this function returns TRUE - */ -static auto -html_process_css_size(const gchar *suffix, gsize len, - double &tgt) -> bool -{ - gdouble sz = tgt; - gboolean ret = FALSE; - - if (len >= 2) { - if (memcmp(suffix, "px", 2) == 0) { - sz = (guint) sz; /* Round to number */ - ret = TRUE; - } - else if (memcmp(suffix, "em", 2) == 0) { - /* EM is 16 px, so multiply and round */ - sz = (guint) (sz * 16.0); - ret = TRUE; - } - else if (len >= 3 && memcmp(suffix, "rem", 3) == 0) { - /* equal to EM in our case */ - sz = (guint) (sz * 16.0); - ret = TRUE; - } - else if (memcmp(suffix, "ex", 2) == 0) { - /* - * Represents the x-height of the element's font. - * On fonts with the "x" letter, this is generally the height - * of lowercase letters in the font; 1ex = 0.5em in many fonts. - */ - sz = (guint) (sz * 8.0); - ret = TRUE; - } - else if (memcmp(suffix, "vw", 2) == 0) { - /* - * Vewport width in percentages: - * we assume 1% of viewport width as 8px - */ - sz = (guint) (sz * 8.0); - ret = TRUE; - } - else if (memcmp(suffix, "vh", 2) == 0) { - /* - * Vewport height in percentages - * we assume 1% of viewport width as 6px - */ - sz = (guint) (sz * 6.0); - ret = TRUE; - } - else if (len >= 4 && memcmp(suffix, "vmax", 4) == 0) { - /* - * Vewport width in percentages - * we assume 1% of viewport width as 6px - */ - sz = (guint) (sz * 8.0); - ret = TRUE; - } - else if (len >= 4 && memcmp(suffix, "vmin", 4) == 0) { - /* - * Vewport height in percentages - * we assume 1% of viewport width as 6px - */ - sz = (guint) (sz * 6.0); - ret = TRUE; - } - else if (memcmp(suffix, "pt", 2) == 0) { - sz = (guint) (sz * 96.0 / 72.0); /* One point. 1pt = 1/72nd of 1in */ - ret = TRUE; - } - else if (memcmp(suffix, "cm", 2) == 0) { - sz = (guint) (sz * 96.0 / 2.54); /* 96px/2.54 */ - ret = TRUE; - } - else if (memcmp(suffix, "mm", 2) == 0) { - sz = (guint) (sz * 9.6 / 2.54); /* 9.6px/2.54 */ - ret = TRUE; - } - else if (memcmp(suffix, "in", 2) == 0) { - sz = (guint) (sz * 96.0); /* 96px */ - ret = TRUE; - } - else if (memcmp(suffix, "pc", 2) == 0) { - sz = (guint) (sz * 96.0 / 6.0); /* 1pc = 12pt = 1/6th of 1in. */ - ret = TRUE; - } - } - else if (suffix[0] == '%') { - /* Percentages from 16 px */ - sz = (guint) (sz / 100.0 * 16.0); - ret = TRUE; - } - - if (ret) { - tgt = sz; - } - - return ret; -} - -static auto -html_process_font_size(const gchar *line, guint len, guint &fs, - gboolean is_css) -> void -{ - const gchar *p = line, *end = line + len; - gchar *err = NULL, numbuf[64]; - gdouble sz = 0; - gboolean failsafe = FALSE; - - while (p < end && g_ascii_isspace (*p)) { - p++; - len--; - } - - if (g_ascii_isdigit (*p)) { - rspamd_strlcpy(numbuf, p, MIN (sizeof(numbuf), len + 1)); - sz = strtod(numbuf, &err); - - /* Now check leftover */ - if (sz < 0) { - sz = 0; - } - } - else { - /* Ignore the rest */ - failsafe = TRUE; - sz = is_css ? 16 : 1; - /* TODO: add textual fonts descriptions */ - } - - if (err && *err != '\0') { - const gchar *e = err; - gsize slen; - - /* Skip spaces */ - while (*e && g_ascii_isspace (*e)) { - e++; - } - - /* Lowercase */ - slen = strlen(e); - rspamd_str_lc((gchar *) e, slen); - - if (!html_process_css_size(e, slen, sz)) { - failsafe = TRUE; - } - } - else { - /* Failsafe naked number */ - failsafe = TRUE; - } - - if (failsafe) { - if (is_css) { - /* - * In css mode we usually ignore sizes, but let's treat - * small sizes specially - */ - if (sz < 1) { - sz = 0; - } - else { - sz = 16; /* Ignore */ - } - } - else { - /* In non-css mode we have to check legacy size */ - sz = sz >= 1 ? sz * 16 : 16; - } - } - - if (sz > 32) { - sz = 32; - } - - fs = sz; -} - -static void -html_process_style(rspamd_mempool_t *pool, struct html_block *bl, - struct html_content *hc, - std::string_view style) -{ - const gchar *p, *c, *end, *key = NULL; - enum { - read_key, - read_colon, - read_value, - skip_spaces, - } state = skip_spaces, next_state = read_key; - guint klen = 0; - gdouble opacity = 1.0; - - p = style.data(); - c = p; - end = p + style.size(); - - while (p <= end) { - switch (state) { - case read_key: - if (p == end || *p == ':') { - key = c; - klen = p - c; - state = skip_spaces; - next_state = read_value; - } - else if (g_ascii_isspace (*p)) { - key = c; - klen = p - c; - state = skip_spaces; - next_state = read_colon; - } - - p++; - break; - - case read_colon: - if (p == end || *p == ':') { - state = skip_spaces; - next_state = read_value; - } - - p++; - break; - - case read_value: - if (p == end || *p == ';') { - if (key && klen && p - c > 0) { - if ((klen == 5 && g_ascii_strncasecmp(key, "color", 5) == 0) - || (klen == 10 && g_ascii_strncasecmp(key, "font-color", 10) == 0)) { - - html_process_color({c, (std::size_t)(p - c)}, &bl->font_color); - msg_debug_html ("got color: %xd", bl->font_color.d.val); - } - else if ((klen == 16 && g_ascii_strncasecmp(key, - "background-color", 16) == 0) || - (klen == 10 && g_ascii_strncasecmp(key, - "background", 10) == 0)) { - - html_process_color({c, (std::size_t)(p - c)}, &bl->background_color); - msg_debug_html ("got bgcolor: %xd", bl->background_color.d.val); - } - else if (klen == 7 && g_ascii_strncasecmp(key, "display", 7) == 0) { - if (p - c >= 4 && rspamd_substring_search_caseless(c, p - c, - "none", 4) != -1) { - bl->visible = FALSE; - msg_debug_html ("tag is not visible"); - } - } - else if (klen == 9 && - g_ascii_strncasecmp(key, "font-size", 9) == 0) { - html_process_font_size(c, p - c, - bl->font_size, TRUE); - msg_debug_html ("got font size: %ud", bl->font_size); - } - else if (klen == 7 && - g_ascii_strncasecmp(key, "opacity", 7) == 0) { - gchar numbuf[64]; - - rspamd_strlcpy(numbuf, c, - MIN (sizeof(numbuf), p - c + 1)); - opacity = strtod(numbuf, NULL); - - if (opacity > 1) { - opacity = 1; - } - else if (opacity < 0) { - opacity = 0; - } - - bl->font_color.d.comp.alpha = (guint8) (opacity * 255.0); - } - else if (klen == 10 && - g_ascii_strncasecmp(key, "visibility", 10) == 0) { - if (p - c >= 6 && rspamd_substring_search_caseless(c, - p - c, - "hidden", 6) != -1) { - bl->visible = FALSE; - msg_debug_html ("tag is not visible"); - } - } - } - - key = NULL; - klen = 0; - state = skip_spaces; - next_state = read_key; - } - - p++; - break; - - case skip_spaces: - if (p < end && !g_ascii_isspace (*p)) { - c = p; - state = next_state; - } - else { - p++; - } - - break; - } - } -} - static auto html_process_block_tag(rspamd_mempool_t *pool, struct html_tag *tag, struct html_content *hc) -> void { - auto *bl = rspamd_mempool_alloc0_type (pool, struct html_block); - bl->tag = tag; - bl->visible = TRUE; - bl->font_size = (guint) -1; - bl->font_color.d.comp.alpha = 255; + std::optional maybe_fgcolor, maybe_bgcolor; for (const auto ¶m : tag->parameters) { if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_COLOR) { - html_process_color(param.value, &bl->font_color); - msg_debug_html ("tag %*s; got color: %xd", - (int) tag->name.size(), tag->name.data(), - bl->font_color.d.val); + maybe_fgcolor = css::css_value::maybe_color_from_string(param.value); } if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_BGCOLOR) { - html_process_color(param.value, &bl->background_color); - msg_debug_html ("tag %*s; got bgcolor: %xd", - (int) tag->name.size(), tag->name.data(), - bl->background_color.d.val); - if (tag->id == Tag_BODY) { - /* Set global background color */ - memcpy(&hc->bgcolor, &bl->background_color, - sizeof(hc->bgcolor)); - } + maybe_bgcolor = css::css_value::maybe_color_from_string(param.value); } if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_STYLE) { - html_process_style(pool, bl, hc, param.value); - msg_debug_html ("tag: %*s; got style: %*s", - (int) tag->name.size(), tag->name.data(), - (int) bl->style.len, bl->style.begin); - } - - if (param.type == html_component_type::RSPAMD_HTML_COMPONENT_CLASS) { - rspamd_ftok_t fstr; - fstr.begin = param.value.data(); - fstr.len = param.value.size(); - bl->html_class = rspamd_mempool_ftokdup (pool, &fstr); - msg_debug_html ("tag: %*s; got class: %s", - (int) tag->name.size(), tag->name.data(), bl->html_class); + tag->block = rspamd::css::parse_css_declaration(pool, param.value); } } - hc->blocks.push_back(bl); - tag->block = bl; -} - -static auto -html_propagate_style(struct html_content *hc, - struct html_tag *tag, - struct html_block *bl, - std::vector &blocks) -> void -{ - gboolean push_block = FALSE; - - if (blocks.empty()) { - /* No blocks to propagate */ - return; - } - /* Propagate from the parent if needed */ - auto *bl_parent = blocks.back(); - - if (!bl->background_color.valid) { - /* Try to propagate background color from parent nodes */ - if (bl_parent->background_color.valid) { - memcpy(&bl->background_color, &bl_parent->background_color, - sizeof(bl->background_color)); - } - } - else { - push_block = TRUE; + if (!tag->block) { + tag->block = html_block::undefined_html_block_pool(pool); } - if (!bl->font_color.valid) { - /* Try to propagate background color from parent nodes */ - if (bl_parent->font_color.valid) { - memcpy(&bl->font_color, &bl_parent->font_color, - sizeof(bl->font_color)); - } - } - else { - push_block = TRUE; - } - - /* Propagate font size */ - if (bl->font_size == (guint) -1) { - if (bl_parent->font_size != (guint) -1) { - bl->font_size = bl_parent->font_size; - } - } - else { - push_block = TRUE; + if (maybe_fgcolor) { + tag->block->set_fgcolor(maybe_fgcolor->to_color().value()); } - /* Set bgcolor to the html bgcolor and font color to black as a last resort */ - if (!bl->font_color.valid) { - /* Don't touch opacity as it can be set separately */ - bl->font_color.d.comp.r = 0; - bl->font_color.d.comp.g = 0; - bl->font_color.d.comp.b = 0; - bl->font_color.valid = TRUE; - } - else { - push_block = TRUE; - } - - if (!bl->background_color.valid) { - memcpy(&bl->background_color, &hc->bgcolor, sizeof(hc->bgcolor)); - } - else { - push_block = TRUE; - } - - if (bl->font_size == (guint) -1) { - bl->font_size = 16; /* Default for browsers */ - } - else { - push_block = TRUE; - } - - if (push_block && !(tag->flags & FL_CLOSED)) { - blocks.push_back(bl); + if (maybe_bgcolor) { + tag->block->set_bgcolor(maybe_fgcolor->to_color().value()); } } @@ -2186,7 +1612,8 @@ html_process_input(rspamd_mempool_t *pool, html_process_link_tag(pool, cur_tag, hc, url_set, part_urls); } - else if (cur_tag->flags & FL_BLOCK) { + + if (cur_tag->flags & FL_BLOCK) { struct html_block *bl; if (cur_tag->flags & FL_CLOSING) { @@ -2197,27 +1624,6 @@ html_process_input(rspamd_mempool_t *pool, } else { html_process_block_tag(pool, cur_tag, hc); - bl = cur_tag->block; - - if (bl) { - html_propagate_style(hc, cur_tag, - bl, blocks_stack); - - /* Check visibility */ - if (bl->font_size < 3 || - bl->font_color.d.comp.alpha < 10) { - - bl->visible = FALSE; - msg_debug_html ("tag is not visible: font size: " - "%d, alpha: %d", - (int)bl->font_size, - (int)bl->font_color.d.comp.alpha); - } - - if (!bl->visible) { - state = content_ignore; - } - } } } } diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index fc1dda141..c75d84ea4 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -40,7 +40,6 @@ struct html_content { guint total_tags = 0; std::vector tags_seen; std::vector images; - std::vector blocks; std::vector> all_tags; std::string parsed; void *css_style; @@ -48,7 +47,6 @@ struct html_content { /* Preallocate and reserve all internal structures */ html_content() { tags_seen.resize(N_TAGS, false); - blocks.reserve(128); all_tags.reserve(128); parsed.reserve(256); } diff --git a/src/libserver/html/html_block.hxx b/src/libserver/html/html_block.hxx index 3978bcf1e..913480864 100644 --- a/src/libserver/html/html_block.hxx +++ b/src/libserver/html/html_block.hxx @@ -171,6 +171,17 @@ struct html_block { rspamd::css::css_display_value::DISPLAY_NORMAL, 12}; } + /** + * Produces html block with no defined values allocated from the pool + * @param pool + * @return + */ + static auto undefined_html_block_pool(rspamd_mempool_t *pool) -> html_block* { + auto *bl = rspamd_mempool_alloc_type(pool, html_block); + bl->mask = 0; + + return bl; + } }; }