From 427f8879360595ff48b77400b6b02b5a6968c4d1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 21 Jan 2021 15:45:21 +0000 Subject: [PATCH] [Project] Add some methods for css parser --- src/libserver/css/CMakeLists.txt | 1 + src/libserver/css/css.cxx | 2 + src/libserver/css/css.h | 5 +- src/libserver/css/css.hxx | 12 ++ src/libserver/css/css_parser.cxx | 238 ++++++++++++++++++++++++++++++ src/libserver/css/css_parser.hxx | 34 +++++ src/libserver/css/parse_error.hxx | 3 +- 7 files changed, 293 insertions(+), 2 deletions(-) create mode 100644 src/libserver/css/css_parser.cxx create mode 100644 src/libserver/css/css_parser.hxx diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt index f5d5affdb..c8f7921b1 100644 --- a/src/libserver/css/CMakeLists.txt +++ b/src/libserver/css/CMakeLists.txt @@ -14,6 +14,7 @@ SET(LIBCSSSRC "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx" "${RAGEL_ragel_css_selector_parser_OUTPUTS}" "${RAGEL_ragel_css_rule_parser_OUTPUTS}" PARENT_SCOPE) diff --git a/src/libserver/css/css.cxx b/src/libserver/css/css.cxx index 68ebfeefa..bd148cecd 100644 --- a/src/libserver/css/css.cxx +++ b/src/libserver/css/css.cxx @@ -29,6 +29,8 @@ rspamd_css_parse_style (const guchar *begin, gsize len, GError **err) namespace rspamd::css { +INIT_LOG_MODULE_PUBLIC(css); + class css_style_sheet::impl { }; diff --git a/src/libserver/css/css.h b/src/libserver/css/css.h index a87f4424d..169bcf58c 100644 --- a/src/libserver/css/css.h +++ b/src/libserver/css/css.h @@ -18,13 +18,16 @@ #define RSPAMD_CSS_H #include "config.h" +#include "mem_pool.h" #ifdef __cplusplus extern "C" { #endif typedef void * rspamd_css; -rspamd_css rspamd_css_parse_style (const guchar *begin, gsize len, GError **err); +rspamd_css rspamd_css_parse_style (rspamd_mempool_t *pool, + const guchar *begin, + gsize len, GError **err); #ifdef __cplusplus } #endif diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx index 78e0d0f73..d258b35c9 100644 --- a/src/libserver/css/css.hxx +++ b/src/libserver/css/css.hxx @@ -18,9 +18,21 @@ #include #include +#include "logger.h" namespace rspamd::css { +extern unsigned int rspamd_css_log_id; + +#define msg_debug_css(...) rspamd_conditional_debug_fast (NULL, NULL, \ + rspamd_css_log_id, "css", pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_err_css(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ + "css", pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + class css_style_sheet { public: css_style_sheet(); diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx new file mode 100644 index 000000000..9f2023e50 --- /dev/null +++ b/src/libserver/css/css_parser.cxx @@ -0,0 +1,238 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_parser.hxx" +#include + + +namespace rspamd::css { + +class css_parser { +public: + css_parser(void) = delete; /* Require mempool to be set for logging */ + explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {} + + bool consume_input(const std::string_view &sv); + + auto get_object_maybe(void) -> tl::expected, css_parse_error> { + if (state == parser_state::parse_done) { + state = parser_state::initial_state; + return std::move (style_object); + } + + return tl::make_unexpected (error); + } + +private: + enum class parser_state { + initial_state, + skip_spaces, + parse_selector, + ignore_selector, /* e.g. media or namespace */ + parse_done, + }; + parser_state state = parser_state::initial_state; + std::unique_ptr style_object; + css_parse_error error; + rspamd_mempool_t *pool; + + /* Helper parser methods */ + bool need_unescape(const std::string_view &sv); + + std::string_view unescape_css(const std::string_view &sv); +}; + +/* + * Find if we need to unescape css + */ +bool +css_parser::need_unescape(const std::string_view &sv) +{ + bool in_quote = false; + char quote_char, prev_c = 0; + + for (const auto c : sv) { + if (!in_quote) { + if (c == '"' || c == '\'') { + in_quote = true; + quote_char = c; + } + else if (c == '\\') { + return true; + } + } + else { + if (c == quote_char) { + if (prev_c != '\\') { + in_quote = false; + } + } + prev_c = c; + } + } + + return false; +} + +/* + * Unescape css escapes + * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9 + * \0020AC : must be 6 digits long, no space needed (but can be included) + */ +std::string_view +css_parser::unescape_css(const std::string_view &sv) +{ + auto *nspace = reinterpret_cast(rspamd_mempool_alloc(pool, sv.length ())); + auto *d = nspace; + auto nleft = sv.length (); + + enum { + normal = 0, + quoted, + escape, + skip_spaces, + } state = normal; + + char quote_char, prev_c = 0; + auto escape_offset = 0, i = 0; + +#define MAYBE_CONSUME_CHAR(c) do { \ + if (c == '"' || c == '\'') { \ + state = quoted; \ + quote_char = c; \ + nleft--; \ + *d++ = c; \ + } \ + else if (c == '\\') { \ + escape_offset = i; \ + state = escape; \ + } \ + else { \ + state = normal; \ + nleft--; \ + *d++ = c; \ + } \ +} while (0) + + for (const auto c : sv) { + if (nleft == 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int)sv.length()); + break; + } + switch (state) { + case normal: + MAYBE_CONSUME_CHAR(c); + break; + case quoted: + if (c == quote_char) { + if (prev_c != '\\') { + state = normal; + } + } + prev_c = c; + nleft --; + *d++ = c; + break; + case escape: + if (!g_ascii_isxdigit(c)) { + if (i > escape_offset + 1) { + /* Try to decode an escape */ + const auto *escape_start = &sv[escape_offset + 1]; + unsigned long val; + + if (!rspamd_xstrtoul (escape_start, i - escape_offset - 1, &val)) { + msg_debug_css("invalid broken escape found at pos %d", + escape_offset); + } + else { + if (val < 0x1f) { + /* Trivial case: ascii character */ + *d++ = (unsigned char)val; + nleft --; + } + else { + UChar32 uc = val; + auto off = d - nspace; + UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off, + sv.length (), uc); + d = nspace + off; + nleft = sv.length () - off; + } + } + } + else { + /* Empty escape, ignore it */ + msg_debug_css("invalid empty escape found at pos %d", + escape_offset); + } + + if (nleft > 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int)sv.length()); + } + else { + /* Escape is done, advance forward */ + if (g_ascii_isspace (c)) { + state = skip_spaces; + } + else { + MAYBE_CONSUME_CHAR(c); + } + } + } + break; + case skip_spaces: + if (!g_ascii_isspace(c)) { + MAYBE_CONSUME_CHAR(c); + } + /* Ignore spaces */ + break; + } + + i ++; + } + + return std::string_view{nspace, sv.size() - nleft}; +}; + +bool css_parser::consume_input(const std::string_view &sv) +{ + auto our_sv = sv; + + if (need_unescape(sv)) { + our_sv = unescape_css(sv); + msg_debug_css("unescaped css: input size %d, unescaped size %d", + (int)sv.size(), (int)our_sv.size()); + } + + return true; +} + +/* + * Wrapper for the parser + */ +auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) -> + tl::expected,css_parse_error> +{ + css_parser parser(pool); + + parser.consume_input(st); + + return parser.get_object_maybe(); +} + +} diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx new file mode 100644 index 000000000..8d1468a01 --- /dev/null +++ b/src/libserver/css/css_parser.hxx @@ -0,0 +1,34 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef RSPAMD_CSS_PARSER_HXX +#define RSPAMD_CSS_PARSER_HXX + +#include "css.hxx" +#include "parse_error.hxx" +#include "contrib/expected/expected.hpp" +#include "logger.h" + +namespace rspamd::css { + +INIT_LOG_MODULE(chartable) + +auto parse_css (rspamd_mempool_t *pool, const std::string_view &st) -> + tl::expected,css_parse_error>; + +} + +#endif //RSPAMD_CSS_PARSER_HXX diff --git a/src/libserver/css/parse_error.hxx b/src/libserver/css/parse_error.hxx index 60b229181..12ad697eb 100644 --- a/src/libserver/css/parse_error.hxx +++ b/src/libserver/css/parse_error.hxx @@ -34,13 +34,14 @@ enum class css_parse_error_type { }; struct css_parse_error { - css_parse_error_type type; + css_parse_error_type type = css_parse_error_type::PARSE_ERROR_UNKNOWN_ERROR; std::optional description; explicit css_parse_error (css_parse_error_type type, const std::string &description) : type(type), description(description) {} explicit css_parse_error (css_parse_error_type type) : type(type) {} + css_parse_error() = default; }; } -- 2.39.5