From: Vsevolod Stakhov Date: Mon, 25 Jan 2021 14:35:41 +0000 (+0000) Subject: [Project] Css: rework tokeniser X-Git-Tag: 3.0~731 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4;p=rspamd.git [Project] Css: rework tokeniser --- diff --git a/src/libserver/css/CMakeLists.txt b/src/libserver/css/CMakeLists.txt index c8f7921b1..84ed2cf8b 100644 --- a/src/libserver/css/CMakeLists.txt +++ b/src/libserver/css/CMakeLists.txt @@ -14,6 +14,8 @@ SET(LIBCSSSRC "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_tokeniser.cxx" + "${CMAKE_CURRENT_SOURCE_DIR}/css_util.cxx" "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx" "${RAGEL_ragel_css_selector_parser_OUTPUTS}" "${RAGEL_ragel_css_rule_parser_OUTPUTS}" diff --git a/src/libserver/css/css.hxx b/src/libserver/css/css.hxx index 8f2550d7b..1a511dcfd 100644 --- a/src/libserver/css/css.hxx +++ b/src/libserver/css/css.hxx @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once + #ifndef RSPAMD_CSS_HXX #define RSPAMD_CSS_HXX diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx index 4134b933c..207cfcb9d 100644 --- a/src/libserver/css/css_parser.cxx +++ b/src/libserver/css/css_parser.cxx @@ -15,6 +15,7 @@ */ #include "css_parser.hxx" +#include "css_tokeniser.hxx" #include @@ -36,9 +37,6 @@ public: return tl::make_unexpected (error); } - /* Public for unit tests */ - std::string_view unescape_css(const std::string_view &sv); - private: enum class parser_state { initial_state, @@ -49,6 +47,7 @@ private: }; parser_state state = parser_state::initial_state; std::unique_ptr style_object; + css_parse_error error; rspamd_mempool_t *pool; @@ -88,136 +87,26 @@ css_parser::need_unescape(const std::string_view &sv) return false; } -/* - * Unescape css escapes - * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9 - * \0020AC : must be 6 digits long, no space needed (but can be included) - */ -std::string_view -css_parser::unescape_css(const std::string_view &sv) -{ - auto *nspace = reinterpret_cast(rspamd_mempool_alloc(pool, sv.length ())); - auto *d = nspace; - auto nleft = sv.length (); - enum { - normal = 0, - quoted, - escape, - skip_spaces, - } state = normal; - - char quote_char, prev_c = 0; - auto escape_offset = 0, i = 0; - -#define MAYBE_CONSUME_CHAR(c) do { \ - if (c == '"' || c == '\'') { \ - state = quoted; \ - quote_char = c; \ - nleft--; \ - *d++ = c; \ - } \ - else if (c == '\\') { \ - escape_offset = i; \ - state = escape; \ - } \ - else { \ - state = normal; \ - nleft--; \ - *d++ = c; \ - } \ -} while (0) - - for (const auto c : sv) { - if (nleft == 0) { - msg_err_css("cannot unescape css: truncated buffer of size %d", - (int)sv.length()); - break; - } - switch (state) { - case normal: - MAYBE_CONSUME_CHAR(c); - break; - case quoted: - if (c == quote_char) { - if (prev_c != '\\') { - state = normal; - } - } - prev_c = c; - nleft --; - *d++ = c; - break; - case escape: - if (!g_ascii_isxdigit(c)) { - if (i > escape_offset + 1) { - /* Try to decode an escape */ - const auto *escape_start = &sv[escape_offset + 1]; - unsigned long val; +bool css_parser::consume_input(const std::string_view &sv) +{ + bool eof = false; + css_tokeniser css_tokeniser(pool, sv); - if (!rspamd_xstrtoul(escape_start, i - escape_offset - 1, &val)) { - msg_debug_css("invalid broken escape found at pos %d", - escape_offset); - } - else { - if (val < 0x80) { - /* Trivial case: ascii character */ - *d++ = (unsigned char)val; - nleft --; - } - else { - UChar32 uc = val; - auto off = 0; - UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off, - sv.length (), uc); - d += off; - nleft -= off; - } - } - } - else { - /* Empty escape, ignore it */ - msg_debug_css("invalid empty escape found at pos %d", - escape_offset); - } + while (!eof) { + auto token_pair = css_tokeniser.next_token(); - if (nleft <= 0) { - msg_err_css("cannot unescape css: truncated buffer of size %d", - (int)sv.length()); - } - else { - /* Escape is done, advance forward */ - if (g_ascii_isspace (c)) { - state = skip_spaces; - } - else { - MAYBE_CONSUME_CHAR(c); - } - } - } + /* Top level parser */ + switch (token_pair.first) { + case css_parser_token::eof_token: + eof = true; break; - case skip_spaces: - if (!g_ascii_isspace(c)) { - MAYBE_CONSUME_CHAR(c); - } - /* Ignore spaces */ + case css_parser_token::whitespace_token: + case css_parser_token::cdc_token: + case css_parser_token::cdo_token: + /* Ignore tokens */ break; } - - i ++; - } - - return std::string_view{nspace, sv.size() - nleft}; -}; - -bool css_parser::consume_input(const std::string_view &sv) -{ - auto our_sv = sv; - - if (need_unescape(sv)) { - our_sv = unescape_css(sv); - msg_debug_css("unescaped css: input size %d, unescaped size %d", - (int)sv.size(), (int)our_sv.size()); } return true; @@ -237,20 +126,3 @@ auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) -> } } - -/* C API */ -const gchar *rspamd_css_unescape (rspamd_mempool_t *pool, - const guchar *begin, - gsize len, - gsize *outlen) -{ - rspamd::css::css_parser parser(pool); - auto sv = parser.unescape_css({(const char*)begin, len}); - const auto *v = sv.begin(); - - if (outlen) { - *outlen = sv.size(); - } - - return v; -} diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx index 8d1468a01..e009fef70 100644 --- a/src/libserver/css/css_parser.hxx +++ b/src/libserver/css/css_parser.hxx @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #ifndef RSPAMD_CSS_PARSER_HXX #define RSPAMD_CSS_PARSER_HXX diff --git a/src/libserver/css/css_property.hxx b/src/libserver/css/css_property.hxx index 06a345ad4..2e668c640 100644 --- a/src/libserver/css/css_property.hxx +++ b/src/libserver/css/css_property.hxx @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #ifndef RSPAMD_CSS_PROPERTY_HXX #define RSPAMD_CSS_PROPERTY_HXX diff --git a/src/libserver/css/css_rule.hxx b/src/libserver/css/css_rule.hxx index 878322f78..6afaa8bc6 100644 --- a/src/libserver/css/css_rule.hxx +++ b/src/libserver/css/css_rule.hxx @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#pragma once #ifndef RSPAMD_CSS_RULE_HXX #define RSPAMD_CSS_RULE_HXX diff --git a/src/libserver/css/css_selector.hxx b/src/libserver/css/css_selector.hxx index 4c12b3b41..c9f3046d5 100644 --- a/src/libserver/css/css_selector.hxx +++ b/src/libserver/css/css_selector.hxx @@ -14,6 +14,7 @@ * limitations under the License. */ +#pragma once #ifndef RSPAMD_CSS_SELECTOR_HXX #define RSPAMD_CSS_SELECTOR_HXX diff --git a/src/libserver/css/css_style.hxx b/src/libserver/css/css_style.hxx index f3d1e664d..2a97f8f0e 100644 --- a/src/libserver/css/css_style.hxx +++ b/src/libserver/css/css_style.hxx @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #ifndef RSPAMD_CSS_STYLE_HXX #define RSPAMD_CSS_STYLE_HXX diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx new file mode 100644 index 000000000..40f202b01 --- /dev/null +++ b/src/libserver/css/css_tokeniser.cxx @@ -0,0 +1,183 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_tokeniser.hxx" +#include "css_util.hxx" + +namespace rspamd::css { + + +auto css_tokeniser::next_token (void) -> std::pair +{ + /* Helpers */ + + /* + * This lambda eats comment handling nested comments; + * offset is set to the next character after a comment (or eof) + * Nothing is returned + */ + auto consume_comment = [this] () { + auto i = offset; + auto nested = 0; + + /* We handle nested comments just because they can exist... */ + while (i < input.size () - 1) { + auto c = input[i]; + if (c == '*' && input[i + 1] == '/') { + if (nested == 0) { + offset = i + 2; + return; + } + else { + nested--; + i += 2; + continue; + } + } + else if (c == '/' && input[i + 1] == '*') { + nested++; + i += 2; + continue; + } + + i++; + } + + offset = i; + }; + + /* + * Consume quoted string, returns a string_view over a string, offset + * is set one character after the string. Css unescaping is done automatically + * Accepts a quote char to find end of string + */ + auto consume_string = [this] (auto quote_char) -> auto { + auto i = offset; + bool need_unescape = false; + + while (i < input.size ()) { + auto c = input[i]; + + if (c == '\\') { + if (i + 1 < input.size ()) { + need_unescape = true; + } + else { + /* \ at the end -> ignore */ + + } + } + else if (c == quote_char) { + /* End of string */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i + 1; + + return res; + } + else if (c == '\n') { + /* Should be a error, but we ignore it for now */ + } + } + + /* EOF with no quote character, consider it fine */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i; + + return res; + }; + + /* Main tokenisation loop */ + for (auto i = offset; i < input.size (); ++i) { + auto c = input[i]; + + switch (c) { + case '/': + if (i + 1 < input.size () && input[i + 1] == '*') { + offset = i + 2; + consume_comment (); /* Consume comment and go forward */ + return next_token (); /* Tail call */ + } + else { + offset = i + 1; + return std::make_pair (css_parser_token::delim_token, + std::string_view (&input[offset - 1], 1)); + } + break; + case ' ': + case '\t': + case '\n': + case '\r': + case '\v': { + /* Consume as much space as we can */ + do { + c = input[++i]; + } while (i < input.size () && g_ascii_isspace (c)); + + auto ret = std::make_pair (css_parser_token::whitespace_token, + std::string_view (&input[offset], i - offset)); + offset = i; + return ret; + } + case '"': + case '\'': + offset = i + 1; + return std::make_pair (css_parser_token::string_token, + consume_string (c)); + case '(': + offset = i + 1; + return std::make_pair (css_parser_token::obrace_token, + std::string_view (&input[offset - 1], 1)); + case ')': + offset = i + 1; + return std::make_pair (css_parser_token::ebrace_token, + std::string_view (&input[offset - 1], 1)); + case ',': + offset = i + 1; + return std::make_pair (css_parser_token::comma_token, + std::string_view (&input[offset - 1], 1)); + case '<': + /* Maybe an xml like comment */ + if (i + 3 < input.size () && input[i + 1] == '!' + && input[i + 2] == '-' && input[i + 3] == '-') { + offset += 3; + + return std::make_pair (css_parser_token::cdo_token, + std::string_view (&input[offset - 3], 3)); + } + else { + offset = i + 1; + return std::make_pair (css_parser_token::delim_token, + std::string_view (&input[offset - 1], 1)); + } + break; + } + + } + + return std::make_pair (css_parser_token::eof_token, std::string_view ()); +} + +} \ No newline at end of file diff --git a/src/libserver/css/css_tokeniser.hxx b/src/libserver/css/css_tokeniser.hxx new file mode 100644 index 000000000..4c6824389 --- /dev/null +++ b/src/libserver/css/css_tokeniser.hxx @@ -0,0 +1,68 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_TOKENISER_HXX +#define RSPAMD_CSS_TOKENISER_HXX + +#include +#include +#include "mem_pool.h" + +namespace rspamd::css { + +enum class css_parser_token { + whitespace_token, + ident_token, + function_token, + at_keyword_token, + hash_token, + string_token, + number_token, + url_token, + dimension_token, + percentage_token, + cdo_token, /* xml open comment */ + cdc_token, /* xml close comment */ + delim_token, + obrace_token, /* ( */ + ebrace_token, /* ) */ + osqbrace_token, /* [ */ + esqbrace_token, /* ] */ + comma_token, + colon_token, + semicolon_token, + eof_token, +}; + +class css_tokeniser { +public: + css_tokeniser() = delete; + css_tokeniser(rspamd_mempool_t *pool, const std::string_view &sv) : + input(sv), offset(0), pool(pool) {} + + auto next_token(void) -> std::pair; +private: + std::string_view input; + std::size_t offset; + rspamd_mempool_t *pool; +}; + +} + + +#endif //RSPAMD_CSS_TOKENISER_HXX diff --git a/src/libserver/css/css_util.cxx b/src/libserver/css/css_util.cxx new file mode 100644 index 000000000..7388e49fd --- /dev/null +++ b/src/libserver/css/css_util.cxx @@ -0,0 +1,156 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_util.hxx" +#include "css.hxx" +#include + +namespace rspamd::css { + +std::string_view unescape_css(rspamd_mempool_t *pool, + const std::string_view &sv) +{ + auto *nspace = reinterpret_cast(rspamd_mempool_alloc(pool, sv.length ())); + auto *d = nspace; + auto nleft = sv.length (); + + enum { + normal = 0, + quoted, + escape, + skip_spaces, + } state = normal; + + char quote_char, prev_c = 0; + auto escape_offset = 0, i = 0; + +#define MAYBE_CONSUME_CHAR(c) do { \ + if (c == '"' || c == '\'') { \ + state = quoted; \ + quote_char = c; \ + nleft--; \ + *d++ = c; \ + } \ + else if (c == '\\') { \ + escape_offset = i; \ + state = escape; \ + } \ + else { \ + state = normal; \ + nleft--; \ + *d++ = c; \ + } \ +} while (0) + + for (const auto c : sv) { + if (nleft == 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int)sv.length()); + break; + } + switch (state) { + case normal: + MAYBE_CONSUME_CHAR(c); + break; + case quoted: + if (c == quote_char) { + if (prev_c != '\\') { + state = normal; + } + } + prev_c = c; + nleft --; + *d++ = c; + break; + case escape: + if (!g_ascii_isxdigit(c)) { + if (i > escape_offset + 1) { + /* Try to decode an escape */ + const auto *escape_start = &sv[escape_offset + 1]; + unsigned long val; + + if (!rspamd_xstrtoul(escape_start, i - escape_offset - 1, &val)) { + msg_debug_css("invalid broken escape found at pos %d", + escape_offset); + } + else { + if (val < 0x80) { + /* Trivial case: ascii character */ + *d++ = (unsigned char)val; + nleft --; + } + else { + UChar32 uc = val; + auto off = 0; + UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off, + sv.length (), uc); + d += off; + nleft -= off; + } + } + } + else { + /* Empty escape, ignore it */ + msg_debug_css("invalid empty escape found at pos %d", + escape_offset); + } + + if (nleft <= 0) { + msg_err_css("cannot unescape css: truncated buffer of size %d", + (int)sv.length()); + } + else { + /* Escape is done, advance forward */ + if (g_ascii_isspace (c)) { + state = skip_spaces; + } + else { + MAYBE_CONSUME_CHAR(c); + } + } + } + break; + case skip_spaces: + if (!g_ascii_isspace(c)) { + MAYBE_CONSUME_CHAR(c); + } + /* Ignore spaces */ + break; + } + + i ++; + } + + return std::string_view{nspace, sv.size() - nleft}; +} + +} + +/* C API */ +const gchar *rspamd_css_unescape (rspamd_mempool_t *pool, + const guchar *begin, + gsize len, + gsize *outlen) +{ + auto sv = rspamd::css::unescape_css(pool, {(const char*)begin, len}); + const auto *v = sv.begin(); + + if (outlen) { + *outlen = sv.size(); + } + + return v; +} \ No newline at end of file diff --git a/src/libserver/css/css_util.hxx b/src/libserver/css/css_util.hxx new file mode 100644 index 000000000..5daf9ee82 --- /dev/null +++ b/src/libserver/css/css_util.hxx @@ -0,0 +1,37 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#ifndef RSPAMD_CSS_UTIL_HXX +#define RSPAMD_CSS_UTIL_HXX + +#include +#include "mem_pool.h" + +namespace rspamd::css { + +/* + * Unescape css escapes + * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9 + * \0020AC : must be 6 digits long, no space needed (but can be included) + */ +std::string_view unescape_css(rspamd_mempool_t *pool, + const std::string_view &sv); + +} + +#endif //RSPAMD_CSS_UTIL_HXX diff --git a/src/libserver/css/css_value.hxx b/src/libserver/css/css_value.hxx index fca0e0433..302eb945b 100644 --- a/src/libserver/css/css_value.hxx +++ b/src/libserver/css/css_value.hxx @@ -14,6 +14,8 @@ * limitations under the License. */ +#pragma once + #ifndef RSPAMD_CSS_VALUE_HXX #define RSPAMD_CSS_VALUE_HXX diff --git a/src/libserver/css/parse_error.hxx b/src/libserver/css/parse_error.hxx index 12ad697eb..0a2cbc750 100644 --- a/src/libserver/css/parse_error.hxx +++ b/src/libserver/css/parse_error.hxx @@ -14,6 +14,7 @@ * limitations under the License. */ +#pragma once #ifndef RSPAMD_PARSE_ERROR_HXX #define RSPAMD_PARSE_ERROR_HXX