diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-01-25 14:35:41 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-01-25 16:35:38 +0000 |
commit | 50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4 (patch) | |
tree | dda32e0cdc28e10ad32abc51a688b8bfa29aff59 /src/libserver/css/css_tokeniser.cxx | |
parent | c3b1d136a7f547896e8c9b83a2160f40477c1fa4 (diff) | |
download | rspamd-50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4.tar.gz rspamd-50e3e98a741cf2946ec0b3e4cf396d53cc9e4ae4.zip |
[Project] Css: rework tokeniser
Diffstat (limited to 'src/libserver/css/css_tokeniser.cxx')
-rw-r--r-- | src/libserver/css/css_tokeniser.cxx | 183 |
1 files changed, 183 insertions, 0 deletions
diff --git a/src/libserver/css/css_tokeniser.cxx b/src/libserver/css/css_tokeniser.cxx new file mode 100644 index 000000000..40f202b01 --- /dev/null +++ b/src/libserver/css/css_tokeniser.cxx @@ -0,0 +1,183 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "css_tokeniser.hxx" +#include "css_util.hxx" + +namespace rspamd::css { + + +auto css_tokeniser::next_token (void) -> std::pair<css_parser_token, std::string_view> +{ + /* Helpers */ + + /* + * This lambda eats comment handling nested comments; + * offset is set to the next character after a comment (or eof) + * Nothing is returned + */ + auto consume_comment = [this] () { + auto i = offset; + auto nested = 0; + + /* We handle nested comments just because they can exist... */ + while (i < input.size () - 1) { + auto c = input[i]; + if (c == '*' && input[i + 1] == '/') { + if (nested == 0) { + offset = i + 2; + return; + } + else { + nested--; + i += 2; + continue; + } + } + else if (c == '/' && input[i + 1] == '*') { + nested++; + i += 2; + continue; + } + + i++; + } + + offset = i; + }; + + /* + * Consume quoted string, returns a string_view over a string, offset + * is set one character after the string. Css unescaping is done automatically + * Accepts a quote char to find end of string + */ + auto consume_string = [this] (auto quote_char) -> auto { + auto i = offset; + bool need_unescape = false; + + while (i < input.size ()) { + auto c = input[i]; + + if (c == '\\') { + if (i + 1 < input.size ()) { + need_unescape = true; + } + else { + /* \ at the end -> ignore */ + + } + } + else if (c == quote_char) { + /* End of string */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i + 1; + + return res; + } + else if (c == '\n') { + /* Should be a error, but we ignore it for now */ + } + } + + /* EOF with no quote character, consider it fine */ + std::string_view res{&input[offset], i - offset}; + + if (need_unescape) { + res = rspamd::css::unescape_css(pool, res); + } + + offset = i; + + return res; + }; + + /* Main tokenisation loop */ + for (auto i = offset; i < input.size (); ++i) { + auto c = input[i]; + + switch (c) { + case '/': + if (i + 1 < input.size () && input[i + 1] == '*') { + offset = i + 2; + consume_comment (); /* Consume comment and go forward */ + return next_token (); /* Tail call */ + } + else { + offset = i + 1; + return std::make_pair (css_parser_token::delim_token, + std::string_view (&input[offset - 1], 1)); + } + break; + case ' ': + case '\t': + case '\n': + case '\r': + case '\v': { + /* Consume as much space as we can */ + do { + c = input[++i]; + } while (i < input.size () && g_ascii_isspace (c)); + + auto ret = std::make_pair (css_parser_token::whitespace_token, + std::string_view (&input[offset], i - offset)); + offset = i; + return ret; + } + case '"': + case '\'': + offset = i + 1; + return std::make_pair (css_parser_token::string_token, + consume_string (c)); + case '(': + offset = i + 1; + return std::make_pair (css_parser_token::obrace_token, + std::string_view (&input[offset - 1], 1)); + case ')': + offset = i + 1; + return std::make_pair (css_parser_token::ebrace_token, + std::string_view (&input[offset - 1], 1)); + case ',': + offset = i + 1; + return std::make_pair (css_parser_token::comma_token, + std::string_view (&input[offset - 1], 1)); + case '<': + /* Maybe an xml like comment */ + if (i + 3 < input.size () && input[i + 1] == '!' + && input[i + 2] == '-' && input[i + 3] == '-') { + offset += 3; + + return std::make_pair (css_parser_token::cdo_token, + std::string_view (&input[offset - 3], 3)); + } + else { + offset = i + 1; + return std::make_pair (css_parser_token::delim_token, + std::string_view (&input[offset - 1], 1)); + } + break; + } + + } + + return std::make_pair (css_parser_token::eof_token, std::string_view ()); +} + +}
\ No newline at end of file |