123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836 |
- /*-
- * Copyright 2021 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
- #include "css_tokeniser.hxx"
- #include "css_util.hxx"
- #include "css.hxx"
- #include "frozen/unordered_map.h"
- #include "frozen/string.h"
- #include <string>
- #include <cmath>
-
- namespace rspamd::css {
-
- /* Helpers to create tokens */
-
- /*
- * This helper is intended to create tokens either with a tag and value
- * or with just a tag.
- */
- template<css_parser_token::token_type T, class Arg>
- auto make_token(const Arg &arg) -> css_parser_token;
-
- template<>
- auto make_token<css_parser_token::token_type::string_token, std::string_view>(const std::string_view &s)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::string_token, s};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::ident_token, std::string_view>(const std::string_view &s)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::ident_token, s};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::function_token, std::string_view>(const std::string_view &s)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::function_token, s};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::url_token, std::string_view>(const std::string_view &s)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::url_token, s};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::whitespace_token, std::string_view>(const std::string_view &s)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::whitespace_token, s};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::delim_token, char>(const char &c)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::delim_token, c};
- }
-
- template<>
- auto make_token<css_parser_token::token_type::number_token, float>(const float &d)
- -> css_parser_token
- {
- return css_parser_token{css_parser_token::token_type::number_token, d};
- }
-
- /*
- * Generic tokens with no value (non-terminals)
- */
- template<css_parser_token::token_type T>
- auto make_token(void) -> css_parser_token
- {
- return css_parser_token{T, css_parser_token_placeholder()};
- }
-
- static constexpr inline auto is_plain_ident_start(char c) -> bool
- {
- if ((c & 0x80) || g_ascii_isalpha(c) || c == '_') {
- return true;
- }
-
- return false;
- };
-
- static constexpr inline auto is_plain_ident(char c) -> bool
- {
- if (is_plain_ident_start(c) || c == '-' || g_ascii_isdigit(c)) {
- return true;
- }
-
- return false;
- };
-
- struct css_dimension_data {
- css_parser_token::dim_type dtype;
- double mult;
- };
-
- /*
- * Maps from css dimensions to the multipliers that look reasonable in email
- */
- constexpr const auto max_dims = static_cast<int>(css_parser_token::dim_type::dim_max);
- constexpr frozen::unordered_map<frozen::string, css_dimension_data, max_dims> dimensions_map{
- {"px", {css_parser_token::dim_type::dim_px, 1.0}},
- /* EM/REM are 16 px, so multiply and round */
- {"em", {css_parser_token::dim_type::dim_em, 16.0}},
- {"rem", {css_parser_token::dim_type::dim_rem, 16.0}},
- /*
- * Represents the x-height of the element's font.
- * On fonts with the "x" letter, this is generally the height
- * of lowercase letters in the font; 1ex = 0.5em in many fonts.
- */
- {"ex", {css_parser_token::dim_type::dim_ex, 8.0}},
- {"wv", {css_parser_token::dim_type::dim_wv, 8.0}},
- {"wh", {css_parser_token::dim_type::dim_wh, 6.0}},
- {"vmax", {css_parser_token::dim_type::dim_vmax, 8.0}},
- {"vmin", {css_parser_token::dim_type::dim_vmin, 6.0}},
- /* One point. 1pt = 1/72nd of 1in */
- {"pt", {css_parser_token::dim_type::dim_pt, 96.0 / 72.0}},
- /* 96px/2.54 */
- {"cm", {css_parser_token::dim_type::dim_cm, 96.0 / 2.54}},
- {"mm", {css_parser_token::dim_type::dim_mm, 9.60 / 2.54}},
- {"in", {css_parser_token::dim_type::dim_in, 96.0}},
- /* 1pc = 12pt = 1/6th of 1in. */
- {"pc", {css_parser_token::dim_type::dim_pc, 96.0 / 6.0}}};
-
- auto css_parser_token::adjust_dim(const css_parser_token &dim_token) -> bool
- {
- if (!std::holds_alternative<float>(value) ||
- !std::holds_alternative<std::string_view>(dim_token.value)) {
- /* Invalid tokens */
- return false;
- }
-
- auto num = std::get<float>(value);
- auto sv = std::get<std::string_view>(dim_token.value);
-
- auto dim_found = find_map(dimensions_map, sv);
-
- if (dim_found) {
- auto dim_elt = dim_found.value().get();
- dimension_type = dim_elt.dtype;
- flags |= css_parser_token::number_dimension;
- num *= dim_elt.mult;
- }
- else {
- flags |= css_parser_token::flag_bad_dimension;
-
- return false;
- }
-
- value = num;
-
- return true;
- }
-
-
- /*
- * Consume functions: return a token and advance lexer offset
- */
- auto css_tokeniser::consume_ident(bool allow_number) -> struct css_parser_token {
- auto i = offset;
- auto need_escape = false;
- auto allow_middle_minus = false;
-
- auto maybe_escape_sv = [&](auto cur_pos, auto tok_type) -> auto {
- if (need_escape) {
- auto escaped = rspamd::css::unescape_css(pool, {&input[offset],
- cur_pos - offset});
- offset = cur_pos;
-
- return css_parser_token{tok_type, escaped};
- }
-
- auto result = std::string_view{&input[offset], cur_pos - offset};
- offset = cur_pos;
-
- return css_parser_token{tok_type, result};
- };
-
- /* Ident token can start from `-` or `--` */
- if (input[i] == '-') {
- i++;
-
- if (i < input.size() && input[i] == '-') {
- i++;
- allow_middle_minus = true;
- }
- }
-
- while (i < input.size()) {
- auto c = input[i];
-
- auto is_plain_c = (allow_number || allow_middle_minus) ? is_plain_ident(c) : is_plain_ident_start(c);
- if (!is_plain_c) {
- if (c == '\\' && i + 1 < input.size()) {
- /* Escape token */
- need_escape = true;
- auto nhex = 0;
-
- /* Need to find an escape end */
- do {
- c = input[++i];
- if (g_ascii_isxdigit(c)) {
- nhex++;
-
- if (nhex > 6) {
- /* End of the escape */
- break;
- }
- }
- else if (nhex > 0 && c == ' ') {
- /* \[hex]{1,6} */
- i++; /* Skip one space */
- break;
- }
- else {
- /* Single \ + char */
- break;
- }
- } while (i < input.size());
- }
- else if (c == '(') {
- /* Function or url token */
- auto j = i + 1;
-
- while (j < input.size() && g_ascii_isspace(input[j])) {
- j++;
- }
-
- if (input.size() - offset > 3 && input.substr(offset, 3) == "url") {
- if (j < input.size() && (input[j] == '"' || input[j] == '\'')) {
- /* Function token */
- auto ret = maybe_escape_sv(i,
- css_parser_token::token_type::function_token);
- return ret;
- }
- else {
- /* Consume URL token */
- while (j < input.size() && input[j] != ')') {
- j++;
- }
-
- if (j < input.size() && input[j] == ')') {
- /* Valid url token */
- auto ret = maybe_escape_sv(j + 1,
- css_parser_token::token_type::url_token);
- return ret;
- }
- else {
- /* Incomplete url token */
- auto ret = maybe_escape_sv(j,
- css_parser_token::token_type::url_token);
-
- ret.flags |= css_parser_token::flag_bad_string;
- return ret;
- }
- }
- }
- else {
- auto ret = maybe_escape_sv(i,
- css_parser_token::token_type::function_token);
- return ret;
- }
- }
- else if (c == '-' && allow_middle_minus) {
- i++;
- continue;
- }
- else {
- break; /* Not an ident token */
- }
- } /* !plain ident */
- else {
- allow_middle_minus = true;
- }
-
- i++;
- }
-
- return maybe_escape_sv(i, css_parser_token::token_type::ident_token);
- }
-
- auto
- css_tokeniser::consume_number() -> struct css_parser_token {
- auto i = offset;
- auto seen_dot = false, seen_exp = false;
-
- if (input[i] == '-' || input[i] == '+') {
- i++;
- }
- if (input[i] == '.' && i < input.size()) {
- seen_dot = true;
- i++;
- }
-
- while (i < input.size()) {
- auto c = input[i];
-
- if (!g_ascii_isdigit(c)) {
- if (c == '.') {
- if (!seen_dot) {
- seen_dot = true;
- }
- else {
- break;
- }
- }
- else if (c == 'e' || c == 'E') {
- if (!seen_exp) {
- seen_exp = true;
- seen_dot = true; /* dots are not allowed after e */
-
- if (i + 1 < input.size()) {
- auto next_c = input[i + 1];
- if (next_c == '+' || next_c == '-') {
- i++;
- }
- else if (!g_ascii_isdigit(next_c)) {
- /* Not an exponent */
- break;
- }
- }
- else {
- /* Not an exponent */
- break;
- }
- }
- else {
- break;
- }
- }
- else {
- break;
- }
- }
-
- i++;
- }
-
- if (i > offset) {
- /* I wish it was supported properly */
- //auto conv_res = std::from_chars(&input[offset], &input[i], num);
- char numbuf[128], *endptr = nullptr;
- rspamd_strlcpy(numbuf, &input[offset], MIN(i - offset + 1, sizeof(numbuf)));
- auto num = g_ascii_strtod(numbuf, &endptr);
- offset = i;
-
- if (fabs(num) >= G_MAXFLOAT || std::isnan(num)) {
- msg_debug_css("invalid number: %s", numbuf);
- return make_token<css_parser_token::token_type::delim_token>(input[i - 1]);
- }
- else {
-
- auto ret = make_token<css_parser_token::token_type::number_token>(static_cast<float>(num));
-
- if (i < input.size()) {
- if (input[i] == '%') {
- ret.flags |= css_parser_token::number_percent;
- i++;
-
- offset = i;
- }
- else if (is_plain_ident_start(input[i])) {
- auto dim_token = consume_ident();
-
- if (dim_token.type == css_parser_token::token_type::ident_token) {
- if (!ret.adjust_dim(dim_token)) {
- auto sv = std::get<std::string_view>(dim_token.value);
- msg_debug_css("cannot apply dimension from the token %*s; number value = %.1f",
- (int) sv.size(), sv.begin(), num);
- /* Unconsume ident */
- offset = i;
- }
- }
- else {
- /* We have no option but to uncosume ident token in this case */
- msg_debug_css("got invalid ident like token after number, unconsume it");
- }
- }
- else {
- /* Plain number, nothing to do */
- }
- }
-
- return ret;
- }
- }
- else {
- msg_err_css("internal error: invalid number, empty token");
- i++;
- }
-
- offset = i;
- /* Should not happen */
- return make_token<css_parser_token::token_type::delim_token>(input[i - 1]);
- }
-
- /*
- * Main routine to produce lexer tokens
- */
- auto
- css_tokeniser::next_token(void) -> struct css_parser_token {
- /* Check pushback queue */
- if (!backlog.empty()) {
- auto tok = backlog.front();
- backlog.pop_front();
-
- return tok;
- }
- /* Helpers */
-
- /*
- * This lambda eats comment handling nested comments;
- * offset is set to the next character after a comment (or eof)
- * Nothing is returned
- */
- auto consume_comment = [this]() {
- auto i = offset;
- auto nested = 0;
-
- if (input.empty()) {
- /* Nothing to consume */
- return;
- }
-
- /* We handle nested comments just because they can exist... */
- while (i < input.size() - 1) {
- auto c = input[i];
- if (c == '*' && input[i + 1] == '/') {
- if (nested == 0) {
- offset = i + 2;
- return;
- }
- else {
- nested--;
- i += 2;
- continue;
- }
- }
- else if (c == '/' && input[i + 1] == '*') {
- nested++;
- i += 2;
- continue;
- }
-
- i++;
- }
-
- offset = i;
- };
-
- /*
- * Consume quoted string, returns a string_view over a string, offset
- * is set one character after the string. Css unescaping is done automatically
- * Accepts a quote char to find end of string
- */
- auto consume_string = [this](auto quote_char) -> auto {
- auto i = offset;
- bool need_unescape = false;
-
- while (i < input.size()) {
- auto c = input[i];
-
- if (c == '\\') {
- if (i + 1 < input.size()) {
- need_unescape = true;
- }
- else {
- /* \ at the end -> ignore */
- }
- }
- else if (c == quote_char) {
- /* End of string */
- std::string_view res{&input[offset], i - offset};
-
- if (need_unescape) {
- res = rspamd::css::unescape_css(pool, res);
- }
-
- offset = i + 1;
-
- return res;
- }
- else if (c == '\n') {
- /* Should be a error, but we ignore it for now */
- }
-
- i++;
- }
-
- /* EOF with no quote character, consider it fine */
- std::string_view res{&input[offset], i - offset};
-
- if (need_unescape) {
- res = rspamd::css::unescape_css(pool, res);
- }
-
- offset = i;
-
- return res;
- };
-
- /* Main tokenisation loop */
- for (auto i = offset; i < input.size(); ++i) {
- auto c = input[i];
-
- switch (c) {
- case '/':
- if (i + 1 < input.size() && input[i + 1] == '*') {
- offset = i + 2;
- consume_comment(); /* Consume comment and go forward */
- return next_token(); /* Tail call */
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- case ' ':
- case '\t':
- case '\n':
- case '\r':
- case '\f': {
- /* Consume as much space as we can */
- while (i < input.size() && g_ascii_isspace(input[i])) {
- i++;
- }
-
- auto ret = make_token<css_parser_token::token_type::whitespace_token>(
- std::string_view(&input[offset], i - offset));
- offset = i;
- return ret;
- }
- case '"':
- case '\'':
- offset = i + 1;
- if (offset < input.size()) {
- return make_token<css_parser_token::token_type::string_token>(consume_string(c));
- }
- else {
- /* Unpaired quote at the end of the rule */
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- case '(':
- offset = i + 1;
- return make_token<css_parser_token::token_type::obrace_token>();
- case ')':
- offset = i + 1;
- return make_token<css_parser_token::token_type::ebrace_token>();
- case '[':
- offset = i + 1;
- return make_token<css_parser_token::token_type::osqbrace_token>();
- case ']':
- offset = i + 1;
- return make_token<css_parser_token::token_type::esqbrace_token>();
- case '{':
- offset = i + 1;
- return make_token<css_parser_token::token_type::ocurlbrace_token>();
- case '}':
- offset = i + 1;
- return make_token<css_parser_token::token_type::ecurlbrace_token>();
- case ',':
- offset = i + 1;
- return make_token<css_parser_token::token_type::comma_token>();
- case ';':
- offset = i + 1;
- return make_token<css_parser_token::token_type::semicolon_token>();
- case ':':
- offset = i + 1;
- return make_token<css_parser_token::token_type::colon_token>();
- case '<':
- /* Maybe an xml like comment */
- if (i + 3 < input.size() && input[i + 1] == '!' && input[i + 2] == '-' && input[i + 3] == '-') {
- offset += 3;
-
- return make_token<css_parser_token::token_type::cdo_token>();
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- case '-':
- if (i + 1 < input.size()) {
- auto next_c = input[i + 1];
-
- if (g_ascii_isdigit(next_c)) {
- /* negative number */
- return consume_number();
- }
- else if (next_c == '-') {
- if (i + 2 < input.size() && input[i + 2] == '>') {
- /* XML like comment */
- offset += 3;
-
- return make_token<css_parser_token::token_type::cdc_token>();
- }
- }
- }
- /* No other options, a delimiter - */
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
-
- break;
- case '+':
- case '.':
- /* Maybe number */
- if (i + 1 < input.size()) {
- auto next_c = input[i + 1];
-
- if (g_ascii_isdigit(next_c)) {
- /* Numeric token */
- return consume_number();
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- }
- /* No other options, a delimiter - */
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
-
- break;
- case '\\':
- if (i + 1 < input.size()) {
- if (input[i + 1] == '\n' || input[i + 1] == '\r') {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- else {
- /* Valid escape, assume ident */
- return consume_ident();
- }
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- case '@':
- if (i + 3 < input.size()) {
- if (is_plain_ident_start(input[i + 1]) &&
- is_plain_ident(input[i + 2]) && is_plain_ident(input[i + 3])) {
- offset = i + 1;
- auto ident_token = consume_ident();
-
- if (ident_token.type == css_parser_token::token_type::ident_token) {
- /* Update type */
- ident_token.type = css_parser_token::token_type::at_keyword_token;
- }
-
- return ident_token;
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- case '#':
- /* TODO: make it more conformant */
- if (i + 2 < input.size()) {
- auto next_c = input[i + 1], next_next_c = input[i + 2];
- if ((is_plain_ident(next_c) || next_c == '-') &&
- (is_plain_ident(next_next_c) || next_next_c == '-')) {
- offset = i + 1;
- /* We consume indent, but we allow numbers there */
- auto ident_token = consume_ident(true);
-
- if (ident_token.type == css_parser_token::token_type::ident_token) {
- /* Update type */
- ident_token.type = css_parser_token::token_type::hash_token;
- }
-
- return ident_token;
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- default:
- /* Generic parsing code */
-
- if (g_ascii_isdigit(c)) {
- return consume_number();
- }
- else if (is_plain_ident_start(c)) {
- return consume_ident();
- }
- else {
- offset = i + 1;
- return make_token<css_parser_token::token_type::delim_token>(c);
- }
- break;
- }
- }
-
- return make_token<css_parser_token::token_type::eof_token>();
- }
-
- constexpr auto
- css_parser_token::get_token_type() -> const char *
- {
- const char *ret = "unknown";
-
- switch (type) {
- case token_type::whitespace_token:
- ret = "whitespace";
- break;
- case token_type::ident_token:
- ret = "ident";
- break;
- case token_type::function_token:
- ret = "function";
- break;
- case token_type::at_keyword_token:
- ret = "atkeyword";
- break;
- case token_type::hash_token:
- ret = "hash";
- break;
- case token_type::string_token:
- ret = "string";
- break;
- case token_type::number_token:
- ret = "number";
- break;
- case token_type::url_token:
- ret = "url";
- break;
- case token_type::cdo_token: /* xml open comment */
- ret = "cdo";
- break;
- case token_type::cdc_token: /* xml close comment */
- ret = "cdc";
- break;
- case token_type::delim_token:
- ret = "delim";
- break;
- case token_type::obrace_token: /* ( */
- ret = "obrace";
- break;
- case token_type::ebrace_token: /* ) */
- ret = "ebrace";
- break;
- case token_type::osqbrace_token: /* [ */
- ret = "osqbrace";
- break;
- case token_type::esqbrace_token: /* ] */
- ret = "esqbrace";
- break;
- case token_type::ocurlbrace_token: /* { */
- ret = "ocurlbrace";
- break;
- case token_type::ecurlbrace_token: /* } */
- ret = "ecurlbrace";
- break;
- case token_type::comma_token:
- ret = "comma";
- break;
- case token_type::colon_token:
- ret = "colon";
- break;
- case token_type::semicolon_token:
- ret = "semicolon";
- break;
- case token_type::eof_token:
- ret = "eof";
- break;
- }
-
- return ret;
- }
-
-
- auto css_parser_token::debug_token_str() -> std::string
- {
- const auto *token_type_str = get_token_type();
- std::string ret = token_type_str;
-
- std::visit([&](auto arg) -> auto {
- using T = std::decay_t<decltype(arg)>;
-
- if constexpr (std::is_same_v<T, std::string_view> || std::is_same_v<T, char>) {
- ret += "; value=";
- ret += arg;
- }
- else if constexpr (std::is_same_v<T, double>) {
- ret += "; value=";
- ret += std::to_string(arg);
- }
- },
- value);
-
- if ((flags & (~number_dimension)) != default_flags) {
- ret += "; flags=" + std::to_string(flags);
- }
-
- if (flags & number_dimension) {
- ret += "; dim=" + std::to_string(static_cast<int>(dimension_type));
- }
-
- return ret; /* Copy elision */
- }
-
- }// namespace rspamd::css
|