]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Add some methods for css parser
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 21 Jan 2021 15:45:21 +0000 (15:45 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Fri, 22 Jan 2021 15:58:05 +0000 (15:58 +0000)
src/libserver/css/CMakeLists.txt
src/libserver/css/css.cxx
src/libserver/css/css.h
src/libserver/css/css.hxx
src/libserver/css/css_parser.cxx [new file with mode: 0644]
src/libserver/css/css_parser.hxx [new file with mode: 0644]
src/libserver/css/parse_error.hxx

index f5d5affdb6bee4f8c93f8ff619e283fb287b4238..c8f7921b1491ab9786f09e2d3a3981936ee38e3c 100644 (file)
@@ -14,6 +14,7 @@ SET(LIBCSSSRC    "${CMAKE_CURRENT_SOURCE_DIR}/css.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_property.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_value.cxx"
                  "${CMAKE_CURRENT_SOURCE_DIR}/css_selector.cxx"
+                 "${CMAKE_CURRENT_SOURCE_DIR}/css_parser.cxx"
                  "${RAGEL_ragel_css_selector_parser_OUTPUTS}"
                  "${RAGEL_ragel_css_rule_parser_OUTPUTS}"
                  PARENT_SCOPE)
index 68ebfeefa6729bb5555abecca9cb836a8ca37fb3..bd148cecda6b083410199202af34fb0048605b63 100644 (file)
@@ -29,6 +29,8 @@ rspamd_css_parse_style (const guchar *begin, gsize len, GError **err)
 
 namespace rspamd::css {
 
+INIT_LOG_MODULE_PUBLIC(css);
+
 class css_style_sheet::impl {
 
 };
index a87f4424d36b21369d025eba290f391a82e7b735..169bcf58c001a51e04591b4c61c3aefca2b29c75 100644 (file)
 #define RSPAMD_CSS_H
 
 #include "config.h"
+#include "mem_pool.h"
 
 #ifdef  __cplusplus
 extern "C" {
 #endif
 typedef void * rspamd_css;
 
-rspamd_css rspamd_css_parse_style (const guchar *begin, gsize len, GError **err);
+rspamd_css rspamd_css_parse_style (rspamd_mempool_t *pool,
+                                                                  const guchar *begin,
+                                                                  gsize len, GError **err);
 #ifdef  __cplusplus
 }
 #endif
index 78e0d0f732477cf45dfb74942eb79586d56c6504..d258b35c94438923ea12e2d3dbb3b195344a653a 100644 (file)
 
 #include <string>
 #include <memory>
+#include "logger.h"
 
 namespace rspamd::css {
 
+extern unsigned int rspamd_css_log_id;
+
+#define msg_debug_css(...)  rspamd_conditional_debug_fast (NULL, NULL, \
+        rspamd_css_log_id, "css", pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+#define msg_err_css(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
+        "css", pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+
 class css_style_sheet {
 public:
        css_style_sheet();
diff --git a/src/libserver/css/css_parser.cxx b/src/libserver/css/css_parser.cxx
new file mode 100644 (file)
index 0000000..9f2023e
--- /dev/null
@@ -0,0 +1,238 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "css_parser.hxx"
+#include <unicode/utf8.h>
+
+
+namespace rspamd::css {
+
+class css_parser {
+public:
+       css_parser(void) = delete; /* Require mempool to be set for logging */
+       explicit css_parser(rspamd_mempool_t *pool) : pool (pool) {}
+
+       bool consume_input(const std::string_view &sv);
+
+       auto get_object_maybe(void) -> tl::expected<std::unique_ptr<css_style_sheet>, css_parse_error> {
+               if (state == parser_state::parse_done) {
+                       state = parser_state::initial_state;
+                       return std::move (style_object);
+               }
+
+               return tl::make_unexpected (error);
+       }
+
+private:
+       enum class parser_state {
+               initial_state,
+               skip_spaces,
+               parse_selector,
+               ignore_selector, /* e.g. media or namespace */
+               parse_done,
+       };
+       parser_state state = parser_state::initial_state;
+       std::unique_ptr<css_style_sheet> style_object;
+       css_parse_error error;
+       rspamd_mempool_t *pool;
+
+       /* Helper parser methods */
+       bool need_unescape(const std::string_view &sv);
+
+       std::string_view unescape_css(const std::string_view &sv);
+};
+
+/*
+ * Find if we need to unescape css
+ */
+bool
+css_parser::need_unescape(const std::string_view &sv)
+{
+       bool in_quote = false;
+       char quote_char, prev_c = 0;
+
+       for (const auto c : sv) {
+               if (!in_quote) {
+                       if (c == '"' || c == '\'') {
+                               in_quote = true;
+                               quote_char = c;
+                       }
+                       else if (c == '\\') {
+                               return true;
+                       }
+               }
+               else {
+                       if (c == quote_char) {
+                               if (prev_c != '\\') {
+                                       in_quote = false;
+                               }
+                       }
+                       prev_c = c;
+               }
+       }
+
+       return false;
+}
+
+/*
+ * Unescape css escapes
+ * \20AC : must be followed by a space if the next character is one of a-f, A-F, 0-9
+ * \0020AC : must be 6 digits long, no space needed (but can be included)
+ */
+std::string_view
+css_parser::unescape_css(const std::string_view &sv)
+{
+       auto *nspace = reinterpret_cast<char *>(rspamd_mempool_alloc(pool, sv.length ()));
+       auto *d = nspace;
+       auto nleft = sv.length ();
+
+       enum {
+               normal = 0,
+               quoted,
+               escape,
+               skip_spaces,
+       } state = normal;
+
+       char quote_char, prev_c = 0;
+       auto escape_offset = 0, i = 0;
+
+#define MAYBE_CONSUME_CHAR(c) do { \
+    if (c == '"' || c == '\'') { \
+        state = quoted; \
+        quote_char = c; \
+        nleft--; \
+        *d++ = c; \
+    } \
+    else if (c == '\\') { \
+        escape_offset = i; \
+        state = escape; \
+    } \
+    else { \
+        state = normal; \
+        nleft--; \
+        *d++ = c; \
+    } \
+} while (0)
+
+       for (const auto c : sv) {
+               if (nleft == 0) {
+                       msg_err_css("cannot unescape css: truncated buffer of size %d",
+                                       (int)sv.length());
+                       break;
+               }
+               switch (state) {
+               case normal:
+                       MAYBE_CONSUME_CHAR(c);
+                       break;
+               case quoted:
+                       if (c == quote_char) {
+                               if (prev_c != '\\') {
+                                       state = normal;
+                               }
+                       }
+                       prev_c = c;
+                       nleft --;
+                       *d++ = c;
+                       break;
+               case escape:
+                       if (!g_ascii_isxdigit(c)) {
+                               if (i > escape_offset + 1) {
+                                       /* Try to decode an escape */
+                                       const auto *escape_start = &sv[escape_offset + 1];
+                                       unsigned long val;
+
+                                       if (!rspamd_xstrtoul (escape_start, i - escape_offset - 1, &val)) {
+                                               msg_debug_css("invalid broken escape found at pos %d",
+                                                               escape_offset);
+                                       }
+                                       else {
+                                               if (val < 0x1f) {
+                                                       /* Trivial case: ascii character */
+                                                       *d++ = (unsigned char)val;
+                                                       nleft --;
+                                               }
+                                               else {
+                                                       UChar32 uc = val;
+                                                       auto off = d - nspace;
+                                                       UTF8_APPEND_CHAR_SAFE((uint8_t *) d, off,
+                                                                       sv.length (), uc);
+                                                       d = nspace + off;
+                                                       nleft = sv.length () - off;
+                                               }
+                                       }
+                               }
+                               else {
+                                       /* Empty escape, ignore it */
+                                       msg_debug_css("invalid empty escape found at pos %d",
+                                                       escape_offset);
+                               }
+
+                               if (nleft > 0) {
+                                       msg_err_css("cannot unescape css: truncated buffer of size %d",
+                                                       (int)sv.length());
+                               }
+                               else {
+                                       /* Escape is done, advance forward */
+                                       if (g_ascii_isspace (c)) {
+                                               state = skip_spaces;
+                                       }
+                                       else {
+                                               MAYBE_CONSUME_CHAR(c);
+                                       }
+                               }
+                       }
+                       break;
+               case skip_spaces:
+                       if (!g_ascii_isspace(c)) {
+                               MAYBE_CONSUME_CHAR(c);
+                       }
+                       /* Ignore spaces */
+                       break;
+               }
+
+               i ++;
+       }
+
+       return std::string_view{nspace, sv.size() - nleft};
+};
+
+bool css_parser::consume_input(const std::string_view &sv)
+{
+       auto our_sv = sv;
+
+       if (need_unescape(sv)) {
+               our_sv = unescape_css(sv);
+               msg_debug_css("unescaped css: input size %d, unescaped size %d",
+                               (int)sv.size(), (int)our_sv.size());
+       }
+
+       return true;
+}
+
+/*
+ * Wrapper for the parser
+ */
+auto parse_css(rspamd_mempool_t *pool, const std::string_view &st) ->
+       tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>
+{
+       css_parser parser(pool);
+
+       parser.consume_input(st);
+
+       return parser.get_object_maybe();
+}
+
+}
diff --git a/src/libserver/css/css_parser.hxx b/src/libserver/css/css_parser.hxx
new file mode 100644 (file)
index 0000000..8d1468a
--- /dev/null
@@ -0,0 +1,34 @@
+/*-
+ * Copyright 2021 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef RSPAMD_CSS_PARSER_HXX
+#define RSPAMD_CSS_PARSER_HXX
+
+#include "css.hxx"
+#include "parse_error.hxx"
+#include "contrib/expected/expected.hpp"
+#include "logger.h"
+
+namespace rspamd::css {
+
+INIT_LOG_MODULE(chartable)
+
+auto parse_css (rspamd_mempool_t *pool, const std::string_view &st) ->
+               tl::expected<std::unique_ptr<css_style_sheet>,css_parse_error>;
+
+}
+
+#endif //RSPAMD_CSS_PARSER_HXX
index 60b2291816e9f6e2bd2bc5e1395e7dfb03a4a9c0..12ad697eb1fa2a3b61828ad761f6ae1675f0d974 100644 (file)
@@ -34,13 +34,14 @@ enum class css_parse_error_type {
 };
 
 struct css_parse_error {
-       css_parse_error_type type;
+       css_parse_error_type type = css_parse_error_type::PARSE_ERROR_UNKNOWN_ERROR;
        std::optional<std::string> description;
 
        explicit css_parse_error (css_parse_error_type type, const std::string &description) :
                type(type), description(description) {}
        explicit css_parse_error (css_parse_error_type type) :
                        type(type) {}
+       css_parse_error() = default;
 };
 
 }