From d7fde715073a96dbcff7ecc69fe7b5ada3a1d045 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 29 Sep 2021 18:00:03 +0100 Subject: [PATCH] [Project] Add helper library to handle mime strings in a more safe matter --- src/libmime/CMakeLists.txt | 3 +- src/libmime/mime_string.cxx | 99 ++++++++++++ src/libmime/mime_string.hxx | 292 ++++++++++++++++++++++++++++++++++++ 3 files changed, 393 insertions(+), 1 deletion(-) create mode 100644 src/libmime/mime_string.cxx create mode 100644 src/libmime/mime_string.hxx diff --git a/src/libmime/CMakeLists.txt b/src/libmime/CMakeLists.txt index a011dff07..878ac8149 100644 --- a/src/libmime/CMakeLists.txt +++ b/src/libmime/CMakeLists.txt @@ -10,6 +10,7 @@ SET(LIBRSPAMDMIMESRC ${CMAKE_CURRENT_SOURCE_DIR}/mime_headers.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_parser.c ${CMAKE_CURRENT_SOURCE_DIR}/mime_encoding.c - ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c) + ${CMAKE_CURRENT_SOURCE_DIR}/lang_detection.c + ${CMAKE_CURRENT_SOURCE_DIR}/mime_string.cxx) SET(RSPAMD_MIME ${LIBRSPAMDMIMESRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/libmime/mime_string.cxx b/src/libmime/mime_string.cxx new file mode 100644 index 000000000..1785e9188 --- /dev/null +++ b/src/libmime/mime_string.cxx @@ -0,0 +1,99 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define DOCTEST_CONFIG_IMPLEMENTATION_IN_DLL +#include "doctest/doctest.h" +#include "mime_string.hxx" +#include "unicode/uchar.h" + +TEST_SUITE("mime_string") { +TEST_CASE("mime_string unfiltered ctors") +{ + SUBCASE("empty") { + rspamd::mime_string st; + CHECK(st.size() == 0); + CHECK(st == ""); + } + SUBCASE("unfiltered valid") { + rspamd::mime_string st{std::string_view("abcd")}; + CHECK(st == "abcd"); + } + SUBCASE("unfiltered zero character") { + rspamd::mime_string st{"abc\0d", 5}; + CHECK(st.has_zeroes()); + CHECK(st == "abcd"); + } + SUBCASE("unfiltered invalid character - middle") { + rspamd::mime_string st{std::string("abc\234d")}; + CHECK(st.has_invalid()); + CHECK(st == "abc\uFFFDd"); + } + SUBCASE("unfiltered invalid character - end") { + rspamd::mime_string st{std::string("abc\234")}; + CHECK(st.has_invalid()); + CHECK(st == "abc\uFFFD"); + } + SUBCASE("unfiltered invalid character - start") { + rspamd::mime_string st{std::string("\234abc")}; + CHECK(st.has_invalid()); + CHECK(st == "\uFFFDabc"); + } +} + +TEST_CASE("mime_string filtered ctors") +{ + auto print_filter = [](UChar32 inp) -> UChar32 { + if (!u_isprint(inp)) { + return 0; + } + + return inp; + }; + + auto tolower_filter = [](UChar32 inp) -> UChar32 { + return u_tolower(inp); + }; + + SUBCASE("empty") { + rspamd::mime_string st{std::string_view(""), tolower_filter}; + CHECK(st.size() == 0); + CHECK(st == ""); + } + SUBCASE("filtered valid") { + rspamd::mime_string st{std::string("AbCdУ"), tolower_filter}; + CHECK(st == "abcdу"); + } + SUBCASE("filtered invalid + filtered") { + rspamd::mime_string st{std::string("abcd\234\1"), print_filter}; + CHECK(st == "abcd\uFFFD"); + } +} +TEST_CASE("mime_string assign") +{ + SUBCASE("assign from valid") { + rspamd::mime_string st; + + CHECK(st.assign_if_valid(std::string("test"))); + CHECK(st == "test"); + } + SUBCASE("assign from invalid") { + rspamd::mime_string st; + + CHECK(!st.assign_if_valid(std::string("test\234t"))); + CHECK(st == ""); + } +} +} \ No newline at end of file diff --git a/src/libmime/mime_string.hxx b/src/libmime/mime_string.hxx new file mode 100644 index 000000000..4e25f6170 --- /dev/null +++ b/src/libmime/mime_string.hxx @@ -0,0 +1,292 @@ +/*- + * Copyright 2021 Vsevolod Stakhov + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef RSPAMD_MIME_STRING_HXX +#define RSPAMD_MIME_STRING_HXX +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "function2/function2.hpp" +#include "unicode/utf8.h" +#include "contrib/fastutf8/fastutf8.h" + +namespace rspamd { +/* + * The motivation for another string is to have utf8 valid string replacing + * all bad things with FFFFD replacement character and filtering \0 and other + * strange stuff defined by policies + * This string always exclude \0 characters and ignore them! This is how MUA acts, + * and we also store a flag about bad characters + */ +template> class basic_mime_string; + +using mime_string = basic_mime_string; + +/* Helpers for type safe flags */ +enum class mime_string_flags : std::uint8_t { + MIME_STRING_DEFAULT = 0, + MIME_STRING_SEEN_ZEROES = 0x1 << 0, + MIME_STRING_SEEN_INVALID = 0x1 << 1, +}; + +mime_string_flags operator |(mime_string_flags lhs, mime_string_flags rhs) +{ + using ut = std::underlying_type::type; + return static_cast(static_cast(lhs) | static_cast(rhs)); +} + +mime_string_flags operator &(mime_string_flags lhs, mime_string_flags rhs) +{ + using ut = std::underlying_type::type; + return static_cast(static_cast(lhs) & static_cast(rhs)); +} + +bool operator !(mime_string_flags fl) +{ + return fl == mime_string_flags::MIME_STRING_DEFAULT; +} + +template +class basic_mime_string : private Allocator { +public: + using storage_type = std::basic_string, Allocator>; + using view_type = std::basic_string_view>; + using filter_type = fu2::function_view; + /* Ctors */ + basic_mime_string() noexcept : Allocator() {} + explicit basic_mime_string(const Allocator& alloc) noexcept : Allocator(alloc) {} + + basic_mime_string(const T* str, std::size_t sz, const Allocator& alloc = Allocator()) noexcept : + Allocator(alloc) + { + append_c_string_unfiltered(str, sz); + } + + basic_mime_string(const storage_type &st, + const Allocator& alloc = Allocator()) noexcept : + basic_mime_string(st.data(), st.size(), alloc) {} + + basic_mime_string(const view_type &st, + const Allocator& alloc = Allocator()) noexcept : + basic_mime_string(st.data(), st.size(), alloc) {} + + basic_mime_string(const T* str, std::size_t sz, + filter_type &&filt, + const Allocator& alloc = Allocator()) noexcept : + Allocator(alloc), + filter_func(std::forward(filt)) + { + append_c_string_filtered(str, sz); + } + + basic_mime_string(const storage_type &st, + filter_type &&filt, + const Allocator& alloc = Allocator()) noexcept : + basic_mime_string(st.data(), st.size(), std::forward(filt), alloc) {} + basic_mime_string(const view_type &st, + filter_type &&filt, + const Allocator& alloc = Allocator()) noexcept : + basic_mime_string(st.data(), st.size(), std::forward(filt), alloc) {} + + auto size() const -> std::size_t { + return storage.size(); + } + + auto data() const -> const T* { + return storage.data(); + } + + constexpr auto has_zeroes() const -> bool { + return !!(flags & mime_string_flags::MIME_STRING_SEEN_ZEROES); + } + + constexpr auto has_invalid() const -> bool { + return !!(flags & mime_string_flags::MIME_STRING_SEEN_INVALID); + } + + /** + * Assign mime string from another string using move operation if a source string + * is utf8 valid. + * If this function returns false, then ownership has not been transferred + * and the `other` string is unmodified as well as the storage + * @param other + * @return + */ + [[nodiscard]] auto assign_if_valid(storage_type &&other) -> bool { + if (filter_func.has_value()) { + /* No way */ + return false; + } + if (rspamd_fast_utf8_validate((const unsigned char *)other.data(), other.size()) == 0) { + std::swap(storage, other); + + return true; + } + + return false; + } + + /** + * Copy to the internal storage discarding the contained value + * @param other + * @return + */ + auto assign_copy(const storage_type &other) { + storage.clear(); + + if (filter_func.has_value()) { + append_c_string_filtered(other.data(), other.size()); + } + else { + append_c_string_unfiltered(other.data(), other.size()); + } + } + + auto append(const T* str, std::size_t size) -> std::size_t { + if (filter_func.has_value()) { + return append_c_string_filtered(str, size); + } + else { + return append_c_string_unfiltered(str, size); + } + } + auto append(const storage_type &other) -> std::size_t { + return append(other.data(), other.size()); + } + auto append(const view_type &other) -> std::size_t { + return append(other.data(), other.size()); + } + + auto operator ==(const basic_mime_string &other) { + return other.storage == storage; + } + auto operator ==(const storage_type &other) { + return other == storage; + } + auto operator ==(const view_type &other) { + return other == storage; + } + auto operator ==(const T* other) { + if (other == NULL) { + return false; + } + auto olen = strlen(other); + if (storage.size() == olen) { + return memcmp(storage.data(), other, olen) == 0; + } + + return false; + } + + friend std::ostream& operator<< (std::ostream& os, const T& value) { + os << value.storage; + return os; + } +private: + mime_string_flags flags = mime_string_flags::MIME_STRING_DEFAULT; + storage_type storage; + std::optional filter_func; + + auto append_c_string_unfiltered(const T* str, std::size_t len) -> std::size_t { + /* This is fast path */ + const auto *p = str; + const auto *end = str + len; + std::ptrdiff_t err_offset; + auto orig_size = storage.size(); + + storage.reserve(len + storage.size()); + + if (memchr(str, 0, len) != NULL) { + /* Fallback to slow path */ + flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES; + return append_c_string_filtered(str, len); + } + + while (p < end && len > 0 && + (err_offset = rspamd_fast_utf8_validate((const unsigned char *)p, len)) > 0) { + auto cur_offset = err_offset - 1; + storage.append(p, cur_offset); + + while (cur_offset < len) { + auto tmp = cur_offset; + UChar32 uc; + + U8_NEXT(p, cur_offset, len, uc); + + if (uc < 0) { + storage.append("\uFFFD"); + flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID; + } + else { + cur_offset = tmp; + break; + } + } + + p += cur_offset; + len = end - p; + } + + storage.append(p, len); + return storage.size() - orig_size; + } + + auto append_c_string_filtered(const T* str, std::size_t len) -> std::size_t { + std::ptrdiff_t i = 0, o = 0; + UChar32 uc; + char tmp[4]; + auto orig_size = storage.size(); + /* Slow path */ + + storage.reserve(len + storage.size()); + + while (i < len) { + U8_NEXT(str, i, len, uc); + + if (uc < 0) { + /* Replace with 0xFFFD */ + storage.append("\uFFFD"); + flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID; + } + else { + if (filter_func.has_value()) { + uc = filter_func.value()(uc); + } + + if (uc == 0) { + /* Special case, ignore it */ + flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES; + } + else { + o = 0; + U8_APPEND_UNSAFE(tmp, o, uc); + storage.append(tmp, o); + } + } + } + + return storage.size() - orig_size; + } +}; + +} + +#endif //RSPAMD_MIME_STRING_HXX -- 2.39.5