Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

mime_string.hxx 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_MIME_STRING_HXX
  17. #define RSPAMD_MIME_STRING_HXX
  18. #pragma once
  19. #include <string>
  20. #include <string_view>
  21. #include <memory>
  22. #include <cstdint>
  23. #include <cstdlib>
  24. #include <cstring>
  25. #include <iosfwd>
  26. #include "libutil/mem_pool.h"
  27. #include "function2/function2.hpp"
  28. #include "unicode/utf8.h"
  29. #include "contrib/fastutf8/fastutf8.h"
  30. namespace rspamd::mime {
  31. /*
  32. * The motivation for another string is to have utf8 valid string replacing
  33. * all bad things with FFFFD replacement character and filtering \0 and other
  34. * strange stuff defined by policies.
  35. * This string always exclude \0 characters and ignore them! This is how MUA acts,
  36. * and we also store a flag about bad characters.
  37. * Mime string iterators are always const, so the underlying storage should not
  38. * be modified externally.
  39. */
  40. template<class T=char, class Allocator = std::allocator<T>,
  41. class Functor = fu2::function_view<UChar32(UChar32)>> class basic_mime_string;
  42. using mime_string = basic_mime_string<char>;
  43. using mime_pool_string = basic_mime_string<char, mempool_allocator<char>>;
  44. /* Helpers for type safe flags */
  45. enum class mime_string_flags : std::uint8_t {
  46. MIME_STRING_DEFAULT = 0,
  47. MIME_STRING_SEEN_ZEROES = 0x1 << 0,
  48. MIME_STRING_SEEN_INVALID = 0x1 << 1,
  49. };
  50. constexpr mime_string_flags operator |(mime_string_flags lhs, mime_string_flags rhs)
  51. {
  52. using ut = std::underlying_type<mime_string_flags>::type;
  53. return static_cast<mime_string_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs));
  54. }
  55. constexpr mime_string_flags operator &(mime_string_flags lhs, mime_string_flags rhs)
  56. {
  57. using ut = std::underlying_type<mime_string_flags>::type;
  58. return static_cast<mime_string_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs));
  59. }
  60. constexpr bool operator !(mime_string_flags fl)
  61. {
  62. return fl == mime_string_flags::MIME_STRING_DEFAULT;
  63. }
  64. // Codepoint iterator base class
  65. template<typename Container, bool Raw = false>
  66. struct iterator_base
  67. {
  68. template<typename, typename, typename>
  69. friend class basic_mime_string;
  70. public:
  71. using value_type = typename Container::value_type;
  72. using difference_type = typename Container::difference_type;
  73. using codepoint_type = typename Container::codepoint_type;
  74. using reference_type = codepoint_type;
  75. using iterator_category = std::bidirectional_iterator_tag;
  76. bool operator==(const iterator_base &it) const noexcept
  77. {
  78. return idx == it.idx;
  79. }
  80. bool operator!=(const iterator_base &it) const noexcept
  81. {
  82. return idx != it.idx;
  83. }
  84. iterator_base(difference_type index, Container *instance) noexcept:
  85. idx(index), cont_instance(instance) {}
  86. iterator_base() noexcept = default;
  87. iterator_base(const iterator_base &) noexcept = default;
  88. iterator_base &operator=(const iterator_base &) noexcept = default;
  89. Container *get_instance() const noexcept
  90. {
  91. return cont_instance;
  92. }
  93. codepoint_type get_value() const noexcept {
  94. auto i = idx;
  95. codepoint_type uc;
  96. U8_NEXT_UNSAFE(cont_instance->data(), i, uc);
  97. return uc;
  98. }
  99. protected:
  100. difference_type idx;
  101. Container* cont_instance = nullptr;
  102. protected:
  103. void advance(difference_type n) noexcept {
  104. if (n > 0) {
  105. U8_FWD_N_UNSAFE(cont_instance->data(), idx, n);
  106. }
  107. else if (n < 0) {
  108. U8_BACK_N_UNSAFE(cont_instance->data(), idx, (-n));
  109. }
  110. }
  111. void increment() noexcept {
  112. codepoint_type uc;
  113. U8_NEXT_UNSAFE(cont_instance->data(), idx, uc);
  114. }
  115. void decrement() noexcept {
  116. codepoint_type uc;
  117. U8_PREV_UNSAFE(cont_instance->data(), idx, uc);
  118. }
  119. };
  120. // Partial spec for raw Byte-based iterator base
  121. template<typename Container>
  122. struct iterator_base<Container, true>
  123. {
  124. template<typename, typename, typename>
  125. friend class basic_string;
  126. public:
  127. using value_type = typename Container::value_type;
  128. using difference_type = typename Container::difference_type;
  129. using reference_type = value_type;
  130. using iterator_category = std::bidirectional_iterator_tag;
  131. bool operator==( const iterator_base& it ) const noexcept { return idx == it.idx; }
  132. bool operator!=( const iterator_base& it ) const noexcept { return idx != it.idx; }
  133. iterator_base(difference_type index, Container *instance) noexcept:
  134. idx(index), cont_instance(instance) {}
  135. iterator_base() noexcept = default;
  136. iterator_base( const iterator_base& ) noexcept = default;
  137. iterator_base& operator=( const iterator_base& ) noexcept = default;
  138. Container* get_instance() const noexcept { return cont_instance; }
  139. value_type get_value() const noexcept { return cont_instance->get_storage().at(idx); }
  140. protected:
  141. difference_type idx;
  142. Container* cont_instance = nullptr;
  143. protected:
  144. //! Advance the iterator n times (negative values allowed!)
  145. void advance( difference_type n ) noexcept {
  146. idx += n;
  147. }
  148. void increment() noexcept { idx ++; }
  149. void decrement() noexcept { idx --; }
  150. };
  151. template<typename Container, bool Raw> struct iterator;
  152. template<typename Container, bool Raw> struct const_iterator;
  153. template<typename Container, bool Raw = false>
  154. struct iterator : iterator_base<Container, Raw> {
  155. iterator(typename iterator_base<Container, Raw>::difference_type index, Container *instance) noexcept:
  156. iterator_base<Container, Raw>(index, instance)
  157. {
  158. }
  159. iterator() noexcept = default;
  160. iterator(const iterator &) noexcept = default;
  161. iterator &operator=(const iterator &) noexcept = default;
  162. /* Disallow creating from const_iterator */
  163. iterator(const const_iterator<Container, Raw> &) = delete;
  164. /* Prefix */
  165. iterator &operator++() noexcept
  166. {
  167. this->increment();
  168. return *this;
  169. }
  170. /* Postfix */
  171. iterator operator++(int) noexcept
  172. {
  173. iterator tmp{this->idx, this->cont_instance};
  174. this->increment();
  175. return tmp;
  176. }
  177. /* Prefix */
  178. iterator &operator--() noexcept
  179. {
  180. this->decrement();
  181. return *this;
  182. }
  183. /* Postfix */
  184. iterator operator--(int) noexcept
  185. {
  186. iterator tmp{this->idx, this->cont_instance};
  187. this->decrement();
  188. return tmp;
  189. }
  190. iterator operator+(typename iterator_base<Container, Raw>::difference_type n) const noexcept
  191. {
  192. iterator it{*this};
  193. it.advance(n);
  194. return it;
  195. }
  196. iterator &operator+=(typename iterator_base<Container, Raw>::difference_type n) noexcept
  197. {
  198. this->advance(n);
  199. return *this;
  200. }
  201. iterator operator-(typename iterator_base<Container, Raw>::difference_type n) const noexcept
  202. {
  203. iterator it{*this};
  204. it.advance(-n);
  205. return it;
  206. }
  207. iterator &operator-=(typename iterator_base<Container, Raw>::difference_type n) noexcept
  208. {
  209. this->advance(-n);
  210. return *this;
  211. }
  212. typename iterator::reference_type operator*() const noexcept
  213. {
  214. return this->get_value();
  215. }
  216. };
  217. template<class CharT, class Allocator, class Functor>
  218. class basic_mime_string : private Allocator {
  219. public:
  220. using storage_type = std::basic_string<CharT, std::char_traits<CharT>, Allocator>;
  221. using view_type = std::basic_string_view<CharT, std::char_traits<CharT>>;
  222. using filter_type = Functor;
  223. using codepoint_type = UChar32;
  224. using value_type = CharT;
  225. using difference_type = std::ptrdiff_t;
  226. using iterator = rspamd::mime::iterator<basic_mime_string, false>;
  227. using raw_iterator = rspamd::mime::iterator<basic_mime_string, true>;
  228. /* Ctors */
  229. basic_mime_string() noexcept : Allocator() {}
  230. explicit basic_mime_string(const Allocator& alloc) noexcept : Allocator(alloc) {}
  231. explicit basic_mime_string(filter_type &&filt, const Allocator& alloc = Allocator()) noexcept :
  232. Allocator(alloc), filter_func(std::move(filt)) {}
  233. basic_mime_string(const CharT* str, std::size_t sz, const Allocator& alloc = Allocator()) noexcept :
  234. Allocator(alloc)
  235. {
  236. append_c_string_unfiltered(str, sz);
  237. }
  238. basic_mime_string(const storage_type &st,
  239. const Allocator& alloc = Allocator()) noexcept :
  240. basic_mime_string(st.data(), st.size(), alloc) {}
  241. basic_mime_string(const view_type &st,
  242. const Allocator& alloc = Allocator()) noexcept :
  243. basic_mime_string(st.data(), st.size(), alloc) {}
  244. /* Explicit move ctor */
  245. basic_mime_string(basic_mime_string &&other) noexcept {
  246. *this = std::move(other);
  247. }
  248. /**
  249. * Creates a string with a filter function. It is calee responsibility to
  250. * ensure that the filter functor survives long enough to work with a string
  251. * @param str
  252. * @param sz
  253. * @param filt
  254. * @param alloc
  255. */
  256. basic_mime_string(const CharT* str, std::size_t sz,
  257. filter_type &&filt,
  258. const Allocator& alloc = Allocator()) noexcept :
  259. Allocator(alloc),
  260. filter_func(std::move(filt))
  261. {
  262. append_c_string_filtered(str, sz);
  263. }
  264. basic_mime_string(const storage_type &st,
  265. filter_type &&filt,
  266. const Allocator& alloc = Allocator()) noexcept :
  267. basic_mime_string(st.data(), st.size(), std::move(filt), alloc) {}
  268. basic_mime_string(const view_type &st,
  269. filter_type &&filt,
  270. const Allocator& alloc = Allocator()) noexcept :
  271. basic_mime_string(st.data(), st.size(), std::move(filt), alloc) {}
  272. /* It seems some libc++ implementations still perform copy, this might fix them */
  273. basic_mime_string& operator=(basic_mime_string &&other) {
  274. storage = std::move(other.storage);
  275. filter_func = std::move(other.filter_func);
  276. return *this;
  277. }
  278. constexpr auto size() const noexcept -> std::size_t {
  279. return storage.size();
  280. }
  281. constexpr auto data() const noexcept -> const CharT* {
  282. return storage.data();
  283. }
  284. constexpr auto has_zeroes() const noexcept -> bool {
  285. return !!(flags & mime_string_flags::MIME_STRING_SEEN_ZEROES);
  286. }
  287. constexpr auto has_invalid() const noexcept -> bool {
  288. return !!(flags & mime_string_flags::MIME_STRING_SEEN_INVALID);
  289. }
  290. /**
  291. * Assign mime string from another string using move operation if a source string
  292. * is utf8 valid.
  293. * If this function returns false, then ownership has not been transferred
  294. * and the `other` string is unmodified as well as the storage
  295. * @param other
  296. * @return
  297. */
  298. [[nodiscard]] auto assign_if_valid(storage_type &&other) -> bool {
  299. if (filter_func) {
  300. /* No way */
  301. return false;
  302. }
  303. if (rspamd_fast_utf8_validate((const unsigned char *)other.data(), other.size()) == 0) {
  304. std::swap(storage, other);
  305. return true;
  306. }
  307. return false;
  308. }
  309. /**
  310. * Copy to the internal storage discarding the contained value
  311. * @param other
  312. * @return
  313. */
  314. auto assign_copy(const view_type &other) {
  315. storage.clear();
  316. if (filter_func) {
  317. append_c_string_filtered(other.data(), other.size());
  318. }
  319. else {
  320. append_c_string_unfiltered(other.data(), other.size());
  321. }
  322. }
  323. auto assign_copy(const storage_type &other) {
  324. storage.clear();
  325. if (filter_func) {
  326. append_c_string_filtered(other.data(), other.size());
  327. }
  328. else {
  329. append_c_string_unfiltered(other.data(), other.size());
  330. }
  331. }
  332. auto assign_copy(const basic_mime_string &other) {
  333. storage.clear();
  334. if (filter_func) {
  335. append_c_string_filtered(other.data(), other.size());
  336. }
  337. else {
  338. append_c_string_unfiltered(other.data(), other.size());
  339. }
  340. }
  341. /* Mutators */
  342. auto append(const CharT* str, std::size_t size) -> std::size_t {
  343. if (filter_func) {
  344. return append_c_string_filtered(str, size);
  345. }
  346. else {
  347. return append_c_string_unfiltered(str, size);
  348. }
  349. }
  350. auto append(const storage_type &other) -> std::size_t {
  351. return append(other.data(), other.size());
  352. }
  353. auto append(const view_type &other) -> std::size_t {
  354. return append(other.data(), other.size());
  355. }
  356. auto ltrim(const view_type &what) -> void
  357. {
  358. auto it = std::find_if(storage.begin(), storage.end(),
  359. [&what](CharT c) {
  360. return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
  361. });
  362. storage.erase(storage.begin(), it);
  363. }
  364. auto rtrim(const view_type &what) -> void
  365. {
  366. auto it = std::find_if(storage.rbegin(), storage.rend(),
  367. [&what](CharT c) {
  368. return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
  369. });
  370. storage.erase(it.base(), storage.end());
  371. }
  372. auto trim(const view_type &what) -> void {
  373. ltrim(what);
  374. rtrim(what);
  375. }
  376. /* Comparison */
  377. auto operator ==(const basic_mime_string &other) {
  378. return other.storage == storage;
  379. }
  380. auto operator ==(const storage_type &other) {
  381. return other == storage;
  382. }
  383. auto operator ==(const view_type &other) {
  384. return other == storage;
  385. }
  386. auto operator ==(const CharT* other) {
  387. if (other == NULL) {
  388. return false;
  389. }
  390. auto olen = strlen(other);
  391. if (storage.size() == olen) {
  392. return memcmp(storage.data(), other, olen) == 0;
  393. }
  394. return false;
  395. }
  396. /* Iterators */
  397. inline auto begin() noexcept -> iterator
  398. {
  399. return {0, this};
  400. }
  401. inline auto raw_begin() noexcept -> raw_iterator
  402. {
  403. return {0, this};
  404. }
  405. inline auto end() noexcept -> iterator
  406. {
  407. return {(difference_type) size(), this};
  408. }
  409. inline auto raw_end() noexcept -> raw_iterator
  410. {
  411. return {(difference_type) size(), this};
  412. }
  413. /* Utility */
  414. inline auto get_storage() const noexcept -> const storage_type &
  415. {
  416. return storage;
  417. }
  418. inline auto as_view() const noexcept -> view_type {
  419. return view_type{storage};
  420. }
  421. constexpr CharT operator[](std::size_t pos) const noexcept {
  422. return storage[pos];
  423. }
  424. constexpr CharT at(std::size_t pos) const {
  425. return storage.at(pos);
  426. }
  427. constexpr bool empty() const noexcept {
  428. return storage.empty();
  429. }
  430. /* For doctest stringify */
  431. friend std::ostream& operator<< (std::ostream& os, const CharT& value) {
  432. os << value.storage;
  433. return os;
  434. }
  435. private:
  436. mime_string_flags flags = mime_string_flags::MIME_STRING_DEFAULT;
  437. storage_type storage;
  438. filter_type filter_func;
  439. auto append_c_string_unfiltered(const CharT* str, std::size_t len) -> std::size_t {
  440. /* This is fast path */
  441. const auto *p = str;
  442. const auto *end = str + len;
  443. std::int32_t err_offset; // We have to use int32_t here as old libicu is brain-damaged
  444. auto orig_size = storage.size();
  445. storage.reserve(len + storage.size());
  446. if (memchr(str, 0, len) != NULL) {
  447. /* Fallback to slow path */
  448. flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
  449. return append_c_string_filtered(str, len);
  450. }
  451. while (p < end && len > 0 &&
  452. (err_offset = rspamd_fast_utf8_validate((const unsigned char *)p, len)) > 0) {
  453. auto cur_offset = err_offset - 1;
  454. storage.append(p, cur_offset);
  455. while (cur_offset < len) {
  456. auto tmp = cur_offset;
  457. UChar32 uc;
  458. U8_NEXT(p, cur_offset, len, uc);
  459. if (uc < 0) {
  460. storage.append("\uFFFD");
  461. flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
  462. }
  463. else {
  464. cur_offset = tmp;
  465. break;
  466. }
  467. }
  468. p += cur_offset;
  469. len = end - p;
  470. }
  471. storage.append(p, len);
  472. return storage.size() - orig_size;
  473. }
  474. auto append_c_string_filtered(const CharT* str, std::size_t len) -> std::size_t {
  475. std::int32_t i = 0; // We have to use int32_t here as old libicu is brain-damaged
  476. UChar32 uc;
  477. char tmp[4];
  478. auto orig_size = storage.size();
  479. /* Slow path */
  480. storage.reserve(len + storage.size());
  481. while (i < len) {
  482. U8_NEXT(str, i, len, uc);
  483. if (uc < 0) {
  484. /* Replace with 0xFFFD */
  485. storage.append("\uFFFD");
  486. flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
  487. }
  488. else {
  489. if (filter_func) {
  490. uc = filter_func(uc);
  491. }
  492. if (uc == 0) {
  493. /* Special case, ignore it */
  494. flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
  495. }
  496. else {
  497. std::int32_t o = 0;
  498. U8_APPEND_UNSAFE(tmp, o, uc);
  499. storage.append(tmp, o);
  500. }
  501. }
  502. }
  503. return storage.size() - orig_size;
  504. }
  505. };
  506. }
  507. #endif //RSPAMD_MIME_STRING_HXX