Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

mime_string.hxx 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670
  1. /*-
  2. * Copyright 2021 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef RSPAMD_MIME_STRING_HXX
  17. #define RSPAMD_MIME_STRING_HXX
  18. #pragma once
  19. #include <algorithm>
  20. #include <string>
  21. #include <string_view>
  22. #include <memory>
  23. #include <cstdint>
  24. #include <cstdlib>
  25. #include <cstring>
  26. #include <iosfwd>
  27. #include "libutil/mem_pool.h"
  28. #include "function2/function2.hpp"
  29. #include "unicode/utf8.h"
  30. #include "contrib/fastutf8/fastutf8.h"
  31. namespace rspamd::mime {
  32. /*
  33. * The motivation for another string is to have utf8 valid string replacing
  34. * all bad things with FFFFD replacement character and filtering \0 and other
  35. * strange stuff defined by policies.
  36. * This string always exclude \0 characters and ignore them! This is how MUA acts,
  37. * and we also store a flag about bad characters.
  38. * Mime string iterators are always const, so the underlying storage should not
  39. * be modified externally.
  40. */
  41. template<class T = char, class Allocator = std::allocator<T>,
  42. class Functor = fu2::function_view<UChar32(UChar32)>>
  43. class basic_mime_string;
  44. using mime_string = basic_mime_string<char>;
  45. using mime_pool_string = basic_mime_string<char, mempool_allocator<char>>;
  46. /* Helpers for type safe flags */
  47. enum class mime_string_flags : std::uint8_t {
  48. MIME_STRING_DEFAULT = 0,
  49. MIME_STRING_SEEN_ZEROES = 0x1 << 0,
  50. MIME_STRING_SEEN_INVALID = 0x1 << 1,
  51. };
  52. constexpr mime_string_flags operator|(mime_string_flags lhs, mime_string_flags rhs)
  53. {
  54. using ut = std::underlying_type<mime_string_flags>::type;
  55. return static_cast<mime_string_flags>(static_cast<ut>(lhs) | static_cast<ut>(rhs));
  56. }
  57. constexpr mime_string_flags operator&(mime_string_flags lhs, mime_string_flags rhs)
  58. {
  59. using ut = std::underlying_type<mime_string_flags>::type;
  60. return static_cast<mime_string_flags>(static_cast<ut>(lhs) & static_cast<ut>(rhs));
  61. }
  62. constexpr bool operator!(mime_string_flags fl)
  63. {
  64. return fl == mime_string_flags::MIME_STRING_DEFAULT;
  65. }
  66. // Codepoint iterator base class
  67. template<typename Container, bool Raw = false>
  68. struct iterator_base {
  69. template<typename, typename, typename>
  70. friend class basic_mime_string;
  71. public:
  72. using value_type = typename Container::value_type;
  73. using difference_type = typename Container::difference_type;
  74. using codepoint_type = typename Container::codepoint_type;
  75. using reference_type = codepoint_type;
  76. using iterator_category = std::bidirectional_iterator_tag;
  77. bool operator==(const iterator_base &it) const noexcept
  78. {
  79. return idx == it.idx;
  80. }
  81. bool operator!=(const iterator_base &it) const noexcept
  82. {
  83. return idx != it.idx;
  84. }
  85. iterator_base(difference_type index, Container *instance) noexcept
  86. : idx(index), cont_instance(instance)
  87. {
  88. }
  89. iterator_base() noexcept = default;
  90. iterator_base(const iterator_base &) noexcept = default;
  91. iterator_base &operator=(const iterator_base &) noexcept = default;
  92. Container *get_instance() const noexcept
  93. {
  94. return cont_instance;
  95. }
  96. codepoint_type get_value() const noexcept
  97. {
  98. auto i = idx;
  99. codepoint_type uc;
  100. U8_NEXT_UNSAFE(cont_instance->data(), i, uc);
  101. return uc;
  102. }
  103. protected:
  104. difference_type idx;
  105. Container *cont_instance = nullptr;
  106. protected:
  107. void advance(difference_type n) noexcept
  108. {
  109. if (n > 0) {
  110. U8_FWD_N_UNSAFE(cont_instance->data(), idx, n);
  111. }
  112. else if (n < 0) {
  113. U8_BACK_N_UNSAFE(cont_instance->data(), idx, (-n));
  114. }
  115. }
  116. void increment() noexcept
  117. {
  118. codepoint_type uc;
  119. U8_NEXT_UNSAFE(cont_instance->data(), idx, uc);
  120. }
  121. void decrement() noexcept
  122. {
  123. codepoint_type uc;
  124. U8_PREV_UNSAFE(cont_instance->data(), idx, uc);
  125. }
  126. };
  127. // Partial spec for raw Byte-based iterator base
  128. template<typename Container>
  129. struct iterator_base<Container, true> {
  130. template<typename, typename, typename>
  131. friend class basic_string;
  132. public:
  133. using value_type = typename Container::value_type;
  134. using difference_type = typename Container::difference_type;
  135. using reference_type = value_type;
  136. using iterator_category = std::bidirectional_iterator_tag;
  137. bool operator==(const iterator_base &it) const noexcept
  138. {
  139. return idx == it.idx;
  140. }
  141. bool operator!=(const iterator_base &it) const noexcept
  142. {
  143. return idx != it.idx;
  144. }
  145. iterator_base(difference_type index, Container *instance) noexcept
  146. : idx(index), cont_instance(instance)
  147. {
  148. }
  149. iterator_base() noexcept = default;
  150. iterator_base(const iterator_base &) noexcept = default;
  151. iterator_base &operator=(const iterator_base &) noexcept = default;
  152. Container *get_instance() const noexcept
  153. {
  154. return cont_instance;
  155. }
  156. value_type get_value() const noexcept
  157. {
  158. return cont_instance->get_storage().at(idx);
  159. }
  160. protected:
  161. difference_type idx;
  162. Container *cont_instance = nullptr;
  163. protected:
  164. //! Advance the iterator n times (negative values allowed!)
  165. void advance(difference_type n) noexcept
  166. {
  167. idx += n;
  168. }
  169. void increment() noexcept
  170. {
  171. idx++;
  172. }
  173. void decrement() noexcept
  174. {
  175. idx--;
  176. }
  177. };
  178. template<typename Container, bool Raw>
  179. struct iterator;
  180. template<typename Container, bool Raw>
  181. struct const_iterator;
  182. template<typename Container, bool Raw = false>
  183. struct iterator : iterator_base<Container, Raw> {
  184. iterator(typename iterator_base<Container, Raw>::difference_type index, Container *instance) noexcept
  185. : iterator_base<Container, Raw>(index, instance)
  186. {
  187. }
  188. iterator() noexcept = default;
  189. iterator(const iterator &) noexcept = default;
  190. iterator &operator=(const iterator &) noexcept = default;
  191. /* Disallow creating from const_iterator */
  192. iterator(const const_iterator<Container, Raw> &) = delete;
  193. /* Prefix */
  194. iterator &operator++() noexcept
  195. {
  196. this->increment();
  197. return *this;
  198. }
  199. /* Postfix */
  200. iterator operator++(int) noexcept
  201. {
  202. iterator tmp{this->idx, this->cont_instance};
  203. this->increment();
  204. return tmp;
  205. }
  206. /* Prefix */
  207. iterator &operator--() noexcept
  208. {
  209. this->decrement();
  210. return *this;
  211. }
  212. /* Postfix */
  213. iterator operator--(int) noexcept
  214. {
  215. iterator tmp{this->idx, this->cont_instance};
  216. this->decrement();
  217. return tmp;
  218. }
  219. iterator operator+(typename iterator_base<Container, Raw>::difference_type n) const noexcept
  220. {
  221. iterator it{*this};
  222. it.advance(n);
  223. return it;
  224. }
  225. iterator &operator+=(typename iterator_base<Container, Raw>::difference_type n) noexcept
  226. {
  227. this->advance(n);
  228. return *this;
  229. }
  230. iterator operator-(typename iterator_base<Container, Raw>::difference_type n) const noexcept
  231. {
  232. iterator it{*this};
  233. it.advance(-n);
  234. return it;
  235. }
  236. iterator &operator-=(typename iterator_base<Container, Raw>::difference_type n) noexcept
  237. {
  238. this->advance(-n);
  239. return *this;
  240. }
  241. typename iterator::reference_type operator*() const noexcept
  242. {
  243. return this->get_value();
  244. }
  245. };
  246. template<class CharT, class Allocator, class Functor>
  247. class basic_mime_string : private Allocator {
  248. public:
  249. using storage_type = std::basic_string<CharT, std::char_traits<CharT>, Allocator>;
  250. using view_type = std::basic_string_view<CharT, std::char_traits<CharT>>;
  251. using filter_type = Functor;
  252. using codepoint_type = UChar32;
  253. using value_type = CharT;
  254. using difference_type = std::ptrdiff_t;
  255. using iterator = rspamd::mime::iterator<basic_mime_string, false>;
  256. using raw_iterator = rspamd::mime::iterator<basic_mime_string, true>;
  257. /* Ctors */
  258. basic_mime_string() noexcept
  259. : Allocator()
  260. {
  261. }
  262. explicit basic_mime_string(const Allocator &alloc) noexcept
  263. : Allocator(alloc)
  264. {
  265. }
  266. explicit basic_mime_string(filter_type &&filt, const Allocator &alloc = Allocator()) noexcept
  267. : Allocator(alloc), filter_func(std::move(filt))
  268. {
  269. }
  270. basic_mime_string(const CharT *str, std::size_t sz, const Allocator &alloc = Allocator()) noexcept
  271. : Allocator(alloc)
  272. {
  273. append_c_string_unfiltered(str, sz);
  274. }
  275. basic_mime_string(const storage_type &st,
  276. const Allocator &alloc = Allocator()) noexcept
  277. : basic_mime_string(st.data(), st.size(), alloc)
  278. {
  279. }
  280. basic_mime_string(const view_type &st,
  281. const Allocator &alloc = Allocator()) noexcept
  282. : basic_mime_string(st.data(), st.size(), alloc)
  283. {
  284. }
  285. /* Explicit move ctor */
  286. basic_mime_string(basic_mime_string &&other) noexcept
  287. {
  288. *this = std::move(other);
  289. }
  290. /**
  291. * Creates a string with a filter function. It is calee responsibility to
  292. * ensure that the filter functor survives long enough to work with a string
  293. * @param str
  294. * @param sz
  295. * @param filt
  296. * @param alloc
  297. */
  298. basic_mime_string(const CharT *str, std::size_t sz,
  299. filter_type &&filt,
  300. const Allocator &alloc = Allocator()) noexcept
  301. : Allocator(alloc),
  302. filter_func(std::move(filt))
  303. {
  304. append_c_string_filtered(str, sz);
  305. }
  306. basic_mime_string(const storage_type &st,
  307. filter_type &&filt,
  308. const Allocator &alloc = Allocator()) noexcept
  309. : basic_mime_string(st.data(), st.size(), std::move(filt), alloc)
  310. {
  311. }
  312. basic_mime_string(const view_type &st,
  313. filter_type &&filt,
  314. const Allocator &alloc = Allocator()) noexcept
  315. : basic_mime_string(st.data(), st.size(), std::move(filt), alloc)
  316. {
  317. }
  318. /* It seems some libc++ implementations still perform copy, this might fix them */
  319. basic_mime_string &operator=(basic_mime_string &&other)
  320. {
  321. storage = std::move(other.storage);
  322. filter_func = std::move(other.filter_func);
  323. return *this;
  324. }
  325. constexpr auto size() const noexcept -> std::size_t
  326. {
  327. return storage.size();
  328. }
  329. constexpr auto data() const noexcept -> const CharT *
  330. {
  331. return storage.data();
  332. }
  333. constexpr auto has_zeroes() const noexcept -> bool
  334. {
  335. return !!(flags & mime_string_flags::MIME_STRING_SEEN_ZEROES);
  336. }
  337. constexpr auto has_invalid() const noexcept -> bool
  338. {
  339. return !!(flags & mime_string_flags::MIME_STRING_SEEN_INVALID);
  340. }
  341. /**
  342. * Assign mime string from another string using move operation if a source string
  343. * is utf8 valid.
  344. * If this function returns false, then ownership has not been transferred
  345. * and the `other` string is unmodified as well as the storage
  346. * @param other
  347. * @return
  348. */
  349. [[nodiscard]] auto assign_if_valid(storage_type &&other) -> bool
  350. {
  351. if (filter_func) {
  352. /* No way */
  353. return false;
  354. }
  355. if (rspamd_fast_utf8_validate((const unsigned char *) other.data(), other.size()) == 0) {
  356. std::swap(storage, other);
  357. return true;
  358. }
  359. return false;
  360. }
  361. /**
  362. * Copy to the internal storage discarding the contained value
  363. * @param other
  364. * @return
  365. */
  366. auto assign_copy(const view_type &other)
  367. {
  368. storage.clear();
  369. if (filter_func) {
  370. append_c_string_filtered(other.data(), other.size());
  371. }
  372. else {
  373. append_c_string_unfiltered(other.data(), other.size());
  374. }
  375. }
  376. auto assign_copy(const storage_type &other)
  377. {
  378. storage.clear();
  379. if (filter_func) {
  380. append_c_string_filtered(other.data(), other.size());
  381. }
  382. else {
  383. append_c_string_unfiltered(other.data(), other.size());
  384. }
  385. }
  386. auto assign_copy(const basic_mime_string &other)
  387. {
  388. storage.clear();
  389. if (filter_func) {
  390. append_c_string_filtered(other.data(), other.size());
  391. }
  392. else {
  393. append_c_string_unfiltered(other.data(), other.size());
  394. }
  395. }
  396. /* Mutators */
  397. auto append(const CharT *str, std::size_t size) -> std::size_t
  398. {
  399. if (filter_func) {
  400. return append_c_string_filtered(str, size);
  401. }
  402. else {
  403. return append_c_string_unfiltered(str, size);
  404. }
  405. }
  406. auto append(const storage_type &other) -> std::size_t
  407. {
  408. return append(other.data(), other.size());
  409. }
  410. auto append(const view_type &other) -> std::size_t
  411. {
  412. return append(other.data(), other.size());
  413. }
  414. auto ltrim(const view_type &what) -> void
  415. {
  416. auto it = std::find_if(storage.begin(), storage.end(),
  417. [&what](CharT c) {
  418. return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
  419. });
  420. storage.erase(storage.begin(), it);
  421. }
  422. auto rtrim(const view_type &what) -> void
  423. {
  424. auto it = std::find_if(storage.rbegin(), storage.rend(),
  425. [&what](CharT c) {
  426. return !std::any_of(what.begin(), what.end(), [&c](CharT sc) { return sc == c; });
  427. });
  428. storage.erase(it.base(), storage.end());
  429. }
  430. auto trim(const view_type &what) -> void
  431. {
  432. ltrim(what);
  433. rtrim(what);
  434. }
  435. /* Comparison */
  436. auto operator==(const basic_mime_string &other)
  437. {
  438. return other.storage == storage;
  439. }
  440. auto operator==(const storage_type &other)
  441. {
  442. return other == storage;
  443. }
  444. auto operator==(const view_type &other)
  445. {
  446. return other == storage;
  447. }
  448. auto operator==(const CharT *other)
  449. {
  450. if (other == NULL) {
  451. return false;
  452. }
  453. auto olen = strlen(other);
  454. if (storage.size() == olen) {
  455. return memcmp(storage.data(), other, olen) == 0;
  456. }
  457. return false;
  458. }
  459. /* Iterators */
  460. inline auto begin() noexcept -> iterator
  461. {
  462. return {0, this};
  463. }
  464. inline auto raw_begin() noexcept -> raw_iterator
  465. {
  466. return {0, this};
  467. }
  468. inline auto end() noexcept -> iterator
  469. {
  470. return {(difference_type) size(), this};
  471. }
  472. inline auto raw_end() noexcept -> raw_iterator
  473. {
  474. return {(difference_type) size(), this};
  475. }
  476. /* Utility */
  477. inline auto get_storage() const noexcept -> const storage_type &
  478. {
  479. return storage;
  480. }
  481. inline auto as_view() const noexcept -> view_type
  482. {
  483. return view_type{storage};
  484. }
  485. constexpr CharT operator[](std::size_t pos) const noexcept
  486. {
  487. return storage[pos];
  488. }
  489. constexpr CharT at(std::size_t pos) const
  490. {
  491. return storage.at(pos);
  492. }
  493. constexpr bool empty() const noexcept
  494. {
  495. return storage.empty();
  496. }
  497. /* For doctest stringify */
  498. friend std::ostream &operator<<(std::ostream &os, const CharT &value)
  499. {
  500. os << value.storage;
  501. return os;
  502. }
  503. private:
  504. mime_string_flags flags = mime_string_flags::MIME_STRING_DEFAULT;
  505. storage_type storage;
  506. filter_type filter_func;
  507. auto append_c_string_unfiltered(const CharT *str, std::size_t len) -> std::size_t
  508. {
  509. /* This is fast path */
  510. const auto *p = str;
  511. const auto *end = str + len;
  512. std::int32_t err_offset;// We have to use int32_t here as old libicu is brain-damaged
  513. auto orig_size = storage.size();
  514. storage.reserve(len + storage.size());
  515. if (memchr(str, 0, len) != NULL) {
  516. /* Fallback to slow path */
  517. flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
  518. return append_c_string_filtered(str, len);
  519. }
  520. while (p < end && len > 0 &&
  521. (err_offset = rspamd_fast_utf8_validate((const unsigned char *) p, len)) > 0) {
  522. auto cur_offset = err_offset - 1;
  523. storage.append(p, cur_offset);
  524. while (cur_offset < len) {
  525. auto tmp = cur_offset;
  526. UChar32 uc;
  527. U8_NEXT(p, cur_offset, len, uc);
  528. if (uc < 0) {
  529. storage.append("\uFFFD");
  530. flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
  531. }
  532. else {
  533. cur_offset = tmp;
  534. break;
  535. }
  536. }
  537. p += cur_offset;
  538. len = end - p;
  539. }
  540. storage.append(p, len);
  541. return storage.size() - orig_size;
  542. }
  543. auto append_c_string_filtered(const CharT *str, std::size_t len) -> std::size_t
  544. {
  545. std::int32_t i = 0;// We have to use int32_t here as old libicu is brain-damaged
  546. UChar32 uc;
  547. char tmp[4];
  548. auto orig_size = storage.size();
  549. /* Slow path */
  550. storage.reserve(len + storage.size());
  551. while (i < len) {
  552. U8_NEXT(str, i, len, uc);
  553. if (uc < 0) {
  554. /* Replace with 0xFFFD */
  555. storage.append("\uFFFD");
  556. flags = flags | mime_string_flags::MIME_STRING_SEEN_INVALID;
  557. }
  558. else {
  559. if (filter_func) {
  560. uc = filter_func(uc);
  561. }
  562. if (uc == 0) {
  563. /* Special case, ignore it */
  564. flags = flags | mime_string_flags::MIME_STRING_SEEN_ZEROES;
  565. }
  566. else {
  567. std::int32_t o = 0;
  568. U8_APPEND_UNSAFE(tmp, o, uc);
  569. storage.append(tmp, o);
  570. }
  571. }
  572. }
  573. return storage.size() - orig_size;
  574. }
  575. };
  576. }// namespace rspamd::mime
  577. #endif//RSPAMD_MIME_STRING_HXX