You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

str_util.h 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef SRC_LIBUTIL_STR_UTIL_H_
  17. #define SRC_LIBUTIL_STR_UTIL_H_
  18. #include "config.h"
  19. #include "ucl.h"
  20. #include "fstring.h"
  21. #include <stdalign.h>
  22. #ifdef __cplusplus
  23. extern "C" {
  24. #endif
  25. enum rspamd_newlines_type {
  26. RSPAMD_TASK_NEWLINES_CR = 0,
  27. RSPAMD_TASK_NEWLINES_LF,
  28. RSPAMD_TASK_NEWLINES_CRLF,
  29. RSPAMD_TASK_NEWLINES_MAX
  30. };
  31. /**
  32. * Compare two memory regions of size `l` using case insensitive matching
  33. */
  34. int rspamd_lc_cmp(const char *s, const char *d, gsize l);
  35. /**
  36. * Convert string to lowercase in-place using ASCII conversion
  37. */
  38. unsigned int rspamd_str_lc(char *str, unsigned int size);
  39. /**
  40. * Performs ascii copy & lowercase
  41. * @param src
  42. * @param size
  43. * @return
  44. */
  45. gsize rspamd_str_copy_lc(const char *src, char *dst, gsize size);
  46. /**
  47. * Convert string to lowercase in-place using utf (limited) conversion
  48. */
  49. unsigned int rspamd_str_lc_utf8(char *str, unsigned int size);
  50. /*
  51. * Hash table utility functions for case insensitive hashing
  52. */
  53. uint64_t rspamd_icase_hash(const char *in, gsize len, uint64_t seed);
  54. unsigned int rspamd_strcase_hash(gconstpointer key);
  55. gboolean rspamd_strcase_equal(gconstpointer v, gconstpointer v2);
  56. /*
  57. * Hash table utility functions for case sensitive hashing
  58. */
  59. unsigned int rspamd_str_hash(gconstpointer key);
  60. gboolean rspamd_str_equal(gconstpointer v, gconstpointer v2);
  61. /*
  62. * Hash table utility functions for hashing fixed strings
  63. */
  64. unsigned int rspamd_ftok_icase_hash(gconstpointer key);
  65. gboolean rspamd_ftok_icase_equal(gconstpointer v, gconstpointer v2);
  66. /* Use in khash for speed */
  67. #define rspamd_ftok_hash(key) _wyhash32((key)->begin, (key)->len, 0)
  68. #define rspamd_ftok_equal(v1, v2) ((v1)->len == (v2)->len && memcmp((v1)->begin, (v2)->begin, (v1)->len) == 0)
  69. unsigned int rspamd_gstring_icase_hash(gconstpointer key);
  70. gboolean rspamd_gstring_icase_equal(gconstpointer v, gconstpointer v2);
  71. /**
  72. * Copy src to dest limited to len, in compare with standard strlcpy(3) rspamd strlcpy does not
  73. * traverse the whole string and it is possible to use it for non NULL terminated strings. This is
  74. * more like memccpy(dst, src, size, '\0')
  75. *
  76. * @param dst destination string
  77. * @param src source string
  78. * @param siz length of destination buffer
  79. * @return bytes copied
  80. */
  81. gsize rspamd_strlcpy_fast(char *dst, const char *src, gsize siz);
  82. gsize rspamd_strlcpy_safe(char *dst, const char *src, gsize siz);
  83. #if defined(__has_feature)
  84. #if __has_feature(address_sanitizer)
  85. #define rspamd_strlcpy rspamd_strlcpy_safe
  86. #else
  87. #ifdef __SANITIZE_ADDRESS__
  88. #define rspamd_strlcpy rspamd_strlcpy_safe
  89. #else
  90. #define rspamd_strlcpy rspamd_strlcpy_fast
  91. #endif
  92. #endif
  93. #else
  94. #ifdef __SANITIZE_ADDRESS__
  95. #define rspamd_strlcpy rspamd_strlcpy_safe
  96. #else
  97. #define rspamd_strlcpy rspamd_strlcpy_fast
  98. #endif
  99. #endif
  100. /**
  101. * Copies `srclen` characters from `src` to `dst` ignoring \0
  102. * @param src
  103. * @param srclen
  104. * @param dest
  105. * @param destlen
  106. * @return number of bytes copied
  107. */
  108. gsize rspamd_null_safe_copy(const char *src, gsize srclen,
  109. char *dest, gsize destlen);
  110. /*
  111. * Try to convert string of length to long
  112. */
  113. gboolean rspamd_strtol(const char *s, gsize len, glong *value);
  114. /*
  115. * Try to convert a string of length to unsigned long
  116. */
  117. gboolean rspamd_strtoul(const char *s, gsize len, gulong *value);
  118. gboolean rspamd_strtou64(const char *s, gsize len, uint64_t *value);
  119. /*
  120. * Try to convert a hex string of length to unsigned long
  121. */
  122. gboolean rspamd_xstrtoul(const char *s, gsize len, gulong *value);
  123. /**
  124. * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
  125. * @param data string to copy
  126. * @param ud memory pool to use
  127. * @return
  128. */
  129. gpointer rspamd_str_pool_copy(gconstpointer data, gpointer ud);
  130. /**
  131. * Encode string using hex encoding
  132. * @param in input
  133. * @param inlen input length
  134. * @return freshly allocated base32 encoding of a specified string
  135. */
  136. char *rspamd_encode_hex(const unsigned char *in, gsize inlen);
  137. /**
  138. * Decode string using hex encoding
  139. * @param in input
  140. * @param inlen input length
  141. * @return freshly allocated base32 decoded value or NULL if input is invalid
  142. */
  143. unsigned char *rspamd_decode_hex(const char *in, gsize inlen);
  144. enum rspamd_base32_type {
  145. RSPAMD_BASE32_DEFAULT = 0,
  146. RSPAMD_BASE32_ZBASE = 0,
  147. RSPAMD_BASE32_BLEACH,
  148. RSPAMD_BASE32_RFC,
  149. RSPAMD_BASE32_INVALID = -1,
  150. };
  151. /**
  152. * Returns base32 type from a string or RSPAMD_BASE32_INVALID
  153. * @param str
  154. * @return
  155. */
  156. enum rspamd_base32_type rspamd_base32_decode_type_from_str(const char *str);
  157. /**
  158. * Encode string using base32 encoding
  159. * @param in input
  160. * @param inlen input length
  161. * @return freshly allocated base32 encoding of a specified string
  162. */
  163. char *rspamd_encode_base32(const unsigned char *in, gsize inlen,
  164. enum rspamd_base32_type type);
  165. /**
  166. * Decode string using base32 encoding
  167. * @param in input
  168. * @param inlen input length
  169. * @return freshly allocated base32 decoded value or NULL if input is invalid
  170. */
  171. unsigned char *rspamd_decode_base32(const char *in, gsize inlen, gsize *outlen, enum rspamd_base32_type type);
  172. /**
  173. * Encode string using base32 encoding
  174. * @param in input
  175. * @param inlen input length
  176. * @param out output buf
  177. * @param outlen output buf len
  178. * @return encoded len if `outlen` is enough to encode `inlen`
  179. */
  180. int rspamd_encode_base32_buf(const unsigned char *in, gsize inlen, char *out,
  181. gsize outlen, enum rspamd_base32_type type);
  182. /**
  183. * Decode string using base32 encoding
  184. * @param in input
  185. * @param inlen input length
  186. * @param out output buf (may overlap with `in`)
  187. * @param outlen output buf len
  188. * @return decoded len if in is valid base32 and `outlen` is enough to encode `inlen`
  189. */
  190. int rspamd_decode_base32_buf(const char *in, gsize inlen, unsigned char *out,
  191. gsize outlen, enum rspamd_base32_type type);
  192. /**
  193. * Encode string using hex encoding
  194. * @param in input
  195. * @param inlen input length
  196. * @param out output buf
  197. * @param outlen output buf len
  198. * @return encoded len if `outlen` is enough to encode `inlen`
  199. */
  200. int rspamd_encode_hex_buf(const unsigned char *in, gsize inlen, char *out,
  201. gsize outlen);
  202. /**
  203. * Decode string using hex encoding
  204. * @param in input
  205. * @param inlen input length
  206. * @param out output buf (may overlap with `in`)
  207. * @param outlen output buf len
  208. * @return decoded len if in is valid hex and `outlen` is enough to encode `inlen`
  209. */
  210. gssize rspamd_decode_hex_buf(const char *in, gsize inlen,
  211. unsigned char *out, gsize outlen);
  212. /**
  213. * Common version of base64 encoder
  214. * @param in
  215. * @param inlen
  216. * @param str_len
  217. * @param outlen
  218. * @param fold
  219. * @param how
  220. * @return
  221. */
  222. char *
  223. rspamd_encode_base64_common(const unsigned char *in,
  224. gsize inlen,
  225. int str_len,
  226. gsize *outlen,
  227. gboolean fold,
  228. enum rspamd_newlines_type how);
  229. /**
  230. * Encode string using base64 encoding
  231. * @param in input
  232. * @param inlen input length
  233. * @param str_len maximum string length (if <= 0 then no lines are split)
  234. * @return freshly allocated base64 encoded value or NULL if input is invalid
  235. */
  236. char *rspamd_encode_base64(const unsigned char *in, gsize inlen, int str_len,
  237. gsize *outlen);
  238. /**
  239. * Encode and fold string using base64 encoding
  240. * @param in input
  241. * @param inlen input length
  242. * @param str_len maximum string length (if <= 0 then no lines are split)
  243. * @return freshly allocated base64 encoded value or NULL if input is invalid
  244. */
  245. char *rspamd_encode_base64_fold(const unsigned char *in, gsize inlen, int str_len,
  246. gsize *outlen, enum rspamd_newlines_type how);
  247. /**
  248. * Encode and fold string using quoted printable encoding
  249. * @param in input
  250. * @param inlen input length
  251. * @param str_len maximum string length (if <= 0 then no lines are split)
  252. * @return freshly allocated base64 encoded value or NULL if input is invalid
  253. */
  254. char *rspamd_encode_qp_fold(const unsigned char *in, gsize inlen, int str_len,
  255. gsize *outlen, enum rspamd_newlines_type how);
  256. /**
  257. * Decode quoted-printable encoded buffer, input and output must not overlap
  258. * @param in input
  259. * @param inlen length of input
  260. * @param out output
  261. * @param outlen length of output
  262. * @return real size of decoded output or (-1) if outlen is not enough
  263. */
  264. gssize rspamd_decode_qp_buf(const char *in, gsize inlen,
  265. char *out, gsize outlen);
  266. /**
  267. * Decode uuencode encoded buffer, input and output must not overlap
  268. * @param in input
  269. * @param inlen length of input
  270. * @param out output
  271. * @param outlen length of output
  272. * @return real size of decoded output or (-1) if outlen is not enough
  273. */
  274. gssize rspamd_decode_uue_buf(const char *in, gsize inlen,
  275. char *out, gsize outlen);
  276. /**
  277. * Decode quoted-printable encoded buffer using rfc2047 format, input and output must not overlap
  278. * @param in input
  279. * @param inlen length of input
  280. * @param out output
  281. * @param outlen length of output
  282. * @return real size of decoded output or (-1) if outlen is not enough
  283. */
  284. gssize rspamd_decode_qp2047_buf(const char *in, gsize inlen,
  285. char *out, gsize outlen);
  286. /**
  287. * Encode quoted-printable buffer using rfc2047 format, input and output must not overlap
  288. * @param in
  289. * @param inlen
  290. * @param out
  291. * @param outlen
  292. * @return
  293. */
  294. gssize rspamd_encode_qp2047_buf(const char *in, gsize inlen,
  295. char *out, gsize outlen);
  296. #ifndef g_tolower
  297. #define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
  298. #endif
  299. /**
  300. * Return levenstein distance between two strings
  301. * @param s1
  302. * @param s1len
  303. * @param s2
  304. * @param s2len
  305. * @return
  306. */
  307. int rspamd_strings_levenshtein_distance(const char *s1, gsize s1len,
  308. const char *s2, gsize s2len, unsigned int replace_cost);
  309. /**
  310. * Fold header using rfc822 rules, return new GString from the previous one
  311. * @param name name of header (used just for folding)
  312. * @param value value of header
  313. * @param fold_max
  314. * @param how
  315. * @param fold_on_chars
  316. * @return new GString with the folded value
  317. */
  318. GString *rspamd_header_value_fold(const char *name,
  319. gsize name_len,
  320. const char *value,
  321. gsize value_len,
  322. unsigned int fold_max,
  323. enum rspamd_newlines_type how,
  324. const char *fold_on_chars);
  325. /**
  326. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm
  327. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  328. * @param in input
  329. * @param inlen input len
  330. * @param srch search string
  331. * @param srchlen length of the search string
  332. * @return position of the first substring match or (-1) if not found
  333. */
  334. goffset rspamd_substring_search(const char *in, gsize inlen,
  335. const char *srch, gsize srchlen);
  336. /**
  337. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only)
  338. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  339. * @param in input
  340. * @param inlen input len
  341. * @param srch search string
  342. * @param srchlen length of the search string
  343. * @return position of the first substring match or (-1) if not found
  344. */
  345. goffset rspamd_substring_search_caseless(const char *in, gsize inlen,
  346. const char *srch, gsize srchlen);
  347. /**
  348. * Search for end-of-headers mark in the input string. Returns position just after
  349. * the last header in message (but before the last newline character).
  350. * Hence, to obtain the real EOH position, it is also required to skip
  351. * space characters
  352. */
  353. goffset rspamd_string_find_eoh(GString *input, goffset *body_start);
  354. #define rspamd_ucl_emit_gstring(o, t, target) \
  355. rspamd_ucl_emit_gstring_comments((o), (t), (target), NULL)
  356. /**
  357. * Emit UCL object to gstring
  358. * @param obj object to emit
  359. * @param emit_type emitter type
  360. * @param comments optional comments object
  361. * @param target target string
  362. */
  363. void rspamd_ucl_emit_gstring_comments(const ucl_object_t *obj,
  364. enum ucl_emitter emit_type,
  365. GString *target,
  366. const ucl_object_t *comments);
  367. #define rspamd_ucl_emit_fstring(o, t, target) \
  368. rspamd_ucl_emit_fstring_comments((o), (t), (target), NULL)
  369. /**
  370. * Emit UCL object to fstring
  371. * @param obj object to emit
  372. * @param emit_type emitter type
  373. * * @param comments optional comments object
  374. * @param target target string
  375. */
  376. void rspamd_ucl_emit_fstring_comments(const ucl_object_t *obj,
  377. enum ucl_emitter emit_type,
  378. rspamd_fstring_t **target,
  379. const ucl_object_t *comments);
  380. extern const unsigned char lc_map[256];
  381. /**
  382. * Search for the last occurrence of character `c` in memory block of size `len`
  383. * @param m
  384. * @param c
  385. * @param len
  386. * @return pointer to the last occurrence or NULL
  387. */
  388. #ifdef HAVE_MEMRCHR
  389. #define rspamd_memrchr memrchr
  390. #else
  391. void *rspamd_memrchr(const void *m, int c, gsize len);
  392. #endif
  393. /**
  394. * Return length of memory segment starting in `s` that contains no chars from `e`
  395. * @param s any input
  396. * @param e zero terminated string of exceptions
  397. * @param len length of `s`
  398. * @return segment size
  399. */
  400. gsize rspamd_memcspn(const char *s, const char *e, gsize len);
  401. /**
  402. * Return length of memory segment starting in `s` that contains only chars from `e`
  403. * @param s any input
  404. * @param e zero terminated string of inclusions
  405. * @param len length of `s`
  406. * @return segment size
  407. */
  408. gsize rspamd_memspn(const char *s, const char *e, gsize len);
  409. /* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */
  410. #define rspamd_str_hasmore(x, n) ((((x) + ~0UL / 255 * (127 - (n))) | (x)) & ~0UL / 255 * 128)
  411. /*
  412. * Check if a pointer is aligned; n must be power of two
  413. */
  414. #define rspamd_is_aligned(p, n) (((uintptr_t) (p) & ((uintptr_t) (n) -1)) == 0)
  415. #define rspamd_is_aligned_as(p, v) rspamd_is_aligned(p, RSPAMD_ALIGNOF(__typeof((v))))
  416. gboolean rspamd_str_has_8bit(const unsigned char *beg, gsize len);
  417. struct UConverter;
  418. struct UConverter *rspamd_get_utf8_converter(void);
  419. struct UNormalizer2;
  420. const struct UNormalizer2 *rspamd_get_unicode_normalizer(void);
  421. enum rspamd_regexp_escape_flags {
  422. RSPAMD_REGEXP_ESCAPE_ASCII = 0,
  423. RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
  424. RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
  425. RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
  426. };
  427. /**
  428. * Escapes special characters when reading plain data to be processed in pcre
  429. * @param pattern pattern to process
  430. * @param slen source length
  431. * @param dst_len destination length pointer (can be NULL)
  432. * @param allow_glob allow glob expressions to be translated into pcre
  433. * @return newly allocated zero terminated escaped pattern
  434. */
  435. char *
  436. rspamd_str_regexp_escape(const char *pattern, gsize slen,
  437. gsize *dst_len, enum rspamd_regexp_escape_flags flags) G_GNUC_WARN_UNUSED_RESULT;
  438. /**
  439. * Returns copy of src (zero terminated) where all unicode is made valid or replaced
  440. * to FFFD characters. Caller must free string after usage
  441. * @param src
  442. * @param slen
  443. * @param dstelen
  444. * @return
  445. */
  446. char *rspamd_str_make_utf_valid(const unsigned char *src, gsize slen, gsize *dstlen,
  447. rspamd_mempool_t *pool) G_GNUC_WARN_UNUSED_RESULT;
  448. /**
  449. * Strips characters in `strip_chars` from start and end of the GString
  450. * @param s
  451. * @param strip_chars
  452. */
  453. gsize rspamd_gstring_strip(GString *s, const char *strip_chars);
  454. /**
  455. * Strips characters in `strip_chars` from start and end of the sized string
  456. * @param s
  457. * @param strip_chars
  458. */
  459. const char *rspamd_string_len_strip(const char *in,
  460. gsize *len, const char *strip_chars) G_GNUC_WARN_UNUSED_RESULT;
  461. /**
  462. * Returns a NULL terminated list of zero terminated strings based on splitting of
  463. * the base string into parts. If pool is not NULL then memory is allocated from
  464. * the pool. Otherwise, it is allocated from the heap using `g_malloc` (so
  465. * g_strfreev could be used to free stuff)
  466. * @param in
  467. * @param len
  468. * @param spill
  469. * @param max_elts
  470. * @return
  471. */
  472. char **rspamd_string_len_split(const char *in, gsize len,
  473. const char *spill, int max_elts, rspamd_mempool_t *pool);
  474. #define IS_ZERO_WIDTH_SPACE(uc) ((uc) == 0x200B || \
  475. (uc) == 0x200C || \
  476. (uc) == 0x200D || \
  477. (uc) == 0xFEFF || \
  478. (uc) == 0x00AD)
  479. #define IS_OBSCURED_CHAR(uc) (((uc) >= 0x200B && (uc) <= 0x200F) || \
  480. ((uc) >= 0x2028 && (uc) <= 0x202F) || \
  481. ((uc) >= 0x205F && (uc) <= 0x206F) || \
  482. (uc) == 0xFEFF)
  483. #define RSPAMD_LEN_CHECK_STARTS_WITH(s, len, lit) \
  484. ((len) >= sizeof(lit) - 1 && g_ascii_strncasecmp((s), (lit), sizeof(lit) - 1) == 0)
  485. #ifdef __cplusplus
  486. }
  487. #endif
  488. #endif /* SRC_LIBUTIL_STR_UTIL_H_ */