Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

str_util.h 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef SRC_LIBUTIL_STR_UTIL_H_
  17. #define SRC_LIBUTIL_STR_UTIL_H_
  18. #include "config.h"
  19. #include "ucl.h"
  20. #include "fstring.h"
  21. #include <stdalign.h>
  22. #ifdef __cplusplus
  23. extern "C" {
  24. #endif
  25. enum rspamd_newlines_type {
  26. RSPAMD_TASK_NEWLINES_CR = 0,
  27. RSPAMD_TASK_NEWLINES_LF,
  28. RSPAMD_TASK_NEWLINES_CRLF,
  29. RSPAMD_TASK_NEWLINES_MAX
  30. };
  31. /**
  32. * Compare two memory regions of size `l` using case insensitive matching
  33. */
  34. gint rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l);
  35. /**
  36. * Convert string to lowercase in-place using ASCII conversion
  37. */
  38. guint rspamd_str_lc (gchar *str, guint size);
  39. /**
  40. * Performs ascii copy & lowercase
  41. * @param src
  42. * @param size
  43. * @return
  44. */
  45. gsize rspamd_str_copy_lc (const gchar *src, gchar *dst, gsize size);
  46. /**
  47. * Convert string to lowercase in-place using utf (limited) conversion
  48. */
  49. guint rspamd_str_lc_utf8 (gchar *str, guint size);
  50. /*
  51. * Hash table utility functions for case insensitive hashing
  52. */
  53. guint64 rspamd_icase_hash (const gchar *in, gsize len, guint64 seed);
  54. guint rspamd_strcase_hash (gconstpointer key);
  55. gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2);
  56. /*
  57. * Hash table utility functions for case sensitive hashing
  58. */
  59. guint rspamd_str_hash (gconstpointer key);
  60. gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
  61. /*
  62. * Hash table utility functions for hashing fixed strings
  63. */
  64. guint rspamd_ftok_icase_hash (gconstpointer key);
  65. gboolean rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2);
  66. guint rspamd_ftok_hash (gconstpointer key);
  67. gboolean rspamd_ftok_equal (gconstpointer v, gconstpointer v2);
  68. guint rspamd_gstring_icase_hash (gconstpointer key);
  69. gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2);
  70. /**
  71. * Copy src to dest limited to len, in compare with standard strlcpy(3) rspamd strlcpy does not
  72. * traverse the whole string and it is possible to use it for non NULL terminated strings. This is
  73. * more like memccpy(dst, src, size, '\0')
  74. *
  75. * @param dst destination string
  76. * @param src source string
  77. * @param siz length of destination buffer
  78. * @return bytes copied
  79. */
  80. gsize rspamd_strlcpy_fast (gchar *dst, const gchar *src, gsize siz);
  81. gsize rspamd_strlcpy_safe (gchar *dst, const gchar *src, gsize siz);
  82. #if defined(__has_feature)
  83. # if __has_feature(address_sanitizer)
  84. # define rspamd_strlcpy rspamd_strlcpy_safe
  85. # else
  86. # ifdef __SANITIZE_ADDRESS__
  87. # define rspamd_strlcpy rspamd_strlcpy_safe
  88. # else
  89. # define rspamd_strlcpy rspamd_strlcpy_fast
  90. # endif
  91. # endif
  92. #else
  93. # ifdef __SANITIZE_ADDRESS__
  94. # define rspamd_strlcpy rspamd_strlcpy_safe
  95. # else
  96. # define rspamd_strlcpy rspamd_strlcpy_fast
  97. # endif
  98. #endif
  99. /**
  100. * Copies `srclen` characters from `src` to `dst` ignoring \0
  101. * @param src
  102. * @param srclen
  103. * @param dest
  104. * @param destlen
  105. * @return number of bytes copied
  106. */
  107. gsize
  108. rspamd_null_safe_copy (const gchar *src, gsize srclen,
  109. gchar *dest, gsize destlen);
  110. /*
  111. * Try to convert string of length to long
  112. */
  113. gboolean rspamd_strtol (const gchar *s, gsize len, glong *value);
  114. /*
  115. * Try to convert a string of length to unsigned long
  116. */
  117. gboolean rspamd_strtoul (const gchar *s, gsize len, gulong *value);
  118. /*
  119. * Try to convert a hex string of length to unsigned long
  120. */
  121. gboolean rspamd_xstrtoul (const gchar *s, gsize len, gulong *value);
  122. /**
  123. * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
  124. * @param data string to copy
  125. * @param ud memory pool to use
  126. * @return
  127. */
  128. gpointer rspamd_str_pool_copy (gconstpointer data, gpointer ud);
  129. /**
  130. * Encode string using hex encoding
  131. * @param in input
  132. * @param inlen input length
  133. * @return freshly allocated base32 encoding of a specified string
  134. */
  135. gchar *rspamd_encode_hex (const guchar *in, gsize inlen);
  136. /**
  137. * Decode string using hex encoding
  138. * @param in input
  139. * @param inlen input length
  140. * @return freshly allocated base32 decoded value or NULL if input is invalid
  141. */
  142. guchar *rspamd_decode_hex (const gchar *in, gsize inlen);
  143. enum rspamd_base32_type {
  144. RSPAMD_BASE32_DEFAULT = 0,
  145. RSPAMD_BASE32_ZBASE = 0,
  146. RSPAMD_BASE32_BLEACH,
  147. RSPAMD_BASE32_RFC,
  148. RSPAMD_BASE32_INVALID = -1,
  149. };
  150. /**
  151. * Returns base32 type from a string or RSPAMD_BASE32_INVALID
  152. * @param str
  153. * @return
  154. */
  155. enum rspamd_base32_type rspamd_base32_decode_type_from_str (const gchar *str);
  156. /**
  157. * Encode string using base32 encoding
  158. * @param in input
  159. * @param inlen input length
  160. * @return freshly allocated base32 encoding of a specified string
  161. */
  162. gchar *rspamd_encode_base32 (const guchar *in, gsize inlen,
  163. enum rspamd_base32_type type);
  164. /**
  165. * Decode string using base32 encoding
  166. * @param in input
  167. * @param inlen input length
  168. * @return freshly allocated base32 decoded value or NULL if input is invalid
  169. */
  170. guchar *rspamd_decode_base32 (const gchar *in, gsize inlen, gsize *outlen, enum rspamd_base32_type type);
  171. /**
  172. * Encode string using base32 encoding
  173. * @param in input
  174. * @param inlen input length
  175. * @param out output buf
  176. * @param outlen output buf len
  177. * @return encoded len if `outlen` is enough to encode `inlen`
  178. */
  179. gint rspamd_encode_base32_buf (const guchar *in, gsize inlen, gchar *out,
  180. gsize outlen, enum rspamd_base32_type type);
  181. /**
  182. * Decode string using base32 encoding
  183. * @param in input
  184. * @param inlen input length
  185. * @param out output buf (may overlap with `in`)
  186. * @param outlen output buf len
  187. * @return decoded len if in is valid base32 and `outlen` is enough to encode `inlen`
  188. */
  189. gint rspamd_decode_base32_buf (const gchar *in, gsize inlen, guchar *out,
  190. gsize outlen, enum rspamd_base32_type type);
  191. /**
  192. * Encode string using hex encoding
  193. * @param in input
  194. * @param inlen input length
  195. * @param out output buf
  196. * @param outlen output buf len
  197. * @return encoded len if `outlen` is enough to encode `inlen`
  198. */
  199. gint rspamd_encode_hex_buf (const guchar *in, gsize inlen, gchar *out,
  200. gsize outlen);
  201. /**
  202. * Decode string using hex encoding
  203. * @param in input
  204. * @param inlen input length
  205. * @param out output buf (may overlap with `in`)
  206. * @param outlen output buf len
  207. * @return decoded len if in is valid hex and `outlen` is enough to encode `inlen`
  208. */
  209. gssize rspamd_decode_hex_buf (const gchar *in, gsize inlen,
  210. guchar *out, gsize outlen);
  211. /**
  212. * Common version of base64 encoder
  213. * @param in
  214. * @param inlen
  215. * @param str_len
  216. * @param outlen
  217. * @param fold
  218. * @param how
  219. * @return
  220. */
  221. gchar *
  222. rspamd_encode_base64_common (const guchar *in,
  223. gsize inlen,
  224. gint str_len,
  225. gsize *outlen,
  226. gboolean fold,
  227. enum rspamd_newlines_type how);
  228. /**
  229. * Encode string using base64 encoding
  230. * @param in input
  231. * @param inlen input length
  232. * @param str_len maximum string length (if <= 0 then no lines are split)
  233. * @return freshly allocated base64 encoded value or NULL if input is invalid
  234. */
  235. gchar *rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
  236. gsize *outlen);
  237. /**
  238. * Encode and fold string using base64 encoding
  239. * @param in input
  240. * @param inlen input length
  241. * @param str_len maximum string length (if <= 0 then no lines are split)
  242. * @return freshly allocated base64 encoded value or NULL if input is invalid
  243. */
  244. gchar *rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
  245. gsize *outlen, enum rspamd_newlines_type how);
  246. /**
  247. * Encode and fold string using quoted printable encoding
  248. * @param in input
  249. * @param inlen input length
  250. * @param str_len maximum string length (if <= 0 then no lines are split)
  251. * @return freshly allocated base64 encoded value or NULL if input is invalid
  252. */
  253. gchar *rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
  254. gsize *outlen, enum rspamd_newlines_type how);
  255. /**
  256. * Decode quoted-printable encoded buffer, input and output must not overlap
  257. * @param in input
  258. * @param inlen length of input
  259. * @param out output
  260. * @param outlen length of output
  261. * @return real size of decoded output or (-1) if outlen is not enough
  262. */
  263. gssize rspamd_decode_qp_buf (const gchar *in, gsize inlen,
  264. gchar *out, gsize outlen);
  265. /**
  266. * Decode uuencode encoded buffer, input and output must not overlap
  267. * @param in input
  268. * @param inlen length of input
  269. * @param out output
  270. * @param outlen length of output
  271. * @return real size of decoded output or (-1) if outlen is not enough
  272. */
  273. gssize rspamd_decode_uue_buf (const gchar *in, gsize inlen,
  274. gchar *out, gsize outlen);
  275. /**
  276. * Decode quoted-printable encoded buffer using rfc2047 format, input and output must not overlap
  277. * @param in input
  278. * @param inlen length of input
  279. * @param out output
  280. * @param outlen length of output
  281. * @return real size of decoded output or (-1) if outlen is not enough
  282. */
  283. gssize rspamd_decode_qp2047_buf (const gchar *in, gsize inlen,
  284. gchar *out, gsize outlen);
  285. /**
  286. * Encode quoted-printable buffer using rfc2047 format, input and output must not overlap
  287. * @param in
  288. * @param inlen
  289. * @param out
  290. * @param outlen
  291. * @return
  292. */
  293. gssize rspamd_encode_qp2047_buf (const gchar *in, gsize inlen,
  294. gchar *out, gsize outlen);
  295. #ifndef g_tolower
  296. # define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
  297. #endif
  298. /**
  299. * Return levenstein distance between two strings
  300. * @param s1
  301. * @param s1len
  302. * @param s2
  303. * @param s2len
  304. * @return
  305. */
  306. gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
  307. const gchar *s2, gsize s2len, guint replace_cost);
  308. /**
  309. * Fold header using rfc822 rules, return new GString from the previous one
  310. * @param name name of header (used just for folding)
  311. * @param value value of header
  312. * @param fold_max
  313. * @param how
  314. * @param fold_on_chars
  315. * @return new GString with the folded value
  316. */
  317. GString *rspamd_header_value_fold (const gchar *name,
  318. const gchar *value,
  319. guint fold_max,
  320. enum rspamd_newlines_type how,
  321. const gchar *fold_on_chars);
  322. /**
  323. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm
  324. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  325. * @param in input
  326. * @param inlen input len
  327. * @param srch search string
  328. * @param srchlen length of the search string
  329. * @return position of the first substring match or (-1) if not found
  330. */
  331. goffset rspamd_substring_search (const gchar *in, gsize inlen,
  332. const gchar *srch, gsize srchlen);
  333. /**
  334. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only)
  335. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  336. * @param in input
  337. * @param inlen input len
  338. * @param srch search string
  339. * @param srchlen length of the search string
  340. * @return position of the first substring match or (-1) if not found
  341. */
  342. goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen,
  343. const gchar *srch, gsize srchlen);
  344. /**
  345. * Search for end-of-headers mark in the input string. Returns position just after
  346. * the last header in message (but before the last newline character).
  347. * Hence, to obtain the real EOH position, it is also required to skip
  348. * space characters
  349. */
  350. goffset rspamd_string_find_eoh (GString *input, goffset *body_start);
  351. #define rspamd_ucl_emit_gstring(o, t, target) \
  352. rspamd_ucl_emit_gstring_comments((o), (t), (target), NULL)
  353. /**
  354. * Emit UCL object to gstring
  355. * @param obj object to emit
  356. * @param emit_type emitter type
  357. * @param comments optional comments object
  358. * @param target target string
  359. */
  360. void rspamd_ucl_emit_gstring_comments (const ucl_object_t *obj,
  361. enum ucl_emitter emit_type,
  362. GString *target,
  363. const ucl_object_t *comments);
  364. #define rspamd_ucl_emit_fstring(o, t, target) \
  365. rspamd_ucl_emit_fstring_comments((o), (t), (target), NULL)
  366. /**
  367. * Emit UCL object to fstring
  368. * @param obj object to emit
  369. * @param emit_type emitter type
  370. * * @param comments optional comments object
  371. * @param target target string
  372. */
  373. void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
  374. enum ucl_emitter emit_type,
  375. rspamd_fstring_t **target,
  376. const ucl_object_t *comments);
  377. extern const guchar lc_map[256];
  378. /**
  379. * Search for the last occurrence of character `c` in memory block of size `len`
  380. * @param m
  381. * @param c
  382. * @param len
  383. * @return pointer to the last occurrence or NULL
  384. */
  385. #ifdef HAVE_MEMRCHR
  386. #define rspamd_memrchr memrchr
  387. #else
  388. void *rspamd_memrchr (const void *m, gint c, gsize len);
  389. #endif
  390. /**
  391. * Return length of memory segment starting in `s` that contains no chars from `e`
  392. * @param s any input
  393. * @param e zero terminated string of exceptions
  394. * @param len length of `s`
  395. * @return segment size
  396. */
  397. gsize rspamd_memcspn (const gchar *s, const gchar *e, gsize len);
  398. /**
  399. * Return length of memory segment starting in `s` that contains only chars from `e`
  400. * @param s any input
  401. * @param e zero terminated string of inclusions
  402. * @param len length of `s`
  403. * @return segment size
  404. */
  405. gsize rspamd_memspn (const gchar *s, const gchar *e, gsize len);
  406. /* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */
  407. #define rspamd_str_hasmore(x, n) ((((x)+~0UL/255*(127-(n)))|(x))&~0UL/255*128)
  408. /*
  409. * Check if a pointer is aligned; n must be power of two
  410. */
  411. #define rspamd_is_aligned(p, n) (((uintptr_t)(p) & ((uintptr_t)(n) - 1)) == 0)
  412. #define rspamd_is_aligned_as(p, v) rspamd_is_aligned(p, _Alignof(__typeof((v))))
  413. gboolean rspamd_str_has_8bit (const guchar *beg, gsize len);
  414. struct UConverter;
  415. struct UConverter *rspamd_get_utf8_converter (void);
  416. struct UNormalizer2;
  417. const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
  418. enum rspamd_regexp_escape_flags {
  419. RSPAMD_REGEXP_ESCAPE_ASCII = 0,
  420. RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
  421. RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
  422. RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
  423. };
  424. /**
  425. * Escapes special characters when reading plain data to be processed in pcre
  426. * @param pattern pattern to process
  427. * @param slen source length
  428. * @param dst_len destination length pointer (can be NULL)
  429. * @param allow_glob allow glob expressions to be translated into pcre
  430. * @return newly allocated zero terminated escaped pattern
  431. */
  432. gchar *
  433. rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
  434. gsize *dst_len, enum rspamd_regexp_escape_flags flags) G_GNUC_WARN_UNUSED_RESULT;
  435. /**
  436. * Returns copy of src (zero terminated) where all unicode is made valid or replaced
  437. * to FFFD characters. Caller must free string after usage
  438. * @param src
  439. * @param slen
  440. * @param dstelen
  441. * @return
  442. */
  443. gchar *rspamd_str_make_utf_valid (const guchar *src, gsize slen, gsize *dstlen,
  444. rspamd_mempool_t *pool) G_GNUC_WARN_UNUSED_RESULT;
  445. /**
  446. * Strips characters in `strip_chars` from start and end of the GString
  447. * @param s
  448. * @param strip_chars
  449. */
  450. gsize rspamd_gstring_strip (GString *s, const gchar *strip_chars);
  451. /**
  452. * Strips characters in `strip_chars` from start and end of the sized string
  453. * @param s
  454. * @param strip_chars
  455. */
  456. const gchar *rspamd_string_len_strip (const gchar *in,
  457. gsize *len, const gchar *strip_chars) G_GNUC_WARN_UNUSED_RESULT;
  458. /**
  459. * Returns a NULL terminated list of zero terminated strings based on splitting of
  460. * the base string into parts. If pool is not NULL then memory is allocated from
  461. * the pool. Otherwise, it is allocated from the heap using `g_malloc` (so
  462. * g_strfreev could be used to free stuff)
  463. * @param in
  464. * @param len
  465. * @param spill
  466. * @param max_elts
  467. * @return
  468. */
  469. gchar ** rspamd_string_len_split (const gchar *in, gsize len,
  470. const gchar *spill, gint max_elts, rspamd_mempool_t *pool);
  471. #define IS_ZERO_WIDTH_SPACE(uc) ((uc) == 0x200B || \
  472. (uc) == 0x200C || \
  473. (uc) == 0x200D || \
  474. (uc) == 0xFEFF || \
  475. (uc) == 0x00AD)
  476. #define IS_OBSCURED_CHAR(uc) (((uc) >= 0x200B && (uc) <= 0x200F) || \
  477. ((uc) >= 0x2028 && (uc) <= 0x202F) || \
  478. ((uc) >= 0x205F && (uc) <= 0x206F) || \
  479. (uc) == 0xFEFF)
  480. #define RSPAMD_LEN_CHECK_STARTS_WITH(s, len, lit) \
  481. ((len) >= sizeof(lit) - 1 && g_ascii_strncasecmp ((s), (lit), sizeof(lit) - 1) == 0)
  482. #ifdef __cplusplus
  483. }
  484. #endif
  485. #endif /* SRC_LIBUTIL_STR_UTIL_H_ */