You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

str_util.h 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486
  1. /*-
  2. * Copyright 2016 Vsevolod Stakhov
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef SRC_LIBUTIL_STR_UTIL_H_
  17. #define SRC_LIBUTIL_STR_UTIL_H_
  18. #include "config.h"
  19. #include "ucl.h"
  20. #include "fstring.h"
  21. enum rspamd_newlines_type {
  22. RSPAMD_TASK_NEWLINES_CR,
  23. RSPAMD_TASK_NEWLINES_LF,
  24. RSPAMD_TASK_NEWLINES_CRLF,
  25. RSPAMD_TASK_NEWLINES_MAX
  26. };
  27. /**
  28. * Compare two memory regions of size `l` using case insensitive matching
  29. */
  30. gint rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l);
  31. /**
  32. * Convert string to lowercase in-place using ASCII conversion
  33. */
  34. void rspamd_str_lc (gchar *str, guint size);
  35. /**
  36. * Convert string to lowercase in-place using utf (limited) conversion
  37. */
  38. void rspamd_str_lc_utf8 (gchar *str, guint size);
  39. /*
  40. * Hash table utility functions for case insensitive hashing
  41. */
  42. guint64 rspamd_icase_hash (const gchar *in, gsize len, guint64 seed);
  43. guint rspamd_strcase_hash (gconstpointer key);
  44. gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2);
  45. /*
  46. * Hash table utility functions for case sensitive hashing
  47. */
  48. guint rspamd_str_hash (gconstpointer key);
  49. gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
  50. /*
  51. * Hash table utility functions for hashing fixed strings
  52. */
  53. guint rspamd_ftok_icase_hash (gconstpointer key);
  54. gboolean rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2);
  55. guint rspamd_ftok_hash (gconstpointer key);
  56. gboolean rspamd_ftok_equal (gconstpointer v, gconstpointer v2);
  57. guint rspamd_gstring_icase_hash (gconstpointer key);
  58. gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2);
  59. /**
  60. * Copy src to dest limited to len, in compare with standard strlcpy(3) rspamd strlcpy does not
  61. * traverse the whole string and it is possible to use it for non NULL terminated strings. This is
  62. * more like memccpy(dst, src, size, '\0')
  63. *
  64. * @param dst destination string
  65. * @param src source string
  66. * @param siz length of destination buffer
  67. * @return bytes copied
  68. */
  69. gsize rspamd_strlcpy_fast (gchar *dst, const gchar *src, gsize siz);
  70. gsize rspamd_strlcpy_safe (gchar *dst, const gchar *src, gsize siz);
  71. #if defined(__has_feature)
  72. # if __has_feature(address_sanitizer)
  73. # define rspamd_strlcpy rspamd_strlcpy_safe
  74. # else
  75. # define rspamd_strlcpy rspamd_strlcpy_fast
  76. # endif
  77. #else
  78. # define rspamd_strlcpy rspamd_strlcpy_fast
  79. #endif
  80. /**
  81. * Copies `srclen` characters from `src` to `dst` ignoring \0
  82. * @param src
  83. * @param srclen
  84. * @param dest
  85. * @param destlen
  86. * @return number of bytes copied
  87. */
  88. gsize
  89. rspamd_null_safe_copy (const gchar *src, gsize srclen,
  90. gchar *dest, gsize destlen);
  91. /*
  92. * Try to convert string of length to long
  93. */
  94. gboolean rspamd_strtol (const gchar *s, gsize len, glong *value);
  95. /*
  96. * Try to convert string of length to unsigned long
  97. */
  98. gboolean rspamd_strtoul (const gchar *s, gsize len, gulong *value);
  99. /**
  100. * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
  101. * @param data string to copy
  102. * @param ud memory pool to use
  103. * @return
  104. */
  105. gpointer rspamd_str_pool_copy (gconstpointer data, gpointer ud);
  106. /**
  107. * Encode string using base32 encoding
  108. * @param in input
  109. * @param inlen input length
  110. * @return freshly allocated base32 encoding of a specified string
  111. */
  112. gchar * rspamd_encode_base32 (const guchar *in, gsize inlen);
  113. /**
  114. * Decode string using base32 encoding
  115. * @param in input
  116. * @param inlen input length
  117. * @return freshly allocated base32 decoded value or NULL if input is invalid
  118. */
  119. guchar* rspamd_decode_base32 (const gchar *in, gsize inlen, gsize *outlen);
  120. /**
  121. * Encode string using hex encoding
  122. * @param in input
  123. * @param inlen input length
  124. * @return freshly allocated base32 encoding of a specified string
  125. */
  126. gchar * rspamd_encode_hex (const guchar *in, gsize inlen);
  127. /**
  128. * Decode string using hex encoding
  129. * @param in input
  130. * @param inlen input length
  131. * @return freshly allocated base32 decoded value or NULL if input is invalid
  132. */
  133. guchar* rspamd_decode_hex (const gchar *in, gsize inlen);
  134. /**
  135. * Encode string using base32 encoding
  136. * @param in input
  137. * @param inlen input length
  138. * @param out output buf
  139. * @param outlen output buf len
  140. * @return encoded len if `outlen` is enough to encode `inlen`
  141. */
  142. gint rspamd_encode_base32_buf (const guchar *in, gsize inlen, gchar *out,
  143. gsize outlen);
  144. /**
  145. * Decode string using base32 encoding
  146. * @param in input
  147. * @param inlen input length
  148. * @param out output buf (may overlap with `in`)
  149. * @param outlen output buf len
  150. * @return decoded len if in is valid base32 and `outlen` is enough to encode `inlen`
  151. */
  152. gint rspamd_decode_base32_buf (const gchar *in, gsize inlen,
  153. guchar *out, gsize outlen);
  154. /**
  155. * Encode string using hex encoding
  156. * @param in input
  157. * @param inlen input length
  158. * @param out output buf
  159. * @param outlen output buf len
  160. * @return encoded len if `outlen` is enough to encode `inlen`
  161. */
  162. gint rspamd_encode_hex_buf (const guchar *in, gsize inlen, gchar *out,
  163. gsize outlen);
  164. /**
  165. * Decode string using hex encoding
  166. * @param in input
  167. * @param inlen input length
  168. * @param out output buf (may overlap with `in`)
  169. * @param outlen output buf len
  170. * @return decoded len if in is valid hex and `outlen` is enough to encode `inlen`
  171. */
  172. gssize rspamd_decode_hex_buf (const gchar *in, gsize inlen,
  173. guchar *out, gsize outlen);
  174. /**
  175. * Encode string using base64 encoding
  176. * @param in input
  177. * @param inlen input length
  178. * @param str_len maximum string length (if <= 0 then no lines are split)
  179. * @return freshly allocated base64 encoded value or NULL if input is invalid
  180. */
  181. gchar * rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
  182. gsize *outlen);
  183. /**
  184. * Encode and fold string using base64 encoding
  185. * @param in input
  186. * @param inlen input length
  187. * @param str_len maximum string length (if <= 0 then no lines are split)
  188. * @return freshly allocated base64 encoded value or NULL if input is invalid
  189. */
  190. gchar * rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
  191. gsize *outlen, enum rspamd_newlines_type how);
  192. /**
  193. * Encode and fold string using quoted printable encoding
  194. * @param in input
  195. * @param inlen input length
  196. * @param str_len maximum string length (if <= 0 then no lines are split)
  197. * @return freshly allocated base64 encoded value or NULL if input is invalid
  198. */
  199. gchar * rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
  200. gsize *outlen, enum rspamd_newlines_type how);
  201. /**
  202. * Decode quoted-printable encoded buffer, input and output must not overlap
  203. * @param in input
  204. * @param inlen length of input
  205. * @param out output
  206. * @param outlen length of output
  207. * @return real size of decoded output or (-1) if outlen is not enough
  208. */
  209. gssize rspamd_decode_qp_buf (const gchar *in, gsize inlen,
  210. gchar *out, gsize outlen);
  211. /**
  212. * Decode quoted-printable encoded buffer using rfc2047 format, input and output must not overlap
  213. * @param in input
  214. * @param inlen length of input
  215. * @param out output
  216. * @param outlen length of output
  217. * @return real size of decoded output or (-1) if outlen is not enough
  218. */
  219. gssize rspamd_decode_qp2047_buf (const gchar *in, gsize inlen,
  220. gchar *out, gsize outlen);
  221. /**
  222. * Encode quoted-printable buffer using rfc2047 format, input and output must not overlap
  223. * @param in
  224. * @param inlen
  225. * @param out
  226. * @param outlen
  227. * @return
  228. */
  229. gssize rspamd_encode_qp2047_buf (const gchar *in, gsize inlen,
  230. gchar *out, gsize outlen);
  231. #ifndef g_tolower
  232. # define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
  233. #endif
  234. /**
  235. * Return levenstein distance between two strings
  236. * @param s1
  237. * @param s1len
  238. * @param s2
  239. * @param s2len
  240. * @return
  241. */
  242. gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
  243. const gchar *s2, gsize s2len, guint replace_cost);
  244. /**
  245. * Fold header using rfc822 rules, return new GString from the previous one
  246. * @param name name of header (used just for folding)
  247. * @param value value of header
  248. * @param fold_max
  249. * @param how
  250. * @param fold_on_chars
  251. * @return new GString with the folded value
  252. */
  253. GString *rspamd_header_value_fold (const gchar *name,
  254. const gchar *value,
  255. guint fold_max,
  256. enum rspamd_newlines_type how,
  257. const gchar *fold_on_chars);
  258. /**
  259. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm
  260. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  261. * @param in input
  262. * @param inlen input len
  263. * @param srch search string
  264. * @param srchlen length of the search string
  265. * @return position of the first substring match or (-1) if not found
  266. */
  267. goffset rspamd_substring_search (const gchar *in, gsize inlen,
  268. const gchar *srch, gsize srchlen);
  269. /**
  270. * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only)
  271. * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
  272. * @param in input
  273. * @param inlen input len
  274. * @param srch search string
  275. * @param srchlen length of the search string
  276. * @return position of the first substring match or (-1) if not found
  277. */
  278. goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen,
  279. const gchar *srch, gsize srchlen);
  280. /**
  281. * Search for end-of-headers mark in the input string. Returns position just after
  282. * the last header in message (but before the last newline character).
  283. * Hence, to obtain the real EOH position, it is also required to skip
  284. * space characters
  285. */
  286. goffset rspamd_string_find_eoh (GString *input, goffset *body_start);
  287. #define rspamd_ucl_emit_gstring(o, t, target) \
  288. rspamd_ucl_emit_gstring_comments((o), (t), (target), NULL)
  289. /**
  290. * Emit UCL object to gstring
  291. * @param obj object to emit
  292. * @param emit_type emitter type
  293. * @param comments optional comments object
  294. * @param target target string
  295. */
  296. void rspamd_ucl_emit_gstring_comments (const ucl_object_t *obj,
  297. enum ucl_emitter emit_type,
  298. GString *target,
  299. const ucl_object_t *comments);
  300. #define rspamd_ucl_emit_fstring(o, t, target) \
  301. rspamd_ucl_emit_fstring_comments((o), (t), (target), NULL)
  302. /**
  303. * Emit UCL object to fstring
  304. * @param obj object to emit
  305. * @param emit_type emitter type
  306. * * @param comments optional comments object
  307. * @param target target string
  308. */
  309. void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
  310. enum ucl_emitter emit_type,
  311. rspamd_fstring_t **target,
  312. const ucl_object_t *comments);
  313. extern const guchar lc_map[256];
  314. /**
  315. * Search for the last occurrence of character `c` in memory block of size `len`
  316. * @param m
  317. * @param c
  318. * @param len
  319. * @return pointer to the last occurrence or NULL
  320. */
  321. const void *rspamd_memrchr (const void *m, gint c, gsize len);
  322. /**
  323. * Return length of memory segment starting in `s` that contains no chars from `e`
  324. * @param s any input
  325. * @param e zero terminated string of exceptions
  326. * @param len length of `s`
  327. * @return segment size
  328. */
  329. gsize rspamd_memcspn (const gchar *s, const gchar *e, gsize len);
  330. /**
  331. * Return length of memory segment starting in `s` that contains only chars from `e`
  332. * @param s any input
  333. * @param e zero terminated string of inclusions
  334. * @param len length of `s`
  335. * @return segment size
  336. */
  337. gsize rspamd_memspn (const gchar *s, const gchar *e, gsize len);
  338. /* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */
  339. #define rspamd_str_hasmore(x,n) ((((x)+~0UL/255*(127-(n)))|(x))&~0UL/255*128)
  340. static inline gboolean
  341. rspamd_str_has_8bit (const guchar *beg, gsize len)
  342. {
  343. unsigned long *w;
  344. gsize i, leftover = len % sizeof (*w);
  345. w = (unsigned long *)beg;
  346. for (i = 0; i < len / sizeof (*w); i ++) {
  347. if (rspamd_str_hasmore (*w, 127)) {
  348. return TRUE;
  349. }
  350. w ++;
  351. }
  352. beg = (const guchar *)w;
  353. for (i = 0; i < leftover; i ++) {
  354. if (beg[i] > 127) {
  355. return TRUE;
  356. }
  357. }
  358. return FALSE;
  359. }
  360. struct UConverter;
  361. struct UConverter *rspamd_get_utf8_converter (void);
  362. struct UNormalizer2;
  363. const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
  364. enum rspamd_normalise_result {
  365. RSPAMD_UNICODE_NORM_NORMAL = 0,
  366. RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
  367. RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
  368. RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
  369. RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
  370. };
  371. /**
  372. * Gets a string in UTF8 and normalises it to NFKC_Casefold form
  373. * @param pool optional memory pool used for logging purposes
  374. * @param start
  375. * @param len
  376. * @return TRUE if a string has been normalised
  377. */
  378. enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
  379. gchar *start, guint *len);
  380. enum rspamd_regexp_escape_flags {
  381. RSPAMD_REGEXP_ESCAPE_ASCII = 0,
  382. RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
  383. RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
  384. };
  385. /**
  386. * Escapes special characters when reading plain data to be processed in pcre
  387. * @param pattern pattern to process
  388. * @param slen source length
  389. * @param dst_len destination length pointer (can be NULL)
  390. * @param allow_glob allow glob expressions to be translated into pcre
  391. * @return newly allocated zero terminated escaped pattern
  392. */
  393. gchar *
  394. rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
  395. gsize *dst_len, enum rspamd_regexp_escape_flags flags);
  396. /**
  397. * Returns copy of src (zero terminated) where all unicode is made valid or replaced
  398. * to FFFD characters. Caller must free string after usage
  399. * @param src
  400. * @param slen
  401. * @param dstelen
  402. * @return
  403. */
  404. gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen);
  405. /**
  406. * Strips characters in `strip_chars` from start and end of the GString
  407. * @param s
  408. * @param strip_chars
  409. */
  410. gsize rspamd_gstring_strip (GString *s, const gchar *strip_chars);
  411. /**
  412. * Strips characters in `strip_chars` from start and end of the sized string
  413. * @param s
  414. * @param strip_chars
  415. */
  416. const gchar* rspamd_string_len_strip (const gchar *in,
  417. gsize *len, const gchar *strip_chars);
  418. #define IS_ZERO_WIDTH_SPACE(uc) ((uc) == 0x200B || \
  419. (uc) == 0x200C || \
  420. (uc) == 0x200D || \
  421. (uc) == 0xFEFF)
  422. #define IS_OBSCURED_CHAR(uc) (((uc) >= 0x200B && (uc) <= 0x200F) || \
  423. ((uc) >= 0x2028 && (uc) <= 0x202F) || \
  424. ((uc) >= 0x205F && (uc) <= 0x206F) || \
  425. (uc) == 0xFEFF)
  426. #endif /* SRC_LIBUTIL_STR_UTIL_H_ */