123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487 |
- /*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
- #ifndef SRC_LIBUTIL_STR_UTIL_H_
- #define SRC_LIBUTIL_STR_UTIL_H_
-
- #include "config.h"
- #include "ucl.h"
- #include "fstring.h"
-
-
- enum rspamd_newlines_type {
- RSPAMD_TASK_NEWLINES_CR,
- RSPAMD_TASK_NEWLINES_LF,
- RSPAMD_TASK_NEWLINES_CRLF,
- RSPAMD_TASK_NEWLINES_MAX
- };
-
- /**
- * Compare two memory regions of size `l` using case insensitive matching
- */
- gint rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l);
-
- /**
- * Convert string to lowercase in-place using ASCII conversion
- */
- void rspamd_str_lc (gchar *str, guint size);
- /**
- * Convert string to lowercase in-place using utf (limited) conversion
- */
- void rspamd_str_lc_utf8 (gchar *str, guint size);
-
- /*
- * Hash table utility functions for case insensitive hashing
- */
- guint64 rspamd_icase_hash (const gchar *in, gsize len, guint64 seed);
- guint rspamd_strcase_hash (gconstpointer key);
- gboolean rspamd_strcase_equal (gconstpointer v, gconstpointer v2);
-
- /*
- * Hash table utility functions for case sensitive hashing
- */
- guint rspamd_str_hash (gconstpointer key);
- gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
-
-
- /*
- * Hash table utility functions for hashing fixed strings
- */
- guint rspamd_ftok_icase_hash (gconstpointer key);
- gboolean rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2);
- guint rspamd_ftok_hash (gconstpointer key);
- gboolean rspamd_ftok_equal (gconstpointer v, gconstpointer v2);
- guint rspamd_gstring_icase_hash (gconstpointer key);
- gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2);
-
- /**
- * Copy src to dest limited to len, in compare with standard strlcpy(3) rspamd strlcpy does not
- * traverse the whole string and it is possible to use it for non NULL terminated strings. This is
- * more like memccpy(dst, src, size, '\0')
- *
- * @param dst destination string
- * @param src source string
- * @param siz length of destination buffer
- * @return bytes copied
- */
- gsize rspamd_strlcpy_fast (gchar *dst, const gchar *src, gsize siz);
- gsize rspamd_strlcpy_safe (gchar *dst, const gchar *src, gsize siz);
-
- #if defined(__has_feature)
- # if __has_feature(address_sanitizer)
- # define rspamd_strlcpy rspamd_strlcpy_safe
- # else
- # define rspamd_strlcpy rspamd_strlcpy_fast
- # endif
- #else
- # define rspamd_strlcpy rspamd_strlcpy_fast
- #endif
-
- /**
- * Copies `srclen` characters from `src` to `dst` ignoring \0
- * @param src
- * @param srclen
- * @param dest
- * @param destlen
- * @return number of bytes copied
- */
- gsize
- rspamd_null_safe_copy (const gchar *src, gsize srclen,
- gchar *dest, gsize destlen);
-
- /*
- * Try to convert string of length to long
- */
- gboolean rspamd_strtol (const gchar *s, gsize len, glong *value);
-
- /*
- * Try to convert string of length to unsigned long
- */
- gboolean rspamd_strtoul (const gchar *s, gsize len, gulong *value);
-
- /**
- * Utility function to provide mem_pool copy for rspamd_hash_table_copy function
- * @param data string to copy
- * @param ud memory pool to use
- * @return
- */
- gpointer rspamd_str_pool_copy (gconstpointer data, gpointer ud);
-
- /**
- * Encode string using base32 encoding
- * @param in input
- * @param inlen input length
- * @return freshly allocated base32 encoding of a specified string
- */
- gchar * rspamd_encode_base32 (const guchar *in, gsize inlen);
-
- /**
- * Decode string using base32 encoding
- * @param in input
- * @param inlen input length
- * @return freshly allocated base32 decoded value or NULL if input is invalid
- */
- guchar* rspamd_decode_base32 (const gchar *in, gsize inlen, gsize *outlen);
-
- /**
- * Encode string using hex encoding
- * @param in input
- * @param inlen input length
- * @return freshly allocated base32 encoding of a specified string
- */
- gchar * rspamd_encode_hex (const guchar *in, gsize inlen);
-
- /**
- * Decode string using hex encoding
- * @param in input
- * @param inlen input length
- * @return freshly allocated base32 decoded value or NULL if input is invalid
- */
- guchar* rspamd_decode_hex (const gchar *in, gsize inlen);
-
- /**
- * Encode string using base32 encoding
- * @param in input
- * @param inlen input length
- * @param out output buf
- * @param outlen output buf len
- * @return encoded len if `outlen` is enough to encode `inlen`
- */
- gint rspamd_encode_base32_buf (const guchar *in, gsize inlen, gchar *out,
- gsize outlen);
-
- /**
- * Decode string using base32 encoding
- * @param in input
- * @param inlen input length
- * @param out output buf (may overlap with `in`)
- * @param outlen output buf len
- * @return decoded len if in is valid base32 and `outlen` is enough to encode `inlen`
- */
- gint rspamd_decode_base32_buf (const gchar *in, gsize inlen,
- guchar *out, gsize outlen);
-
- /**
- * Encode string using hex encoding
- * @param in input
- * @param inlen input length
- * @param out output buf
- * @param outlen output buf len
- * @return encoded len if `outlen` is enough to encode `inlen`
- */
- gint rspamd_encode_hex_buf (const guchar *in, gsize inlen, gchar *out,
- gsize outlen);
-
- /**
- * Decode string using hex encoding
- * @param in input
- * @param inlen input length
- * @param out output buf (may overlap with `in`)
- * @param outlen output buf len
- * @return decoded len if in is valid hex and `outlen` is enough to encode `inlen`
- */
- gssize rspamd_decode_hex_buf (const gchar *in, gsize inlen,
- guchar *out, gsize outlen);
-
- /**
- * Encode string using base64 encoding
- * @param in input
- * @param inlen input length
- * @param str_len maximum string length (if <= 0 then no lines are split)
- * @return freshly allocated base64 encoded value or NULL if input is invalid
- */
- gchar * rspamd_encode_base64 (const guchar *in, gsize inlen, gint str_len,
- gsize *outlen);
-
- /**
- * Encode and fold string using base64 encoding
- * @param in input
- * @param inlen input length
- * @param str_len maximum string length (if <= 0 then no lines are split)
- * @return freshly allocated base64 encoded value or NULL if input is invalid
- */
- gchar * rspamd_encode_base64_fold (const guchar *in, gsize inlen, gint str_len,
- gsize *outlen, enum rspamd_newlines_type how);
-
- /**
- * Encode and fold string using quoted printable encoding
- * @param in input
- * @param inlen input length
- * @param str_len maximum string length (if <= 0 then no lines are split)
- * @return freshly allocated base64 encoded value or NULL if input is invalid
- */
- gchar * rspamd_encode_qp_fold (const guchar *in, gsize inlen, gint str_len,
- gsize *outlen, enum rspamd_newlines_type how);
-
- /**
- * Decode quoted-printable encoded buffer, input and output must not overlap
- * @param in input
- * @param inlen length of input
- * @param out output
- * @param outlen length of output
- * @return real size of decoded output or (-1) if outlen is not enough
- */
- gssize rspamd_decode_qp_buf (const gchar *in, gsize inlen,
- gchar *out, gsize outlen);
-
- /**
- * Decode quoted-printable encoded buffer using rfc2047 format, input and output must not overlap
- * @param in input
- * @param inlen length of input
- * @param out output
- * @param outlen length of output
- * @return real size of decoded output or (-1) if outlen is not enough
- */
- gssize rspamd_decode_qp2047_buf (const gchar *in, gsize inlen,
- gchar *out, gsize outlen);
-
- /**
- * Encode quoted-printable buffer using rfc2047 format, input and output must not overlap
- * @param in
- * @param inlen
- * @param out
- * @param outlen
- * @return
- */
- gssize rspamd_encode_qp2047_buf (const gchar *in, gsize inlen,
- gchar *out, gsize outlen);
-
- #ifndef g_tolower
- # define g_tolower(x) (((x) >= 'A' && (x) <= 'Z') ? (x) - 'A' + 'a' : (x))
- #endif
-
- /**
- * Return levenstein distance between two strings
- * @param s1
- * @param s1len
- * @param s2
- * @param s2len
- * @return
- */
- gint rspamd_strings_levenshtein_distance (const gchar *s1, gsize s1len,
- const gchar *s2, gsize s2len, guint replace_cost);
-
- /**
- * Fold header using rfc822 rules, return new GString from the previous one
- * @param name name of header (used just for folding)
- * @param value value of header
- * @param fold_max
- * @param how
- * @param fold_on_chars
- * @return new GString with the folded value
- */
- GString *rspamd_header_value_fold (const gchar *name,
- const gchar *value,
- guint fold_max,
- enum rspamd_newlines_type how,
- const gchar *fold_on_chars);
-
- /**
- * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm
- * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
- * @param in input
- * @param inlen input len
- * @param srch search string
- * @param srchlen length of the search string
- * @return position of the first substring match or (-1) if not found
- */
- goffset rspamd_substring_search (const gchar *in, gsize inlen,
- const gchar *srch, gsize srchlen);
-
- /**
- * Search for a substring `srch` in the text `in` using Apostolico-Crochemore algorithm in caseless matter (ASCII only)
- * http://www-igm.univ-mlv.fr/~lecroq/string/node12.html#SECTION00120
- * @param in input
- * @param inlen input len
- * @param srch search string
- * @param srchlen length of the search string
- * @return position of the first substring match or (-1) if not found
- */
- goffset rspamd_substring_search_caseless (const gchar *in, gsize inlen,
- const gchar *srch, gsize srchlen);
-
- /**
- * Search for end-of-headers mark in the input string. Returns position just after
- * the last header in message (but before the last newline character).
- * Hence, to obtain the real EOH position, it is also required to skip
- * space characters
- */
- goffset rspamd_string_find_eoh (GString *input, goffset *body_start);
-
-
- #define rspamd_ucl_emit_gstring(o, t, target) \
- rspamd_ucl_emit_gstring_comments((o), (t), (target), NULL)
- /**
- * Emit UCL object to gstring
- * @param obj object to emit
- * @param emit_type emitter type
- * @param comments optional comments object
- * @param target target string
- */
- void rspamd_ucl_emit_gstring_comments (const ucl_object_t *obj,
- enum ucl_emitter emit_type,
- GString *target,
- const ucl_object_t *comments);
-
- #define rspamd_ucl_emit_fstring(o, t, target) \
- rspamd_ucl_emit_fstring_comments((o), (t), (target), NULL)
- /**
- * Emit UCL object to fstring
- * @param obj object to emit
- * @param emit_type emitter type
- * * @param comments optional comments object
- * @param target target string
- */
- void rspamd_ucl_emit_fstring_comments (const ucl_object_t *obj,
- enum ucl_emitter emit_type,
- rspamd_fstring_t **target,
- const ucl_object_t *comments);
-
- extern const guchar lc_map[256];
-
- /**
- * Search for the last occurrence of character `c` in memory block of size `len`
- * @param m
- * @param c
- * @param len
- * @return pointer to the last occurrence or NULL
- */
- const void *rspamd_memrchr (const void *m, gint c, gsize len);
-
- /**
- * Return length of memory segment starting in `s` that contains no chars from `e`
- * @param s any input
- * @param e zero terminated string of exceptions
- * @param len length of `s`
- * @return segment size
- */
- gsize rspamd_memcspn (const gchar *s, const gchar *e, gsize len);
-
- /**
- * Return length of memory segment starting in `s` that contains only chars from `e`
- * @param s any input
- * @param e zero terminated string of inclusions
- * @param len length of `s`
- * @return segment size
- */
- gsize rspamd_memspn (const gchar *s, const gchar *e, gsize len);
-
- /* https://graphics.stanford.edu/~seander/bithacks.html#HasMoreInWord */
- #define rspamd_str_hasmore(x,n) ((((x)+~0UL/255*(127-(n)))|(x))&~0UL/255*128)
-
- static inline gboolean
- rspamd_str_has_8bit (const guchar *beg, gsize len)
- {
- unsigned long *w;
- gsize i, leftover = len % sizeof (*w);
-
- w = (unsigned long *)beg;
-
- for (i = 0; i < len / sizeof (*w); i ++) {
- if (rspamd_str_hasmore (*w, 127)) {
- return TRUE;
- }
-
- w ++;
- }
-
- beg = (const guchar *)w;
-
- for (i = 0; i < leftover; i ++) {
- if (beg[i] > 127) {
- return TRUE;
- }
- }
-
- return FALSE;
- }
-
- struct UConverter;
- struct UConverter *rspamd_get_utf8_converter (void);
-
- struct UNormalizer2;
- const struct UNormalizer2 *rspamd_get_unicode_normalizer (void);
-
- enum rspamd_normalise_result {
- RSPAMD_UNICODE_NORM_NORMAL = 0,
- RSPAMD_UNICODE_NORM_UNNORMAL = (1 << 0),
- RSPAMD_UNICODE_NORM_ZERO_SPACES = (1 << 1),
- RSPAMD_UNICODE_NORM_ERROR = (1 << 2),
- RSPAMD_UNICODE_NORM_OVERFLOW = (1 << 3)
- };
-
- /**
- * Gets a string in UTF8 and normalises it to NFKC_Casefold form
- * @param pool optional memory pool used for logging purposes
- * @param start
- * @param len
- * @return TRUE if a string has been normalised
- */
- enum rspamd_normalise_result rspamd_normalise_unicode_inplace (rspamd_mempool_t *pool,
- gchar *start, guint *len);
-
- enum rspamd_regexp_escape_flags {
- RSPAMD_REGEXP_ESCAPE_ASCII = 0,
- RSPAMD_REGEXP_ESCAPE_UTF = 1u << 0,
- RSPAMD_REGEXP_ESCAPE_GLOB = 1u << 1,
- RSPAMD_REGEXP_ESCAPE_RE = 1u << 2,
- };
- /**
- * Escapes special characters when reading plain data to be processed in pcre
- * @param pattern pattern to process
- * @param slen source length
- * @param dst_len destination length pointer (can be NULL)
- * @param allow_glob allow glob expressions to be translated into pcre
- * @return newly allocated zero terminated escaped pattern
- */
- gchar *
- rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
- gsize *dst_len, enum rspamd_regexp_escape_flags flags);
-
- /**
- * Returns copy of src (zero terminated) where all unicode is made valid or replaced
- * to FFFD characters. Caller must free string after usage
- * @param src
- * @param slen
- * @param dstelen
- * @return
- */
- gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen);
-
- /**
- * Strips characters in `strip_chars` from start and end of the GString
- * @param s
- * @param strip_chars
- */
- gsize rspamd_gstring_strip (GString *s, const gchar *strip_chars);
-
- /**
- * Strips characters in `strip_chars` from start and end of the sized string
- * @param s
- * @param strip_chars
- */
- const gchar* rspamd_string_len_strip (const gchar *in,
- gsize *len, const gchar *strip_chars);
-
- #define IS_ZERO_WIDTH_SPACE(uc) ((uc) == 0x200B || \
- (uc) == 0x200C || \
- (uc) == 0x200D || \
- (uc) == 0xFEFF)
- #define IS_OBSCURED_CHAR(uc) (((uc) >= 0x200B && (uc) <= 0x200F) || \
- ((uc) >= 0x2028 && (uc) <= 0x202F) || \
- ((uc) >= 0x205F && (uc) <= 0x206F) || \
- (uc) == 0xFEFF)
-
- #endif /* SRC_LIBUTIL_STR_UTIL_H_ */
|