From b4442b41baab6160e87098f52b94def24b97e066 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 3 Dec 2018 11:30:22 +0000 Subject: [Minor] Add rspamd_str_make_utf_valid routine + unit tests --- src/libutil/str_util.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++- src/libutil/str_util.h | 10 +++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index d8b17e3c3..1e43e7726 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2425,7 +2425,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { if (!g_utf8_validate (pattern, slen, NULL)) { - tmp_utf = g_utf8_make_valid (pattern, slen); + tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); } } @@ -2517,3 +2517,61 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, return res; } + + +gchar * +rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen) +{ + GString *dst; + const gchar *last; + gchar *dchar; + gsize i, valid, prev; + UChar32 uc; + + if (src == NULL) { + return NULL; + } + + if (slen == 0) { + slen = strlen (src); + } + + dst = g_string_sized_new (slen); + i = 0; + last = src; + valid = 0; + prev = 0; + + while (i < slen) { + U8_NEXT (src, i, slen, uc); + + if (uc <= 0) { + if (valid > 0) { + g_string_append_len (dst, last, valid); + } + /* 0xFFFD in UTF8 */ + g_string_append_len (dst, "\357\277\275", 3); + valid = 0; + last = &src[i]; + } + else { + valid += i - prev; + } + + prev = i; + } + + if (valid > 0) { + g_string_append_len (dst, last, valid); + } + + dchar = dst->str; + + if (dstlen) { + *dstlen = dst->len; + } + + g_string_free (dst, FALSE); + + return dchar; +} \ No newline at end of file diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 688034ec6..139a85416 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -419,4 +419,14 @@ gchar * rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gsize *dst_len, enum rspamd_regexp_escape_flags flags); +/** + * Returns copy of src (zero terminated) where all unicode is made valid or replaced + * to FFFD characters. Caller must free string after usage + * @param src + * @param slen + * @param dstelen + * @return + */ +gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen); + #endif /* SRC_LIBUTIL_STR_UTIL_H_ */ -- cgit v1.2.3