aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-12-03 11:30:22 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-12-03 11:30:22 +0000
commitb4442b41baab6160e87098f52b94def24b97e066 (patch)
tree5361f47875c1399c0c8a0836e9c74475af0ae459 /src
parentfe940c7d3d9d72f0196b9cd847dd0160603dcbe9 (diff)
downloadrspamd-b4442b41baab6160e87098f52b94def24b97e066.tar.gz
rspamd-b4442b41baab6160e87098f52b94def24b97e066.zip
[Minor] Add rspamd_str_make_utf_valid routine + unit tests
Diffstat (limited to 'src')
-rw-r--r--src/libutil/str_util.c60
-rw-r--r--src/libutil/str_util.h10
2 files changed, 69 insertions, 1 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index d8b17e3c3..1e43e7726 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2425,7 +2425,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
if (!g_utf8_validate (pattern, slen, NULL)) {
- tmp_utf = g_utf8_make_valid (pattern, slen);
+ tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
}
}
@@ -2517,3 +2517,61 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
return res;
}
+
+
+gchar *
+rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen)
+{
+ GString *dst;
+ const gchar *last;
+ gchar *dchar;
+ gsize i, valid, prev;
+ UChar32 uc;
+
+ if (src == NULL) {
+ return NULL;
+ }
+
+ if (slen == 0) {
+ slen = strlen (src);
+ }
+
+ dst = g_string_sized_new (slen);
+ i = 0;
+ last = src;
+ valid = 0;
+ prev = 0;
+
+ while (i < slen) {
+ U8_NEXT (src, i, slen, uc);
+
+ if (uc <= 0) {
+ if (valid > 0) {
+ g_string_append_len (dst, last, valid);
+ }
+ /* 0xFFFD in UTF8 */
+ g_string_append_len (dst, "\357\277\275", 3);
+ valid = 0;
+ last = &src[i];
+ }
+ else {
+ valid += i - prev;
+ }
+
+ prev = i;
+ }
+
+ if (valid > 0) {
+ g_string_append_len (dst, last, valid);
+ }
+
+ dchar = dst->str;
+
+ if (dstlen) {
+ *dstlen = dst->len;
+ }
+
+ g_string_free (dst, FALSE);
+
+ return dchar;
+} \ No newline at end of file
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 688034ec6..139a85416 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -419,4 +419,14 @@ gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gsize *dst_len, enum rspamd_regexp_escape_flags flags);
+/**
+ * Returns copy of src (zero terminated) where all unicode is made valid or replaced
+ * to FFFD characters. Caller must free string after usage
+ * @param src
+ * @param slen
+ * @param dstelen
+ * @return
+ */
+gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen);
+
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */