From b4442b41baab6160e87098f52b94def24b97e066 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 3 Dec 2018 11:30:22 +0000 Subject: [PATCH] [Minor] Add rspamd_str_make_utf_valid routine + unit tests --- src/libutil/str_util.c | 60 ++++++++++++++++++++++++++++++++++- src/libutil/str_util.h | 10 ++++++ test/lua/unit/utf.lua | 72 ++++++++++++++++++++++++++++++------------ 3 files changed, 121 insertions(+), 21 deletions(-) diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index d8b17e3c3..1e43e7726 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -2425,7 +2425,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, if (flags & RSPAMD_REGEXP_ESCAPE_UTF) { if (!g_utf8_validate (pattern, slen, NULL)) { - tmp_utf = g_utf8_make_valid (pattern, slen); + tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL); } } @@ -2517,3 +2517,61 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen, return res; } + + +gchar * +rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen) +{ + GString *dst; + const gchar *last; + gchar *dchar; + gsize i, valid, prev; + UChar32 uc; + + if (src == NULL) { + return NULL; + } + + if (slen == 0) { + slen = strlen (src); + } + + dst = g_string_sized_new (slen); + i = 0; + last = src; + valid = 0; + prev = 0; + + while (i < slen) { + U8_NEXT (src, i, slen, uc); + + if (uc <= 0) { + if (valid > 0) { + g_string_append_len (dst, last, valid); + } + /* 0xFFFD in UTF8 */ + g_string_append_len (dst, "\357\277\275", 3); + valid = 0; + last = &src[i]; + } + else { + valid += i - prev; + } + + prev = i; + } + + if (valid > 0) { + g_string_append_len (dst, last, valid); + } + + dchar = dst->str; + + if (dstlen) { + *dstlen = dst->len; + } + + g_string_free (dst, FALSE); + + return dchar; +} \ No newline at end of file diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 688034ec6..139a85416 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -419,4 +419,14 @@ gchar * rspamd_str_regexp_escape (const gchar *pattern, gsize slen, gsize *dst_len, enum rspamd_regexp_escape_flags flags); +/** + * Returns copy of src (zero terminated) where all unicode is made valid or replaced + * to FFFD characters. Caller must free string after usage + * @param src + * @param slen + * @param dstelen + * @return + */ +gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen); + #endif /* SRC_LIBUTIL_STR_UTIL_H_ */ diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua index e22eb2a2f..277d99e41 100644 --- a/test/lua/unit/utf.lua +++ b/test/lua/unit/utf.lua @@ -5,36 +5,68 @@ context("UTF8 check functions", function() ffi.cdef[[ void rspamd_str_lc_utf8 (char *str, unsigned int size); void rspamd_str_lc (char *str, unsigned int size); + char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen); ]] - test("UTF lowercase", function() - local cases = { - {"АбЫрвАлг", "абырвалг"}, - {"АAБBвc", "аaбbвc"} - } - - for _,c in ipairs(cases) do + local cases = { + {"АбЫрвАлг", "абырвалг"}, + {"АAБBвc", "аaбbвc"}, + --{"STRASSE", "straße"}, XXX: NYI + {"KEÇİ", "keçi"}, + } + + for i,c in ipairs(cases) do + test("UTF lowercase " .. tostring(i), function() local buf = ffi.new("char[?]", #c[1] + 1) ffi.copy(buf, c[1]) ffi.C.rspamd_str_lc_utf8(buf, #c[1]) local s = ffi.string(buf) assert_equal(s, c[2]) - end - end) - test("ASCII lowercase", function() - local cases = { - {"AbCdEf", "abcdef"}, - {"A", "a"}, - {"AaAa", "aaaa"}, - {"AaAaAaAa", "aaaaaaaa"} - } - - for _,c in ipairs(cases) do + end) + end + + cases = { + {"AbCdEf", "abcdef"}, + {"A", "a"}, + {"AaAa", "aaaa"}, + {"AaAaAaAa", "aaaaaaaa"} + } + + for i,c in ipairs(cases) do + test("ASCII lowercase " .. tostring(i), function() local buf = ffi.new("char[?]", #c[1] + 1) ffi.copy(buf, c[1]) ffi.C.rspamd_str_lc(buf, #c[1]) local s = ffi.string(buf) assert_equal(s, c[2]) - end - end) + end) + end + + cases = { + {'тест', 'тест'}, + {'\200\213\202', '���'}, + {'тест\200\213\202test', 'тест���test'}, + {'\200\213\202test', '���test'}, + {'\200\213\202test\200\213\202', '���test���'}, + {'тест\200\213\202test\200\213\202', 'тест���test���'}, + {'тест\200\213\202test\200\213\202тест', 'тест���test���тест'}, + } + + local NULL = ffi.new 'void*' + for i,c in ipairs(cases) do + test("Unicode make valid " .. tostring(i), function() + local buf = ffi.new("char[?]", #c[1] + 1) + ffi.copy(buf, c[1]) + + local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL)) + local function to_hex(s) + return (s:gsub('.', function (c) + return string.format('%02X', string.byte(c)) + end)) + end + print(to_hex(s)) + print(to_hex(c[2])) + assert_equal(s, c[2]) + end) + end end) \ No newline at end of file -- 2.39.5