aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libutil/str_util.c60
-rw-r--r--src/libutil/str_util.h10
-rw-r--r--test/lua/unit/utf.lua72
3 files changed, 121 insertions, 21 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index d8b17e3c3..1e43e7726 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -2425,7 +2425,7 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
if (flags & RSPAMD_REGEXP_ESCAPE_UTF) {
if (!g_utf8_validate (pattern, slen, NULL)) {
- tmp_utf = g_utf8_make_valid (pattern, slen);
+ tmp_utf = rspamd_str_make_utf_valid (pattern, slen, NULL);
}
}
@@ -2517,3 +2517,61 @@ rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
return res;
}
+
+
+gchar *
+rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen)
+{
+ GString *dst;
+ const gchar *last;
+ gchar *dchar;
+ gsize i, valid, prev;
+ UChar32 uc;
+
+ if (src == NULL) {
+ return NULL;
+ }
+
+ if (slen == 0) {
+ slen = strlen (src);
+ }
+
+ dst = g_string_sized_new (slen);
+ i = 0;
+ last = src;
+ valid = 0;
+ prev = 0;
+
+ while (i < slen) {
+ U8_NEXT (src, i, slen, uc);
+
+ if (uc <= 0) {
+ if (valid > 0) {
+ g_string_append_len (dst, last, valid);
+ }
+ /* 0xFFFD in UTF8 */
+ g_string_append_len (dst, "\357\277\275", 3);
+ valid = 0;
+ last = &src[i];
+ }
+ else {
+ valid += i - prev;
+ }
+
+ prev = i;
+ }
+
+ if (valid > 0) {
+ g_string_append_len (dst, last, valid);
+ }
+
+ dchar = dst->str;
+
+ if (dstlen) {
+ *dstlen = dst->len;
+ }
+
+ g_string_free (dst, FALSE);
+
+ return dchar;
+} \ No newline at end of file
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 688034ec6..139a85416 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -419,4 +419,14 @@ gchar *
rspamd_str_regexp_escape (const gchar *pattern, gsize slen,
gsize *dst_len, enum rspamd_regexp_escape_flags flags);
+/**
+ * Returns copy of src (zero terminated) where all unicode is made valid or replaced
+ * to FFFD characters. Caller must free string after usage
+ * @param src
+ * @param slen
+ * @param dstelen
+ * @return
+ */
+gchar * rspamd_str_make_utf_valid (const gchar *src, gsize slen, gsize *dstlen);
+
#endif /* SRC_LIBUTIL_STR_UTIL_H_ */
diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua
index e22eb2a2f..277d99e41 100644
--- a/test/lua/unit/utf.lua
+++ b/test/lua/unit/utf.lua
@@ -5,36 +5,68 @@ context("UTF8 check functions", function()
ffi.cdef[[
void rspamd_str_lc_utf8 (char *str, unsigned int size);
void rspamd_str_lc (char *str, unsigned int size);
+ char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen);
]]
- test("UTF lowercase", function()
- local cases = {
- {"АбЫрвАлг", "абырвалг"},
- {"АAБBвc", "аaбbвc"}
- }
-
- for _,c in ipairs(cases) do
+ local cases = {
+ {"АбЫрвАлг", "абырвалг"},
+ {"АAБBвc", "аaбbвc"},
+ --{"STRASSE", "straße"}, XXX: NYI
+ {"KEÇİ", "keçi"},
+ }
+
+ for i,c in ipairs(cases) do
+ test("UTF lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
ffi.C.rspamd_str_lc_utf8(buf, #c[1])
local s = ffi.string(buf)
assert_equal(s, c[2])
- end
- end)
- test("ASCII lowercase", function()
- local cases = {
- {"AbCdEf", "abcdef"},
- {"A", "a"},
- {"AaAa", "aaaa"},
- {"AaAaAaAa", "aaaaaaaa"}
- }
-
- for _,c in ipairs(cases) do
+ end)
+ end
+
+ cases = {
+ {"AbCdEf", "abcdef"},
+ {"A", "a"},
+ {"AaAa", "aaaa"},
+ {"AaAaAaAa", "aaaaaaaa"}
+ }
+
+ for i,c in ipairs(cases) do
+ test("ASCII lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
ffi.C.rspamd_str_lc(buf, #c[1])
local s = ffi.string(buf)
assert_equal(s, c[2])
- end
- end)
+ end)
+ end
+
+ cases = {
+ {'тест', 'тест'},
+ {'\200\213\202', '���'},
+ {'тест\200\213\202test', 'тест���test'},
+ {'\200\213\202test', '���test'},
+ {'\200\213\202test\200\213\202', '���test���'},
+ {'тест\200\213\202test\200\213\202', 'тест���test���'},
+ {'тест\200\213\202test\200\213\202тест', 'тест���test���тест'},
+ }
+
+ local NULL = ffi.new 'void*'
+ for i,c in ipairs(cases) do
+ test("Unicode make valid " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c[1] + 1)
+ ffi.copy(buf, c[1])
+
+ local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL))
+ local function to_hex(s)
+ return (s:gsub('.', function (c)
+ return string.format('%02X', string.byte(c))
+ end))
+ end
+ print(to_hex(s))
+ print(to_hex(c[2]))
+ assert_equal(s, c[2])
+ end)
+ end
end) \ No newline at end of file