@@ -477,30 +477,29 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len) | |||
p = in; | |||
end = in + len; | |||
while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len) > 0)) { | |||
while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) { | |||
err_offset --; /* As it returns it 1 indexed */ | |||
goffset cur_offset = err_offset; | |||
while (cur_offset < len) { | |||
goffset tmp = cur_offset; | |||
U8_NEXT (in, cur_offset, len, uc); | |||
U8_NEXT (p, cur_offset, len, uc); | |||
if (uc > 0) { | |||
/* Fill string between err_offset and tmp with `?` character */ | |||
memset (in + err_offset, '?', | |||
tmp - err_offset); | |||
memset (p + err_offset - 1, '?', tmp - err_offset); | |||
break; | |||
} | |||
} | |||
if (uc < 0) { | |||
/* Fill till the end */ | |||
memset (p + err_offset, '?', | |||
len - err_offset); | |||
memset (p + err_offset, '?', len - err_offset); | |||
break; | |||
} | |||
p = in + cur_offset; | |||
p += cur_offset; | |||
len = end - p; | |||
} | |||
} |
@@ -925,11 +925,11 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud) | |||
goffset err_offset; | |||
if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen) == 0)) { | |||
if ((err_offset = rspamd_fast_utf8_validate (url->host, url->hostlen)) == 0) { | |||
obj = ucl_object_fromlstring (url->host, url->hostlen); | |||
} | |||
else { | |||
obj = ucl_object_fromlstring (url->host, err_offset); | |||
obj = ucl_object_fromlstring (url->host, err_offset - 1); | |||
} | |||
} | |||
else { |
@@ -3071,30 +3071,31 @@ rspamd_str_make_utf_valid (const guchar *src, gsize slen, | |||
} | |||
p = src; | |||
dlen = slen; | |||
dlen = slen + 1; /* As we add '\0' */ | |||
/* Check space required */ | |||
while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { | |||
while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain)) > 0) { | |||
gint i = 0; | |||
err_offset --; /* As it returns it 1 indexed */ | |||
p += err_offset; | |||
remain -= err_offset; | |||
dlen += err_offset; | |||
/* Each invalid character of input requires 3 bytes of output */ | |||
/* Each invalid character of input requires 3 bytes of output (+2 bytes) */ | |||
while (i < remain) { | |||
gint old_i = i; | |||
U8_NEXT (p, i, remain, uc); | |||
if (uc < 0) { | |||
dlen += 3; | |||
dlen += 2; | |||
} | |||
else { | |||
p += old_i; | |||
remain -= old_i; | |||
break; | |||
} | |||
} | |||
p += i; | |||
remain -= i; | |||
} | |||
if (pool) { | |||
@@ -3108,8 +3109,9 @@ rspamd_str_make_utf_valid (const guchar *src, gsize slen, | |||
d = dst; | |||
remain = slen; | |||
while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain) > 0)) { | |||
while (remain > 0 && (err_offset = rspamd_fast_utf8_validate (p, remain)) > 0) { | |||
/* Copy valid */ | |||
err_offset --; /* As it returns it 1 indexed */ | |||
memcpy (d, p, err_offset); | |||
d += err_offset; | |||
@@ -3130,8 +3132,7 @@ rspamd_str_make_utf_valid (const guchar *src, gsize slen, | |||
} | |||
else { | |||
/* Adjust p and remaining stuff and go to the outer cycle */ | |||
p += old_i; | |||
remain -= old_i; | |||
i = old_i; | |||
break; | |||
} | |||
} | |||
@@ -3139,6 +3140,8 @@ rspamd_str_make_utf_valid (const guchar *src, gsize slen, | |||
* Now p is the first valid utf8 character and remain is the rest of the string | |||
* so we can continue our loop | |||
*/ | |||
p += i; | |||
remain -= i; | |||
} | |||
if (err_offset == 0 && remain > 0) { |
@@ -5,7 +5,7 @@ context("UTF8 check functions", function() | |||
ffi.cdef[[ | |||
unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size); | |||
unsigned int rspamd_str_lc (char *str, unsigned int size); | |||
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen); | |||
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *); | |||
]] | |||
local cases = { | |||
@@ -58,7 +58,7 @@ context("UTF8 check functions", function() | |||
local buf = ffi.new("char[?]", #c[1] + 1) | |||
ffi.copy(buf, c[1]) | |||
local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL)) | |||
local s = ffi.string(ffi.C.rspamd_str_make_utf_valid(buf, #c[1], NULL, NULL)) | |||
local function to_hex(s) | |||
return (s:gsub('.', function (c) | |||
return string.format('%02X', string.byte(c)) |