ffi.cdef[[
unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
unsigned int rspamd_str_lc (char *str, unsigned int size);
+ void rspamd_fast_utf8_library_init (unsigned flags);
+ void ottery_rand_bytes(void *buf, size_t n);
+ double rspamd_get_ticks(int allow);
+ size_t rspamd_fast_utf8_validate (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_ref (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_sse41 (const unsigned char *data, size_t len);
+ size_t rspamd_fast_utf8_validate_avx2 (const unsigned char *data, size_t len);
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen, void *);
]]
assert_equal(s, c[2])
end)
end
+
+ -- Enable sse and avx2
+ ffi.C.rspamd_fast_utf8_library_init(3)
+ local valid_cases = {
+ "a",
+ "\xc3\xb1",
+ "\xe2\x82\xa1",
+ "\xf0\x90\x8c\xbc",
+ "안녕하세요, 세상"
+ }
+ for i,c in ipairs(valid_cases) do
+ test("Unicode validate success: " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c + 1)
+ ffi.copy(buf, c)
+
+ local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
+ assert_equal(ret, 0)
+ end)
+ end
+ local invalid_cases = {
+ "\xc3\x28",
+ "\xa0\xa1",
+ "\xe2\x28\xa1",
+ "\xe2\x82\x28",
+ "\xf0\x28\x8c\xbc",
+ "\xf0\x90\x28\xbc",
+ "\xf0\x28\x8c\x28",
+ "\xc0\x9f",
+ "\xf5\xff\xff\xff",
+ "\xed\xa0\x81",
+ "\xf8\x90\x80\x80\x80",
+ "123456789012345\xed",
+ "123456789012345\xf1",
+ "123456789012345\xc2",
+ "\xC2\x7F"
+ }
+ for i,c in ipairs(invalid_cases) do
+ test("Unicode validate fail: " .. tostring(i), function()
+ local buf = ffi.new("char[?]", #c + 1)
+ ffi.copy(buf, c)
+
+ local ret = ffi.C.rspamd_fast_utf8_validate(buf, #c)
+ assert_not_equal(ret, 0)
+ end)
+ end
+
+ local speed_iters = 10000
+ local function test_size(buflen, is_valid, impl)
+ local logger = require "rspamd_logger"
+ local test_str
+ if is_valid then
+ test_str = table.concat(valid_cases)
+ else
+ test_str = table.concat(valid_cases) .. table.concat(invalid_cases)
+ end
+
+ local buf = ffi.new("char[?]", buflen)
+ if #test_str < buflen then
+ local t = {}
+ local len = #test_str
+ while len < buflen do
+ t[#t + 1] = test_str
+ len = len + #test_str
+ end
+ test_str = table.concat(t)
+ end
+ ffi.copy(buf, test_str:sub(1, buflen))
+
+ local tm = 0
+
+ for _=1,speed_iters do
+ if impl == 'ref' then
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_ref(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ elseif impl == 'sse' then
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_sse41(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ else
+ local t1 = ffi.C.rspamd_get_ticks(1)
+ ffi.C.rspamd_fast_utf8_validate_avx2(buf, buflen)
+ local t2 = ffi.C.rspamd_get_ticks(1)
+ tm = tm + (t2 - t1)
+ end
+ end
+
+ logger.messagex("%s utf8 %s check (valid = %s): %s ticks per iter, %s ticks per byte",
+ impl, buflen, is_valid,
+ tm / speed_iters, tm / speed_iters / buflen)
+
+ return 0
+ end
+
+ for _,sz in ipairs({78, 512, 65535}) do
+ test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'valid'), function()
+ local res = test_size(sz, true, 'ref')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'ref', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'ref')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'valid'), function()
+ local res = test_size(sz, true, 'sse')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'sse', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'sse')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'valid'), function()
+ local res = test_size(sz, true, 'avx2')
+ assert_equal(res, 0)
+ end)
+ test(string.format("Utf8 test %s %d buffer, %s", 'avx2', sz, 'invalid'), function()
+ local res = test_size(sz, false, 'avx2')
+ assert_equal(res, 0)
+ end)
+ end
+
end)
\ No newline at end of file