tag->name.len = rspamd_html_decode_entitles_inplace (s,
tag->name.len);
tag->name.start = s;
+ tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len);
s[tag->name.len] = '\0';
- rspamd_str_lc_utf8 (s, tag->name.len);
k = kh_get (tag_by_name, html_tag_by_name, s);
}
rspamd_str_lc (uri->string, uri->protocollen);
- rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+ unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen);
+ rspamd_url_shift (uri, unquoted_len, UF_HOST);
if (uri->protocol == PROTOCOL_UNKNOWN) {
for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) {
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
-void
+guint
rspamd_str_lc (gchar *str, guint size)
{
guint leftover = size % 4;
*dest = lc_map[(guchar)str[i]];
}
+ return size;
}
gint
* string to lower case, so some locale peculiarities are simply ignored
* If the target string is longer than initial one, then we just trim it
*/
-void
+guint
rspamd_str_lc_utf8 (gchar *str, guint size)
{
- const gchar *s = str, *p;
- gchar *d = str, tst[6];
- gint remain = size;
- gint r;
- gunichar uc;
+ guchar *d = (guchar *)str, tst[6];
+ gint32 i = 0, prev = 0;
+ UChar32 uc;
- while (remain > 0) {
- p = g_utf8_next_char (s);
+ while (i < size) {
+ prev = i;
- if (p - s > remain) {
- break;
- }
+ U8_NEXT ((guint8*)str, i, size, uc);
+ uc = u_tolower (uc);
- uc = g_utf8_get_char (s);
- uc = g_unichar_tolower (uc);
+ gint32 olen = 0;
+ U8_APPEND_UNSAFE (tst, olen, uc);
- if (remain >= 6) {
- r = g_unichar_to_utf8 (uc, d);
+ if (olen <= (i - prev)) {
+ memcpy (d, tst, olen);
+ d += olen;
}
else {
- /* We must be cautious here to avoid broken unicode being append */
- r = g_unichar_to_utf8 (uc, tst);
- if (r > remain) {
- break;
- }
- else {
- memcpy (d, tst, r);
- }
+ /* Lowercasing has increased the length, so we need to ignore it */
+ d += i - prev;
}
- remain -= r;
- s = p;
- d += r;
}
+
+ return d - (guchar *)str;
}
gboolean
/**
* Convert string to lowercase in-place using ASCII conversion
*/
-void rspamd_str_lc (gchar *str, guint size);
+guint rspamd_str_lc (gchar *str, guint size);
/**
* Convert string to lowercase in-place using utf (limited) conversion
*/
-void rspamd_str_lc_utf8 (gchar *str, guint size);
+guint rspamd_str_lc_utf8 (gchar *str, guint size);
/*
* Hash table utility functions for case insensitive hashing
context("UTF8 check functions", function()
local ffi = require("ffi")
ffi.cdef[[
- void rspamd_str_lc_utf8 (char *str, unsigned int size);
- void rspamd_str_lc (char *str, unsigned int size);
+ unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size);
+ unsigned int rspamd_str_lc (char *str, unsigned int size);
char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen);
]]
test("UTF lowercase " .. tostring(i), function()
local buf = ffi.new("char[?]", #c[1] + 1)
ffi.copy(buf, c[1])
- ffi.C.rspamd_str_lc_utf8(buf, #c[1])
- local s = ffi.string(buf)
+ local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1])
+ local s = ffi.string(buf, nlen)
assert_equal(s, c[2])
end)
end