diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-13 09:46:18 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-08-13 09:46:18 +0100 |
commit | f1e9625920e4e9add168e30c0441a4312b23c890 (patch) | |
tree | b272a83d3f8c6af8fec2eb139d415d1b8d672b1f /src/libutil/str_util.c | |
parent | e900e3fce155c8ad08c69b3be0668e25262b15d1 (diff) | |
download | rspamd-f1e9625920e4e9add168e30c0441a4312b23c890.tar.gz rspamd-f1e9625920e4e9add168e30c0441a4312b23c890.zip |
[Minor] Rework utf8 lowercasing
Diffstat (limited to 'src/libutil/str_util.c')
-rw-r--r-- | src/libutil/str_util.c | 46 |
1 files changed, 19 insertions, 27 deletions
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 1f2c4629f..4ce84fa65 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -62,7 +62,7 @@ const guchar lc_map[256] = { 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; -void +guint rspamd_str_lc (gchar *str, guint size) { guint leftover = size % 4; @@ -93,6 +93,7 @@ rspamd_str_lc (gchar *str, guint size) *dest = lc_map[(guchar)str[i]]; } + return size; } gint @@ -144,42 +145,33 @@ rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l) * string to lower case, so some locale peculiarities are simply ignored * If the target string is longer than initial one, then we just trim it */ -void +guint rspamd_str_lc_utf8 (gchar *str, guint size) { - const gchar *s = str, *p; - gchar *d = str, tst[6]; - gint remain = size; - gint r; - gunichar uc; + guchar *d = (guchar *)str, tst[6]; + gint32 i = 0, prev = 0; + UChar32 uc; - while (remain > 0) { - p = g_utf8_next_char (s); + while (i < size) { + prev = i; - if (p - s > remain) { - break; - } + U8_NEXT ((guint8*)str, i, size, uc); + uc = u_tolower (uc); - uc = g_utf8_get_char (s); - uc = g_unichar_tolower (uc); + gint32 olen = 0; + U8_APPEND_UNSAFE (tst, olen, uc); - if (remain >= 6) { - r = g_unichar_to_utf8 (uc, d); + if (olen <= (i - prev)) { + memcpy (d, tst, olen); + d += olen; } else { - /* We must be cautious here to avoid broken unicode being append */ - r = g_unichar_to_utf8 (uc, tst); - if (r > remain) { - break; - } - else { - memcpy (d, tst, r); - } + /* Lowercasing has increased the length, so we need to ignore it */ + d += i - prev; } - remain -= r; - s = p; - d += r; } + + return d - (guchar *)str; } gboolean |