From f1e9625920e4e9add168e30c0441a4312b23c890 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 13 Aug 2019 09:46:18 +0100 Subject: [PATCH] [Minor] Rework utf8 lowercasing --- src/libserver/html.c | 2 +- src/libserver/url.c | 3 ++- src/libutil/str_util.c | 46 +++++++++++++++++------------------------- src/libutil/str_util.h | 4 ++-- test/lua/unit/utf.lua | 8 ++++---- 5 files changed, 28 insertions(+), 35 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index 8f6b3d291..4ff310f1c 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -1018,8 +1018,8 @@ rspamd_html_parse_tag_content (rspamd_mempool_t *pool, tag->name.len = rspamd_html_decode_entitles_inplace (s, tag->name.len); tag->name.start = s; + tag->name.len = rspamd_str_lc_utf8 (s, tag->name.len); s[tag->name.len] = '\0'; - rspamd_str_lc_utf8 (s, tag->name.len); k = kh_get (tag_by_name, html_tag_by_name, s); diff --git a/src/libserver/url.c b/src/libserver/url.c index ef59b6da0..9314ce2bb 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2069,7 +2069,8 @@ rspamd_url_parse (struct rspamd_url *uri, } rspamd_str_lc (uri->string, uri->protocollen); - rspamd_str_lc_utf8 (uri->host, uri->hostlen); + unquoted_len = rspamd_str_lc_utf8 (uri->host, uri->hostlen); + rspamd_url_shift (uri, unquoted_len, UF_HOST); if (uri->protocol == PROTOCOL_UNKNOWN) { for (i = 0; i < G_N_ELEMENTS (rspamd_url_protocols); i++) { diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 1f2c4629f..4ce84fa65 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -62,7 +62,7 @@ const guchar lc_map[256] = { 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff }; -void +guint rspamd_str_lc (gchar *str, guint size) { guint leftover = size % 4; @@ -93,6 +93,7 @@ rspamd_str_lc (gchar *str, guint size) *dest = lc_map[(guchar)str[i]]; } + return size; } gint @@ -144,42 +145,33 @@ rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l) * string to lower case, so some locale peculiarities are simply ignored * If the target string is longer than initial one, then we just trim it */ -void +guint rspamd_str_lc_utf8 (gchar *str, guint size) { - const gchar *s = str, *p; - gchar *d = str, tst[6]; - gint remain = size; - gint r; - gunichar uc; + guchar *d = (guchar *)str, tst[6]; + gint32 i = 0, prev = 0; + UChar32 uc; - while (remain > 0) { - p = g_utf8_next_char (s); + while (i < size) { + prev = i; - if (p - s > remain) { - break; - } + U8_NEXT ((guint8*)str, i, size, uc); + uc = u_tolower (uc); - uc = g_utf8_get_char (s); - uc = g_unichar_tolower (uc); + gint32 olen = 0; + U8_APPEND_UNSAFE (tst, olen, uc); - if (remain >= 6) { - r = g_unichar_to_utf8 (uc, d); + if (olen <= (i - prev)) { + memcpy (d, tst, olen); + d += olen; } else { - /* We must be cautious here to avoid broken unicode being append */ - r = g_unichar_to_utf8 (uc, tst); - if (r > remain) { - break; - } - else { - memcpy (d, tst, r); - } + /* Lowercasing has increased the length, so we need to ignore it */ + d += i - prev; } - remain -= r; - s = p; - d += r; } + + return d - (guchar *)str; } gboolean diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index a1f980526..b255c125b 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -41,12 +41,12 @@ gint rspamd_lc_cmp (const gchar *s, const gchar *d, gsize l); /** * Convert string to lowercase in-place using ASCII conversion */ -void rspamd_str_lc (gchar *str, guint size); +guint rspamd_str_lc (gchar *str, guint size); /** * Convert string to lowercase in-place using utf (limited) conversion */ -void rspamd_str_lc_utf8 (gchar *str, guint size); +guint rspamd_str_lc_utf8 (gchar *str, guint size); /* * Hash table utility functions for case insensitive hashing diff --git a/test/lua/unit/utf.lua b/test/lua/unit/utf.lua index 277d99e41..75dd33977 100644 --- a/test/lua/unit/utf.lua +++ b/test/lua/unit/utf.lua @@ -3,8 +3,8 @@ context("UTF8 check functions", function() local ffi = require("ffi") ffi.cdef[[ - void rspamd_str_lc_utf8 (char *str, unsigned int size); - void rspamd_str_lc (char *str, unsigned int size); + unsigned int rspamd_str_lc_utf8 (char *str, unsigned int size); + unsigned int rspamd_str_lc (char *str, unsigned int size); char * rspamd_str_make_utf_valid (const char *src, size_t slen, size_t *dstlen); ]] @@ -19,8 +19,8 @@ context("UTF8 check functions", function() test("UTF lowercase " .. tostring(i), function() local buf = ffi.new("char[?]", #c[1] + 1) ffi.copy(buf, c[1]) - ffi.C.rspamd_str_lc_utf8(buf, #c[1]) - local s = ffi.string(buf) + local nlen = ffi.C.rspamd_str_lc_utf8(buf, #c[1]) + local s = ffi.string(buf, nlen) assert_equal(s, c[2]) end) end -- 2.39.5