diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-05-14 16:59:30 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2021-05-14 16:59:30 +0100 |
commit | a2eb042dcd36228b9e0a6d1417c54032489d91ff (patch) | |
tree | adb3311934ee41d07522023835c7ba24f61b98cc | |
parent | 05f3458be9ff02b412548cd5693b09e34c6f052b (diff) | |
download | rspamd-a2eb042dcd36228b9e0a6d1417c54032489d91ff.tar.gz rspamd-a2eb042dcd36228b9e0a6d1417c54032489d91ff.zip |
[Minor] Strip visible parts of urls using utf rules
-rw-r--r-- | src/libserver/html.c | 39 |
1 files changed, 37 insertions, 2 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c index 326c8facc..30c2c022b 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -2617,8 +2617,43 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, rspamd_strlcpy (url->visible_part, dest->data + href_offset, dest->len - href_offset + 1); dlen = dest->len - href_offset; - url->visible_part = - (gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n"); + + /* Strip unicode spaces from the start and the end */ + gchar *p = url->visible_part, *end = url->visible_part + dlen; + gint i = 0; + + while (i < dlen) { + UChar32 uc; + gint prev_i = i; + + U8_NEXT(p, i, dlen, uc); + + if (!u_isspace (uc)) { + i = prev_i; + break; + } + } + + p += i; + dlen -= i; + url->visible_part = p; + i = end - url->visible_part - 1; + + if (i > 0) { + gint32 dl = dlen; + + while (i > 0) { + UChar32 uc; + + U8_PREV(p, i, dl, uc); + + if (!u_isspace (uc)) { + break; + } + } + + dlen = i; + } rspamd_html_url_is_phished (pool, url, |