From a2eb042dcd36228b9e0a6d1417c54032489d91ff Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 14 May 2021 16:59:30 +0100 Subject: [PATCH] [Minor] Strip visible parts of urls using utf rules --- src/libserver/html.c | 39 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 2 deletions(-) diff --git a/src/libserver/html.c b/src/libserver/html.c index 326c8facc..30c2c022b 100644 --- a/src/libserver/html.c +++ b/src/libserver/html.c @@ -2617,8 +2617,43 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool, rspamd_strlcpy (url->visible_part, dest->data + href_offset, dest->len - href_offset + 1); dlen = dest->len - href_offset; - url->visible_part = - (gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n"); + + /* Strip unicode spaces from the start and the end */ + gchar *p = url->visible_part, *end = url->visible_part + dlen; + gint i = 0; + + while (i < dlen) { + UChar32 uc; + gint prev_i = i; + + U8_NEXT(p, i, dlen, uc); + + if (!u_isspace (uc)) { + i = prev_i; + break; + } + } + + p += i; + dlen -= i; + url->visible_part = p; + i = end - url->visible_part - 1; + + if (i > 0) { + gint32 dl = dlen; + + while (i > 0) { + UChar32 uc; + + U8_PREV(p, i, dl, uc); + + if (!u_isspace (uc)) { + break; + } + } + + dlen = i; + } rspamd_html_url_is_phished (pool, url, -- 2.39.5