aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-14 16:59:30 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2021-05-14 16:59:30 +0100
commita2eb042dcd36228b9e0a6d1417c54032489d91ff (patch)
treeadb3311934ee41d07522023835c7ba24f61b98cc
parent05f3458be9ff02b412548cd5693b09e34c6f052b (diff)
downloadrspamd-a2eb042dcd36228b9e0a6d1417c54032489d91ff.tar.gz
rspamd-a2eb042dcd36228b9e0a6d1417c54032489d91ff.zip
[Minor] Strip visible parts of urls using utf rules
-rw-r--r--src/libserver/html.c39
1 files changed, 37 insertions, 2 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 326c8facc..30c2c022b 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -2617,8 +2617,43 @@ rspamd_html_check_displayed_url (rspamd_mempool_t *pool,
rspamd_strlcpy (url->visible_part, dest->data + href_offset,
dest->len - href_offset + 1);
dlen = dest->len - href_offset;
- url->visible_part =
- (gchar *)rspamd_string_len_strip (url->visible_part, &dlen, " \t\v\r\n");
+
+ /* Strip unicode spaces from the start and the end */
+ gchar *p = url->visible_part, *end = url->visible_part + dlen;
+ gint i = 0;
+
+ while (i < dlen) {
+ UChar32 uc;
+ gint prev_i = i;
+
+ U8_NEXT(p, i, dlen, uc);
+
+ if (!u_isspace (uc)) {
+ i = prev_i;
+ break;
+ }
+ }
+
+ p += i;
+ dlen -= i;
+ url->visible_part = p;
+ i = end - url->visible_part - 1;
+
+ if (i > 0) {
+ gint32 dl = dlen;
+
+ while (i > 0) {
+ UChar32 uc;
+
+ U8_PREV(p, i, dl, uc);
+
+ if (!u_isspace (uc)) {
+ break;
+ }
+ }
+
+ dlen = i;
+ }
rspamd_html_url_is_phished (pool, url,