[Fix] Find suspicious url encodings that could break url extraction

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)
diff --git a/src/libserver/html/html_url.cxx b/src/libserver/html/html_url.cxx

index d073a25d99dd1ff2e2229d04cea0de0dd9fb859a..8f721b3e8bb2d68938e63d01957c762a753c7e6f 100644 (file)
--- a/src/libserver/html/html_url.cxx
+++ b/src/libserver/html/html_url.cxx
@@ -376,6 +376,8 @@ html_process_url(rspamd_mempool_t *pool, std::string_view &input)
         /*
          * We also need to remove all internal newlines, spaces
          * and encode unsafe characters
+        * Another obfuscation find in the wild was encoding of the SAFE url characters,
+        * including essential ones
          */
         for (auto i = 0; i < sz; i++) {
                 if (G_UNLIKELY (g_ascii_isspace(s[i]))) {
@@ -388,6 +390,43 @@ html_process_url(rspamd_mempool_t *pool, std::string_view &input)
                         *d++ = hexdigests[s[i] & 0xf];
                         has_bad_chars = TRUE;
                 }
+               else if (G_UNLIKELY (s[i] == '%')) {
+                       if (i + 2 < sz) {
+                               auto [c1, c2] = std::tuple(s[i + 1], s[i + 2]);
+
+                               if (g_ascii_isxdigit(c1) && g_ascii_isxdigit(c2)) {
+                                       auto codepoint = 0;
+
+                                       if      (c1 >= '0' && c1 <= '9') codepoint = c1 - '0';
+                                       else if (c1 >= 'A' && c1 <= 'F') codepoint = c1 - 'A' + 10;
+                                       else if (c1 >= 'a' && c1 <= 'f') codepoint = c1 - 'a' + 10;
+
+                                       codepoint <<= 4;
+
+                                       if      (c2 >= '0' && c2 <= '9') codepoint += c2 - '0';
+                                       else if (c2 >= 'A' && c2 <= 'F') codepoint += c2 - 'A' + 10;
+                                       else if (c2 >= 'a' && c2 <= 'f') codepoint += c2 - 'a' + 10;
+
+                                       /* Now check for 'interesting' codepoints */
+                                       if (codepoint == '@' || codepoint == ':' || codepoint == '|' ||
+                                               codepoint == '?' || codepoint == '\\' || codepoint == '/') {
+                                               /* Replace it back */
+                                               *d++ = (char)(codepoint & 0xff);
+                                               i += 2;
+                                               has_bad_chars = TRUE;
+                                       }
+                                       else {
+                                               *d++ = s[i];
+                                       }
+                               }
+                               else {
+                                       *d++ = s[i];
+                               }
+                       }
+                       else {
+                               *d++ = s[i];
+                       }
+               }
                 else {
                         *d++ = s[i];
                 }
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Wed, 17 Nov 2021 21:48:00 +0000 (21:48 +0000)