[Fix] Investigate many border cases in URLs parser

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)
diff --git a/src/libserver/url.c b/src/libserver/url.c

index b27c5a1692e0b2bb0a049de6aaa313223846ae80..700ffe34b83626b19531c932ea11b0889b43a536 100644 (file)
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -55,6 +55,7 @@ typedef struct url_match_s {
         const gchar *pattern;
         const gchar *prefix;
         gboolean add_prefix;
+       gchar st;
  } url_match_t;
  
  #define URL_FLAG_NOHTML (1 << 0)
@@ -1665,6 +1666,14 @@ url_file_start (struct url_callback_data *cb,
                 url_match_t *match)
  {
         match->m_begin = pos;
+
+       if (pos > cb->begin - 1) {
+               match->st = *(pos - 1);
+       }
+       else {
+               match->st = '\0';
+       }
+
         return TRUE;
  }
  
@@ -1712,13 +1721,13 @@ url_tld_start (struct url_callback_data *cb,
  
         /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */
         while (p >= cb->begin) {
-               if ((!is_domain (*p) && *p != '.' &&
-                        *p != '/') || g_ascii_isspace (*p)) {
-
+               if (!is_domain (*p) || g_ascii_isspace (*p) || is_url_start (*p)) {
                         if (!is_url_start (*p) && !g_ascii_isspace (*p)) {
                                 return FALSE;
                         }
  
+                       match->st = *p;
+
                         p++;
  
                         if (!g_ascii_isalnum (*p)) {
@@ -1730,7 +1739,9 @@ url_tld_start (struct url_callback_data *cb,
                         return TRUE;
                 }
                 else if (p == cb->begin && p != pos) {
+                       match->st = '\0';
                         match->m_begin = p;
+
                         return TRUE;
                 }
                 else if (*p == '.') {
@@ -1747,6 +1758,7 @@ url_tld_start (struct url_callback_data *cb,
                         /* Urls cannot contain '/' in their body */
                         return FALSE;
                 }
+
                 p--;
         }
  
@@ -1766,7 +1778,7 @@ url_tld_end (struct url_callback_data *cb,
                 match->m_len = p - match->m_begin;
                 return TRUE;
         }
-       else if (*p == '/' || *p == ':') {
+       else if (*p == '/' || *p == ':' || is_url_end (*p)) {
                 /* Parse arguments, ports by normal way by url default function */
                 p = match->m_begin;
                 /* Check common prefix */
@@ -1813,6 +1825,13 @@ url_web_start (struct url_callback_data *cb,
                 return FALSE;
         }
  
+       if (pos > cb->begin) {
+               match->st = *(pos - 1);
+       }
+       else {
+               match->st = '\0';
+       }
+
         match->m_begin = pos;
  
         return TRUE;
@@ -1829,6 +1848,13 @@ url_web_end (struct url_callback_data *cb,
                 return FALSE;
         }
  
+       if (last < cb->end && *last == '>') {
+               /* We need to ensure that url also starts with '>' */
+               if (match->st != '<') {
+                       return FALSE;
+               }
+       }
+
         match->m_len = (last - pos);
  
         return TRUE;
@@ -1855,6 +1881,13 @@ url_email_start (struct url_callback_data *cb,
                 }
         }
  
+       if (pos > cb->begin - 1) {
+               match->st = *(pos - 1);
+       }
+       else {
+               match->st = '\0';
+       }
+
         return TRUE;
  }
  
@@ -2063,13 +2096,13 @@ rspamd_url_trie_callback (int strnum, int textpos, void *context)
                 pos = &cb->begin[textpos];
                 if (pos < cb->end) {
                         if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' &&
-                               *pos != ':') {
+                               *pos != ':' && !is_url_end (*pos)) {
                                 if (*pos == '.') {
                                         /* We allow . at the end of the domain however */
                                         pos++;
                                         if (pos < cb->end) {
                                                 if (!g_ascii_isspace (*pos) && *pos != '/' &&
-                                                       *pos != '?' && *pos != ':') {
+                                                       *pos != '?' && *pos != ':' && !is_url_end (*pos)) {
                                                         return 0;
                                                 }
                                         }
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 31 Mar 2016 16:12:43 +0000 (17:12 +0100)