From: Vsevolod Stakhov Date: Thu, 31 Mar 2016 16:12:43 +0000 (+0100) Subject: [Fix] Investigate many border cases in URLs parser X-Git-Tag: 1.2.2~3 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=7ac8e597ac324257f3ac1793c0e9362a4714b9cc;p=rspamd.git [Fix] Investigate many border cases in URLs parser --- diff --git a/src/libserver/url.c b/src/libserver/url.c index b27c5a169..700ffe34b 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -55,6 +55,7 @@ typedef struct url_match_s { const gchar *pattern; const gchar *prefix; gboolean add_prefix; + gchar st; } url_match_t; #define URL_FLAG_NOHTML (1 << 0) @@ -1665,6 +1666,14 @@ url_file_start (struct url_callback_data *cb, url_match_t *match) { match->m_begin = pos; + + if (pos > cb->begin - 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + return TRUE; } @@ -1712,13 +1721,13 @@ url_tld_start (struct url_callback_data *cb, /* Try to find the start of the url by finding any non-urlsafe character or whitespace/punctuation */ while (p >= cb->begin) { - if ((!is_domain (*p) && *p != '.' && - *p != '/') || g_ascii_isspace (*p)) { - + if (!is_domain (*p) || g_ascii_isspace (*p) || is_url_start (*p)) { if (!is_url_start (*p) && !g_ascii_isspace (*p)) { return FALSE; } + match->st = *p; + p++; if (!g_ascii_isalnum (*p)) { @@ -1730,7 +1739,9 @@ url_tld_start (struct url_callback_data *cb, return TRUE; } else if (p == cb->begin && p != pos) { + match->st = '\0'; match->m_begin = p; + return TRUE; } else if (*p == '.') { @@ -1747,6 +1758,7 @@ url_tld_start (struct url_callback_data *cb, /* Urls cannot contain '/' in their body */ return FALSE; } + p--; } @@ -1766,7 +1778,7 @@ url_tld_end (struct url_callback_data *cb, match->m_len = p - match->m_begin; return TRUE; } - else if (*p == '/' || *p == ':') { + else if (*p == '/' || *p == ':' || is_url_end (*p)) { /* Parse arguments, ports by normal way by url default function */ p = match->m_begin; /* Check common prefix */ @@ -1813,6 +1825,13 @@ url_web_start (struct url_callback_data *cb, return FALSE; } + if (pos > cb->begin) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + match->m_begin = pos; return TRUE; @@ -1829,6 +1848,13 @@ url_web_end (struct url_callback_data *cb, return FALSE; } + if (last < cb->end && *last == '>') { + /* We need to ensure that url also starts with '>' */ + if (match->st != '<') { + return FALSE; + } + } + match->m_len = (last - pos); return TRUE; @@ -1855,6 +1881,13 @@ url_email_start (struct url_callback_data *cb, } } + if (pos > cb->begin - 1) { + match->st = *(pos - 1); + } + else { + match->st = '\0'; + } + return TRUE; } @@ -2063,13 +2096,13 @@ rspamd_url_trie_callback (int strnum, int textpos, void *context) pos = &cb->begin[textpos]; if (pos < cb->end) { if (!g_ascii_isspace (*pos) && *pos != '/' && *pos != '?' && - *pos != ':') { + *pos != ':' && !is_url_end (*pos)) { if (*pos == '.') { /* We allow . at the end of the domain however */ pos++; if (pos < cb->end) { if (!g_ascii_isspace (*pos) && *pos != '/' && - *pos != '?' && *pos != ':') { + *pos != '?' && *pos != ':' && !is_url_end (*pos)) { return 0; } }