]> source.dussan.org Git - rspamd.git/commitdiff
Fix phishing check for special cases like http://host.com and http://www.host.com
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 14 Mar 2011 15:19:17 +0000 (18:19 +0300)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 14 Mar 2011 15:19:17 +0000 (18:19 +0300)
src/html.c
src/url.c

index 29d535564d9cd1cfd5e8334a9e67ceba00a642a8..74bdc29ac488636b41704e88cef52b96b306189b 100644 (file)
@@ -683,12 +683,12 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
 {
        struct uri                     *new;
        gchar                          *url_str;
-       const gchar                    *p;
+       const gchar                    *p, *c;
        gsize                           len = 0;
        gint                            off, rc;
 
        p = url_text;
-       while (len < remain) {
+       while (len < remain - 1) {
                if (*p == '<' || *p == '>') {
                        break;
                }
@@ -704,8 +704,39 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
                        if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
                                if (g_ascii_strncasecmp (href_url->host, new->host,
                                                MAX (href_url->hostlen, new->hostlen)) != 0) {
-                                       href_url->is_phished = TRUE;
-                                       href_url->phished_url = new;
+                                       /* Special check for urls beginning with 'www' */
+                                       if (new->hostlen > 4 && href_url->hostlen > 4) {
+                                               p = new->host;
+                                               c = NULL;
+                                               if ((p[0] == 'w' || p[0] == 'W') &&
+                                                       (p[1] == 'w' || p[1] == 'W') &&
+                                                       (p[2] == 'w' || p[2] == 'W') &&
+                                                       (p[3] == '.')) {
+                                                       p += 4;
+                                                       c = href_url->host;
+                                                       len = MAX (href_url->hostlen, new->hostlen - 4);
+                                               }
+                                               else {
+                                                       p = href_url->host;
+                                                       if ((p[0] == 'w' || p[0] == 'W') &&
+                                                               (p[1] == 'w' || p[1] == 'W') &&
+                                                               (p[2] == 'w' || p[2] == 'W') &&
+                                                               (p[3] == '.')) {
+                                                               p += 4;
+                                                               c = new->host;
+                                                               len = MAX (href_url->hostlen - 4, new->hostlen);
+                                                       }
+                                               }
+                                               /* Compare parts and check for phished hostname */
+                                               if (c != NULL && g_ascii_strncasecmp (p, c, len) != 0) {
+                                                       href_url->is_phished = TRUE;
+                                                       href_url->phished_url = new;
+                                               }
+                                       }
+                                       else {
+                                               href_url->is_phished = TRUE;
+                                               href_url->phished_url = new;
+                                       }
                                }
                        }
                        else {
@@ -805,7 +836,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
                         * Check for phishing
                         */
                        if ((p = strchr (c, '>')) != NULL ) {
-                               check_phishing (task, url, p + 1, remain - (p - tag_text));
+                               p ++;
+                               check_phishing (task, url, p, remain - (p - tag_text));
                        }
                        if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
                                g_tree_insert (part->html_urls, url_text, url);
index 83eb30d4de41d2f39a8e9b74d116321bafd29c35..8282f92172596c1b8c0f4ea25a5ffd05ae03ff7b 100644 (file)
--- a/src/url.c
+++ b/src/url.c
@@ -930,7 +930,7 @@ url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_
                }
        }
 
-       while (p < end - 1 && *p != stop && is_urlsafe (*p)) {
+       while (p < end && *p != stop && is_urlsafe (*p)) {
                p ++;
        }
 
@@ -979,14 +979,14 @@ url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t
        if (is_atom (*p)) {
                /* might be a domain or user@domain */
                c = p;
-               while (p < end - 1) {
+               while (p < end) {
                        if (!is_atom (*p)) {
                                break;
                        }
 
                        p++;
 
-                       while (p < end - 1 && is_atom (*p)) {
+                       while (p < end && is_atom (*p)) {
                                p++;
                        }
 
@@ -1006,18 +1006,18 @@ url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t
        }
        else if (is_domain (*p)) {
 domain:
-               while (p < end - 1) {
+               while (p < end) {
                        if (!is_domain (*p)) {
                                break;
                        }
 
                        p++;
 
-                       while (p < end - 1 && is_domain (*p)) {
+                       while (p < end && is_domain (*p)) {
                                p++;
                        }
 
-                       if ((p + 1) < end - 1 && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) {
+                       if ((p + 1) < end && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) {
                                p++;
                        }
                }
@@ -1034,7 +1034,7 @@ domain:
                        if (is_digit (*p) || passwd) {
                                port = (*p++ - '0');
 
-                               while (p < end - 1 && is_digit (*p) && port < 65536) {
+                               while (p < end && is_digit (*p) && port < 65536) {
                                        port = (port * 10) + (*p++ - '0');
                                }
 
@@ -1052,7 +1052,7 @@ domain:
                                passwd = TRUE;
                                c = p;
 
-                               while (p < end - 1 && is_atom (*p)) {
+                               while (p < end && is_atom (*p)) {
                                        p++;
                                }
 
@@ -1076,7 +1076,7 @@ domain:
                case '/': /* we've detected a path component to our url */
                        p++;
                case '?':
-                       while (p < end - 1 && is_urlsafe (*p)) {
+                       while (p < end && is_urlsafe (*p)) {
                                if (*p == open_brace) {
                                        brace_stack++;
                                }