Fix phishing check for special cases like http://host.com and http://www.host.com

author: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-03-14 18:19:17 +0300
committer: Vsevolod Stakhov <vsevolod@rambler-co.ru> 2011-03-14 18:19:17 +0300
commit: 9dcd8232a2326b951f9d7fc8d7063a89a4312468 (patch)
tree: 6bc92ab0536fa5dbf667ebe696d38e7a1ca30995
parent: eb746e9a050fb8032ac213751ba55f9229213c91 (diff)
download: rspamd-9dcd8232a2326b951f9d7fc8d7063a89a4312468.tar.gz
rspamd-9dcd8232a2326b951f9d7fc8d7063a89a4312468.zip
2 files changed, 46 insertions, 14 deletions
diff --git a/src/html.c b/src/html.c
index 29d535564..74bdc29ac 100644
--- a/src/html.c
+++ b/src/html.c
@@ -683,12 +683,12 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
 {
 	struct uri                     *new;
 	gchar                          *url_str;
-	const gchar                    *p;
+	const gchar                    *p, *c;
 	gsize                           len = 0;
 	gint                            off, rc;
 
 	p = url_text;
-	while (len < remain) {
+	while (len < remain - 1) {
 		if (*p == '<' || *p == '>') {
 			break;
 		}
@@ -704,8 +704,39 @@ check_phishing (struct worker_task *task, struct uri *href_url, const gchar *url
 			if (rc == URI_ERRNO_OK || rc == URI_ERRNO_NO_SLASHES || rc == URI_ERRNO_NO_HOST_SLASH) {
 				if (g_ascii_strncasecmp (href_url->host, new->host,
 						MAX (href_url->hostlen, new->hostlen)) != 0) {
-					href_url->is_phished = TRUE;
-					href_url->phished_url = new;
+					/* Special check for urls beginning with 'www' */
+					if (new->hostlen > 4 && href_url->hostlen > 4) {
+						p = new->host;
+						c = NULL;
+						if ((p[0] == 'w' || p[0] == 'W') &&
+							(p[1] == 'w' || p[1] == 'W') &&
+							(p[2] == 'w' || p[2] == 'W') &&
+							(p[3] == '.')) {
+							p += 4;
+							c = href_url->host;
+							len = MAX (href_url->hostlen, new->hostlen - 4);
+						}
+						else {
+							p = href_url->host;
+							if ((p[0] == 'w' || p[0] == 'W') &&
+								(p[1] == 'w' || p[1] == 'W') &&
+								(p[2] == 'w' || p[2] == 'W') &&
+								(p[3] == '.')) {
+								p += 4;
+								c = new->host;
+								len = MAX (href_url->hostlen - 4, new->hostlen);
+							}
+						}
+						/* Compare parts and check for phished hostname */
+						if (c != NULL && g_ascii_strncasecmp (p, c, len) != 0) {
+							href_url->is_phished = TRUE;
+							href_url->phished_url = new;
+						}
+					}
+					else {
+						href_url->is_phished = TRUE;
+						href_url->phished_url = new;
+					}
 				}
 			}
 			else {
@@ -805,7 +836,8 @@ parse_tag_url (struct worker_task *task, struct mime_text_part *part, tag_id_t i
 			 * Check for phishing
 			 */
 			if ((p = strchr (c, '>')) != NULL ) {
-				check_phishing (task, url, p + 1, remain - (p - tag_text));
+				p ++;
+				check_phishing (task, url, p, remain - (p - tag_text));
 			}
 			if (part->html_urls && g_tree_lookup (part->html_urls, url_text) == NULL) {
 				g_tree_insert (part->html_urls, url_text, url);
diff --git a/src/url.c b/src/url.c
index 83eb30d4d..8282f9217 100644
--- a/src/url.c
+++ b/src/url.c
@@ -930,7 +930,7 @@ url_file_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_
 		}
 	}
 
-	while (p < end - 1 && *p != stop && is_urlsafe (*p)) {
+	while (p < end && *p != stop && is_urlsafe (*p)) {
 		p ++;
 	}
 
@@ -979,14 +979,14 @@ url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t
 	if (is_atom (*p)) {
 		/* might be a domain or user@domain */
 		c = p;
-		while (p < end - 1) {
+		while (p < end) {
 			if (!is_atom (*p)) {
 				break;
 			}
 
 			p++;
 
-			while (p < end - 1 && is_atom (*p)) {
+			while (p < end && is_atom (*p)) {
 				p++;
 			}
 
@@ -1006,18 +1006,18 @@ url_web_end (const gchar *begin, const gchar *end, const gchar *pos, url_match_t
 	}
 	else if (is_domain (*p)) {
 domain:
-		while (p < end - 1) {
+		while (p < end) {
 			if (!is_domain (*p)) {
 				break;
 			}
 
 			p++;
 
-			while (p < end - 1 && is_domain (*p)) {
+			while (p < end && is_domain (*p)) {
 				p++;
 			}
 
-			if ((p + 1) < end - 1 && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) {
+			if ((p + 1) < end && *p == '.' && (is_domain (*(p + 1)) || *(p + 1) == '/')) {
 				p++;
 			}
 		}
@@ -1034,7 +1034,7 @@ domain:
 			if (is_digit (*p) || passwd) {
 				port = (*p++ - '0');
 
-				while (p < end - 1 && is_digit (*p) && port < 65536) {
+				while (p < end && is_digit (*p) && port < 65536) {
 					port = (port * 10) + (*p++ - '0');
 				}
 
@@ -1052,7 +1052,7 @@ domain:
 				passwd = TRUE;
 				c = p;
 
-				while (p < end - 1 && is_atom (*p)) {
+				while (p < end && is_atom (*p)) {
 					p++;
 				}
 
@@ -1076,7 +1076,7 @@ domain:
 		case '/': /* we've detected a path component to our url */
 			p++;
 		case '?':
-			while (p < end - 1 && is_urlsafe (*p)) {
+			while (p < end && is_urlsafe (*p)) {
 				if (*p == open_brace) {
 					brace_stack++;
 				}
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-03-14 18:19:17 +0300
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>	2011-03-14 18:19:17 +0300
commit	9dcd8232a2326b951f9d7fc8d7063a89a4312468 (patch)
tree	6bc92ab0536fa5dbf667ebe696d38e7a1ca30995
parent	eb746e9a050fb8032ac213751ba55f9229213c91 (diff)
download	rspamd-9dcd8232a2326b951f9d7fc8d7063a89a4312468.tar.gz rspamd-9dcd8232a2326b951f9d7fc8d7063a89a4312468.zip