Rework phishing detection to reduce false positives rate.

Now we use TLD to match phished URL's. Sometimes, we could miss real positives by not checking strictly for subdomains, however, this change should reduce false positives rate of this rule significantly.
author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-04-23 13:08:00 +0100
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2015-04-23 13:08:00 +0100
commit: d14400e3ae7c9ccfd0d010a95f2d57add99a6868 (patch)
tree: b17bda696139a61d2b92e6cbc3871b283e84f495 /src
parent: 31adf2042911d755510a1992018cd3bf479b9bc8 (diff)
download: rspamd-d14400e3ae7c9ccfd0d010a95f2d57add99a6868.tar.gz
rspamd-d14400e3ae7c9ccfd0d010a95f2d57add99a6868.zip
1 files changed, 9 insertions, 43 deletions
diff --git a/src/libserver/html.c b/src/libserver/html.c
index 563ac0825..f978ff1c7 100644
--- a/src/libserver/html.c
+++ b/src/libserver/html.c
@@ -676,7 +676,7 @@ check_phishing (struct rspamd_task *task,
 	gsize remain,
 	tag_id_t id)
 {
-	struct rspamd_url *new;
+	struct rspamd_url *text_url;
 	gchar *url_str;
 	const gchar *p, *c;
 	gchar tagbuf[128];
@@ -731,51 +731,17 @@ check_phishing (struct rspamd_task *task,
 
 	if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
 		TRUE, &state) && url_str != NULL) {
-		new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
-		g_strstrip (url_str);
-		rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool);
+		text_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
+		rc = rspamd_url_parse (text_url, url_str, strlen (url_str), task->task_pool);
 
 		if (rc == URI_ERRNO_OK) {
-			if (g_ascii_strncasecmp (href_url->host, new->host,
-					MAX (href_url->hostlen, new->hostlen)) != 0) {
-				/* Special check for urls beginning with 'www' */
-				if (new->hostlen > 4 && href_url->hostlen > 4) {
-					p = new->host;
-					c = NULL;
-					if ((p[0] == 'w' || p[0] == 'W') &&
-							(p[1] == 'w' || p[1] == 'W') &&
-							(p[2] == 'w' || p[2] == 'W') &&
-							(p[3] == '.')) {
-						p += 4;
-						c = href_url->host;
-						len = MAX (href_url->hostlen, new->hostlen - 4);
-					}
-					else {
-						p = href_url->host;
-						if ((p[0] == 'w' || p[0] == 'W') &&
-								(p[1] == 'w' || p[1] == 'W') &&
-								(p[2] == 'w' || p[2] == 'W') &&
-								(p[3] == '.')) {
-							p += 4;
-							c = new->host;
-							len = MAX (href_url->hostlen - 4, new->hostlen);
-						}
-					}
-					/* Compare parts and check for phished hostname */
-					if (c != NULL) {
-						if (g_ascii_strncasecmp (p, c, len) != 0) {
-							href_url->is_phished = TRUE;
-							href_url->phished_url = new;
-						}
-					}
-					else {
-						href_url->is_phished = TRUE;
-						href_url->phished_url = new;
-					}
-				}
-				else {
+			if (href_url->hostlen != text_url->hostlen || memcmp (href_url->host,
+					text_url->host, href_url->hostlen) != 0) {
+
+				if (href_url->tldlen != text_url->tldlen || memcmp (href_url->tld,
+						text_url->tld, href_url->tldlen) != 0) {
 					href_url->is_phished = TRUE;
-					href_url->phished_url = new;
+					href_url->phished_url = text_url;
 				}
 			}
 		}
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-04-23 13:08:00 +0100
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2015-04-23 13:08:00 +0100
commit	d14400e3ae7c9ccfd0d010a95f2d57add99a6868 (patch)
tree	b17bda696139a61d2b92e6cbc3871b283e84f495 /src
parent	31adf2042911d755510a1992018cd3bf479b9bc8 (diff)
download	rspamd-d14400e3ae7c9ccfd0d010a95f2d57add99a6868.tar.gz rspamd-d14400e3ae7c9ccfd0d010a95f2d57add99a6868.zip