]> source.dussan.org Git - rspamd.git/commitdiff
Rework phishing detection to reduce false positives rate.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Apr 2015 12:08:00 +0000 (13:08 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Apr 2015 12:08:00 +0000 (13:08 +0100)
Now we use TLD to match phished URL's. Sometimes, we could miss real
positives by not checking strictly for subdomains, however, this change
should reduce false positives rate of this rule significantly.

src/libserver/html.c

index 563ac0825bd6ada3e4805b6a70935dfe07935d72..f978ff1c7fc7153d2588fb388b1d9b431ec2416c 100644 (file)
@@ -676,7 +676,7 @@ check_phishing (struct rspamd_task *task,
        gsize remain,
        tag_id_t id)
 {
-       struct rspamd_url *new;
+       struct rspamd_url *text_url;
        gchar *url_str;
        const gchar *p, *c;
        gchar tagbuf[128];
@@ -731,51 +731,17 @@ check_phishing (struct rspamd_task *task,
 
        if (rspamd_url_find (task->task_pool, url_text, len, NULL, NULL, &url_str,
                TRUE, &state) && url_str != NULL) {
-               new = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
-               g_strstrip (url_str);
-               rc = rspamd_url_parse (new, url_str, strlen (url_str), task->task_pool);
+               text_url = rspamd_mempool_alloc0 (task->task_pool, sizeof (struct rspamd_url));
+               rc = rspamd_url_parse (text_url, url_str, strlen (url_str), task->task_pool);
 
                if (rc == URI_ERRNO_OK) {
-                       if (g_ascii_strncasecmp (href_url->host, new->host,
-                                       MAX (href_url->hostlen, new->hostlen)) != 0) {
-                               /* Special check for urls beginning with 'www' */
-                               if (new->hostlen > 4 && href_url->hostlen > 4) {
-                                       p = new->host;
-                                       c = NULL;
-                                       if ((p[0] == 'w' || p[0] == 'W') &&
-                                                       (p[1] == 'w' || p[1] == 'W') &&
-                                                       (p[2] == 'w' || p[2] == 'W') &&
-                                                       (p[3] == '.')) {
-                                               p += 4;
-                                               c = href_url->host;
-                                               len = MAX (href_url->hostlen, new->hostlen - 4);
-                                       }
-                                       else {
-                                               p = href_url->host;
-                                               if ((p[0] == 'w' || p[0] == 'W') &&
-                                                               (p[1] == 'w' || p[1] == 'W') &&
-                                                               (p[2] == 'w' || p[2] == 'W') &&
-                                                               (p[3] == '.')) {
-                                                       p += 4;
-                                                       c = new->host;
-                                                       len = MAX (href_url->hostlen - 4, new->hostlen);
-                                               }
-                                       }
-                                       /* Compare parts and check for phished hostname */
-                                       if (c != NULL) {
-                                               if (g_ascii_strncasecmp (p, c, len) != 0) {
-                                                       href_url->is_phished = TRUE;
-                                                       href_url->phished_url = new;
-                                               }
-                                       }
-                                       else {
-                                               href_url->is_phished = TRUE;
-                                               href_url->phished_url = new;
-                                       }
-                               }
-                               else {
+                       if (href_url->hostlen != text_url->hostlen || memcmp (href_url->host,
+                                       text_url->host, href_url->hostlen) != 0) {
+
+                               if (href_url->tldlen != text_url->tldlen || memcmp (href_url->tld,
+                                               text_url->tld, href_url->tldlen) != 0) {
                                        href_url->is_phished = TRUE;
-                                       href_url->phished_url = new;
+                                       href_url->phished_url = text_url;
                                }
                        }
                }