]> source.dussan.org Git - rspamd.git/commitdiff
Add TLD detection for urls.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 15 Apr 2015 15:27:33 +0000 (16:27 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 15 Apr 2015 15:27:33 +0000 (16:27 +0100)
src/libserver/url.c
src/libserver/url.h

index 345bd62a3c4079c4ee8bfc21d7b83f79b02156f1..174884e71cd48f0a1b5141a64e98f612a3ddd3c7 100644 (file)
@@ -877,6 +877,50 @@ out:
 
 #undef SET_U
 
+static gint
+rspamd_tld_trie_callback (int strnum, int textpos, void *context)
+{
+       struct url_matcher *matcher;
+       const gchar *start, *pos, *p;
+       struct rspamd_url *url = context;
+       ac_trie_pat_t *pat;
+       gint ndots = 1;
+
+       matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum);
+       pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum);
+
+       if (matcher->flags & URL_FLAG_STAR_MATCH) {
+               /* Skip one more tld component */
+               ndots = 2;
+       }
+
+       pos = url->host + textpos;
+       start = url->host;
+
+       if (*pos != '.' || pos + pat->len != url->host + url->hostlen) {
+               /* Something weird has been found */
+               return 0;
+       }
+
+       /* Now we need to find top level domain */
+       p = pos - 1;
+       while (p >= start && ndots > 0) {
+               if (*p == '.') {
+                       ndots --;
+                       pos = p + 1;
+               }
+
+               p --;
+       }
+
+       if (ndots == 0) {
+               url->tld = (gchar *)pos;
+               url->tldlen = url->host + url->hostlen - pos;
+       }
+
+       return 1;
+}
+
 enum uri_errno
 rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                rspamd_mempool_t *pool)
@@ -885,6 +929,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
        gchar *p, *comp;
        const gchar *end;
        guint i, complen, ret;
+       gint state = 0;
 
        const struct {
                enum rspamd_url_protocol proto;
@@ -1012,6 +1057,10 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len,
                }
        }
 
+       /* Find TLD part */
+       acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen,
+                       rspamd_tld_trie_callback, uri, &state, true);
+
        if (uri->protocol == PROTOCOL_UNKNOWN) {
                return URI_ERRNO_INVALID_PROTOCOL;
        }
index 664f1a945240bdcbd96ba465ced5a85f94197c2b..ce81542116b57b5e7960227f83ff7875cbc9b308 100644 (file)
@@ -20,6 +20,7 @@ struct rspamd_url {
        gchar *query;
        gchar *fragment;
        gchar *surbl;
+       gchar *tld;
 
        struct rspamd_url *phished_url;
 
@@ -31,7 +32,7 @@ struct rspamd_url {
        guint querylen;
        guint fragmentlen;
        guint surbllen;
-
+       guint tldlen;
        guint urllen;
 
        gboolean is_phished; /* URI maybe phishing */