From: Vsevolod Stakhov Date: Wed, 15 Apr 2015 15:27:33 +0000 (+0100) Subject: Add TLD detection for urls. X-Git-Tag: 0.9.0~239 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=ac1c58789c0c7a57dfd9019850ae4cbff6e485f6;p=rspamd.git Add TLD detection for urls. --- diff --git a/src/libserver/url.c b/src/libserver/url.c index 345bd62a3..174884e71 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -877,6 +877,50 @@ out: #undef SET_U +static gint +rspamd_tld_trie_callback (int strnum, int textpos, void *context) +{ + struct url_matcher *matcher; + const gchar *start, *pos, *p; + struct rspamd_url *url = context; + ac_trie_pat_t *pat; + gint ndots = 1; + + matcher = &g_array_index (url_scanner->matchers, struct url_matcher, strnum); + pat = &g_array_index (url_scanner->patterns, ac_trie_pat_t, strnum); + + if (matcher->flags & URL_FLAG_STAR_MATCH) { + /* Skip one more tld component */ + ndots = 2; + } + + pos = url->host + textpos; + start = url->host; + + if (*pos != '.' || pos + pat->len != url->host + url->hostlen) { + /* Something weird has been found */ + return 0; + } + + /* Now we need to find top level domain */ + p = pos - 1; + while (p >= start && ndots > 0) { + if (*p == '.') { + ndots --; + pos = p + 1; + } + + p --; + } + + if (ndots == 0) { + url->tld = (gchar *)pos; + url->tldlen = url->host + url->hostlen - pos; + } + + return 1; +} + enum uri_errno rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, rspamd_mempool_t *pool) @@ -885,6 +929,7 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, gchar *p, *comp; const gchar *end; guint i, complen, ret; + gint state = 0; const struct { enum rspamd_url_protocol proto; @@ -1012,6 +1057,10 @@ rspamd_url_parse (struct rspamd_url *uri, gchar *uristring, gsize len, } } + /* Find TLD part */ + acism_lookup (url_scanner->search_trie, uri->host, uri->hostlen, + rspamd_tld_trie_callback, uri, &state, true); + if (uri->protocol == PROTOCOL_UNKNOWN) { return URI_ERRNO_INVALID_PROTOCOL; } diff --git a/src/libserver/url.h b/src/libserver/url.h index 664f1a945..ce8154211 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -20,6 +20,7 @@ struct rspamd_url { gchar *query; gchar *fragment; gchar *surbl; + gchar *tld; struct rspamd_url *phished_url; @@ -31,7 +32,7 @@ struct rspamd_url { guint querylen; guint fragmentlen; guint surbllen; - + guint tldlen; guint urllen; gboolean is_phished; /* URI maybe phishing */