From: Vsevolod Stakhov Date: Sun, 5 Apr 2015 18:03:48 +0000 (+0100) Subject: Rework URL parser to load tld file. X-Git-Tag: 0.9.0~331 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=a9cca486d560a4d2741a86e2ea53298658af1aac;p=rspamd.git Rework URL parser to load tld file. --- diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 16ecb2800..bed89516d 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -247,13 +247,15 @@ struct rspamd_config { gchar *cache_filename; /**< filename of cache file */ struct metric *default_metric; /**< default metric */ - gchar * checksum; /**< real checksum of config file */ - gchar * dump_checksum; /**< dump checksum of config file */ + gchar * checksum; /**< real checksum of config file */ + gchar * dump_checksum; /**< dump checksum of config file */ gpointer lua_state; /**< pointer to lua state */ - gchar * rrd_file; /**< rrd file to store statistics */ + gchar * rrd_file; /**< rrd file to store statistics */ - gchar * history_file; /**< file to save rolling history */ + gchar * history_file; /**< file to save rolling history */ + + gchar * tld_file; /**< file to load effective tld list from */ gdouble dns_timeout; /**< timeout in milliseconds for waiting for dns reply */ guint32 dns_retransmits; /**< maximum retransmits count */ diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index 6eaaeda63..07982d018 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -1309,6 +1309,11 @@ rspamd_rcl_config_init (void) rspamd_rcl_parse_struct_integer, G_STRUCT_OFFSET (struct rspamd_config, min_word_len), RSPAMD_CL_FLAG_INT_32); + rspamd_rcl_add_default_handler (sub, + "url_tld", + rspamd_rcl_parse_struct_string, + G_STRUCT_OFFSET (struct rspamd_config, tld_file), + RSPAMD_CL_FLAG_STRING_PATH); /** * Metric section diff --git a/src/libserver/url.c b/src/libserver/url.c index a3dbf92f4..629d9baa1 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -41,8 +41,9 @@ typedef struct url_match_s { gboolean add_prefix; } url_match_t; -#define URL_FLAG_NOHTML 0x1 -#define URL_FLAG_STRICT_MATCH 0x2 +#define URL_FLAG_NOHTML (1 << 0) +#define URL_FLAG_STRICT_MATCH (1 << 1) +#define URL_FLAG_STAR_MATCH (1 << 2) struct url_matcher { const gchar *pattern; @@ -90,7 +91,7 @@ static gboolean url_email_end (const gchar *begin, const gchar *pos, url_match_t *match); -struct url_matcher matchers[] = { +struct url_matcher static_matchers[] = { /* Common prefixes */ { "file://", "", url_file_start, url_file_end, 0 }, @@ -671,9 +672,9 @@ struct url_matcher matchers[] = { }; struct url_match_scanner { - struct url_matcher *matchers; - gsize matchers_count; - rspamd_trie_t *patterns; + GArray *matchers; + rspamd_trie_t *search_trie; + rspamd_trie_t *tld_trie; }; struct url_match_scanner *url_scanner = NULL; @@ -821,49 +822,130 @@ rspamd_url_strerror (enum uri_errno err) return NULL; } -static gint -url_init (void) +static void +rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner) +{ + FILE *f; + struct url_matcher m; + gchar *linebuf = NULL, *p; + gsize buflen = 0; + gssize r; + gint flags; + + f = fopen (fname, "r"); + + if (f == NULL) { + msg_err ("cannot open TLD file %s: %s", fname, strerror (errno)); + return; + } + + m.end = url_tld_end; + m.start = url_tld_start; + m.prefix = "http://"; + + while ((r = getline (&linebuf, &buflen, f)) > 0) { + if (linebuf[0] == '/' || g_ascii_isspace (linebuf[0])) { + /* Skip comment or empty line */ + continue; + } + + g_strchomp (linebuf); + + /* TODO: add support for ! patterns */ + if (linebuf[0] == '!') { + msg_debug ("skip '!' patterns from parsing for now: %s", linebuf); + continue; + } + + flags = URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH; + + if (linebuf[0] == '*') { + flags |= URL_FLAG_STAR_MATCH; + p = strchr (linebuf, '.'); + + if (p == NULL) { + msg_err ("got bad star line, skip it: %s", linebuf); + continue; + } + p ++; + } + else { + p = linebuf; + } + + m.pattern = g_strdup (p); + g_array_append_val (url_scanner->matchers, m); + } + + free (linebuf); + fclose (f); +} + +static void +rspamd_url_add_static_matchers (GArray *matchers) +{ + gint n = G_N_ELEMENTS (static_matchers); + + g_array_append_vals (matchers, static_matchers, n); +} + +void +rspamd_url_init (struct rspamd_config *cfg) { guint i; gchar patbuf[128]; + struct url_matcher *m; if (url_scanner == NULL) { url_scanner = g_malloc (sizeof (struct url_match_scanner)); - url_scanner->matchers = matchers; - url_scanner->matchers_count = G_N_ELEMENTS (matchers); - url_scanner->patterns = rspamd_trie_create (TRUE); - for (i = 0; i < url_scanner->matchers_count; i++) { - if (matchers[i].flags & URL_FLAG_STRICT_MATCH) { + url_scanner->matchers = g_array_new (FALSE, TRUE, + sizeof (struct url_matcher)); + url_scanner->search_trie = rspamd_trie_create (TRUE); + url_scanner->tld_trie = rspamd_trie_create (TRUE); + rspamd_url_add_static_matchers (url_scanner->matchers); + + if (cfg->tld_file) { + rspamd_url_parse_tld_file (cfg->tld_file, url_scanner); + } + else { + msg_warn ("tld extension file is not specified, url matching is limited"); + } + + for (i = 0; i < url_scanner->matchers->len; i++) { + m = &g_array_index (url_scanner->matchers, struct url_matcher, i); + + if (m->flags & URL_FLAG_STRICT_MATCH) { /* Insert more specific patterns */ /* some.tld/ */ rspamd_snprintf (patbuf, sizeof (patbuf), "%s/", - matchers[i].pattern); - rspamd_trie_insert (url_scanner->patterns, patbuf, i); + m->pattern); + rspamd_trie_insert (url_scanner->search_trie, patbuf, i); /* some.tld */ rspamd_snprintf (patbuf, sizeof (patbuf), "%s ", - matchers[i].pattern); - rspamd_trie_insert (url_scanner->patterns, patbuf, i); + m->pattern); + rspamd_trie_insert (url_scanner->search_trie, patbuf, i); /* some.tld: */ rspamd_snprintf (patbuf, sizeof (patbuf), "%s:", - matchers[i].pattern); - rspamd_trie_insert (url_scanner->patterns, patbuf, i); + m->pattern); + rspamd_trie_insert (url_scanner->search_trie, patbuf, i); } else { - rspamd_trie_insert (url_scanner->patterns, - matchers[i].pattern, - i); + rspamd_trie_insert (url_scanner->search_trie, m->pattern, i); + } + + /* Also use it for TLD lookups */ + if (strcmp (m->prefix, "http://") == 0) { + rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i); } } } - - return 0; } #define SET_U(u, field) do { \ @@ -1773,54 +1855,52 @@ rspamd_url_text_extract (rspamd_mempool_t * pool, return; } - if (url_init () == 0) { - begin = part->content->data; - end = begin + part->content->len; - p = begin; - while (p < end) { - if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, + begin = part->content->data; + end = begin + part->content->len; + p = begin; + while (p < end) { + if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str, is_html)) { - if (url_str != NULL) { - new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); - ex = + if (url_str != NULL) { + new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url)); + ex = rspamd_mempool_alloc0 (pool, - sizeof (struct process_exception)); - if (new != NULL) { - g_strstrip (url_str); - rc = rspamd_url_parse (new, url_str, strlen (url_str), pool); - if (rc == URI_ERRNO_OK && + sizeof (struct process_exception)); + if (new != NULL) { + g_strstrip (url_str); + rc = rspamd_url_parse (new, url_str, strlen (url_str), pool); + if (rc == URI_ERRNO_OK && new->hostlen > 0) { - ex->pos = url_start - begin; - ex->len = url_end - url_start; - if (new->protocol == PROTOCOL_MAILTO) { - if (new->userlen > 0) { - if (!g_tree_lookup (task->emails, new)) { - g_tree_insert (task->emails, new, new); - } + ex->pos = url_start - begin; + ex->len = url_end - url_start; + if (new->protocol == PROTOCOL_MAILTO) { + if (new->userlen > 0) { + if (!g_tree_lookup (task->emails, new)) { + g_tree_insert (task->emails, new, new); } } - else { - if (!g_tree_lookup (task->urls, new)) { - g_tree_insert (task->urls, new, new); - } + } + else { + if (!g_tree_lookup (task->urls, new)) { + g_tree_insert (task->urls, new, new); } - part->urls_offset = g_list_prepend ( + } + part->urls_offset = g_list_prepend ( part->urls_offset, ex); - } - else if (rc != URI_ERRNO_OK) { - msg_info ("extract of url '%s' failed: %s", + } + else if (rc != URI_ERRNO_OK) { + msg_info ("extract of url '%s' failed: %s", url_str, rspamd_url_strerror (rc)); - } } } } - else { - break; - } - p = url_end + 1; } + else { + break; + } + p = url_end + 1; } /* Handle offsets of this part */ if (part->urls_offset != NULL) { @@ -1845,57 +1925,55 @@ rspamd_url_find (rspamd_mempool_t *pool, url_match_t m; end = begin + len; - if (url_init () == 0) { - if ((pos = - rspamd_trie_lookup (url_scanner->patterns, begin, len, - &idx)) == NULL) { + if ((pos = + rspamd_trie_lookup (url_scanner->search_trie, begin, len, + &idx)) == NULL) { + return FALSE; + } + else { + matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx); + if ((matcher->flags & URL_FLAG_NOHTML) && is_html) { + /* Do not try to match non-html like urls in html texts */ return FALSE; } - else { - matcher = &matchers[idx]; - if ((matcher->flags & URL_FLAG_NOHTML) && is_html) { - /* Do not try to match non-html like urls in html texts */ - return FALSE; - } - m.pattern = matcher->pattern; - m.prefix = matcher->prefix; - m.add_prefix = FALSE; - if (matcher->start (begin, end, pos, + m.pattern = matcher->pattern; + m.prefix = matcher->prefix; + m.add_prefix = FALSE; + if (matcher->start (begin, end, pos, &m) && matcher->end (begin, end, pos, &m)) { - if (m.add_prefix || matcher->prefix[0] != '\0') { - l = m.m_len + 1 + strlen (m.prefix); - *url_str = rspamd_mempool_alloc (pool, l); - rspamd_snprintf (*url_str, + if (m.add_prefix || matcher->prefix[0] != '\0') { + l = m.m_len + 1 + strlen (m.prefix); + *url_str = rspamd_mempool_alloc (pool, l); + rspamd_snprintf (*url_str, l, "%s%*s", m.prefix, m.m_len, m.m_begin); - } - else { - *url_str = rspamd_mempool_alloc (pool, m.m_len + 1); - memcpy (*url_str, m.m_begin, m.m_len); - (*url_str)[m.m_len] = '\0'; - } - if (start != NULL) { - *start = (gchar *)m.m_begin; - } - if (fin != NULL) { - *fin = (gchar *)m.m_begin + m.m_len; - } } else { - *url_str = NULL; - if (start != NULL) { - *start = (gchar *)pos; - } - if (fin != NULL) { - *fin = (gchar *)pos + strlen (m.prefix); - } + *url_str = rspamd_mempool_alloc (pool, m.m_len + 1); + memcpy (*url_str, m.m_begin, m.m_len); + (*url_str)[m.m_len] = '\0'; + } + if (start != NULL) { + *start = (gchar *)m.m_begin; + } + if (fin != NULL) { + *fin = (gchar *)m.m_begin + m.m_len; + } + } + else { + *url_str = NULL; + if (start != NULL) { + *start = (gchar *)pos; + } + if (fin != NULL) { + *fin = (gchar *)pos + strlen (m.prefix); } - - return TRUE; } + + return TRUE; } return FALSE; diff --git a/src/libserver/url.h b/src/libserver/url.h index 889458c44..a066d214d 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -7,6 +7,7 @@ struct rspamd_task; struct mime_text_part; +struct rspamd_config; struct rspamd_url { gchar *string; @@ -62,6 +63,12 @@ enum rspamd_url_protocol { #define struri(uri) ((uri)->string) +/** + * Initialize url library + * @param cfg + */ +void rspamd_url_init (struct rspamd_config *cfg); + /* * Parse urls inside text * @param pool memory pool diff --git a/src/main.c b/src/main.c index b77b4d63d..6cfbf114b 100644 --- a/src/main.c +++ b/src/main.c @@ -1358,6 +1358,7 @@ main (gint argc, gchar **argv, gchar **env) } rspamd_stat_init (rspamd_main->cfg); + rspamd_url_init (rspamd_main->cfg); /* Insert classifiers symbols */ (void)rspamd_config_insert_classify_symbols (rspamd_main->cfg);