summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-05 19:03:48 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2015-04-05 19:03:48 +0100
commita9cca486d560a4d2741a86e2ea53298658af1aac (patch)
tree7d195cc4f62ba7cdef6026c5987b93833868450f /src
parent650678a36be70fbce595b168ca318be1a95ba013 (diff)
downloadrspamd-a9cca486d560a4d2741a86e2ea53298658af1aac.tar.gz
rspamd-a9cca486d560a4d2741a86e2ea53298658af1aac.zip
Rework URL parser to load tld file.
Diffstat (limited to 'src')
-rw-r--r--src/libserver/cfg_file.h10
-rw-r--r--src/libserver/cfg_rcl.c5
-rw-r--r--src/libserver/url.c272
-rw-r--r--src/libserver/url.h7
-rw-r--r--src/main.c1
5 files changed, 194 insertions, 101 deletions
diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h
index 16ecb2800..bed89516d 100644
--- a/src/libserver/cfg_file.h
+++ b/src/libserver/cfg_file.h
@@ -247,13 +247,15 @@ struct rspamd_config {
gchar *cache_filename; /**< filename of cache file */
struct metric *default_metric; /**< default metric */
- gchar * checksum; /**< real checksum of config file */
- gchar * dump_checksum; /**< dump checksum of config file */
+ gchar * checksum; /**< real checksum of config file */
+ gchar * dump_checksum; /**< dump checksum of config file */
gpointer lua_state; /**< pointer to lua state */
- gchar * rrd_file; /**< rrd file to store statistics */
+ gchar * rrd_file; /**< rrd file to store statistics */
- gchar * history_file; /**< file to save rolling history */
+ gchar * history_file; /**< file to save rolling history */
+
+ gchar * tld_file; /**< file to load effective tld list from */
gdouble dns_timeout; /**< timeout in milliseconds for waiting for dns reply */
guint32 dns_retransmits; /**< maximum retransmits count */
diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c
index 6eaaeda63..07982d018 100644
--- a/src/libserver/cfg_rcl.c
+++ b/src/libserver/cfg_rcl.c
@@ -1309,6 +1309,11 @@ rspamd_rcl_config_init (void)
rspamd_rcl_parse_struct_integer,
G_STRUCT_OFFSET (struct rspamd_config, min_word_len),
RSPAMD_CL_FLAG_INT_32);
+ rspamd_rcl_add_default_handler (sub,
+ "url_tld",
+ rspamd_rcl_parse_struct_string,
+ G_STRUCT_OFFSET (struct rspamd_config, tld_file),
+ RSPAMD_CL_FLAG_STRING_PATH);
/**
* Metric section
diff --git a/src/libserver/url.c b/src/libserver/url.c
index a3dbf92f4..629d9baa1 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -41,8 +41,9 @@ typedef struct url_match_s {
gboolean add_prefix;
} url_match_t;
-#define URL_FLAG_NOHTML 0x1
-#define URL_FLAG_STRICT_MATCH 0x2
+#define URL_FLAG_NOHTML (1 << 0)
+#define URL_FLAG_STRICT_MATCH (1 << 1)
+#define URL_FLAG_STAR_MATCH (1 << 2)
struct url_matcher {
const gchar *pattern;
@@ -90,7 +91,7 @@ static gboolean url_email_end (const gchar *begin,
const gchar *pos,
url_match_t *match);
-struct url_matcher matchers[] = {
+struct url_matcher static_matchers[] = {
/* Common prefixes */
{ "file://", "", url_file_start, url_file_end,
0 },
@@ -671,9 +672,9 @@ struct url_matcher matchers[] = {
};
struct url_match_scanner {
- struct url_matcher *matchers;
- gsize matchers_count;
- rspamd_trie_t *patterns;
+ GArray *matchers;
+ rspamd_trie_t *search_trie;
+ rspamd_trie_t *tld_trie;
};
struct url_match_scanner *url_scanner = NULL;
@@ -821,49 +822,130 @@ rspamd_url_strerror (enum uri_errno err)
return NULL;
}
-static gint
-url_init (void)
+static void
+rspamd_url_parse_tld_file (const gchar *fname, struct url_match_scanner *scanner)
+{
+ FILE *f;
+ struct url_matcher m;
+ gchar *linebuf = NULL, *p;
+ gsize buflen = 0;
+ gssize r;
+ gint flags;
+
+ f = fopen (fname, "r");
+
+ if (f == NULL) {
+ msg_err ("cannot open TLD file %s: %s", fname, strerror (errno));
+ return;
+ }
+
+ m.end = url_tld_end;
+ m.start = url_tld_start;
+ m.prefix = "http://";
+
+ while ((r = getline (&linebuf, &buflen, f)) > 0) {
+ if (linebuf[0] == '/' || g_ascii_isspace (linebuf[0])) {
+ /* Skip comment or empty line */
+ continue;
+ }
+
+ g_strchomp (linebuf);
+
+ /* TODO: add support for ! patterns */
+ if (linebuf[0] == '!') {
+ msg_debug ("skip '!' patterns from parsing for now: %s", linebuf);
+ continue;
+ }
+
+ flags = URL_FLAG_NOHTML | URL_FLAG_STRICT_MATCH;
+
+ if (linebuf[0] == '*') {
+ flags |= URL_FLAG_STAR_MATCH;
+ p = strchr (linebuf, '.');
+
+ if (p == NULL) {
+ msg_err ("got bad star line, skip it: %s", linebuf);
+ continue;
+ }
+ p ++;
+ }
+ else {
+ p = linebuf;
+ }
+
+ m.pattern = g_strdup (p);
+ g_array_append_val (url_scanner->matchers, m);
+ }
+
+ free (linebuf);
+ fclose (f);
+}
+
+static void
+rspamd_url_add_static_matchers (GArray *matchers)
+{
+ gint n = G_N_ELEMENTS (static_matchers);
+
+ g_array_append_vals (matchers, static_matchers, n);
+}
+
+void
+rspamd_url_init (struct rspamd_config *cfg)
{
guint i;
gchar patbuf[128];
+ struct url_matcher *m;
if (url_scanner == NULL) {
url_scanner = g_malloc (sizeof (struct url_match_scanner));
- url_scanner->matchers = matchers;
- url_scanner->matchers_count = G_N_ELEMENTS (matchers);
- url_scanner->patterns = rspamd_trie_create (TRUE);
- for (i = 0; i < url_scanner->matchers_count; i++) {
- if (matchers[i].flags & URL_FLAG_STRICT_MATCH) {
+ url_scanner->matchers = g_array_new (FALSE, TRUE,
+ sizeof (struct url_matcher));
+ url_scanner->search_trie = rspamd_trie_create (TRUE);
+ url_scanner->tld_trie = rspamd_trie_create (TRUE);
+ rspamd_url_add_static_matchers (url_scanner->matchers);
+
+ if (cfg->tld_file) {
+ rspamd_url_parse_tld_file (cfg->tld_file, url_scanner);
+ }
+ else {
+ msg_warn ("tld extension file is not specified, url matching is limited");
+ }
+
+ for (i = 0; i < url_scanner->matchers->len; i++) {
+ m = &g_array_index (url_scanner->matchers, struct url_matcher, i);
+
+ if (m->flags & URL_FLAG_STRICT_MATCH) {
/* Insert more specific patterns */
/* some.tld/ */
rspamd_snprintf (patbuf,
sizeof (patbuf),
"%s/",
- matchers[i].pattern);
- rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ m->pattern);
+ rspamd_trie_insert (url_scanner->search_trie, patbuf, i);
/* some.tld */
rspamd_snprintf (patbuf,
sizeof (patbuf),
"%s ",
- matchers[i].pattern);
- rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ m->pattern);
+ rspamd_trie_insert (url_scanner->search_trie, patbuf, i);
/* some.tld: */
rspamd_snprintf (patbuf,
sizeof (patbuf),
"%s:",
- matchers[i].pattern);
- rspamd_trie_insert (url_scanner->patterns, patbuf, i);
+ m->pattern);
+ rspamd_trie_insert (url_scanner->search_trie, patbuf, i);
}
else {
- rspamd_trie_insert (url_scanner->patterns,
- matchers[i].pattern,
- i);
+ rspamd_trie_insert (url_scanner->search_trie, m->pattern, i);
+ }
+
+ /* Also use it for TLD lookups */
+ if (strcmp (m->prefix, "http://") == 0) {
+ rspamd_trie_insert (url_scanner->tld_trie, m->pattern, i);
}
}
}
-
- return 0;
}
#define SET_U(u, field) do { \
@@ -1773,54 +1855,52 @@ rspamd_url_text_extract (rspamd_mempool_t * pool,
return;
}
- if (url_init () == 0) {
- begin = part->content->data;
- end = begin + part->content->len;
- p = begin;
- while (p < end) {
- if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
+ begin = part->content->data;
+ end = begin + part->content->len;
+ p = begin;
+ while (p < end) {
+ if (rspamd_url_find (pool, p, end - p, &url_start, &url_end, &url_str,
is_html)) {
- if (url_str != NULL) {
- new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
- ex =
+ if (url_str != NULL) {
+ new = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
+ ex =
rspamd_mempool_alloc0 (pool,
- sizeof (struct process_exception));
- if (new != NULL) {
- g_strstrip (url_str);
- rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
- if (rc == URI_ERRNO_OK &&
+ sizeof (struct process_exception));
+ if (new != NULL) {
+ g_strstrip (url_str);
+ rc = rspamd_url_parse (new, url_str, strlen (url_str), pool);
+ if (rc == URI_ERRNO_OK &&
new->hostlen > 0) {
- ex->pos = url_start - begin;
- ex->len = url_end - url_start;
- if (new->protocol == PROTOCOL_MAILTO) {
- if (new->userlen > 0) {
- if (!g_tree_lookup (task->emails, new)) {
- g_tree_insert (task->emails, new, new);
- }
+ ex->pos = url_start - begin;
+ ex->len = url_end - url_start;
+ if (new->protocol == PROTOCOL_MAILTO) {
+ if (new->userlen > 0) {
+ if (!g_tree_lookup (task->emails, new)) {
+ g_tree_insert (task->emails, new, new);
}
}
- else {
- if (!g_tree_lookup (task->urls, new)) {
- g_tree_insert (task->urls, new, new);
- }
+ }
+ else {
+ if (!g_tree_lookup (task->urls, new)) {
+ g_tree_insert (task->urls, new, new);
}
- part->urls_offset = g_list_prepend (
+ }
+ part->urls_offset = g_list_prepend (
part->urls_offset,
ex);
- }
- else if (rc != URI_ERRNO_OK) {
- msg_info ("extract of url '%s' failed: %s",
+ }
+ else if (rc != URI_ERRNO_OK) {
+ msg_info ("extract of url '%s' failed: %s",
url_str,
rspamd_url_strerror (rc));
- }
}
}
}
- else {
- break;
- }
- p = url_end + 1;
}
+ else {
+ break;
+ }
+ p = url_end + 1;
}
/* Handle offsets of this part */
if (part->urls_offset != NULL) {
@@ -1845,57 +1925,55 @@ rspamd_url_find (rspamd_mempool_t *pool,
url_match_t m;
end = begin + len;
- if (url_init () == 0) {
- if ((pos =
- rspamd_trie_lookup (url_scanner->patterns, begin, len,
- &idx)) == NULL) {
+ if ((pos =
+ rspamd_trie_lookup (url_scanner->search_trie, begin, len,
+ &idx)) == NULL) {
+ return FALSE;
+ }
+ else {
+ matcher = &g_array_index (url_scanner->matchers, struct url_matcher, idx);
+ if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
+ /* Do not try to match non-html like urls in html texts */
return FALSE;
}
- else {
- matcher = &matchers[idx];
- if ((matcher->flags & URL_FLAG_NOHTML) && is_html) {
- /* Do not try to match non-html like urls in html texts */
- return FALSE;
- }
- m.pattern = matcher->pattern;
- m.prefix = matcher->prefix;
- m.add_prefix = FALSE;
- if (matcher->start (begin, end, pos,
+ m.pattern = matcher->pattern;
+ m.prefix = matcher->prefix;
+ m.add_prefix = FALSE;
+ if (matcher->start (begin, end, pos,
&m) && matcher->end (begin, end, pos, &m)) {
- if (m.add_prefix || matcher->prefix[0] != '\0') {
- l = m.m_len + 1 + strlen (m.prefix);
- *url_str = rspamd_mempool_alloc (pool, l);
- rspamd_snprintf (*url_str,
+ if (m.add_prefix || matcher->prefix[0] != '\0') {
+ l = m.m_len + 1 + strlen (m.prefix);
+ *url_str = rspamd_mempool_alloc (pool, l);
+ rspamd_snprintf (*url_str,
l,
"%s%*s",
m.prefix,
m.m_len,
m.m_begin);
- }
- else {
- *url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
- memcpy (*url_str, m.m_begin, m.m_len);
- (*url_str)[m.m_len] = '\0';
- }
- if (start != NULL) {
- *start = (gchar *)m.m_begin;
- }
- if (fin != NULL) {
- *fin = (gchar *)m.m_begin + m.m_len;
- }
}
else {
- *url_str = NULL;
- if (start != NULL) {
- *start = (gchar *)pos;
- }
- if (fin != NULL) {
- *fin = (gchar *)pos + strlen (m.prefix);
- }
+ *url_str = rspamd_mempool_alloc (pool, m.m_len + 1);
+ memcpy (*url_str, m.m_begin, m.m_len);
+ (*url_str)[m.m_len] = '\0';
+ }
+ if (start != NULL) {
+ *start = (gchar *)m.m_begin;
+ }
+ if (fin != NULL) {
+ *fin = (gchar *)m.m_begin + m.m_len;
+ }
+ }
+ else {
+ *url_str = NULL;
+ if (start != NULL) {
+ *start = (gchar *)pos;
+ }
+ if (fin != NULL) {
+ *fin = (gchar *)pos + strlen (m.prefix);
}
-
- return TRUE;
}
+
+ return TRUE;
}
return FALSE;
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 889458c44..a066d214d 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -7,6 +7,7 @@
struct rspamd_task;
struct mime_text_part;
+struct rspamd_config;
struct rspamd_url {
gchar *string;
@@ -62,6 +63,12 @@ enum rspamd_url_protocol {
#define struri(uri) ((uri)->string)
+/**
+ * Initialize url library
+ * @param cfg
+ */
+void rspamd_url_init (struct rspamd_config *cfg);
+
/*
* Parse urls inside text
* @param pool memory pool
diff --git a/src/main.c b/src/main.c
index b77b4d63d..6cfbf114b 100644
--- a/src/main.c
+++ b/src/main.c
@@ -1358,6 +1358,7 @@ main (gint argc, gchar **argv, gchar **env)
}
rspamd_stat_init (rspamd_main->cfg);
+ rspamd_url_init (rspamd_main->cfg);
/* Insert classifiers symbols */
(void)rspamd_config_insert_classify_symbols (rspamd_main->cfg);