diff options
-rw-r--r-- | src/libmime/mime_headers.h | 5 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 246 | ||||
-rw-r--r-- | src/libserver/url.c | 24 |
3 files changed, 137 insertions, 138 deletions
diff --git a/src/libmime/mime_headers.h b/src/libmime/mime_headers.h index 81948d11b..9b85e758f 100644 --- a/src/libmime/mime_headers.h +++ b/src/libmime/mime_headers.h @@ -51,12 +51,13 @@ enum rspamd_mime_header_flags { }; struct rspamd_mime_header { - gchar *name; /* Also used for key */ - gchar *value; const gchar *raw_value; /* As it is in the message (unfolded and unparsed) */ gsize raw_len; guint order; int flags; /* see enum rspamd_mime_header_flags */ + /* These are zero terminated (historically) */ + gchar *name; /* Also used for key */ + gchar *value; gchar *separator; gchar *decoded; struct rspamd_mime_header *prev, *next; /* Headers with the same name */ diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index 5517ad875..592cc31d4 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -24,6 +24,7 @@ #include "libutil/regexp.h" #include "lua/lua_common.h" #include "libstat/stat_api.h" +#include "contrib/uthash/utlist.h" #include "khash.h" @@ -950,6 +951,72 @@ rspamd_process_words_vector (GArray *words, return cnt; } +static guint +rspamd_re_cache_process_headers_list (struct rspamd_task *task, + struct rspamd_re_runtime *rt, + rspamd_regexp_t *re, + struct rspamd_re_class *re_class, + struct rspamd_mime_header *rh, + gboolean is_strong) +{ + const guchar **scvec, *in; + gboolean raw = FALSE; + guint *lenvec; + struct rspamd_mime_header *cur; + guint cnt = 0, i = 0, ret = 0; + + DL_COUNT (rh, cur, cnt); + + scvec = g_malloc (sizeof (*scvec) * cnt); + lenvec = g_malloc (sizeof (*lenvec) * cnt); + + DL_FOREACH (rh, cur) { + + if (is_strong && strcmp (cur->name, re_class->type_data) != 0) { + /* Skip a different case */ + continue; + } + + if (re_class->type == RSPAMD_RE_RAWHEADER) { + in = (const guchar *)cur->value; + lenvec[i] = strlen (cur->value); + + if (!g_utf8_validate (in, lenvec[i], NULL)) { + raw = TRUE; + } + } + else { + in = (const guchar *)cur->decoded; + /* Validate input^W^WNo need to validate as it is already valid */ + if (!in) { + lenvec[i] = 0; + scvec[i] = (guchar *)""; + continue; + } + + lenvec[i] = strlen (in); + } + + scvec[i] = in; + + i ++; + } + + if (i > 0) { + ret = rspamd_re_cache_process_regexp_data (rt, re, + task, scvec, lenvec, i, raw); + msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d", + re_class->type_data, + rspamd_regexp_get_pattern (re), + (int) lenvec[0], scvec[0], ret); + } + + g_free (scvec); + g_free (lenvec); + + return ret; +} + /* * Calculates the specified regexp for the specified class if it's not calculated */ @@ -961,14 +1028,14 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, gboolean is_strong) { guint ret = 0, i, re_id; - GPtrArray *headerlist; GHashTableIter it; struct rspamd_mime_header *rh; - const gchar *in, *end; + const gchar *in; const guchar **scvec; guint *lenvec; gboolean raw = FALSE; - struct rspamd_mime_text_part *part; + struct rspamd_mime_text_part *text_part; + struct rspamd_mime_part *mime_part; struct rspamd_url *url; gpointer k, v; guint len, cnt; @@ -982,140 +1049,70 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, case RSPAMD_RE_HEADER: case RSPAMD_RE_RAWHEADER: /* Get list of specified headers */ - headerlist = rspamd_message_get_header_array (task, - re_class->type_data, - is_strong); + rh = rspamd_message_get_header_array (task, + re_class->type_data); - if (headerlist && headerlist->len > 0) { - scvec = g_malloc (sizeof (*scvec) * headerlist->len); - lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); - - for (i = 0; i < headerlist->len; i ++) { - rh = g_ptr_array_index (headerlist, i); - - if (re_class->type == RSPAMD_RE_RAWHEADER) { - in = rh->value; - lenvec[i] = strlen (rh->value); - - if (!g_utf8_validate (in, lenvec[i], NULL)) { - raw = TRUE; - } - } - else { - in = rh->decoded; - /* Validate input */ - if (!in || !g_utf8_validate (in, -1, &end)) { - lenvec[i] = 0; - scvec[i] = (guchar *)""; - continue; - } - lenvec[i] = end - in; - } - - scvec[i] = (guchar *)in; - } - - ret = rspamd_re_cache_process_regexp_data (rt, re, - task, scvec, lenvec, headerlist->len, raw); - msg_debug_re_task ("checking header %s regexp: %s=%*s -> %d", - re_class->type_data, - rspamd_regexp_get_pattern (re), - (int)lenvec[0], scvec[0], ret); - g_free (scvec); - g_free (lenvec); + if (rh) { + ret = rspamd_re_cache_process_headers_list (task, rt, re, + re_class, rh, is_strong); } break; case RSPAMD_RE_ALLHEADER: raw = TRUE; - in = task->raw_headers_content.begin; - len = task->raw_headers_content.len; + in = MESSAGE_FIELD (task, raw_headers_content).begin; + len = MESSAGE_FIELD (task, raw_headers_content).len; ret = rspamd_re_cache_process_regexp_data (rt, re, task, (const guchar **)&in, &len, 1, raw); msg_debug_re_task ("checking allheader regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); break; case RSPAMD_RE_MIMEHEADER: - headerlist = rspamd_message_get_mime_header_array (task, - re_class->type_data, - is_strong); - - if (headerlist && headerlist->len > 0) { - scvec = g_malloc (sizeof (*scvec) * headerlist->len); - lenvec = g_malloc (sizeof (*lenvec) * headerlist->len); - - for (i = 0; i < headerlist->len; i ++) { - rh = g_ptr_array_index (headerlist, i); - - if (re_class->type == RSPAMD_RE_RAWHEADER) { - in = rh->value; - lenvec[i] = strlen (rh->value); + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, parts), i, mime_part) { + rh = rspamd_message_get_header_from_hash (mime_part->raw_headers, + re_class->type_data); - if (!g_utf8_validate (in, lenvec[i], NULL)) { - raw = TRUE; - } - } - else { - in = rh->decoded; - /* Validate input */ - if (!in || !g_utf8_validate (in, -1, &end)) { - lenvec[i] = 0; - scvec[i] = (guchar *)""; - continue; - } - - lenvec[i] = end - in; - } - - scvec[i] = (guchar *)in; + if (rh) { + ret += rspamd_re_cache_process_headers_list (task, rt, re, + re_class, rh, is_strong); } - - ret = rspamd_re_cache_process_regexp_data (rt, re, - task, scvec, lenvec, headerlist->len, raw); - msg_debug_re_task ("checking mime header %s regexp: %s -> %d", - re_class->type_data, - rspamd_regexp_get_pattern (re), ret); - g_free (scvec); - g_free (lenvec); } break; case RSPAMD_RE_MIME: case RSPAMD_RE_RAWMIME: /* Iterate through text parts */ - if (task->text_parts->len > 0) { - cnt = task->text_parts->len; + if (MESSAGE_FIELD (task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD (task, text_parts)->len; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); - for (i = 0; i < task->text_parts->len; i++) { - part = g_ptr_array_index (task->text_parts, i); - + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) { /* Select data for regexp */ if (re_class->type == RSPAMD_RE_RAWMIME) { - if (part->raw.len == 0) { + if (text_part->raw.len == 0) { len = 0; in = ""; } else { - in = part->raw.begin; - len = part->raw.len; + in = text_part->raw.begin; + len = text_part->raw.len; } raw = TRUE; } else { /* Skip empty parts */ - if (IS_PART_EMPTY (part)) { + if (IS_PART_EMPTY (text_part)) { len = 0; in = ""; } else { /* Check raw flags */ - if (!IS_PART_UTF (part)) { + if (!IS_PART_UTF (text_part)) { raw = TRUE; } - in = part->utf_content->data; - len = part->utf_content->len; + in = text_part->utf_content->data; + len = text_part->utf_content->len; } } @@ -1132,12 +1129,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_URL: - cnt = g_hash_table_size (task->urls) + g_hash_table_size (task->emails); + cnt = g_hash_table_size (MESSAGE_FIELD (task, urls)) + + g_hash_table_size (MESSAGE_FIELD (task, emails)); if (cnt > 0) { scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); - g_hash_table_iter_init (&it, task->urls); + g_hash_table_iter_init (&it, MESSAGE_FIELD (task, urls)); i = 0; while (g_hash_table_iter_next (&it, &k, &v)) { @@ -1150,7 +1148,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, lenvec[i++] = len; } - g_hash_table_iter_init (&it, task->emails); + g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails)); while (g_hash_table_iter_next (&it, &k, &v)) { url = v; @@ -1191,7 +1189,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, * paragraph when running the rules. All HTML tags and line breaks will * be removed before matching. */ - cnt = task->text_parts->len + 1; + cnt = MESSAGE_FIELD (task, text_parts)->len + 1; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); @@ -1200,11 +1198,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, * of the body content. */ - headerlist = rspamd_message_get_header_array (task, "Subject", FALSE); - - if (headerlist && headerlist->len > 0) { - rh = g_ptr_array_index (headerlist, 0); + rh = rspamd_message_get_header_array (task, "Subject"); + if (rh) { scvec[0] = (guchar *)rh->decoded; lenvec[0] = strlen (rh->decoded); } @@ -1212,14 +1208,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, scvec[0] = (guchar *)""; lenvec[0] = 0; } - for (i = 0; i < task->text_parts->len; i++) { - part = g_ptr_array_index (task->text_parts, i); - if (part->utf_stripped_content) { - scvec[i + 1] = (guchar *)part->utf_stripped_content->data; - lenvec[i + 1] = part->utf_stripped_content->len; + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) { + if (text_part->utf_stripped_content) { + scvec[i + 1] = (guchar *)text_part->utf_stripped_content->data; + lenvec[i + 1] = text_part->utf_stripped_content->len; - if (!IS_PART_UTF (part)) { + if (!IS_PART_UTF (text_part)) { raw = TRUE; } } @@ -1244,19 +1239,19 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, * Multiline expressions will need to be used to match strings that are * broken by line breaks. */ - if (task->text_parts->len > 0) { - cnt = task->text_parts->len; + if (MESSAGE_FIELD (task, text_parts)->len > 0) { + cnt = MESSAGE_FIELD (task, text_parts)->len; scvec = g_malloc (sizeof (*scvec) * cnt); lenvec = g_malloc (sizeof (*lenvec) * cnt); - for (i = 0; i < task->text_parts->len; i++) { - part = g_ptr_array_index (task->text_parts, i); + for (i = 0; i < cnt; i++) { + text_part = g_ptr_array_index (MESSAGE_FIELD (task, text_parts), i); - if (part->parsed.len > 0) { - scvec[i] = (guchar *)part->parsed.begin; - lenvec[i] = part->parsed.len; + if (text_part->parsed.len > 0) { + scvec[i] = (guchar *)text_part->parsed.begin; + lenvec[i] = text_part->parsed.len; - if (!IS_PART_UTF (part)) { + if (!IS_PART_UTF (text_part)) { raw = TRUE; } } @@ -1277,13 +1272,13 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, case RSPAMD_RE_WORDS: case RSPAMD_RE_STEMWORDS: case RSPAMD_RE_RAWWORDS: - if (task->text_parts->len > 0) { + if (MESSAGE_FIELD (task, text_parts)->len > 0) { cnt = 0; raw = FALSE; - PTR_ARRAY_FOREACH (task->text_parts, i, part) { - if (part->utf_words) { - cnt += part->utf_words->len; + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) { + if (text_part->utf_words) { + cnt += text_part->utf_words->len; } } @@ -1297,9 +1292,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, cnt = 0; - PTR_ARRAY_FOREACH (task->text_parts, i, part) { - if (part->utf_words) { - cnt = rspamd_process_words_vector (part->utf_words, + PTR_ARRAY_FOREACH (MESSAGE_FIELD (task, text_parts), i, text_part) { + if (text_part->utf_words) { + cnt = rspamd_process_words_vector (text_part->utf_words, scvec, lenvec, re_class, cnt, &raw); } } @@ -1522,6 +1517,7 @@ rspamd_re_cache_type_to_string (enum rspamd_re_type type) ret = "stem_words"; break; case RSPAMD_RE_MAX: + default: ret = "invalid class"; break; } diff --git a/src/libserver/url.c b/src/libserver/url.c index 0b31007bb..26e328a6d 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2949,11 +2949,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (url->protocol == PROTOCOL_MAILTO) { if (url->userlen > 0) { - target_tbl = task->emails; + target_tbl = MESSAGE_FIELD (task, emails); } } else { - target_tbl = task->urls; + target_tbl = MESSAGE_FIELD (task, urls); } if (target_tbl) { @@ -2996,11 +2996,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (query_url->protocol == PROTOCOL_MAILTO) { if (query_url->userlen > 0) { - target_tbl = task->emails; + target_tbl = MESSAGE_FIELD (task, emails); } } else { - target_tbl = task->urls; + target_tbl = MESSAGE_FIELD (task, urls); } if (target_tbl) { @@ -3115,9 +3115,10 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED|RSPAMD_URL_FLAG_SUBJECT; if (url->protocol == PROTOCOL_MAILTO) { - if (url->userlen > 0) { - if ((existing = g_hash_table_lookup (task->emails, url)) == NULL) { - g_hash_table_insert (task->emails, url, + if (url->userlen > 0 && url->hostlen > 0) { + if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, emails), + url)) == NULL) { + g_hash_table_insert (MESSAGE_FIELD (task, emails), url, url); } else { @@ -3126,8 +3127,9 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, } } else { - if ((existing = g_hash_table_lookup (task->urls, url)) == NULL) { - g_hash_table_insert (task->urls, url, url); + if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls), + url)) == NULL) { + g_hash_table_insert (MESSAGE_FIELD (task, urls), url, url); } else { existing->count ++; @@ -3156,9 +3158,9 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset, query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS; } - if ((existing = g_hash_table_lookup (task->urls, + if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls), query_url)) == NULL) { - g_hash_table_insert (task->urls, + g_hash_table_insert (MESSAGE_FIELD (task, urls), query_url, query_url); } |