Browse Source

[Rework] Urls: more rework of the urls sets

tags/2.5
Vsevolod Stakhov 4 years ago
parent
commit
50a043a7cb
7 changed files with 137 additions and 163 deletions
  1. 2
    6
      src/libmime/message.c
  2. 2
    2
      src/libmime/message.h
  3. 2
    1
      src/libserver/html.h
  4. 26
    20
      src/libserver/protocol.c
  5. 12
    12
      src/libserver/re_cache.c
  6. 84
    107
      src/libserver/url.c
  7. 9
    15
      src/libserver/url.h

+ 2
- 6
src/libmime/message.c View File

@@ -1048,8 +1048,7 @@ rspamd_message_dtor (struct rspamd_message *msg)
g_ptr_array_unref (msg->text_parts);
g_ptr_array_unref (msg->parts);

g_hash_table_unref (msg->urls);
g_hash_table_unref (msg->emails);
kh_destroy (rspamd_url_hash, msg->urls);
}

struct rspamd_message*
@@ -1060,10 +1059,7 @@ rspamd_message_new (struct rspamd_task *task)
msg = rspamd_mempool_alloc0 (task->task_pool, sizeof (*msg));

msg->raw_headers = rspamd_message_headers_new ();

msg->emails = g_hash_table_new (rspamd_email_hash, rspamd_emails_cmp);
msg->urls = g_hash_table_new (rspamd_url_hash, rspamd_urls_cmp);

msg->urls = kh_init (rspamd_url_hash);
msg->parts = g_ptr_array_sized_new (4);
msg->text_parts = g_ptr_array_sized_new (2);
msg->task = task;

+ 2
- 2
src/libmime/message.h View File

@@ -13,6 +13,7 @@
#include "libcryptobox/cryptobox.h"
#include "libmime/mime_headers.h"
#include "libmime/content_type.h"
#include "libserver/url.h"
#include "libutil/ref.h"
#include "libutil/str_util.h"

@@ -175,8 +176,7 @@ struct rspamd_message {
GPtrArray *text_parts; /**< list of text parts */
struct rspamd_message_raw_headers_content raw_headers_content;
struct rspamd_received_header *received; /**< list of received headers */
GHashTable *urls; /**< list of parsed urls */
GHashTable *emails; /**< list of parsed emails */
khash_t (rspamd_url_hash) *urls;
struct rspamd_mime_headers_table *raw_headers; /**< list of raw headers */
struct rspamd_mime_header *headers_order; /**< order of raw headers */
struct rspamd_task *task;

+ 2
- 1
src/libserver/html.h View File

@@ -141,7 +141,8 @@ GByteArray *rspamd_html_process_part (rspamd_mempool_t *pool,

GByteArray *rspamd_html_process_part_full (rspamd_mempool_t *pool,
struct html_content *hc,
GByteArray *in, GList **exceptions, GHashTable *urls, GHashTable *emails);
GByteArray *in, GList **exceptions,
GHashTable *urls, GHashTable *emails);

/*
* Returns true if a specified tag has been seen in a part

+ 26
- 20
src/libserver/protocol.c View File

@@ -861,7 +861,7 @@ rspamd_protocol_handle_request (struct rspamd_task *task,
/* Structure for writing tree data */
struct tree_cb_data {
ucl_object_t *top;
GHashTable *seen;
khash_t (rspamd_url_host_hash) *seen;
struct rspamd_task *task;
};

@@ -908,10 +908,8 @@ rspamd_protocol_extended_url (struct rspamd_task *task,
* Callback for writing urls
*/
static void
urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
urls_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb)
{
struct tree_cb_data *cb = ud;
struct rspamd_url *url = value;
ucl_object_t *obj;
struct rspamd_task *task = cb->task;
const gchar *user_field = "unknown", *encoded = NULL;
@@ -921,7 +919,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)

if (!(task->protocol_flags & RSPAMD_TASK_PROTOCOL_FLAG_EXT_URLS)) {
if (url->hostlen > 0) {
if (g_hash_table_lookup (cb->seen, url)) {
if (rspamd_url_host_set_has (cb->seen, url)) {
return;
}

@@ -941,7 +939,7 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
return;
}

g_hash_table_insert (cb->seen, url, url);
rspamd_url_host_set_add (cb->seen, url);
}
else {
encoded = rspamd_url_encode (url, &enclen, task->task_pool);
@@ -975,28 +973,32 @@ urls_protocol_cb (gpointer key, gpointer value, gpointer ud)
}

static ucl_object_t *
rspamd_urls_tree_ucl (GHashTable *input, struct rspamd_task *task)
rspamd_urls_tree_ucl (khash_t (rspamd_url_hash) *set,
struct rspamd_task *task)
{
struct tree_cb_data cb;
ucl_object_t *obj;
struct rspamd_url *u;

obj = ucl_object_typed_new (UCL_ARRAY);
cb.top = obj;
cb.task = task;
cb.seen = g_hash_table_new (rspamd_url_host_hash, rspamd_urls_host_cmp);
cb.seen = kh_init (rspamd_url_host_hash);

g_hash_table_foreach (input, urls_protocol_cb, &cb);
kh_foreach_key (set, u, {
if (!(u->protocol & PROTOCOL_MAILTO)) {
urls_protocol_cb (u, &cb);
}
});

g_hash_table_unref (cb.seen);
kh_destroy (rspamd_url_host_hash, cb.seen);

return obj;
}

static void
emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
emails_protocol_cb (struct rspamd_url *url, struct tree_cb_data *cb)
{
struct tree_cb_data *cb = ud;
struct rspamd_url *url = value;
ucl_object_t *obj;

if (url->userlen > 0 && url->hostlen > 0) {
@@ -1007,16 +1009,23 @@ emails_protocol_cb (gpointer key, gpointer value, gpointer ud)
}

static ucl_object_t *
rspamd_emails_tree_ucl (GHashTable *input, struct rspamd_task *task)
rspamd_emails_tree_ucl (khash_t (rspamd_url_hash) *set,
struct rspamd_task *task)
{
struct tree_cb_data cb;
ucl_object_t *obj;
struct rspamd_url *u;

obj = ucl_object_typed_new (UCL_ARRAY);
cb.top = obj;
cb.task = task;

g_hash_table_foreach (input, emails_protocol_cb, &cb);
kh_foreach_key (set, u, {
if ((u->protocol & PROTOCOL_MAILTO)) {
emails_protocol_cb (u, &cb);
}
});


return obj;
}
@@ -1446,15 +1455,12 @@ rspamd_protocol_write_ucl (struct rspamd_task *task,
}

if (flags & RSPAMD_PROTOCOL_URLS && task->message) {
if (g_hash_table_size (MESSAGE_FIELD (task, urls)) > 0) {
if (kh_size (MESSAGE_FIELD (task, urls)) > 0) {
ucl_object_insert_key (top,
rspamd_urls_tree_ucl (MESSAGE_FIELD (task, urls), task),
"urls", 0, false);
}

if (g_hash_table_size (MESSAGE_FIELD (task, emails)) > 0) {
ucl_object_insert_key (top,
rspamd_emails_tree_ucl (MESSAGE_FIELD (task, emails), task),
rspamd_emails_tree_ucl (MESSAGE_FIELD (task, urls), task),
"emails", 0, false);
}
}

+ 12
- 12
src/libserver/re_cache.c View File

@@ -1053,7 +1053,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
gboolean is_strong)
{
guint ret = 0, i, re_id;
GHashTableIter it;
struct rspamd_mime_header *rh;
const gchar *in;
const guchar **scvec;
@@ -1062,7 +1061,6 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
struct rspamd_mime_text_part *text_part;
struct rspamd_mime_part *mime_part;
struct rspamd_url *url;
gpointer k, v;
guint len, cnt;
const gchar *class_name;

@@ -1164,17 +1162,18 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_URL:
cnt = g_hash_table_size (MESSAGE_FIELD (task, urls));
cnt = kh_size (MESSAGE_FIELD (task, urls));

if (cnt > 0) {
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
g_hash_table_iter_init (&it, MESSAGE_FIELD (task, urls));
i = 0;
raw = FALSE;

while (g_hash_table_iter_next (&it, &k, &v)) {
url = v;
kh_foreach_key (MESSAGE_FIELD (task, urls), url, {
if ((url->protocol & PROTOCOL_MAILTO)) {
continue;
}
in = url->string;
len = url->urllen;

@@ -1182,7 +1181,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
scvec[i] = (guchar *) in;
lenvec[i++] = len;
}
}
});

#if 0
g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
@@ -1207,18 +1206,19 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_EMAIL:
cnt = g_hash_table_size (MESSAGE_FIELD (task, emails));
cnt = kh_size (MESSAGE_FIELD (task, urls));

if (cnt > 0) {
scvec = g_malloc (sizeof (*scvec) * cnt);
lenvec = g_malloc (sizeof (*lenvec) * cnt);
g_hash_table_iter_init (&it, MESSAGE_FIELD (task, emails));
i = 0;
raw = FALSE;

while (g_hash_table_iter_next (&it, &k, &v)) {
url = v;
kh_foreach_key (MESSAGE_FIELD (task, urls), url, {

if (!(url->protocol & PROTOCOL_MAILTO)) {
continue;
}
if (url->userlen == 0 || url->hostlen == 0) {
continue;
}
@@ -1227,7 +1227,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
len = url->userlen + 1 + url->hostlen;
scvec[i] = (guchar *) in;
lenvec[i++] = len;
}
});

ret = rspamd_re_cache_process_regexp_data (rt, re,
task, scvec, lenvec, i, raw, &processed_hyperscan);

+ 84
- 107
src/libserver/url.c View File

@@ -214,6 +214,13 @@ struct url_matcher static_matchers[] = {
URL_FLAG_NOHTML}
};


static inline khint_t rspamd_url_hash (struct rspamd_url *u);

static inline khint_t rspamd_url_host_hash (struct rspamd_url * u);
static inline bool rspamd_urls_cmp (struct rspamd_url *a, struct rspamd_url *b);
static inline bool rspamd_urls_host_cmp (struct rspamd_url *a, struct rspamd_url *b);

/* Hash table implementation */
__KHASH_IMPL (rspamd_url_hash, kh_inline,struct rspamd_url *, char, false,
rspamd_url_hash, rspamd_urls_cmp);
@@ -3116,7 +3123,6 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
struct rspamd_task *task;
gchar *url_str = NULL;
struct rspamd_url *query_url, *existing;
GHashTable *target_tbl = NULL;
gint rc;
gboolean prefix_added;

@@ -3141,36 +3147,23 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}

if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0) {
target_tbl = MESSAGE_FIELD (task, emails);
if (url->userlen == 0) {
return FALSE;
}
}
else {
target_tbl = MESSAGE_FIELD (task, urls);
}

if (target_tbl) {
/* Also check max urls */
if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
if (g_hash_table_size (target_tbl) > cbd->task->cfg->max_urls) {
msg_err_task ("part has too many URLs, we cannot process more: "
"%d urls extracted ",
(guint)g_hash_table_size (target_tbl));

return FALSE;
}
}
/* Also check max urls */
if (cbd->task->cfg && cbd->task->cfg->max_urls > 0) {
if (kh_size (MESSAGE_FIELD (task, urls)) > cbd->task->cfg->max_urls) {
msg_err_task ("part has too many URLs, we cannot process more: "
"%d urls extracted ",
(guint)kh_size (MESSAGE_FIELD (task, urls)));

if ((existing = g_hash_table_lookup (target_tbl, url)) == NULL) {
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
g_hash_table_insert (target_tbl, url, url);
}
else {
existing->count++;
return FALSE;
}
}

target_tbl = NULL;
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);

cbd->part->exceptions = g_list_prepend (
cbd->part->exceptions,
@@ -3178,7 +3171,8 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,

/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
if (rspamd_url_find (task->task_pool,
rspamd_url_query_unsafe (url), url->querylen,
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
@@ -3198,23 +3192,13 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}

if (query_url->protocol == PROTOCOL_MAILTO) {
if (query_url->userlen > 0) {
target_tbl = MESSAGE_FIELD (task, emails);
if (query_url->userlen == 0) {
return TRUE;
}
}
else {
target_tbl = MESSAGE_FIELD (task, urls);
}

if (target_tbl) {
if ((existing = g_hash_table_lookup (target_tbl, query_url)) == NULL) {
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
g_hash_table_insert (target_tbl, query_url, query_url);
}
else {
existing->count++;
}
}
query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), query_url);
}
}
}
@@ -3321,27 +3305,13 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED|RSPAMD_URL_FLAG_SUBJECT;

if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0 && url->hostlen > 0) {
if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, emails),
url)) == NULL) {
g_hash_table_insert (MESSAGE_FIELD (task, emails), url,
url);
}
else {
existing->count ++;
}
}
}
else {
if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
url)) == NULL) {
g_hash_table_insert (MESSAGE_FIELD (task, urls), url, url);
}
else {
existing->count ++;
if (url->userlen == 0) {
return FALSE;
}
}

rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls), url);

/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, rspamd_url_query_unsafe (url), url->querylen,
@@ -3364,15 +3334,14 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
query_url->flags |= RSPAMD_URL_FLAG_SCHEMALESS;
}

if ((existing = g_hash_table_lookup (MESSAGE_FIELD (task, urls),
query_url)) == NULL) {
g_hash_table_insert (MESSAGE_FIELD (task, urls),
query_url,
query_url);
}
else {
existing->count ++;
if (query_url->protocol == PROTOCOL_MAILTO) {
if (query_url->userlen == 0) {
return TRUE;
}
}

rspamd_url_set_add_or_increase (MESSAGE_FIELD (task, urls),
query_url);
}
}
}
@@ -3380,26 +3349,22 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
return TRUE;
}

inline guint
rspamd_url_hash (gconstpointer u)
static inline khint_t
rspamd_url_hash (struct rspamd_url *url)
{
const struct rspamd_url *url = u;

if (url->urllen > 0) {
return (guint)rspamd_cryptobox_fast_hash (url->string, url->urllen,
return (khint_t)rspamd_cryptobox_fast_hash (url->string, url->urllen,
rspamd_hash_seed ());
}

return 0;
}

inline guint
rspamd_url_host_hash (gconstpointer u)
static inline khint_t
rspamd_url_host_hash (struct rspamd_url *url)
{
const struct rspamd_url *url = u;

if (url->hostlen > 0) {
return (guint)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
return (khint_t)rspamd_cryptobox_fast_hash (rspamd_url_host_unsafe (url),
url->hostlen,
rspamd_hash_seed ());
}
@@ -3407,30 +3372,10 @@ rspamd_url_host_hash (gconstpointer u)
return 0;
}

inline guint
rspamd_email_hash (gconstpointer u)
{
const struct rspamd_url *url = u;
rspamd_cryptobox_fast_hash_state_t st;

rspamd_cryptobox_fast_hash_init (&st, rspamd_hash_seed ());

if (url->hostlen > 0) {
rspamd_cryptobox_fast_hash_update (&st, rspamd_url_host_unsafe (url), url->hostlen);
}

if (url->userlen > 0) {
rspamd_cryptobox_fast_hash_update (&st, rspamd_url_user_unsafe(url), url->userlen);
}

return (guint)rspamd_cryptobox_fast_hash_final (&st);
}

/* Compare two emails for building emails tree */
inline gboolean
rspamd_emails_cmp (gconstpointer a, gconstpointer b)
static inline bool
rspamd_emails_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
const struct rspamd_url *u1 = a, *u2 = b;
gint r;

if (u1->hostlen != u2->hostlen || u1->hostlen == 0) {
@@ -3456,30 +3401,32 @@ rspamd_emails_cmp (gconstpointer a, gconstpointer b)
return FALSE;
}

inline gboolean
rspamd_urls_cmp (gconstpointer a, gconstpointer b)
static inline bool
rspamd_urls_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
const struct rspamd_url *u1 = a, *u2 = b;
int r = 0;

if (u1->urllen != u2->urllen) {
return FALSE;
if (u1->protocol != u2->protocol || u1->urllen != u2->urllen) {
return false;
}
else {
if (u1->protocol & PROTOCOL_MAILTO) {
return rspamd_emails_cmp (u1, u2);
}

r = memcmp (u1->string, u2->string, u1->urllen);
}

return r == 0;
}

inline gboolean
rspamd_urls_host_cmp (gconstpointer a, gconstpointer b)
static inline bool
rspamd_urls_host_cmp (struct rspamd_url *u1, struct rspamd_url *u2)
{
const struct rspamd_url *u1 = a, *u2 = b;
int r = 0;

if (u1->hostlen != u2->hostlen) {
return FALSE;
return false;
}
else {
r = memcmp (rspamd_url_host_unsafe (u1), rspamd_url_host_unsafe (u2),
@@ -3834,6 +3781,22 @@ rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
return true;
}

bool
rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
struct rspamd_url *u)
{
khiter_t k;
gint r;

k = kh_put (rspamd_url_host_hash, set, u, &r);

if (r == 0) {
return false;
}

return true;
}

bool
rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u)
{
@@ -3845,5 +3808,19 @@ rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u)
return false;
}

return true;
}

bool
rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u)
{
khiter_t k;

k = kh_get (rspamd_url_hash, set, u);

if (k == kh_end (set)) {
return false;
}

return true;
}

+ 9
- 15
src/libserver/url.h View File

@@ -225,21 +225,6 @@ gboolean rspamd_url_task_subject_callback (struct rspamd_url *url,
gsize start_offset,
gsize end_offset, gpointer ud);

guint rspamd_url_hash (gconstpointer u);

guint rspamd_email_hash (gconstpointer u);

guint rspamd_url_host_hash (gconstpointer u);


/* Compare two emails for building emails hash */
gboolean rspamd_emails_cmp (gconstpointer a, gconstpointer b);

/* Compare two urls for building emails hash */
gboolean rspamd_urls_cmp (gconstpointer a, gconstpointer b);

gboolean rspamd_urls_host_cmp (gconstpointer a, gconstpointer b);

/**
* Decode URL encoded string in-place and return new length of a string, src and dst are NULL terminated
* @param dst
@@ -295,6 +280,14 @@ KHASH_DECLARE (rspamd_url_host_hash, struct rspamd_url *, char);
*/
bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
struct rspamd_url *u);
/**
* Helper for url host set
* @param set
* @param u
* @return
*/
bool rspamd_url_host_set_add (khash_t (rspamd_url_host_hash) *set,
struct rspamd_url *u);
/**
* Checks if a url is in set
* @param set
@@ -302,6 +295,7 @@ bool rspamd_url_set_add_or_increase (khash_t (rspamd_url_hash) *set,
* @return
*/
bool rspamd_url_set_has (khash_t (rspamd_url_hash) *set, struct rspamd_url *u);
bool rspamd_url_host_set_has (khash_t (rspamd_url_host_hash) *set, struct rspamd_url *u);

#ifdef __cplusplus
}

Loading…
Cancel
Save