Sfoglia il codice sorgente

[Rework] Rework HTML content urls extraction

tags/1.9.1
Vsevolod Stakhov 5 anni fa
parent
commit
40e894b9df
5 ha cambiato i file con 96 aggiunte e 64 eliminazioni
  1. 8
    2
      src/libmime/message.c
  2. 4
    2
      src/libserver/html.c
  3. 45
    35
      src/libserver/url.c
  4. 36
    23
      src/libserver/url.h
  5. 3
    2
      src/lua/lua_url.c

+ 8
- 2
src/libmime/message.c Vedi File

@@ -912,7 +912,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
rspamd_normalize_text_part (task, text_part);

if (!IS_PART_HTML (text_part)) {
rspamd_url_text_extract (task->task_pool, task, text_part, FALSE);
rspamd_url_text_extract (task->task_pool, task, text_part,
RSPAMD_URL_FIND_ALL);
}
else {
rspamd_url_text_extract (task->task_pool, task, text_part,
RSPAMD_URL_FIND_STRICT);
}

if (text_part->exceptions) {
@@ -1231,7 +1236,8 @@ rspamd_message_parse (struct rspamd_task *task)
p = task->subject;
len = strlen (p);
rspamd_cryptobox_hash_update (&st, p, len);
rspamd_url_find_multiple (task->task_pool, p, len, FALSE, NULL,
rspamd_url_find_multiple (task->task_pool, p, len,
RSPAMD_URL_FIND_STRICT, NULL,
rspamd_url_task_subject_callback, task);
}


+ 4
- 2
src/libserver/html.c Vedi File

@@ -598,7 +598,8 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
}

if (end > url_text + 4 &&
rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE,
rspamd_url_find (pool, url_text, end - url_text, &url_str,
RSPAMD_URL_FIND_ALL,
&url_pos, NULL) &&
url_str != NULL) {
if (url_pos > 0) {
@@ -1569,7 +1570,8 @@ rspamd_process_html_url (rspamd_mempool_t *pool, struct rspamd_url *url,

if (url->querylen > 0) {

if (rspamd_url_find (pool, url->query, url->querylen, &url_str, FALSE,
if (rspamd_url_find (pool, url->query, url->querylen, &url_str,
RSPAMD_URL_FIND_ALL,
NULL, &prefix_added)) {
query_url = rspamd_mempool_alloc0 (pool,
sizeof (struct rspamd_url));

+ 45
- 35
src/libserver/url.c Vedi File

@@ -205,7 +205,7 @@ struct url_matcher static_matchers[] = {
{"sip:", "", url_web_start, url_web_end,
0, 0},
{"www.", "http://", url_web_start, url_web_end,
0, 0},
URL_FLAG_NOHTML, 0},
{"ftp.", "ftp://", url_web_start, url_web_end,
URL_FLAG_NOHTML, 0},
/* Likely emails */
@@ -218,7 +218,7 @@ struct url_callback_data {
gchar *url_str;
rspamd_mempool_t *pool;
gint len;
gboolean is_html;
enum rspamd_url_find_type how;
gboolean prefix_added;
guint newline_idx;
GPtrArray *newlines;
@@ -2584,12 +2584,12 @@ rspamd_url_trie_is_match (struct url_matcher *matcher, const gchar *pos,

static gint
rspamd_url_trie_callback (struct rspamd_multipattern *mp,
guint strnum,
gint match_start,
gint match_pos,
const gchar *text,
gsize len,
void *context)
guint strnum,
gint match_start,
gint match_pos,
const gchar *text,
gsize len,
void *context)
{
struct url_matcher *matcher;
url_match_t m;
@@ -2599,7 +2599,7 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
matcher = &g_array_index (url_scanner->matchers, struct url_matcher,
strnum);

if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
@@ -2669,9 +2669,12 @@ rspamd_url_trie_callback (struct rspamd_multipattern *mp,
}

gboolean
rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
gchar **url_str, gboolean is_html, goffset *url_pos,
gboolean *prefix_added)
rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin, gsize len,
gchar **url_str,
enum rspamd_url_find_type how,
goffset *url_pos,
gboolean *prefix_added)
{
struct url_callback_data cb;
gint ret;
@@ -2679,7 +2682,7 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
memset (&cb, 0, sizeof (cb));
cb.begin = begin;
cb.end = begin + len;
cb.is_html = is_html;
cb.how = how;
cb.pool = pool;

ret = rspamd_multipattern_lookup (url_scanner->search_trie, begin, len,
@@ -2706,13 +2709,13 @@ rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,

static gint
rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
guint strnum,
gint match_start,
gint match_pos,
const gchar *text,
gsize len,
void *context,
gboolean multiple)
guint strnum,
gint match_start,
gint match_pos,
const gchar *text,
gsize len,
void *context,
gboolean multiple)
{
struct rspamd_url *url;
struct url_matcher *matcher;
@@ -2726,7 +2729,7 @@ rspamd_url_trie_generic_callback_common (struct rspamd_multipattern *mp,
strnum);
pool = cb->pool;

if ((matcher->flags & URL_FLAG_NOHTML) && cb->is_html) {
if ((matcher->flags & URL_FLAG_NOHTML) && cb->how == RSPAMD_URL_FIND_STRICT) {
/* Do not try to match non-html like urls in html texts */
return 0;
}
@@ -2894,7 +2897,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
&url_str, IS_PART_HTML (cbd->part), NULL, &prefix_added)) {
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {

query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));
@@ -2938,9 +2941,9 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,

void
rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
gboolean is_html)
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
enum rspamd_url_find_type how)
{
struct rspamd_url_mimepart_cbdata mcbd;

@@ -2953,14 +2956,18 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
mcbd.part = part;

rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
part->utf_stripped_content->len, is_html, part->newlines,
part->utf_stripped_content->len, how, part->newlines,
rspamd_url_text_part_callback, &mcbd);
}

void
rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
gsize inlen, gboolean is_html, GPtrArray *nlines,
url_insert_function func, gpointer ud)
rspamd_url_find_multiple (rspamd_mempool_t *pool,
const gchar *in,
gsize inlen,
enum rspamd_url_find_type how,
GPtrArray *nlines,
url_insert_function func,
gpointer ud)
{
struct url_callback_data cb;

@@ -2973,7 +2980,7 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
cb.is_html = is_html;
cb.how = how;
cb.pool = pool;

cb.funcd = ud;
@@ -2986,9 +2993,12 @@ rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
}

void
rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
gsize inlen, gboolean is_html,
url_insert_function func, gpointer ud)
rspamd_url_find_single (rspamd_mempool_t *pool,
const gchar *in,
gsize inlen,
enum rspamd_url_find_type how,
url_insert_function func,
gpointer ud)
{
struct url_callback_data cb;

@@ -3001,7 +3011,7 @@ rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
memset (&cb, 0, sizeof (cb));
cb.begin = in;
cb.end = in + inlen;
cb.is_html = is_html;
cb.how = how;
cb.pool = pool;

cb.funcd = ud;
@@ -3049,7 +3059,7 @@ rspamd_url_task_subject_callback (struct rspamd_url *url, gsize start_offset,
/* We also search the query for additional url inside */
if (url->querylen > 0) {
if (rspamd_url_find (task->task_pool, url->query, url->querylen,
&url_str, FALSE, NULL, &prefix_added)) {
&url_str, RSPAMD_URL_FIND_ALL, NULL, &prefix_added)) {

query_url = rspamd_mempool_alloc0 (task->task_pool,
sizeof (struct rspamd_url));

+ 36
- 23
src/libserver/url.h Vedi File

@@ -90,6 +90,17 @@ enum rspamd_url_protocol {
PROTOCOL_UNKNOWN = 1u << 31,
};

enum rspamd_url_parse_flags {
RSPAMD_URL_PARSE_TEXT = 0,
RSPAMD_URL_PARSE_HREF = (1u << 0),
RSPAMD_URL_PARSE_CHECK = (1 << 1),
};

enum rspamd_url_find_type {
RSPAMD_URL_FIND_ALL = 0,
RSPAMD_URL_FIND_STRICT,
};

/**
* Initialize url library
* @param cfg
@@ -104,15 +115,9 @@ void rspamd_url_deinit (void);
* @param is_html turn on html euristic
*/
void rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
gboolean is_html);

enum rspamd_url_parse_flags {
RSPAMD_URL_PARSE_TEXT = 0,
RSPAMD_URL_PARSE_HREF = (1u << 0),
RSPAMD_URL_PARSE_CHECK = (1 << 1),
};
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
enum rspamd_url_find_type how);

/*
* Parse a single url into an uri structure
@@ -136,9 +141,12 @@ enum uri_errno rspamd_url_parse (struct rspamd_url *uri,
* @param url_str storage for url string(or NULL)
* @return TRUE if url is found in specified text
*/
gboolean rspamd_url_find (rspamd_mempool_t *pool, const gchar *begin, gsize len,
gchar **url_str, gboolean is_html, goffset *url_pos,
gboolean *prefix_added);
gboolean rspamd_url_find (rspamd_mempool_t *pool,
const gchar *begin, gsize len,
gchar **url_str,
enum rspamd_url_find_type how,
goffset *url_pos,
gboolean *prefix_added);
/*
* Return text representation of url parsing error
*/
@@ -166,9 +174,12 @@ typedef void (*url_insert_function) (struct rspamd_url *url,
* @param func
* @param ud
*/
void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
gsize inlen, gboolean is_html, GPtrArray *nlines,
url_insert_function func, gpointer ud);
void rspamd_url_find_multiple (rspamd_mempool_t *pool,
const gchar *in, gsize inlen,
enum rspamd_url_find_type how,
GPtrArray *nlines,
url_insert_function func,
gpointer ud);
/**
* Search for a single url in text and call `func` for each url found
* @param pool
@@ -178,9 +189,11 @@ void rspamd_url_find_multiple (rspamd_mempool_t *pool, const gchar *in,
* @param func
* @param ud
*/
void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
gsize inlen, gboolean is_html,
url_insert_function func, gpointer ud);
void rspamd_url_find_single (rspamd_mempool_t *pool,
const gchar *in, gsize inlen,
enum rspamd_url_find_type how,
url_insert_function func,
gpointer ud);

/**
* Generic callback to insert URLs into rspamd_task
@@ -190,8 +203,8 @@ void rspamd_url_find_single (rspamd_mempool_t *pool, const gchar *in,
* @param ud
*/
void rspamd_url_task_subject_callback (struct rspamd_url *url,
gsize start_offset,
gsize end_offset, gpointer ud);
gsize start_offset,
gsize end_offset, gpointer ud);

/**
* Adds a tag for url
@@ -200,8 +213,8 @@ void rspamd_url_task_subject_callback (struct rspamd_url *url,
* @param pool
*/
void rspamd_url_add_tag (struct rspamd_url *url, const gchar *tag,
const gchar *value,
rspamd_mempool_t *pool);
const gchar *value,
rspamd_mempool_t *pool);

guint rspamd_url_hash (gconstpointer u);
guint rspamd_email_hash (gconstpointer u);
@@ -232,7 +245,7 @@ gsize rspamd_url_decode (gchar *dst, const gchar *src, gsize size);
* @return
*/
const gchar * rspamd_url_encode (struct rspamd_url *url, gsize *dlen,
rspamd_mempool_t *pool);
rspamd_mempool_t *pool);


/**

+ 3
- 2
src/lua/lua_url.c Vedi File

@@ -799,7 +799,7 @@ lua_url_create (lua_State *L)
return luaL_error (L, "invalid arguments");
}
else {
rspamd_url_find_single (pool, text, length, FALSE,
rspamd_url_find_single (pool, text, length, RSPAMD_URL_FIND_ALL,
lua_url_single_inserter, L);

if (lua_type (L, -1) != LUA_TUSERDATA) {
@@ -867,7 +867,8 @@ lua_url_all (lua_State *L)

if (text != NULL) {
lua_newtable (L);
rspamd_url_find_multiple (pool, text, length, FALSE, NULL,
rspamd_url_find_multiple (pool, text, length,
RSPAMD_URL_FIND_ALL, NULL,
lua_url_table_inserter, L);

}

Loading…
Annulla
Salva