Browse Source

[Feature] Add heuristic to find displayed URLs

tags/1.5.0
Vsevolod Stakhov 7 years ago
parent
commit
5244d25f3b
3 changed files with 46 additions and 3 deletions
  1. 41
    3
      src/libserver/html.c
  2. 3
    0
      src/libserver/url.c
  3. 2
    0
      src/libserver/url.h

+ 41
- 3
src/libserver/html.c View File

@@ -765,16 +765,23 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
struct rspamd_url *href_url,
const guchar *url_text,
gsize len,
gboolean *url_found)
gboolean *url_found,
struct rspamd_url **ptext_url)
{
struct rspamd_url *text_url;
rspamd_ftok_t phished_tld;
gint rc;
gchar *url_str = NULL;
const guchar *end = url_text + len;

*url_found = FALSE;

if (rspamd_url_find (pool, url_text, len, &url_str, TRUE) && url_str != NULL) {
while (url_text < end && g_ascii_isspace (*url_text)) {
url_text ++;
}

if (rspamd_url_find (pool, url_text, end - url_text, &url_str, FALSE) &&
url_str != NULL) {
text_url = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_url));
rc = rspamd_url_parse (text_url, url_str, strlen (url_str), pool);

@@ -791,9 +798,11 @@ rspamd_html_url_is_phished (rspamd_mempool_t *pool,
rspamd_url_add_tag (text_url, "phishing",
rspamd_mempool_ftokdup (pool, &phished_tld),
pool);
text_url->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
}
}

*ptext_url = text_url;
*url_found = TRUE;
}
else {
@@ -2124,10 +2133,13 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,
(cur_tag->flags & FL_CLOSING)) {
/* Insert exception */
if (url != NULL && (gint)dest->len > href_offset) {
struct rspamd_url *displayed_url = NULL;

rspamd_html_url_is_phished (pool, url,
dest->data + href_offset,
dest->len - href_offset,
&url_text);
&url_text, &displayed_url);

if (exceptions && url_text) {
ex = rspamd_mempool_alloc (pool, sizeof (*ex));
ex->pos = href_offset;
@@ -2136,6 +2148,32 @@ rspamd_html_process_part_full (rspamd_mempool_t *pool, struct html_content *hc,

*exceptions = g_list_prepend (*exceptions, ex);
}

if (displayed_url) {
if (url->protocol == PROTOCOL_MAILTO) {
target_tbl = emails;
}
else {
target_tbl = urls;
}

if (target_tbl != NULL) {
turl = g_hash_table_lookup (target_tbl, url);

if (turl != NULL) {
/* Here, we assume the following:
* if we have a URL in the text part which
* is the same as displayed URL in the
* HTML part, we assume that it is also
* hint only.
*/
if (turl->flags & RSPAMD_URL_FLAG_FROM_TEXT) {
turl->flags |= RSPAMD_URL_FLAG_HTML_DISPLAYED;
turl->flags &= ~RSPAMD_URL_FLAG_FROM_TEXT;
}
}
}
}
}

href_offset = -1;

+ 3
- 0
src/libserver/url.c View File

@@ -2364,6 +2364,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
if (url->protocol == PROTOCOL_MAILTO) {
if (url->userlen > 0) {
if (!g_hash_table_lookup (task->emails, url)) {
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
g_hash_table_insert (task->emails, url,
url);
}
@@ -2371,6 +2372,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
}
else {
if (!g_hash_table_lookup (task->urls, url)) {
url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
g_hash_table_insert (task->urls, url, url);
}
}
@@ -2401,6 +2403,7 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,

if (!g_hash_table_lookup (task->urls,
query_url)) {
query_url->flags |= RSPAMD_URL_FLAG_FROM_TEXT;
g_hash_table_insert (task->urls,
query_url,
query_url);

+ 2
- 0
src/libserver/url.h View File

@@ -14,6 +14,8 @@ enum rspamd_url_flags {
RSPAMD_URL_FLAG_NUMERIC = 1 << 1,
RSPAMD_URL_FLAG_OBSCURED = 1 << 2,
RSPAMD_URL_FLAG_REDIRECTED = 1 << 3,
RSPAMD_URL_FLAG_HTML_DISPLAYED = 1 << 4,
RSPAMD_URL_FLAG_FROM_TEXT = 1 << 5,
};

struct rspamd_url_tag {

Loading…
Cancel
Save