From: Vsevolod Stakhov Date: Mon, 23 Mar 2020 14:50:24 +0000 (+0000) Subject: [Rework] URL: Another update for urls extraction logic X-Git-Tag: 2.5~35 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=f605d670505baad46b8ef4cdfa3dc32f48d4150e;p=rspamd.git [Rework] URL: Another update for urls extraction logic URL extraction from HTML parts should look like this: 1. Extract href links 2. Convert HTML to plain text and extract: a) (http|https|ftp)://foo.bar and www.foo b) email like strings \bfoo@bar.baz\b . For all extracted strings check if we have host with a domain from the public suffix. --- diff --git a/src/libmime/message.c b/src/libmime/message.c index f167730d4..49d879090 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -887,21 +887,8 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, } } else { - if (mime_part->parent_part) { - struct rspamd_mime_part *parent = mime_part->parent_part; - - if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) { - /* Do not extract urls from HTML at all */ - } - else { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_STRICT); - } - } - else { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_STRICT); - } + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); } if (text_part->exceptions) { diff --git a/src/libserver/url.c b/src/libserver/url.c index 0669d932f..6aceb8fa6 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -63,10 +63,10 @@ typedef struct url_match_s { gchar st; } url_match_t; -#define URL_FLAG_NOHTML (1 << 0) -#define URL_FLAG_TLD_MATCH (1 << 1) -#define URL_FLAG_STAR_MATCH (1 << 2) -#define URL_FLAG_REGEXP (1 << 3) +#define URL_FLAG_NOHTML (1u << 0u) +#define URL_FLAG_TLD_MATCH (1u << 1u) +#define URL_FLAG_STAR_MATCH (1u << 2u) +#define URL_FLAG_REGEXP (1u << 3u) struct url_callback_data; @@ -206,12 +206,12 @@ struct url_matcher static_matchers[] = { {"sip:", "", url_web_start, url_web_end, 0}, {"www.", "http://", url_web_start, url_web_end, - URL_FLAG_NOHTML}, + 0}, {"ftp.", "ftp://", url_web_start, url_web_end, - URL_FLAG_NOHTML}, + 0}, /* Likely emails */ {"@", "mailto://", url_email_start, url_email_end, - URL_FLAG_NOHTML} + 0} };