diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-19 16:06:42 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-03-19 16:06:42 +0000 |
commit | 9e7bf606f92f56911d6f31a9b8b1e7d030ca27a7 (patch) | |
tree | bdda8b34084c072f73184c739dac0691520244c4 | |
parent | 193879d7466b42f8b20fccc9b0cf403f41f82672 (diff) | |
download | rspamd-9e7bf606f92f56911d6f31a9b8b1e7d030ca27a7.tar.gz rspamd-9e7bf606f92f56911d6f31a9b8b1e7d030ca27a7.zip |
[Rework] Change the way to extract URLs when dealing with alternative parts
-rw-r--r-- | src/libmime/message.c | 45 |
1 files changed, 41 insertions, 4 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index c45550e6d..f167730d4 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -859,12 +859,49 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, rspamd_normalize_text_part (task, text_part); if (!IS_PART_HTML (text_part)) { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_ALL); + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) { + /* + * Use strict extraction mode: we will extract missing urls from + * an html part if needed + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_ALL); + } + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_ALL); + } } else { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_STRICT); + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) { + /* Do not extract urls from HTML at all */ + } + else { + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } + } + else { + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } } if (text_part->exceptions) { |