From: Vsevolod Stakhov Date: Thu, 19 Mar 2020 16:06:42 +0000 (+0000) Subject: [Rework] Change the way to extract URLs when dealing with alternative parts X-Git-Tag: 2.5~43 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=9e7bf606f92f56911d6f31a9b8b1e7d030ca27a7;p=rspamd.git [Rework] Change the way to extract URLs when dealing with alternative parts --- diff --git a/src/libmime/message.c b/src/libmime/message.c index c45550e6d..f167730d4 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -859,12 +859,49 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, rspamd_normalize_text_part (task, text_part); if (!IS_PART_HTML (text_part)) { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_ALL); + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) { + /* + * Use strict extraction mode: we will extract missing urls from + * an html part if needed + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_ALL); + } + } + else { + /* + * Fall back to full text extraction using TLD patterns + */ + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_ALL); + } } else { - rspamd_url_text_extract (task->task_pool, task, text_part, - RSPAMD_URL_FIND_STRICT); + if (mime_part->parent_part) { + struct rspamd_mime_part *parent = mime_part->parent_part; + + if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) { + /* Do not extract urls from HTML at all */ + } + else { + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } + } + else { + rspamd_url_text_extract (task->task_pool, task, text_part, + RSPAMD_URL_FIND_STRICT); + } } if (text_part->exceptions) {