[Rework] Change the way to extract URLs when dealing with alternative parts

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)
diff --git a/src/libmime/message.c b/src/libmime/message.c

index c45550e6d0e8d6cca5db31963d8573abd39b2c1d..f167730d428f78f58a944b81ede444742fc1a126 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -859,12 +859,49 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
         rspamd_normalize_text_part (task, text_part);
  
         if (!IS_PART_HTML (text_part)) {
-               rspamd_url_text_extract (task->task_pool, task, text_part,
-                               RSPAMD_URL_FIND_ALL);
+               if (mime_part->parent_part) {
+                       struct rspamd_mime_part *parent = mime_part->parent_part;
+
+                       if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) {
+                               /*
+                                * Use strict extraction mode: we will extract missing urls from
+                                * an html part if needed
+                                */
+                               rspamd_url_text_extract (task->task_pool, task, text_part,
+                                               RSPAMD_URL_FIND_STRICT);
+                       }
+                       else {
+                               /*
+                                * Fall back to full text extraction using TLD patterns
+                                */
+                               rspamd_url_text_extract (task->task_pool, task, text_part,
+                                               RSPAMD_URL_FIND_ALL);
+                       }
+               }
+               else {
+                       /*
+                        * Fall back to full text extraction using TLD patterns
+                       */
+                       rspamd_url_text_extract (task->task_pool, task, text_part,
+                                       RSPAMD_URL_FIND_ALL);
+               }
         }
         else {
-               rspamd_url_text_extract (task->task_pool, task, text_part,
-                               RSPAMD_URL_FIND_STRICT);
+               if (mime_part->parent_part) {
+                       struct rspamd_mime_part *parent = mime_part->parent_part;
+
+                       if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) {
+                               /* Do not extract urls from HTML at all */
+                       }
+                       else {
+                               rspamd_url_text_extract (task->task_pool, task, text_part,
+                                               RSPAMD_URL_FIND_STRICT);
+                       }
+               }
+               else {
+                       rspamd_url_text_extract (task->task_pool, task, text_part,
+                                       RSPAMD_URL_FIND_STRICT);
+               }
         }
  
         if (text_part->exceptions) {
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Thu, 19 Mar 2020 16:06:42 +0000 (16:06 +0000)