aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2020-03-23 14:50:24 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2020-03-23 14:50:24 +0000
commitf605d670505baad46b8ef4cdfa3dc32f48d4150e (patch)
tree04f959e423960adb60c99dc0c35cb3a10e5d8595
parent7299efb5eeddde80511be8a6285d94c333fc8ea3 (diff)
downloadrspamd-f605d670505baad46b8ef4cdfa3dc32f48d4150e.tar.gz
rspamd-f605d670505baad46b8ef4cdfa3dc32f48d4150e.zip
[Rework] URL: Another update for urls extraction logic
URL extraction from HTML parts should look like this: 1. Extract href links 2. Convert HTML to plain text and extract: a) (http|https|ftp)://foo.bar and www.foo b) email like strings \bfoo@bar.baz\b . For all extracted strings check if we have host with a domain from the public suffix.
-rw-r--r--src/libmime/message.c17
-rw-r--r--src/libserver/url.c14
2 files changed, 9 insertions, 22 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index f167730d4..49d879090 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -887,21 +887,8 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
}
}
else {
- if (mime_part->parent_part) {
- struct rspamd_mime_part *parent = mime_part->parent_part;
-
- if (IS_PART_MULTIPART (parent) && parent->specific.mp->children->len == 2) {
- /* Do not extract urls from HTML at all */
- }
- else {
- rspamd_url_text_extract (task->task_pool, task, text_part,
- RSPAMD_URL_FIND_STRICT);
- }
- }
- else {
- rspamd_url_text_extract (task->task_pool, task, text_part,
- RSPAMD_URL_FIND_STRICT);
- }
+ rspamd_url_text_extract (task->task_pool, task, text_part,
+ RSPAMD_URL_FIND_STRICT);
}
if (text_part->exceptions) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 0669d932f..6aceb8fa6 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -63,10 +63,10 @@ typedef struct url_match_s {
gchar st;
} url_match_t;
-#define URL_FLAG_NOHTML (1 << 0)
-#define URL_FLAG_TLD_MATCH (1 << 1)
-#define URL_FLAG_STAR_MATCH (1 << 2)
-#define URL_FLAG_REGEXP (1 << 3)
+#define URL_FLAG_NOHTML (1u << 0u)
+#define URL_FLAG_TLD_MATCH (1u << 1u)
+#define URL_FLAG_STAR_MATCH (1u << 2u)
+#define URL_FLAG_REGEXP (1u << 3u)
struct url_callback_data;
@@ -206,12 +206,12 @@ struct url_matcher static_matchers[] = {
{"sip:", "", url_web_start, url_web_end,
0},
{"www.", "http://", url_web_start, url_web_end,
- URL_FLAG_NOHTML},
+ 0},
{"ftp.", "ftp://", url_web_start, url_web_end,
- URL_FLAG_NOHTML},
+ 0},
/* Likely emails */
{"@", "mailto://", url_email_start, url_email_end,
- URL_FLAG_NOHTML}
+ 0}
};