summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rspamd.com>2023-07-25 15:56:43 +0100
committerVsevolod Stakhov <vsevolod@rspamd.com>2023-07-25 16:05:06 +0100
commit5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365 (patch)
treeda29b9e1576aa6c453adf31d76334c3f9c15525c
parentc82c2ccc54d5f99d99782d3282e6f2a8671eef2e (diff)
downloadrspamd-5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365.tar.gz
rspamd-5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365.zip
[Feature] Add order to urls structure
-rw-r--r--src/libmime/message.c23
-rw-r--r--src/libserver/html/html.cxx16
-rw-r--r--src/libserver/html/html.h3
-rw-r--r--src/libserver/html/html.hxx3
-rw-r--r--src/libserver/html/html_tests.cxx6
-rw-r--r--src/libserver/url.c19
-rw-r--r--src/libserver/url.h6
7 files changed, 57 insertions, 19 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 5ab712283..508ea27ea 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -763,7 +763,8 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task,
static gboolean
rspamd_message_process_html_text_part (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
+ struct rspamd_mime_text_part *text_part,
+ uint16_t *cur_url_order)
{
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML;
@@ -786,7 +787,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
&text_part->exceptions,
MESSAGE_FIELD (task, urls),
text_part->mime_part->urls,
- task->cfg ? task->cfg->enable_css_parser : true);
+ task->cfg ? task->cfg->enable_css_parser : true,
+ cur_url_order);
rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content);
if (text_part->utf_content.len == 0) {
@@ -842,7 +844,8 @@ rspamd_message_part_can_be_parsed_as_text (struct rspamd_task *task,
static gboolean
rspamd_message_process_text_part_maybe (struct rspamd_task *task,
struct rspamd_mime_part *mime_part,
- enum rspamd_message_part_is_text_result is_text)
+ enum rspamd_message_part_is_text_result is_text,
+ uint16_t *cur_url_order)
{
struct rspamd_mime_text_part *text_part;
guint flags = 0;
@@ -864,7 +867,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
text_part->flags |= flags;
if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) {
- if (!rspamd_message_process_html_text_part (task, text_part)) {
+ if (!rspamd_message_process_html_text_part (task, text_part, cur_url_order)) {
return FALSE;
}
}
@@ -911,14 +914,14 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
* Use strict extraction mode: we will extract missing urls from
* an html part if needed
*/
- rspamd_url_text_extract (task->task_pool, task, text_part,
+ rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_STRICT);
}
else {
/*
* Fall back to full text extraction using TLD patterns
*/
- rspamd_url_text_extract (task->task_pool, task, text_part,
+ rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_ALL);
}
}
@@ -926,12 +929,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task,
/*
* Fall back to full text extraction using TLD patterns
*/
- rspamd_url_text_extract (task->task_pool, task, text_part,
+ rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_ALL);
}
}
else {
- rspamd_url_text_extract (task->task_pool, task, text_part,
+ rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order,
RSPAMD_URL_FIND_STRICT);
}
@@ -1487,13 +1490,14 @@ rspamd_message_process (struct rspamd_task *task)
}
}
+ uint16_t cur_url_order = 0;
g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func);
/* One more iteration to process text parts in a more specific order */
for (i = 0; i < detected_text_parts->len; i ++) {
part = g_ptr_array_index (MESSAGE_FIELD (task, parts),
g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos);
rspamd_message_process_text_part_maybe(task, part,
- g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res);
+ g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order);
}
g_array_free (detected_text_parts, TRUE);
@@ -1640,7 +1644,6 @@ rspamd_message_process (struct rspamd_task *task)
}
rspamd_images_link (task);
-
rspamd_tokenize_meta_words (task);
}
diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx
index a848a25d3..ed034b928 100644
--- a/src/libserver/html/html.cxx
+++ b/src/libserver/html/html.cxx
@@ -1328,7 +1328,8 @@ html_process_input(struct rspamd_task *task,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
- bool allow_css) -> html_content *
+ bool allow_css,
+ std::uint16_t *cur_url_order) -> html_content *
{
const gchar *p, *c, *end, *start;
guchar t;
@@ -1372,6 +1373,7 @@ html_process_input(struct rspamd_task *task,
g_assert (task != NULL);
auto *pool = task->task_pool;
+ auto cur_url_part_order = 0u;
auto *hc = new html_content;
rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc);
@@ -1472,6 +1474,10 @@ html_process_input(struct rspamd_task *task,
struct rspamd_url *maybe_existing =
rspamd_url_set_add_or_return(url_set, maybe_url.value());
if (maybe_existing == maybe_url.value()) {
+ if (cur_url_order) {
+ url->order = *(cur_url_order)++;
+ }
+ url->part_order = cur_url_part_order++;
html_process_query_url(pool, url, url_set,
part_urls);
}
@@ -2273,10 +2279,11 @@ rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
- bool allow_css)
+ bool allow_css,
+ uint16_t *cur_url_order)
{
return rspamd::html::html_process_input(task, in, exceptions, url_set,
- part_urls, allow_css);
+ part_urls, allow_css, cur_url_order);
}
void *
@@ -2286,9 +2293,10 @@ rspamd_html_process_part(rspamd_mempool_t *pool,
struct rspamd_task fake_task;
memset(&fake_task, 0, sizeof(fake_task));
fake_task.task_pool = pool;
+ uint16_t order = 0;
return rspamd_html_process_part_full (&fake_task, in, NULL,
- NULL, NULL, FALSE);
+ NULL, NULL, FALSE, &order);
}
guint
diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h
index 2a43223f9..17067b3b1 100644
--- a/src/libserver/html/html.h
+++ b/src/libserver/html/html.h
@@ -74,7 +74,8 @@ void *rspamd_html_process_part_full(struct rspamd_task *task,
GByteArray *in, GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
- bool allow_css);
+ bool allow_css,
+ uint16_t *cur_url_order);
/*
* Returns true if a specified tag has been seen in a part
diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx
index 34008aaf7..c119adc3f 100644
--- a/src/libserver/html/html.hxx
+++ b/src/libserver/html/html.hxx
@@ -132,7 +132,8 @@ auto html_process_input(struct rspamd_task *task,
GList **exceptions,
khash_t (rspamd_url_hash) *url_set,
GPtrArray *part_urls,
- bool allow_css) -> html_content *;
+ bool allow_css,
+ std::uint16_t *cur_url_order) -> html_content *;
auto html_debug_structure(const html_content &hc) -> std::string;
}
diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx
index 9ab13ee78..2492337bf 100644
--- a/src/libserver/html/html_tests.cxx
+++ b/src/libserver/html/html_tests.cxx
@@ -58,7 +58,7 @@ TEST_CASE("html parsing")
SUBCASE((std::string("extract tags from: ") + c.first).c_str()) {
GByteArray *tmp = g_byte_array_sized_new(c.first.size());
g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
- auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
CHECK(hc != nullptr);
auto dump = html_debug_structure(*hc);
CHECK(c.second == dump);
@@ -215,7 +215,7 @@ TEST_CASE("html text extraction")
SUBCASE((fmt::format("html extraction case {}", i)).c_str()) {
GByteArray *tmp = g_byte_array_sized_new(c.first.size());
g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size());
- auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true);
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr);
CHECK(hc != nullptr);
replace_newlines(hc->parsed);
auto expected = c.second;
@@ -259,7 +259,7 @@ TEST_CASE("html urls extraction")
auto input = std::get<0>(c);
GByteArray *tmp = g_byte_array_sized_new(input.size());
g_byte_array_append(tmp, (const guint8 *)input.data(), input.size());
- auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true);
+ auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr);
CHECK(hc != nullptr);
auto &expected_text = std::get<2>(c);
if (expected_text.has_value()) {
diff --git a/src/libserver/url.c b/src/libserver/url.c
index 33198b861..d5dafeaea 100644
--- a/src/libserver/url.c
+++ b/src/libserver/url.c
@@ -2244,6 +2244,9 @@ rspamd_url_parse (struct rspamd_url *uri,
memset (uri, 0, sizeof (*uri));
memset (&u, 0, sizeof (u));
uri->count = 1;
+ /* Undefine order */
+ uri->order = -1;
+ uri->part_order = -1;
if (*uristring == '\0') {
return URI_ERRNO_EMPTY;
@@ -3453,6 +3456,8 @@ struct rspamd_url_mimepart_cbdata {
struct rspamd_task *task;
struct rspamd_mime_text_part *part;
gsize url_len;
+ uint16_t *cur_url_order; /* Global ordering */
+ uint16_t cur_part_order; /* Per part ordering */
};
static gboolean
@@ -3488,6 +3493,12 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset,
if (cbd->part && cbd->part->mime_part->urls) {
g_ptr_array_add (cbd->part->mime_part->urls, url);
}
+
+ url->part_order = cbd->cur_part_order ++;
+
+ if (cbd->cur_url_order) {
+ url->order = *(cbd->cur_url_order)++;
+ }
}
return TRUE;
@@ -3542,6 +3553,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset,
if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false) &&
cbd->part->mime_part->urls) {
+ url->part_order = cbd->cur_part_order ++;
+
+ if (cbd->cur_url_order) {
+ url->order = *(cbd->cur_url_order)++;
+ }
g_ptr_array_add (cbd->part->mime_part->urls, url);
}
@@ -3564,6 +3580,7 @@ void
rspamd_url_text_extract (rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
+ uint16_t *cur_url_order,
enum rspamd_url_find_type how)
{
struct rspamd_url_mimepart_cbdata mcbd;
@@ -3576,6 +3593,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool,
mcbd.task = task;
mcbd.part = part;
mcbd.url_len = 0;
+ mcbd.cur_url_order = cur_url_order;
+ mcbd.cur_part_order = 0;
rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data,
part->utf_stripped_content->len, how, part->newlines,
diff --git a/src/libserver/url.h b/src/libserver/url.h
index 7a005efd8..f3d561736 100644
--- a/src/libserver/url.h
+++ b/src/libserver/url.h
@@ -82,6 +82,11 @@ struct rspamd_url {
uint16_t count;
uint16_t urllen;
uint16_t rawlen;
+
+ /* Absolute order of the URL in a message */
+ uint16_t order;
+ /* Order of the URL in a specific part of message */
+ uint16_t part_order;
};
/**
@@ -156,6 +161,7 @@ void rspamd_url_deinit(void);
void rspamd_url_text_extract(rspamd_mempool_t *pool,
struct rspamd_task *task,
struct rspamd_mime_text_part *part,
+ uint16_t *cur_order,
enum rspamd_url_find_type how);
/*