From 5fd7a90fdaa33f52c59bdb0ca84451e5c1e22365 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 25 Jul 2023 15:56:43 +0100 Subject: [PATCH] [Feature] Add order to urls structure --- src/libmime/message.c | 23 +++++++++++++---------- src/libserver/html/html.cxx | 16 ++++++++++++---- src/libserver/html/html.h | 3 ++- src/libserver/html/html.hxx | 3 ++- src/libserver/html/html_tests.cxx | 6 +++--- src/libserver/url.c | 19 +++++++++++++++++++ src/libserver/url.h | 6 ++++++ 7 files changed, 57 insertions(+), 19 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index 5ab712283..508ea27ea 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -763,7 +763,8 @@ rspamd_message_process_plain_text_part (struct rspamd_task *task, static gboolean rspamd_message_process_html_text_part (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part) + struct rspamd_mime_text_part *text_part, + uint16_t *cur_url_order) { text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; @@ -786,7 +787,8 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, &text_part->exceptions, MESSAGE_FIELD (task, urls), text_part->mime_part->urls, - task->cfg ? task->cfg->enable_css_parser : true); + task->cfg ? task->cfg->enable_css_parser : true, + cur_url_order); rspamd_html_get_parsed_content(text_part->html, &text_part->utf_content); if (text_part->utf_content.len == 0) { @@ -842,7 +844,8 @@ rspamd_message_part_can_be_parsed_as_text (struct rspamd_task *task, static gboolean rspamd_message_process_text_part_maybe (struct rspamd_task *task, struct rspamd_mime_part *mime_part, - enum rspamd_message_part_is_text_result is_text) + enum rspamd_message_part_is_text_result is_text, + uint16_t *cur_url_order) { struct rspamd_mime_text_part *text_part; guint flags = 0; @@ -864,7 +867,7 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, text_part->flags |= flags; if (is_text == RSPAMD_MESSAGE_PART_IS_TEXT_HTML) { - if (!rspamd_message_process_html_text_part (task, text_part)) { + if (!rspamd_message_process_html_text_part (task, text_part, cur_url_order)) { return FALSE; } } @@ -911,14 +914,14 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, * Use strict extraction mode: we will extract missing urls from * an html part if needed */ - rspamd_url_text_extract (task->task_pool, task, text_part, + rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order, RSPAMD_URL_FIND_STRICT); } else { /* * Fall back to full text extraction using TLD patterns */ - rspamd_url_text_extract (task->task_pool, task, text_part, + rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order, RSPAMD_URL_FIND_ALL); } } @@ -926,12 +929,12 @@ rspamd_message_process_text_part_maybe (struct rspamd_task *task, /* * Fall back to full text extraction using TLD patterns */ - rspamd_url_text_extract (task->task_pool, task, text_part, + rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order, RSPAMD_URL_FIND_ALL); } } else { - rspamd_url_text_extract (task->task_pool, task, text_part, + rspamd_url_text_extract (task->task_pool, task, text_part, cur_url_order, RSPAMD_URL_FIND_STRICT); } @@ -1487,13 +1490,14 @@ rspamd_message_process (struct rspamd_task *task) } } + uint16_t cur_url_order = 0; g_array_sort(detected_text_parts, rspamd_mime_text_part_position_compare_func); /* One more iteration to process text parts in a more specific order */ for (i = 0; i < detected_text_parts->len; i ++) { part = g_ptr_array_index (MESSAGE_FIELD (task, parts), g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).pos); rspamd_message_process_text_part_maybe(task, part, - g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res); + g_array_index(detected_text_parts, struct rspamd_mime_part_text_position, i).res, &cur_url_order); } g_array_free (detected_text_parts, TRUE); @@ -1640,7 +1644,6 @@ rspamd_message_process (struct rspamd_task *task) } rspamd_images_link (task); - rspamd_tokenize_meta_words (task); } diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index a848a25d3..ed034b928 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -1328,7 +1328,8 @@ html_process_input(struct rspamd_task *task, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, - bool allow_css) -> html_content * + bool allow_css, + std::uint16_t *cur_url_order) -> html_content * { const gchar *p, *c, *end, *start; guchar t; @@ -1372,6 +1373,7 @@ html_process_input(struct rspamd_task *task, g_assert (task != NULL); auto *pool = task->task_pool; + auto cur_url_part_order = 0u; auto *hc = new html_content; rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc); @@ -1472,6 +1474,10 @@ html_process_input(struct rspamd_task *task, struct rspamd_url *maybe_existing = rspamd_url_set_add_or_return(url_set, maybe_url.value()); if (maybe_existing == maybe_url.value()) { + if (cur_url_order) { + url->order = *(cur_url_order)++; + } + url->part_order = cur_url_part_order++; html_process_query_url(pool, url, url_set, part_urls); } @@ -2273,10 +2279,11 @@ rspamd_html_process_part_full(struct rspamd_task *task, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, - bool allow_css) + bool allow_css, + uint16_t *cur_url_order) { return rspamd::html::html_process_input(task, in, exceptions, url_set, - part_urls, allow_css); + part_urls, allow_css, cur_url_order); } void * @@ -2286,9 +2293,10 @@ rspamd_html_process_part(rspamd_mempool_t *pool, struct rspamd_task fake_task; memset(&fake_task, 0, sizeof(fake_task)); fake_task.task_pool = pool; + uint16_t order = 0; return rspamd_html_process_part_full (&fake_task, in, NULL, - NULL, NULL, FALSE); + NULL, NULL, FALSE, &order); } guint diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index 2a43223f9..17067b3b1 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -74,7 +74,8 @@ void *rspamd_html_process_part_full(struct rspamd_task *task, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, - bool allow_css); + bool allow_css, + uint16_t *cur_url_order); /* * Returns true if a specified tag has been seen in a part diff --git a/src/libserver/html/html.hxx b/src/libserver/html/html.hxx index 34008aaf7..c119adc3f 100644 --- a/src/libserver/html/html.hxx +++ b/src/libserver/html/html.hxx @@ -132,7 +132,8 @@ auto html_process_input(struct rspamd_task *task, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, - bool allow_css) -> html_content *; + bool allow_css, + std::uint16_t *cur_url_order) -> html_content *; auto html_debug_structure(const html_content &hc) -> std::string; } diff --git a/src/libserver/html/html_tests.cxx b/src/libserver/html/html_tests.cxx index 9ab13ee78..2492337bf 100644 --- a/src/libserver/html/html_tests.cxx +++ b/src/libserver/html/html_tests.cxx @@ -58,7 +58,7 @@ TEST_CASE("html parsing") SUBCASE((std::string("extract tags from: ") + c.first).c_str()) { GByteArray *tmp = g_byte_array_sized_new(c.first.size()); g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); - auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); CHECK(hc != nullptr); auto dump = html_debug_structure(*hc); CHECK(c.second == dump); @@ -215,7 +215,7 @@ TEST_CASE("html text extraction") SUBCASE((fmt::format("html extraction case {}", i)).c_str()) { GByteArray *tmp = g_byte_array_sized_new(c.first.size()); g_byte_array_append(tmp, (const guint8 *) c.first.data(), c.first.size()); - auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, nullptr, true, nullptr); CHECK(hc != nullptr); replace_newlines(hc->parsed); auto expected = c.second; @@ -259,7 +259,7 @@ TEST_CASE("html urls extraction") auto input = std::get<0>(c); GByteArray *tmp = g_byte_array_sized_new(input.size()); g_byte_array_append(tmp, (const guint8 *)input.data(), input.size()); - auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true); + auto *hc = html_process_input(&fake_task, tmp, nullptr, nullptr, purls, true, nullptr); CHECK(hc != nullptr); auto &expected_text = std::get<2>(c); if (expected_text.has_value()) { diff --git a/src/libserver/url.c b/src/libserver/url.c index 33198b861..d5dafeaea 100644 --- a/src/libserver/url.c +++ b/src/libserver/url.c @@ -2244,6 +2244,9 @@ rspamd_url_parse (struct rspamd_url *uri, memset (uri, 0, sizeof (*uri)); memset (&u, 0, sizeof (u)); uri->count = 1; + /* Undefine order */ + uri->order = -1; + uri->part_order = -1; if (*uristring == '\0') { return URI_ERRNO_EMPTY; @@ -3453,6 +3456,8 @@ struct rspamd_url_mimepart_cbdata { struct rspamd_task *task; struct rspamd_mime_text_part *part; gsize url_len; + uint16_t *cur_url_order; /* Global ordering */ + uint16_t cur_part_order; /* Per part ordering */ }; static gboolean @@ -3488,6 +3493,12 @@ rspamd_url_query_callback (struct rspamd_url *url, gsize start_offset, if (cbd->part && cbd->part->mime_part->urls) { g_ptr_array_add (cbd->part->mime_part->urls, url); } + + url->part_order = cbd->cur_part_order ++; + + if (cbd->cur_url_order) { + url->order = *(cbd->cur_url_order)++; + } } return TRUE; @@ -3542,6 +3553,11 @@ rspamd_url_text_part_callback (struct rspamd_url *url, gsize start_offset, if (rspamd_url_set_add_or_increase(MESSAGE_FIELD (task, urls), url, false) && cbd->part->mime_part->urls) { + url->part_order = cbd->cur_part_order ++; + + if (cbd->cur_url_order) { + url->order = *(cbd->cur_url_order)++; + } g_ptr_array_add (cbd->part->mime_part->urls, url); } @@ -3564,6 +3580,7 @@ void rspamd_url_text_extract (rspamd_mempool_t *pool, struct rspamd_task *task, struct rspamd_mime_text_part *part, + uint16_t *cur_url_order, enum rspamd_url_find_type how) { struct rspamd_url_mimepart_cbdata mcbd; @@ -3576,6 +3593,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, mcbd.task = task; mcbd.part = part; mcbd.url_len = 0; + mcbd.cur_url_order = cur_url_order; + mcbd.cur_part_order = 0; rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, part->utf_stripped_content->len, how, part->newlines, diff --git a/src/libserver/url.h b/src/libserver/url.h index 7a005efd8..f3d561736 100644 --- a/src/libserver/url.h +++ b/src/libserver/url.h @@ -82,6 +82,11 @@ struct rspamd_url { uint16_t count; uint16_t urllen; uint16_t rawlen; + + /* Absolute order of the URL in a message */ + uint16_t order; + /* Order of the URL in a specific part of message */ + uint16_t part_order; }; /** @@ -156,6 +161,7 @@ void rspamd_url_deinit(void); void rspamd_url_text_extract(rspamd_mempool_t *pool, struct rspamd_task *task, struct rspamd_mime_text_part *part, + uint16_t *cur_order, enum rspamd_url_find_type how); /* -- 2.39.5