diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-26 22:54:24 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-26 22:54:24 +0100 |
commit | 1a7b7d7076f41651444832f693606f5eca39a624 (patch) | |
tree | 9cf0d1d08e5a46feeb4c7a009b2741d38130c03a /src | |
parent | 017040605f9d14a94d86b9b676dd512ce65d2d1b (diff) | |
download | rspamd-1a7b7d7076f41651444832f693606f5eca39a624.tar.gz rspamd-1a7b7d7076f41651444832f693606f5eca39a624.zip |
[Feature] Add html parsing limit
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/message.c | 2 | ||||
-rw-r--r-- | src/libserver/cfg_file.h | 1 | ||||
-rw-r--r-- | src/libserver/cfg_rcl.c | 6 | ||||
-rw-r--r-- | src/libserver/cfg_utils.c | 2 | ||||
-rw-r--r-- | src/libserver/html/html.cxx | 46 | ||||
-rw-r--r-- | src/libserver/html/html.h | 2 |
6 files changed, 48 insertions, 11 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index ec49b3b5e..ad2cccf92 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -766,7 +766,7 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, text_part->html = rspamd_html_process_part_full ( - task->task_pool, + task, text_part->utf_raw_content, &text_part->exceptions, MESSAGE_FIELD (task, urls), diff --git a/src/libserver/cfg_file.h b/src/libserver/cfg_file.h index 44502ebb7..d7c3789e7 100644 --- a/src/libserver/cfg_file.h +++ b/src/libserver/cfg_file.h @@ -478,6 +478,7 @@ struct rspamd_config { gint max_recipients; /**< maximum number of recipients to be processed */ guint max_blas_threads; /**< maximum threads for openblas when learning ANN */ guint max_opts_len; /**< maximum length for all options for a symbol */ + gsize max_html_len; /**< maximum length of HTML document */ struct module_s **compiled_modules; /**< list of compiled C modules */ struct worker_s **compiled_workers; /**< list of compiled C modules */ diff --git a/src/libserver/cfg_rcl.c b/src/libserver/cfg_rcl.c index 01c2a6ad1..08d534eb3 100644 --- a/src/libserver/cfg_rcl.c +++ b/src/libserver/cfg_rcl.c @@ -1920,6 +1920,12 @@ rspamd_rcl_config_init (struct rspamd_config *cfg, GHashTable *skip_sections) RSPAMD_CL_FLAG_UINT, "Maximum length of the word to be considered in statistics/fuzzy"); rspamd_rcl_add_default_handler (sub, + "max_html_len", + rspamd_rcl_parse_struct_integer, + G_STRUCT_OFFSET (struct rspamd_config, max_word_len), + RSPAMD_CL_FLAG_INT_SIZE, + "Maximum length of the html part to be parsed"); + rspamd_rcl_add_default_handler (sub, "words_decay", rspamd_rcl_parse_struct_integer, G_STRUCT_OFFSET (struct rspamd_config, words_decay), diff --git a/src/libserver/cfg_utils.c b/src/libserver/cfg_utils.c index 09e2ab158..67bc97070 100644 --- a/src/libserver/cfg_utils.c +++ b/src/libserver/cfg_utils.c @@ -75,6 +75,7 @@ #define DEFAULT_MAX_SHOTS 100 #define DEFAULT_MAX_SESSIONS 100 #define DEFAULT_MAX_WORKERS 4 +#define DEFAULT_MAX_HTML_SIZE DEFAULT_MAX_MESSAGE / 5 /* 10 Mb */ /* Timeout for task processing */ #define DEFAULT_TASK_TIMEOUT 8.0 #define DEFAULT_LUA_GC_STEP 200 @@ -243,6 +244,7 @@ rspamd_config_new (enum rspamd_config_init_flags flags) cfg->words_decay = DEFAULT_WORDS_DECAY; cfg->min_word_len = DEFAULT_MIN_WORD; cfg->max_word_len = DEFAULT_MAX_WORD; + cfg->max_html_len = DEFAULT_MAX_HTML_SIZE; /* GC limits */ cfg->lua_gc_pause = DEFAULT_LUA_GC_PAUSE; diff --git a/src/libserver/html/html.cxx b/src/libserver/html/html.cxx index e2f484804..91a59c8d0 100644 --- a/src/libserver/html/html.cxx +++ b/src/libserver/html/html.cxx @@ -22,6 +22,8 @@ #include "html.hxx" #include "libserver/css/css_value.hxx" #include "libserver/css/css.hxx" +#include "libserver/task.h" +#include "libserver/cfg_file.h" #include "url.h" #include "contrib/libucl/khash.h" @@ -1321,7 +1323,7 @@ html_append_tag_content(rspamd_mempool_t *pool, } auto -html_process_input(rspamd_mempool_t *pool, +html_process_input(struct rspamd_task *task, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set, @@ -1334,8 +1336,11 @@ html_process_input(rspamd_mempool_t *pool, guint obrace = 0, ebrace = 0; struct rspamd_url *url = nullptr; gint href_offset = -1; + auto overflow_input = false; struct html_tag *cur_tag = nullptr, *parent_tag = nullptr, cur_closing_tag; struct tag_content_parser_state content_parser_env; + auto process_size = in->len; + enum { parse_start = 0, @@ -1364,10 +1369,20 @@ html_process_input(rspamd_mempool_t *pool, } html_document_state = html_document_state::doctype; g_assert (in != NULL); - g_assert (pool != NULL); + g_assert (task != NULL); + + auto *pool = task->task_pool; - struct html_content *hc = new html_content; - rspamd_mempool_add_destructor(pool, html_content::html_content_dtor, hc); + auto *hc = new html_content; + rspamd_mempool_add_destructor(task->task_pool, html_content::html_content_dtor, hc); + + if (task->cfg && in->len > task->cfg->max_html_len) { + msg_notice_task("html input is too big: %z, limit is %z", + in->len, + task->cfg->max_html_len); + process_size = task->cfg->max_html_len; + overflow_input = true; + } auto new_tag = [&](int flags = 0) -> struct html_tag * { @@ -1525,7 +1540,7 @@ html_process_input(rspamd_mempool_t *pool, p = (const char *) in->data; c = p; - end = p + in->len; + end = p + process_size; start = c; while (p < end) { @@ -2140,8 +2155,17 @@ html_process_input(rspamd_mempool_t *pool, break; } + if (overflow_input) { + /* + * Append the rest of the input as raw html, this might work as + * further algorithms can skip words when auto *pool = task->task_pool;there are too many. + * It is still unclear about urls though... + */ + hc->parsed.append(end, in->len - process_size); + } + if (!hc->parsed.empty()) { - /* Trim extra spaces at the at the end if needed */ + /* Trim extra spaces at the end if needed */ if (g_ascii_isspace(hc->parsed.back())) { auto last_it = std::end(hc->parsed); @@ -2244,13 +2268,13 @@ html_tag::get_content(const struct html_content *hc) const -> std::string_view } void * -rspamd_html_process_part_full(rspamd_mempool_t *pool, +rspamd_html_process_part_full(struct rspamd_task *task, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, bool allow_css) { - return rspamd::html::html_process_input(pool, in, exceptions, url_set, + return rspamd::html::html_process_input(task, in, exceptions, url_set, part_urls, allow_css); } @@ -2258,7 +2282,11 @@ void * rspamd_html_process_part(rspamd_mempool_t *pool, GByteArray *in) { - return rspamd_html_process_part_full (pool, in, NULL, + struct rspamd_task fake_task; + memset(&fake_task, 0, sizeof(fake_task)); + fake_task.task_pool = pool; + + return rspamd_html_process_part_full (&fake_task, in, NULL, NULL, NULL, FALSE); } diff --git a/src/libserver/html/html.h b/src/libserver/html/html.h index 8b690499e..2a43223f9 100644 --- a/src/libserver/html/html.h +++ b/src/libserver/html/html.h @@ -70,7 +70,7 @@ guint rspamd_html_decode_entitles_inplace(gchar *s, gsize len); void* rspamd_html_process_part(rspamd_mempool_t *pool, GByteArray *in); -void *rspamd_html_process_part_full(rspamd_mempool_t *pool, +void *rspamd_html_process_part_full(struct rspamd_task *task, GByteArray *in, GList **exceptions, khash_t (rspamd_url_hash) *url_set, GPtrArray *part_urls, |