diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-07-25 14:08:45 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-07-25 14:08:45 +0100 |
commit | 6d646284c42427dde70440c9caebdce45bf9fd23 (patch) | |
tree | 481a59c962b2dc99a7a3696ba7c42450aa03b171 | |
parent | 747afaaa805f90d58b4330eb32119480fe40d3db (diff) | |
download | rspamd-6d646284c42427dde70440c9caebdce45bf9fd23.tar.gz rspamd-6d646284c42427dde70440c9caebdce45bf9fd23.zip |
[Minor] Add long texts sanity checks
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 55 |
1 files changed, 54 insertions, 1 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 550ed2097..12b860b67 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -298,14 +298,25 @@ rspamd_tokenize_text (const gchar *text, gsize len, GList *cur = exceptions; guint min_len = 0, max_len = 0, word_decay = 0, initial_size = 128; guint64 hv = 0; - gboolean decay = FALSE; + gboolean decay = FALSE, long_text_mode = FALSE; guint64 prob = 0; static UBreakIterator* bi = NULL; + static const gsize long_text_limit = 1 * 1024 * 1024; + static const ev_tstamp max_exec_time = 0.2; /* 200 ms */ + ev_tstamp start; if (text == NULL) { return cur_words; } + if (len > long_text_limit) { + /* + * In this mode we do additional checks to avoid performance issues + */ + long_text_mode = TRUE; + start = ev_time (); + } + buf.original.begin = text; buf.original.len = len; buf.flags = 0; @@ -347,7 +358,31 @@ rspamd_tokenize_text (const gchar *text, gsize len, } } + if (long_text_mode) { + if ((res->len + 1) % 16 == 0) { + ev_tstamp now = ev_time (); + + if (now - start > max_exec_time) { + msg_warn ("too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); + + goto end; + } + } + } + g_array_append_val (res, token); + + if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) { + /* Due to bug in glib ! */ + msg_err ("too many words found: %d, stop tokenization to avoid DoS", + res->len); + + goto end; + } + token.original.begin = pos; } } @@ -482,6 +517,7 @@ start_over: } if (token.original.len > 0) { + /* Additional check for number of words */ if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ msg_err ("too many words found: %d, stop tokenization to avoid DoS", @@ -489,9 +525,26 @@ start_over: goto end; } + g_array_append_val (res, token); } + /* Also check for long text mode */ + if (long_text_mode) { + if ((res->len + 1) % 16 == 0) { + ev_tstamp now = ev_time (); + + if (now - start > max_exec_time) { + msg_warn ("too long time has been spent on tokenization:" + " %.1f ms, limit is %.1f ms; %d words added so far", + (now - start) * 1e3, max_exec_time * 1e3, + res->len); + + goto end; + } + } + } + last = p; p = ubrk_next (bi); } |