diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-10-24 13:08:29 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-10-24 13:08:56 +0100 |
commit | 26eff813e8e951bc1470ed4667ecaabd0aa0588f (patch) | |
tree | 792e25374e931a08ea7c06991cae39248326b95f /src/libstat/tokenizers/tokenizers.c | |
parent | c52e22c7ce4a371076fe0f403144484a3be59ef2 (diff) | |
download | rspamd-26eff813e8e951bc1470ed4667ecaabd0aa0588f.tar.gz rspamd-26eff813e8e951bc1470ed4667ecaabd0aa0588f.zip |
[Minor] Add safety check when using icu ubrk iterators
Diffstat (limited to 'src/libstat/tokenizers/tokenizers.c')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 46 |
1 files changed, 40 insertions, 6 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index acd3c5739..c533534ed 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -285,7 +285,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, struct rspamd_config *cfg, GList *exceptions, guint64 *hash, - GArray *cur_words) + GArray *cur_words, + rspamd_mempool_t *pool) { rspamd_stat_token_t token, buf; const gchar *pos = NULL; @@ -359,7 +360,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, ev_tstamp now = ev_time (); if (now - start > max_exec_time) { - msg_warn ("too long time has been spent on tokenization:" + msg_warn_pool_check ( + "too long time has been spent on tokenization:" " %.1f ms, limit is %.1f ms; %d words added so far", (now - start) * 1e3, max_exec_time * 1e3, res->len); @@ -373,7 +375,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) { /* Due to bug in glib ! */ - msg_err ("too many words found: %d, stop tokenization to avoid DoS", + msg_err_pool_check ( + "too many words found: %d, stop tokenization to avoid DoS", res->len); goto end; @@ -420,7 +423,17 @@ start_over: if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { + gint32 old_p = p; p = ubrk_next (bi); + + if (p <= old_p) { + msg_warn_pool_check ( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint)(p), (gint)(old_p), old_p - p); + + goto end; + } } /* We need to reset our scan with new p and last */ @@ -450,7 +463,16 @@ start_over: if (last > p) { /* Exception spread over the boundaries */ while (last > p && p != UBRK_DONE) { + gint32 old_p = p; p = ubrk_next (bi); + if (p <= old_p) { + msg_warn_pool_check ( + "tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint)(p), (gint)(old_p), old_p - p); + + goto end; + } } /* We need to reset our scan with new p and last */ SHIFT_EX; @@ -531,7 +553,8 @@ start_over: ev_tstamp now = ev_time (); if (now - start > max_exec_time) { - msg_warn ("too long time has been spent on tokenization:" + msg_warn_pool_check ( + "too long time has been spent on tokenization:" " %.1f ms, limit is %.1f ms; %d words added so far", (now - start) * 1e3, max_exec_time * 1e3, res->len); @@ -543,6 +566,14 @@ start_over: last = p; p = ubrk_next (bi); + + if (p <= last) { + msg_warn_pool_check ("tokenization reversed back on position %d," + "%d new position (%d backward), likely libicu bug!", + (gint)(p), (gint)(last), last - p); + + goto end; + } } } @@ -599,14 +630,17 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len, task->meta_words = rspamd_tokenize_text (beg, len, &utxt, RSPAMD_TOKENIZE_UTF, - task->cfg, NULL, NULL, task->meta_words); + task->cfg, NULL, NULL, + task->meta_words, + task->task_pool); utext_close (&utxt); } else { task->meta_words = rspamd_tokenize_text (beg, len, NULL, RSPAMD_TOKENIZE_RAW, - task->cfg, NULL, NULL, task->meta_words); + task->cfg, NULL, NULL, task->meta_words, + task->task_pool); } } |