]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Add safety check when using icu ubrk iterators
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 24 Oct 2019 12:08:29 +0000 (13:08 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 24 Oct 2019 12:08:56 +0000 (13:08 +0100)
src/libmime/message.c
src/libstat/tokenizers/tokenizers.c
src/libstat/tokenizers/tokenizers.h
src/lua/lua_util.c

index cfa8cf97de4734291564e3eb1ce486288be0392c..648fa82c517e9c1a66d5ea2a8143bc5078a31c2a 100644 (file)
@@ -187,7 +187,9 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
                        &part->utf_stripped_text,
                        tok_type, task->cfg,
                        part->exceptions,
-                       NULL, NULL);
+                       NULL,
+                       NULL,
+                       task->task_pool);
 
 
        if (part->utf_words) {
index acd3c57392ad39d68b2dbf24d24aebbd5ff66955..c533534edbc64fdda8ce0bee08dd794f93dbb75f 100644 (file)
@@ -285,7 +285,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
                                          struct rspamd_config *cfg,
                                          GList *exceptions,
                                          guint64 *hash,
-                                         GArray *cur_words)
+                                         GArray *cur_words,
+                                         rspamd_mempool_t *pool)
 {
        rspamd_stat_token_t token, buf;
        const gchar *pos = NULL;
@@ -359,7 +360,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
                                        ev_tstamp now = ev_time ();
 
                                        if (now - start > max_exec_time) {
-                                               msg_warn ("too long time has been spent on tokenization:"
+                                               msg_warn_pool_check (
+                                                               "too long time has been spent on tokenization:"
                                                                  " %.1f ms, limit is %.1f ms; %d words added so far",
                                                                (now - start) * 1e3, max_exec_time * 1e3,
                                                                res->len);
@@ -373,7 +375,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
 
                        if (((gsize)res->len) * sizeof (token) > (0x1ull << 30u)) {
                                /* Due to bug in glib ! */
-                               msg_err ("too many words found: %d, stop tokenization to avoid DoS",
+                               msg_err_pool_check (
+                                               "too many words found: %d, stop tokenization to avoid DoS",
                                                res->len);
 
                                goto end;
@@ -420,7 +423,17 @@ start_over:
                                                        if (last > p) {
                                                                /* Exception spread over the boundaries */
                                                                while (last > p && p != UBRK_DONE) {
+                                                                       gint32 old_p = p;
                                                                        p = ubrk_next (bi);
+
+                                                                       if (p <= old_p) {
+                                                                               msg_warn_pool_check (
+                                                                                               "tokenization reversed back on position %d,"
+                                                                                               "%d new position (%d backward), likely libicu bug!",
+                                                                                               (gint)(p), (gint)(old_p), old_p - p);
+
+                                                                               goto end;
+                                                                       }
                                                                }
 
                                                                /* We need to reset our scan with new p and last */
@@ -450,7 +463,16 @@ start_over:
                                                        if (last > p) {
                                                                /* Exception spread over the boundaries */
                                                                while (last > p && p != UBRK_DONE) {
+                                                                       gint32 old_p = p;
                                                                        p = ubrk_next (bi);
+                                                                       if (p <= old_p) {
+                                                                               msg_warn_pool_check (
+                                                                                               "tokenization reversed back on position %d,"
+                                                                                               "%d new position (%d backward), likely libicu bug!",
+                                                                                               (gint)(p), (gint)(old_p), old_p - p);
+
+                                                                               goto end;
+                                                                       }
                                                                }
                                                                /* We need to reset our scan with new p and last */
                                                                SHIFT_EX;
@@ -531,7 +553,8 @@ start_over:
                                        ev_tstamp now = ev_time ();
 
                                        if (now - start > max_exec_time) {
-                                               msg_warn ("too long time has been spent on tokenization:"
+                                               msg_warn_pool_check (
+                                                               "too long time has been spent on tokenization:"
                                                                  " %.1f ms, limit is %.1f ms; %d words added so far",
                                                                (now - start) * 1e3, max_exec_time * 1e3,
                                                                res->len);
@@ -543,6 +566,14 @@ start_over:
 
                        last = p;
                        p = ubrk_next (bi);
+
+                       if (p <= last) {
+                               msg_warn_pool_check ("tokenization reversed back on position %d,"
+                                                "%d new position (%d backward), likely libicu bug!",
+                                               (gint)(p), (gint)(last), last - p);
+
+                               goto end;
+                       }
                }
        }
 
@@ -599,14 +630,17 @@ rspamd_add_metawords_from_str (const gchar *beg, gsize len,
 
                task->meta_words = rspamd_tokenize_text (beg, len,
                                &utxt, RSPAMD_TOKENIZE_UTF,
-                               task->cfg, NULL, NULL, task->meta_words);
+                               task->cfg, NULL, NULL,
+                               task->meta_words,
+                               task->task_pool);
 
                utext_close (&utxt);
        }
        else {
                task->meta_words = rspamd_tokenize_text (beg, len,
                                NULL, RSPAMD_TOKENIZE_RAW,
-                               task->cfg, NULL, NULL, task->meta_words);
+                               task->cfg, NULL, NULL, task->meta_words,
+                               task->task_pool);
        }
 }
 
index bf4987c7a994ac33b1b14781de5f0a7364305219..ca72618021efe61ede7d424ed8003ee75d664bdf 100644 (file)
@@ -50,7 +50,8 @@ GArray *rspamd_tokenize_text (const gchar *text, gsize len,
                                                          struct rspamd_config *cfg,
                                                          GList *exceptions,
                                                          guint64 *hash,
-                                                         GArray *cur_words);
+                                                         GArray *cur_words,
+                                                         rspamd_mempool_t *pool);
 
 /* OSB tokenize function */
 gint rspamd_tokenizer_osb (struct rspamd_stat_ctx *ctx,
index 461130157f93d41f2d4ddd7e4bab1edb80e09fef..1ea8d380cee750cb2bbe9fac1695f5a8ba65609f 100644 (file)
@@ -1322,7 +1322,7 @@ lua_util_tokenize_text (lua_State *L)
                        &utxt,
                        RSPAMD_TOKENIZE_UTF, NULL,
                        exceptions,
-                       NULL, NULL);
+                       NULL, NULL, NULL);
 
        if (res == NULL) {
                lua_pushnil (L);