diff options
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 48 |
1 files changed, 28 insertions, 20 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 9babfc8a1..0902ceb05 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -225,6 +225,15 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end, return FALSE; } +#define SHIFT_EX do { \ + cur = g_list_next (cur); \ + if (cur) { \ + ex = (struct rspamd_process_exception *) cur->data; \ + } \ + else { \ + ex = NULL; \ + } \ +} while(0) GArray * rspamd_tokenize_text (const gchar *text, gsize len, @@ -278,7 +287,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; - } else { + } + else { token.begin = pos; continue; } @@ -322,16 +332,6 @@ start_over: /* We have an exception at the beginning, skip those */ last += ex->len; - if (last > p) { - /* Exception spread over the boundaries */ - while (last > p && p != UBRK_DONE) { - p = ubrk_next (bi); - } - - /* We need to reset our scan with new p and last */ - goto start_over; - } - if (ex->type == RSPAMD_EXCEPTION_URL) { token.begin = "!!EX!!"; token.len = sizeof ("!!EX!!") - 1; @@ -341,11 +341,18 @@ start_over: token.flags = 0; } - cur = g_list_next (cur); + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } - if (cur) { - ex = (struct rspamd_process_exception *) cur->data; + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; } + + SHIFT_EX; } /* Now, we can have an exception within boundary again */ @@ -360,7 +367,7 @@ start_over: } /* Process the current exception */ - last += ex->len + token.len; + last += ex->len + (ex->pos - last); if (ex->type == RSPAMD_EXCEPTION_URL) { token.begin = "!!EX!!"; @@ -376,8 +383,11 @@ start_over: p = ubrk_next (bi); } /* We need to reset our scan with new p and last */ + SHIFT_EX; goto start_over; } + + SHIFT_EX; } else if (p > last) { if (rspamd_utf_word_valid (text, text + len, last, p)) { @@ -391,11 +401,7 @@ start_over: /* Forward exceptions list */ while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ - cur = g_list_next (cur); - - if (cur) { - ex = (struct rspamd_process_exception *) cur->data; - } + SHIFT_EX; } if (rspamd_utf_word_valid (text, text + len, last, p)) { @@ -450,6 +456,8 @@ start_over: return res; } +#undef SHIFT_EX + /* * vi:ts=4 */ |