diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 09:26:27 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 09:26:27 +0100 |
commit | 2b9a86ba167d3d3508c7a6ee76d24245332386b4 (patch) | |
tree | 7893426967d7066fe05a89972fadf378552232e6 /src/libstat/tokenizers | |
parent | c31f8bf12bff61c9422de9eeff0292c6ac339c5e (diff) | |
download | rspamd-2b9a86ba167d3d3508c7a6ee76d24245332386b4.tar.gz rspamd-2b9a86ba167d3d3508c7a6ee76d24245332386b4.zip |
[Minor] Further fixes in tokenization algorithm
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 48 |
1 files changed, 28 insertions, 20 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 9babfc8a1..0902ceb05 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -225,6 +225,15 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end, return FALSE; } +#define SHIFT_EX do { \ + cur = g_list_next (cur); \ + if (cur) { \ + ex = (struct rspamd_process_exception *) cur->data; \ + } \ + else { \ + ex = NULL; \ + } \ +} while(0) GArray * rspamd_tokenize_text (const gchar *text, gsize len, @@ -278,7 +287,8 @@ rspamd_tokenize_text (const gchar *text, gsize len, &hv, &prob, &token, pos - text, len)) { if (!decay) { decay = TRUE; - } else { + } + else { token.begin = pos; continue; } @@ -322,16 +332,6 @@ start_over: /* We have an exception at the beginning, skip those */ last += ex->len; - if (last > p) { - /* Exception spread over the boundaries */ - while (last > p && p != UBRK_DONE) { - p = ubrk_next (bi); - } - - /* We need to reset our scan with new p and last */ - goto start_over; - } - if (ex->type == RSPAMD_EXCEPTION_URL) { token.begin = "!!EX!!"; token.len = sizeof ("!!EX!!") - 1; @@ -341,11 +341,18 @@ start_over: token.flags = 0; } - cur = g_list_next (cur); + if (last > p) { + /* Exception spread over the boundaries */ + while (last > p && p != UBRK_DONE) { + p = ubrk_next (bi); + } - if (cur) { - ex = (struct rspamd_process_exception *) cur->data; + /* We need to reset our scan with new p and last */ + SHIFT_EX; + goto start_over; } + + SHIFT_EX; } /* Now, we can have an exception within boundary again */ @@ -360,7 +367,7 @@ start_over: } /* Process the current exception */ - last += ex->len + token.len; + last += ex->len + (ex->pos - last); if (ex->type == RSPAMD_EXCEPTION_URL) { token.begin = "!!EX!!"; @@ -376,8 +383,11 @@ start_over: p = ubrk_next (bi); } /* We need to reset our scan with new p and last */ + SHIFT_EX; goto start_over; } + + SHIFT_EX; } else if (p > last) { if (rspamd_utf_word_valid (text, text + len, last, p)) { @@ -391,11 +401,7 @@ start_over: /* Forward exceptions list */ while (cur && ex->pos <= last) { /* We have an exception at the beginning, skip those */ - cur = g_list_next (cur); - - if (cur) { - ex = (struct rspamd_process_exception *) cur->data; - } + SHIFT_EX; } if (rspamd_utf_word_valid (text, text + len, last, p)) { @@ -450,6 +456,8 @@ start_over: return res; } +#undef SHIFT_EX + /* * vi:ts=4 */ |