aboutsummaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-07 09:26:27 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-07 09:26:27 +0100
commit2b9a86ba167d3d3508c7a6ee76d24245332386b4 (patch)
tree7893426967d7066fe05a89972fadf378552232e6 /src/libstat/tokenizers
parentc31f8bf12bff61c9422de9eeff0292c6ac339c5e (diff)
downloadrspamd-2b9a86ba167d3d3508c7a6ee76d24245332386b4.tar.gz
rspamd-2b9a86ba167d3d3508c7a6ee76d24245332386b4.zip
[Minor] Further fixes in tokenization algorithm
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c48
1 files changed, 28 insertions, 20 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index 9babfc8a1..0902ceb05 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -225,6 +225,15 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end,
return FALSE;
}
+#define SHIFT_EX do { \
+ cur = g_list_next (cur); \
+ if (cur) { \
+ ex = (struct rspamd_process_exception *) cur->data; \
+ } \
+ else { \
+ ex = NULL; \
+ } \
+} while(0)
GArray *
rspamd_tokenize_text (const gchar *text, gsize len,
@@ -278,7 +287,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
- } else {
+ }
+ else {
token.begin = pos;
continue;
}
@@ -322,16 +332,6 @@ start_over:
/* We have an exception at the beginning, skip those */
last += ex->len;
- if (last > p) {
- /* Exception spread over the boundaries */
- while (last > p && p != UBRK_DONE) {
- p = ubrk_next (bi);
- }
-
- /* We need to reset our scan with new p and last */
- goto start_over;
- }
-
if (ex->type == RSPAMD_EXCEPTION_URL) {
token.begin = "!!EX!!";
token.len = sizeof ("!!EX!!") - 1;
@@ -341,11 +341,18 @@ start_over:
token.flags = 0;
}
- cur = g_list_next (cur);
+ if (last > p) {
+ /* Exception spread over the boundaries */
+ while (last > p && p != UBRK_DONE) {
+ p = ubrk_next (bi);
+ }
- if (cur) {
- ex = (struct rspamd_process_exception *) cur->data;
+ /* We need to reset our scan with new p and last */
+ SHIFT_EX;
+ goto start_over;
}
+
+ SHIFT_EX;
}
/* Now, we can have an exception within boundary again */
@@ -360,7 +367,7 @@ start_over:
}
/* Process the current exception */
- last += ex->len + token.len;
+ last += ex->len + (ex->pos - last);
if (ex->type == RSPAMD_EXCEPTION_URL) {
token.begin = "!!EX!!";
@@ -376,8 +383,11 @@ start_over:
p = ubrk_next (bi);
}
/* We need to reset our scan with new p and last */
+ SHIFT_EX;
goto start_over;
}
+
+ SHIFT_EX;
}
else if (p > last) {
if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -391,11 +401,7 @@ start_over:
/* Forward exceptions list */
while (cur && ex->pos <= last) {
/* We have an exception at the beginning, skip those */
- cur = g_list_next (cur);
-
- if (cur) {
- ex = (struct rspamd_process_exception *) cur->data;
- }
+ SHIFT_EX;
}
if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -450,6 +456,8 @@ start_over:
return res;
}
+#undef SHIFT_EX
+
/*
* vi:ts=4
*/