Browse Source

[Minor] Further fixes in tokenization algorithm

tags/1.8.0
Vsevolod Stakhov 5 years ago
parent
commit
2b9a86ba16
1 changed files with 28 additions and 20 deletions
  1. 28
    20
      src/libstat/tokenizers/tokenizers.c

+ 28
- 20
src/libstat/tokenizers/tokenizers.c View File

@@ -225,6 +225,15 @@ rspamd_utf_word_valid (const gchar *text, const gchar *end,

return FALSE;
}
#define SHIFT_EX do { \
cur = g_list_next (cur); \
if (cur) { \
ex = (struct rspamd_process_exception *) cur->data; \
} \
else { \
ex = NULL; \
} \
} while(0)

GArray *
rspamd_tokenize_text (const gchar *text, gsize len,
@@ -278,7 +287,8 @@ rspamd_tokenize_text (const gchar *text, gsize len,
&hv, &prob, &token, pos - text, len)) {
if (!decay) {
decay = TRUE;
} else {
}
else {
token.begin = pos;
continue;
}
@@ -322,16 +332,6 @@ start_over:
/* We have an exception at the beginning, skip those */
last += ex->len;

if (last > p) {
/* Exception spread over the boundaries */
while (last > p && p != UBRK_DONE) {
p = ubrk_next (bi);
}

/* We need to reset our scan with new p and last */
goto start_over;
}

if (ex->type == RSPAMD_EXCEPTION_URL) {
token.begin = "!!EX!!";
token.len = sizeof ("!!EX!!") - 1;
@@ -341,11 +341,18 @@ start_over:
token.flags = 0;
}

cur = g_list_next (cur);
if (last > p) {
/* Exception spread over the boundaries */
while (last > p && p != UBRK_DONE) {
p = ubrk_next (bi);
}

if (cur) {
ex = (struct rspamd_process_exception *) cur->data;
/* We need to reset our scan with new p and last */
SHIFT_EX;
goto start_over;
}

SHIFT_EX;
}

/* Now, we can have an exception within boundary again */
@@ -360,7 +367,7 @@ start_over:
}

/* Process the current exception */
last += ex->len + token.len;
last += ex->len + (ex->pos - last);

if (ex->type == RSPAMD_EXCEPTION_URL) {
token.begin = "!!EX!!";
@@ -376,8 +383,11 @@ start_over:
p = ubrk_next (bi);
}
/* We need to reset our scan with new p and last */
SHIFT_EX;
goto start_over;
}

SHIFT_EX;
}
else if (p > last) {
if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -391,11 +401,7 @@ start_over:
/* Forward exceptions list */
while (cur && ex->pos <= last) {
/* We have an exception at the beginning, skip those */
cur = g_list_next (cur);

if (cur) {
ex = (struct rspamd_process_exception *) cur->data;
}
SHIFT_EX;
}

if (rspamd_utf_word_valid (text, text + len, last, p)) {
@@ -450,6 +456,8 @@ start_over:
return res;
}

#undef SHIFT_EX

/*
* vi:ts=4
*/

Loading…
Cancel
Save