summaryrefslogtreecommitdiffstats
path: root/src/libstat/tokenizers
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-10-18 08:18:25 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-10-18 08:18:25 +0100
commitf53e901f3469cab9e2ec6f5983e66e25c87f5731 (patch)
treebd58a4d313a37a9f51cedec7b2805e0353edb2bc /src/libstat/tokenizers
parent1336182634fe880411c081b3002272575c239435 (diff)
downloadrspamd-f53e901f3469cab9e2ec6f5983e66e25c87f5731.tar.gz
rspamd-f53e901f3469cab9e2ec6f5983e66e25c87f5731.zip
[CritFix] Another portion of tokenization fixes
MFH: rspamd-1.6
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r--src/libstat/tokenizers/tokenizers.c35
1 files changed, 19 insertions, 16 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c
index e9a9ce9b9..74c4f5460 100644
--- a/src/libstat/tokenizers/tokenizers.c
+++ b/src/libstat/tokenizers/tokenizers.c
@@ -168,7 +168,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
enum {
skip_delimiters = 0,
feed_token,
- skip_exception,
process_signature
} state = skip_delimiters;
@@ -215,17 +214,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
switch (state) {
case skip_delimiters:
if (ex != NULL && p - buf->begin == ex->pos) {
- if (ex->type == RSPAMD_EXCEPTION_URL) {
- token->begin = "!!EX!!";
- token->len = sizeof ("!!EX!!") - 1;
- token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
- processed = token->len;
- }
- state = skip_exception;
- continue;
+ goto process_exception;
}
else if (u_isgraph (uc)) {
- if (!u_ispunct (uc)) {
+ if (u_isalnum (uc)) {
state = feed_token;
token->begin = p;
continue;
@@ -241,7 +233,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
case feed_token:
if (ex != NULL && p - buf->begin == (gint)ex->pos) {
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
- goto set_token;
+ goto process_exception;
}
else if (!u_isgraph (uc) || u_ispunct (uc)) {
token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -249,11 +241,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
}
processed ++;
break;
- case skip_exception:
- *cur = p + ex->len;
- *exceptions = g_list_next (*exceptions);
- goto set_token;
- break;
case process_signature:
if (*p == '\r' || *p == '\n') {
msg_debug ("signature found: %*s", (gint)siglen, sig);
@@ -280,6 +267,22 @@ set_token:
*cur = &s[i];
return TRUE;
+
+process_exception:
+ if (ex->type == RSPAMD_EXCEPTION_URL) {
+ token->begin = "!!EX!!";
+ token->len = sizeof ("!!EX!!") - 1;
+ token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+ processed = token->len;
+ }
+ *cur = p + ex->len;
+ *exceptions = g_list_next (*exceptions);
+
+ if (rl) {
+ *rl = processed;
+ }
+
+ return TRUE;
}
GArray *