diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-10-18 08:18:25 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2017-10-18 08:18:25 +0100 |
commit | f53e901f3469cab9e2ec6f5983e66e25c87f5731 (patch) | |
tree | bd58a4d313a37a9f51cedec7b2805e0353edb2bc /src/libstat/tokenizers | |
parent | 1336182634fe880411c081b3002272575c239435 (diff) | |
download | rspamd-f53e901f3469cab9e2ec6f5983e66e25c87f5731.tar.gz rspamd-f53e901f3469cab9e2ec6f5983e66e25c87f5731.zip |
[CritFix] Another portion of tokenization fixes
MFH: rspamd-1.6
Diffstat (limited to 'src/libstat/tokenizers')
-rw-r--r-- | src/libstat/tokenizers/tokenizers.c | 35 |
1 files changed, 19 insertions, 16 deletions
diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index e9a9ce9b9..74c4f5460 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -168,7 +168,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, enum { skip_delimiters = 0, feed_token, - skip_exception, process_signature } state = skip_delimiters; @@ -215,17 +214,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, switch (state) { case skip_delimiters: if (ex != NULL && p - buf->begin == ex->pos) { - if (ex->type == RSPAMD_EXCEPTION_URL) { - token->begin = "!!EX!!"; - token->len = sizeof ("!!EX!!") - 1; - token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - processed = token->len; - } - state = skip_exception; - continue; + goto process_exception; } else if (u_isgraph (uc)) { - if (!u_ispunct (uc)) { + if (u_isalnum (uc)) { state = feed_token; token->begin = p; continue; @@ -241,7 +233,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto set_token; + goto process_exception; } else if (!u_isgraph (uc) || u_ispunct (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; @@ -249,11 +241,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, } processed ++; break; - case skip_exception: - *cur = p + ex->len; - *exceptions = g_list_next (*exceptions); - goto set_token; - break; case process_signature: if (*p == '\r' || *p == '\n') { msg_debug ("signature found: %*s", (gint)siglen, sig); @@ -280,6 +267,22 @@ set_token: *cur = &s[i]; return TRUE; + +process_exception: + if (ex->type == RSPAMD_EXCEPTION_URL) { + token->begin = "!!EX!!"; + token->len = sizeof ("!!EX!!") - 1; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + processed = token->len; + } + *cur = p + ex->len; + *exceptions = g_list_next (*exceptions); + + if (rl) { + *rl = processed; + } + + return TRUE; } GArray * |