From: Vsevolod Stakhov Date: Wed, 18 Oct 2017 07:18:25 +0000 (+0100) Subject: [CritFix] Another portion of tokenization fixes X-Git-Tag: 1.6.5~3^2~1 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=eac7a97ead7f7e256238db62b5d006a73cf17452;p=rspamd.git [CritFix] Another portion of tokenization fixes MFH: rspamd-1.6 --- diff --git a/src/libmime/message.c b/src/libmime/message.c index 085982064..36dbee945 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -231,10 +231,20 @@ rspamd_extract_words (struct rspamd_task *task, } #endif /* Ugly workaround */ - part->normalized_words = rspamd_tokenize_text (part->content->data, - part->content->len, IS_PART_UTF (part), task->cfg, - part->exceptions, FALSE, - NULL); + if (IS_PART_HTML (part)) { + part->normalized_words = rspamd_tokenize_text ( + part->content->data, + part->content->len, IS_PART_UTF (part), task->cfg, + part->exceptions, FALSE, + NULL); + } + else { + part->normalized_words = rspamd_tokenize_text ( + part->stripped_content->data, + part->stripped_content->len, IS_PART_UTF (part), task->cfg, + part->exceptions, FALSE, + NULL); + } if (part->normalized_words) { part->normalized_hashes = g_array_sized_new (FALSE, FALSE, diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index e9a9ce9b9..74c4f5460 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -168,7 +168,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, enum { skip_delimiters = 0, feed_token, - skip_exception, process_signature } state = skip_delimiters; @@ -215,17 +214,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, switch (state) { case skip_delimiters: if (ex != NULL && p - buf->begin == ex->pos) { - if (ex->type == RSPAMD_EXCEPTION_URL) { - token->begin = "!!EX!!"; - token->len = sizeof ("!!EX!!") - 1; - token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; - processed = token->len; - } - state = skip_exception; - continue; + goto process_exception; } else if (u_isgraph (uc)) { - if (!u_ispunct (uc)) { + if (u_isalnum (uc)) { state = feed_token; token->begin = p; continue; @@ -241,7 +233,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, case feed_token: if (ex != NULL && p - buf->begin == (gint)ex->pos) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; - goto set_token; + goto process_exception; } else if (!u_isgraph (uc) || u_ispunct (uc)) { token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT; @@ -249,11 +241,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, } processed ++; break; - case skip_exception: - *cur = p + ex->len; - *exceptions = g_list_next (*exceptions); - goto set_token; - break; case process_signature: if (*p == '\r' || *p == '\n') { msg_debug ("signature found: %*s", (gint)siglen, sig); @@ -279,6 +266,22 @@ set_token: *cur = &s[i]; + return TRUE; + +process_exception: + if (ex->type == RSPAMD_EXCEPTION_URL) { + token->begin = "!!EX!!"; + token->len = sizeof ("!!EX!!") - 1; + token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION; + processed = token->len; + } + *cur = p + ex->len; + *exceptions = g_list_next (*exceptions); + + if (rl) { + *rl = processed; + } + return TRUE; }