]> source.dussan.org Git - rspamd.git/commitdiff
[CritFix] Another portion of tokenization fixes
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 18 Oct 2017 07:18:25 +0000 (08:18 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 18 Oct 2017 07:18:25 +0000 (08:18 +0100)
MFH: rspamd-1.6

src/libmime/message.c
src/libstat/tokenizers/tokenizers.c

index f426c821daf67bf36166d1ed10ee965544a95f5a..cae61643c8267deda4ffb0e7572e933a8ea5c638 100644 (file)
@@ -232,10 +232,20 @@ rspamd_extract_words (struct rspamd_task *task,
        }
 #endif
        /* Ugly workaround */
-       part->normalized_words = rspamd_tokenize_text (part->content->data,
-                       part->content->len, IS_PART_UTF (part), task->cfg,
-                       part->exceptions, FALSE,
-                       NULL);
+       if (IS_PART_HTML (part)) {
+               part->normalized_words = rspamd_tokenize_text (
+                               part->content->data,
+                               part->content->len, IS_PART_UTF (part), task->cfg,
+                               part->exceptions, FALSE,
+                               NULL);
+       }
+       else {
+               part->normalized_words = rspamd_tokenize_text (
+                               part->stripped_content->data,
+                               part->stripped_content->len, IS_PART_UTF (part), task->cfg,
+                               part->exceptions, FALSE,
+                               NULL);
+       }
 
        if (part->normalized_words) {
                part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
index e9a9ce9b914285d698bd25a770823f38164ed0db..74c4f5460f9dd845fa40f85b5e745bb50140d873 100644 (file)
@@ -168,7 +168,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
        enum {
                skip_delimiters = 0,
                feed_token,
-               skip_exception,
                process_signature
        } state = skip_delimiters;
 
@@ -215,17 +214,10 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                switch (state) {
                case skip_delimiters:
                        if (ex != NULL && p - buf->begin == ex->pos) {
-                               if (ex->type == RSPAMD_EXCEPTION_URL) {
-                                       token->begin = "!!EX!!";
-                                       token->len = sizeof ("!!EX!!") - 1;
-                                       token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
-                                       processed = token->len;
-                               }
-                               state = skip_exception;
-                               continue;
+                               goto process_exception;
                        }
                        else if (u_isgraph (uc)) {
-                               if (!u_ispunct (uc)) {
+                               if (u_isalnum (uc)) {
                                        state = feed_token;
                                        token->begin = p;
                                        continue;
@@ -241,7 +233,7 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                case feed_token:
                        if (ex != NULL && p - buf->begin == (gint)ex->pos) {
                                token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
-                               goto set_token;
+                               goto process_exception;
                        }
                        else if (!u_isgraph (uc) || u_ispunct (uc)) {
                                token->flags = RSPAMD_STAT_TOKEN_FLAG_TEXT;
@@ -249,11 +241,6 @@ rspamd_tokenizer_get_word (rspamd_stat_token_t * buf,
                        }
                        processed ++;
                        break;
-               case skip_exception:
-                       *cur = p + ex->len;
-                       *exceptions = g_list_next (*exceptions);
-                       goto set_token;
-                       break;
                case process_signature:
                        if (*p == '\r' || *p == '\n') {
                                msg_debug ("signature found: %*s", (gint)siglen, sig);
@@ -279,6 +266,22 @@ set_token:
 
        *cur = &s[i];
 
+       return TRUE;
+
+process_exception:
+       if (ex->type == RSPAMD_EXCEPTION_URL) {
+               token->begin = "!!EX!!";
+               token->len = sizeof ("!!EX!!") - 1;
+               token->flags = RSPAMD_STAT_TOKEN_FLAG_EXCEPTION;
+               processed = token->len;
+       }
+       *cur = p + ex->len;
+       *exceptions = g_list_next (*exceptions);
+
+       if (rl) {
+               *rl = processed;
+       }
+
        return TRUE;
 }