diff options
Diffstat (limited to 'src/libserver/re_cache.c')
-rw-r--r-- | src/libserver/re_cache.c | 41 |
1 files changed, 36 insertions, 5 deletions
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c2c7464fc..b323ffa0e 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_WORDS: + case RSPAMD_RE_STEMWORDS: + case RSPAMD_RE_RAWWORDS: if (task->text_parts->len > 0) { cnt = 0; + raw = FALSE; PTR_ARRAY_FOREACH (task->text_parts, i, part) { if (part->utf_words) { @@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, guint j; rspamd_stat_token_t *tok; - if (part->utf_words) { for (j = 0; j < part->utf_words->len; j ++) { tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j); - if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - scvec[cnt] = tok->normalized.begin; - lenvec[cnt++] = tok->normalized.len; + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { + if (!re_class->has_utf8) { + raw = TRUE; + } + else { + continue; /* Skip */ + } + } + } + else { + continue; /* Skip non text */ + } + + if (re_class->type == RSPAMD_RE_RAWWORDS) { + if (tok->original.len > 0) { + scvec[cnt] = tok->original.begin; + lenvec[cnt++] = tok->original.len; + } + } + else if (re_class->type == RSPAMD_RE_WORDS) { + if (tok->normalized.len > 0) { + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; + } + } + else { + /* Stemmed words */ + if (tok->stemmed.len > 0) { + scvec[cnt] = tok->stemmed.begin; + lenvec[cnt++] = tok->stemmed.len; + } } } } } ret = rspamd_re_cache_process_regexp_data (rt, re, - task, scvec, lenvec, cnt, TRUE); + task, scvec, lenvec, cnt, raw); msg_debug_re_task ("checking sa words regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); |