aboutsummaryrefslogtreecommitdiffstats
path: root/src/libserver/re_cache.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libserver/re_cache.c')
-rw-r--r--src/libserver/re_cache.c41
1 files changed, 36 insertions, 5 deletions
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index c2c7464fc..b323ffa0e 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_WORDS:
+ case RSPAMD_RE_STEMWORDS:
+ case RSPAMD_RE_RAWWORDS:
if (task->text_parts->len > 0) {
cnt = 0;
+ raw = FALSE;
PTR_ARRAY_FOREACH (task->text_parts, i, part) {
if (part->utf_words) {
@@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
guint j;
rspamd_stat_token_t *tok;
-
if (part->utf_words) {
for (j = 0; j < part->utf_words->len; j ++) {
tok = &g_array_index (part->utf_words,
rspamd_stat_token_t, j);
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- scvec[cnt] = tok->normalized.begin;
- lenvec[cnt++] = tok->normalized.len;
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
+ if (!re_class->has_utf8) {
+ raw = TRUE;
+ }
+ else {
+ continue; /* Skip */
+ }
+ }
+ }
+ else {
+ continue; /* Skip non text */
+ }
+
+ if (re_class->type == RSPAMD_RE_RAWWORDS) {
+ if (tok->original.len > 0) {
+ scvec[cnt] = tok->original.begin;
+ lenvec[cnt++] = tok->original.len;
+ }
+ }
+ else if (re_class->type == RSPAMD_RE_WORDS) {
+ if (tok->normalized.len > 0) {
+ scvec[cnt] = tok->normalized.begin;
+ lenvec[cnt++] = tok->normalized.len;
+ }
+ }
+ else {
+ /* Stemmed words */
+ if (tok->stemmed.len > 0) {
+ scvec[cnt] = tok->stemmed.begin;
+ lenvec[cnt++] = tok->stemmed.len;
+ }
}
}
}
}
ret = rspamd_re_cache_process_regexp_data (rt, re,
- task, scvec, lenvec, cnt, TRUE);
+ task, scvec, lenvec, cnt, raw);
msg_debug_re_task ("checking sa words regexp: %s -> %d",
rspamd_regexp_get_pattern (re), ret);