diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-29 15:07:26 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-11-29 15:07:26 +0000 |
commit | 36c874383c4c56fb10c737a3f5932abc173080e2 (patch) | |
tree | 2c1114ffa8a658d91a1635df92f704e0e121f9c4 /src | |
parent | b88b112953e52fef8008216c498bbc2e3bfd0c72 (diff) | |
download | rspamd-36c874383c4c56fb10c737a3f5932abc173080e2.tar.gz rspamd-36c874383c4c56fb10c737a3f5932abc173080e2.zip |
[Feature] Add more words regexp classes
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/mime_expressions.c | 8 | ||||
-rw-r--r-- | src/libserver/re_cache.c | 41 | ||||
-rw-r--r-- | src/libserver/re_cache.h | 2 |
3 files changed, 46 insertions, 5 deletions
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c index c6d258c49..535b8a124 100644 --- a/src/libmime/mime_expressions.c +++ b/src/libmime/mime_expressions.c @@ -243,6 +243,14 @@ rspamd_parse_long_option (const gchar *start, gsize len, ret = TRUE; a->type = RSPAMD_RE_WORDS; } + else if (TYPE_CHECK (start, "raw_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_RAWWORDS; + } + else if (TYPE_CHECK (start, "stem_words", len)) { + ret = TRUE; + a->type = RSPAMD_RE_STEMWORDS; + } else if (TYPE_CHECK (start, "selector", len)) { ret = TRUE; a->type = RSPAMD_RE_SELECTOR; diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c index c2c7464fc..b323ffa0e 100644 --- a/src/libserver/re_cache.c +++ b/src/libserver/re_cache.c @@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, } break; case RSPAMD_RE_WORDS: + case RSPAMD_RE_STEMWORDS: + case RSPAMD_RE_RAWWORDS: if (task->text_parts->len > 0) { cnt = 0; + raw = FALSE; PTR_ARRAY_FOREACH (task->text_parts, i, part) { if (part->utf_words) { @@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, guint j; rspamd_stat_token_t *tok; - if (part->utf_words) { for (j = 0; j < part->utf_words->len; j ++) { tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j); - if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { - scvec[cnt] = tok->normalized.begin; - lenvec[cnt++] = tok->normalized.len; + if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) { + if (!re_class->has_utf8) { + raw = TRUE; + } + else { + continue; /* Skip */ + } + } + } + else { + continue; /* Skip non text */ + } + + if (re_class->type == RSPAMD_RE_RAWWORDS) { + if (tok->original.len > 0) { + scvec[cnt] = tok->original.begin; + lenvec[cnt++] = tok->original.len; + } + } + else if (re_class->type == RSPAMD_RE_WORDS) { + if (tok->normalized.len > 0) { + scvec[cnt] = tok->normalized.begin; + lenvec[cnt++] = tok->normalized.len; + } + } + else { + /* Stemmed words */ + if (tok->stemmed.len > 0) { + scvec[cnt] = tok->stemmed.begin; + lenvec[cnt++] = tok->stemmed.len; + } } } } } ret = rspamd_re_cache_process_regexp_data (rt, re, - task, scvec, lenvec, cnt, TRUE); + task, scvec, lenvec, cnt, raw); msg_debug_re_task ("checking sa words regexp: %s -> %d", rspamd_regexp_get_pattern (re), ret); diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h index 596ea08c2..15146c5dd 100644 --- a/src/libserver/re_cache.h +++ b/src/libserver/re_cache.h @@ -36,6 +36,8 @@ enum rspamd_re_type { RSPAMD_RE_SABODY, /* body in SA */ RSPAMD_RE_SARAWBODY, /* rawbody in SA */ RSPAMD_RE_WORDS, /* normalized words */ + RSPAMD_RE_RAWWORDS, /* raw words */ + RSPAMD_RE_STEMWORDS, /* stemmed words */ RSPAMD_RE_SELECTOR, /* use lua selector to process regexp */ RSPAMD_RE_MAX }; |