aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-29 15:07:26 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-29 15:07:26 +0000
commit36c874383c4c56fb10c737a3f5932abc173080e2 (patch)
tree2c1114ffa8a658d91a1635df92f704e0e121f9c4 /src
parentb88b112953e52fef8008216c498bbc2e3bfd0c72 (diff)
downloadrspamd-36c874383c4c56fb10c737a3f5932abc173080e2.tar.gz
rspamd-36c874383c4c56fb10c737a3f5932abc173080e2.zip
[Feature] Add more words regexp classes
Diffstat (limited to 'src')
-rw-r--r--src/libmime/mime_expressions.c8
-rw-r--r--src/libserver/re_cache.c41
-rw-r--r--src/libserver/re_cache.h2
3 files changed, 46 insertions, 5 deletions
diff --git a/src/libmime/mime_expressions.c b/src/libmime/mime_expressions.c
index c6d258c49..535b8a124 100644
--- a/src/libmime/mime_expressions.c
+++ b/src/libmime/mime_expressions.c
@@ -243,6 +243,14 @@ rspamd_parse_long_option (const gchar *start, gsize len,
ret = TRUE;
a->type = RSPAMD_RE_WORDS;
}
+ else if (TYPE_CHECK (start, "raw_words", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_RAWWORDS;
+ }
+ else if (TYPE_CHECK (start, "stem_words", len)) {
+ ret = TRUE;
+ a->type = RSPAMD_RE_STEMWORDS;
+ }
else if (TYPE_CHECK (start, "selector", len)) {
ret = TRUE;
a->type = RSPAMD_RE_SELECTOR;
diff --git a/src/libserver/re_cache.c b/src/libserver/re_cache.c
index c2c7464fc..b323ffa0e 100644
--- a/src/libserver/re_cache.c
+++ b/src/libserver/re_cache.c
@@ -1222,8 +1222,11 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
}
break;
case RSPAMD_RE_WORDS:
+ case RSPAMD_RE_STEMWORDS:
+ case RSPAMD_RE_RAWWORDS:
if (task->text_parts->len > 0) {
cnt = 0;
+ raw = FALSE;
PTR_ARRAY_FOREACH (task->text_parts, i, part) {
if (part->utf_words) {
@@ -1241,22 +1244,50 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
guint j;
rspamd_stat_token_t *tok;
-
if (part->utf_words) {
for (j = 0; j < part->utf_words->len; j ++) {
tok = &g_array_index (part->utf_words,
rspamd_stat_token_t, j);
- if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
- scvec[cnt] = tok->normalized.begin;
- lenvec[cnt++] = tok->normalized.len;
+ if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
+ if (!re_class->has_utf8) {
+ raw = TRUE;
+ }
+ else {
+ continue; /* Skip */
+ }
+ }
+ }
+ else {
+ continue; /* Skip non text */
+ }
+
+ if (re_class->type == RSPAMD_RE_RAWWORDS) {
+ if (tok->original.len > 0) {
+ scvec[cnt] = tok->original.begin;
+ lenvec[cnt++] = tok->original.len;
+ }
+ }
+ else if (re_class->type == RSPAMD_RE_WORDS) {
+ if (tok->normalized.len > 0) {
+ scvec[cnt] = tok->normalized.begin;
+ lenvec[cnt++] = tok->normalized.len;
+ }
+ }
+ else {
+ /* Stemmed words */
+ if (tok->stemmed.len > 0) {
+ scvec[cnt] = tok->stemmed.begin;
+ lenvec[cnt++] = tok->stemmed.len;
+ }
}
}
}
}
ret = rspamd_re_cache_process_regexp_data (rt, re,
- task, scvec, lenvec, cnt, TRUE);
+ task, scvec, lenvec, cnt, raw);
msg_debug_re_task ("checking sa words regexp: %s -> %d",
rspamd_regexp_get_pattern (re), ret);
diff --git a/src/libserver/re_cache.h b/src/libserver/re_cache.h
index 596ea08c2..15146c5dd 100644
--- a/src/libserver/re_cache.h
+++ b/src/libserver/re_cache.h
@@ -36,6 +36,8 @@ enum rspamd_re_type {
RSPAMD_RE_SABODY, /* body in SA */
RSPAMD_RE_SARAWBODY, /* rawbody in SA */
RSPAMD_RE_WORDS, /* normalized words */
+ RSPAMD_RE_RAWWORDS, /* raw words */
+ RSPAMD_RE_STEMWORDS, /* stemmed words */
RSPAMD_RE_SELECTOR, /* use lua selector to process regexp */
RSPAMD_RE_MAX
};