]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Support metawords in words regexps
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 29 Nov 2018 17:23:10 +0000 (17:23 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 29 Nov 2018 17:23:10 +0000 (17:23 +0000)
src/libserver/re_cache.c

index b323ffa0e390ee0c6aabb873c9130978722b8de2..f4f190ed56db87742d63132dbee326ee2b805772 100644 (file)
@@ -898,6 +898,60 @@ rspamd_re_cache_process_selector (struct rspamd_task *task,
        return result;
 }
 
+static inline guint
+rspamd_process_words_vector (GArray *words,
+                                                        const guchar **scvec,
+                                                        guint *lenvec,
+                                                        struct rspamd_re_class *re_class,
+                                                        guint cnt,
+                                                        gboolean *raw)
+{
+       guint j;
+       rspamd_stat_token_t *tok;
+
+       if (words) {
+               for (j = 0; j < words->len; j ++) {
+                       tok = &g_array_index (words, rspamd_stat_token_t, j);
+
+                       if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                               if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
+                                       if (!re_class->has_utf8) {
+                                               *raw = TRUE;
+                                       }
+                                       else {
+                                               continue; /* Skip */
+                                       }
+                               }
+                       }
+                       else {
+                               continue; /* Skip non text */
+                       }
+
+                       if (re_class->type == RSPAMD_RE_RAWWORDS) {
+                               if (tok->original.len > 0) {
+                                       scvec[cnt] = tok->original.begin;
+                                       lenvec[cnt++] = tok->original.len;
+                               }
+                       }
+                       else if (re_class->type == RSPAMD_RE_WORDS) {
+                               if (tok->normalized.len > 0) {
+                                       scvec[cnt] = tok->normalized.begin;
+                                       lenvec[cnt++] = tok->normalized.len;
+                               }
+                       }
+                       else {
+                               /* Stemmed words */
+                               if (tok->stemmed.len > 0) {
+                                       scvec[cnt] = tok->stemmed.begin;
+                                       lenvec[cnt++] = tok->stemmed.len;
+                               }
+                       }
+               }
+       }
+
+       return cnt;
+}
+
 /*
  * Calculates the specified regexp for the specified class if it's not calculated
  */
@@ -1010,6 +1064,7 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                                scvec[i] = (guchar *)"";
                                                continue;
                                        }
+
                                        lenvec[i] = end - in;
                                }
 
@@ -1234,6 +1289,10 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                }
                        }
 
+                       if (task->meta_words && task->meta_words->len > 0) {
+                               cnt += task->meta_words->len;
+                       }
+
                        if (cnt > 0) {
                                scvec = g_malloc (sizeof (*scvec) * cnt);
                                lenvec = g_malloc (sizeof (*lenvec) * cnt);
@@ -1241,51 +1300,17 @@ rspamd_re_cache_exec_re (struct rspamd_task *task,
                                cnt = 0;
 
                                PTR_ARRAY_FOREACH (task->text_parts, i, part) {
-                                       guint j;
-                                       rspamd_stat_token_t *tok;
-
                                        if (part->utf_words) {
-                                               for (j = 0; j < part->utf_words->len; j ++) {
-                                                       tok = &g_array_index (part->utf_words,
-                                                                       rspamd_stat_token_t, j);
-
-                                                       if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
-                                                               if (!(tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF)) {
-                                                                       if (!re_class->has_utf8) {
-                                                                               raw = TRUE;
-                                                                       }
-                                                                       else {
-                                                                               continue; /* Skip */
-                                                                       }
-                                                               }
-                                                       }
-                                                       else {
-                                                               continue; /* Skip non text */
-                                                       }
-
-                                                       if (re_class->type == RSPAMD_RE_RAWWORDS) {
-                                                               if (tok->original.len > 0) {
-                                                                       scvec[cnt] = tok->original.begin;
-                                                                       lenvec[cnt++] = tok->original.len;
-                                                               }
-                                                       }
-                                                       else if (re_class->type == RSPAMD_RE_WORDS) {
-                                                               if (tok->normalized.len > 0) {
-                                                                       scvec[cnt] = tok->normalized.begin;
-                                                                       lenvec[cnt++] = tok->normalized.len;
-                                                               }
-                                                       }
-                                                       else {
-                                                               /* Stemmed words */
-                                                               if (tok->stemmed.len > 0) {
-                                                                       scvec[cnt] = tok->stemmed.begin;
-                                                                       lenvec[cnt++] = tok->stemmed.len;
-                                                               }
-                                                       }
-                                               }
+                                               cnt = rspamd_process_words_vector (part->utf_words,
+                                                               scvec, lenvec, re_class, cnt, &raw);
                                        }
                                }
 
+                               if (task->meta_words) {
+                                       cnt = rspamd_process_words_vector (task->meta_words,
+                                                       scvec, lenvec, re_class, cnt, &raw);
+                               }
+
                                ret = rspamd_re_cache_process_regexp_data (rt, re,
                                                task, scvec, lenvec, cnt, raw);
 
@@ -1492,6 +1517,12 @@ rspamd_re_cache_type_to_string (enum rspamd_re_type type)
        case RSPAMD_RE_WORDS:
                ret = "words";
                break;
+       case RSPAMD_RE_RAWWORDS:
+               ret = "raw_words";
+               break;
+       case RSPAMD_RE_STEMWORDS:
+               ret = "stem_words";
+               break;
        case RSPAMD_RE_MAX:
                ret = "invalid class";
                break;