]> source.dussan.org Git - rspamd.git/commitdiff
[Rework] Composites: Start rework of the composites framework
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Jul 2021 09:54:08 +0000 (10:54 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 15 Jul 2021 09:54:08 +0000 (10:54 +0100)
src/libserver/CMakeLists.txt
src/libserver/cfg_rcl.c
src/libserver/composites.c [deleted file]
src/libserver/composites.h [deleted file]
src/libserver/composites/composites.cxx [new file with mode: 0644]
src/libserver/composites/composites.h [new file with mode: 0644]
src/libserver/task.c
src/lua/lua_cfg_file.c
src/lua/lua_config.c

index 189e9fe03c961fd16eb0f7a083556444452db0f5..75fad36ac424d090cfba246500bba3a94f8898ec 100644 (file)
@@ -3,7 +3,7 @@ ADD_SUBDIRECTORY(css)
 SET(LIBRSPAMDSERVERSRC
                                ${CMAKE_CURRENT_SOURCE_DIR}/cfg_utils.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/cfg_rcl.c
-                               ${CMAKE_CURRENT_SOURCE_DIR}/composites.c
+                               ${CMAKE_CURRENT_SOURCE_DIR}/composites/composites.cxx
                                ${CMAKE_CURRENT_SOURCE_DIR}/dkim.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/dns.c
                                ${CMAKE_CURRENT_SOURCE_DIR}/dynamic_cfg.c
index 73b9a3b1d11de50b77c2c4d97ebdec5d86689b23..68b94abfe9b914d16fdd35b0123ebc7f1b773a4b 100644 (file)
@@ -20,7 +20,7 @@
 #include "cfg_file.h"
 #include "lua/lua_common.h"
 #include "expression.h"
-#include "composites.h"
+#include "src/libserver/composites/composites.h"
 #include "libserver/worker_util.h"
 #include "unix-std.h"
 #include "cryptobox.h"
diff --git a/src/libserver/composites.c b/src/libserver/composites.c
deleted file mode 100644 (file)
index 6f3e8a7..0000000
+++ /dev/null
@@ -1,989 +0,0 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "config.h"
-#include "logger.h"
-#include "expression.h"
-#include "task.h"
-#include "utlist.h"
-#include "scan_result.h"
-#include "composites.h"
-
-#include <math.h>
-
-#define msg_err_composites(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
-        "composites", task->task_pool->tag.uid, \
-        G_STRFUNC, \
-        __VA_ARGS__)
-#define msg_warn_composites(...)   rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
-        "composites", task->task_pool->tag.uid, \
-        G_STRFUNC, \
-        __VA_ARGS__)
-#define msg_info_composites(...)   rspamd_default_log_function (G_LOG_LEVEL_INFO, \
-        "composites", task->task_pool->tag.uid, \
-        G_STRFUNC, \
-        __VA_ARGS__)
-
-#define msg_debug_composites(...)  rspamd_conditional_debug_fast (NULL, task->from_addr, \
-        rspamd_composites_log_id, "composites", task->task_pool->tag.uid, \
-        G_STRFUNC, \
-        __VA_ARGS__)
-
-INIT_LOG_MODULE(composites)
-
-struct composites_data {
-       struct rspamd_task *task;
-       struct rspamd_composite *composite;
-       struct rspamd_scan_result *metric_res;
-       GHashTable *symbols_to_remove;
-       guint8 *checked;
-       struct composites_data *next;
-};
-
-struct rspamd_composite_option_match {
-       enum {
-               RSPAMD_COMPOSITE_OPTION_PLAIN,
-               RSPAMD_COMPOSITE_OPTION_RE
-       } type;
-
-       union {
-               rspamd_regexp_t *re;
-               gchar *match;
-       } data;
-       struct rspamd_composite_option_match *prev, *next;
-};
-
-struct rspamd_composite_atom {
-       gchar *symbol;
-       enum {
-               ATOM_UNKNOWN,
-               ATOM_COMPOSITE,
-               ATOM_PLAIN
-       } comp_type;
-
-       struct rspamd_composite *ncomp; /* underlying composite */
-       struct rspamd_composite_option_match *opts;
-};
-
-enum rspamd_composite_action {
-       RSPAMD_COMPOSITE_UNTOUCH = 0,
-       RSPAMD_COMPOSITE_REMOVE_SYMBOL = (1 << 0),
-       RSPAMD_COMPOSITE_REMOVE_WEIGHT = (1 << 1),
-       RSPAMD_COMPOSITE_REMOVE_FORCED = (1 << 2)
-};
-
-struct symbol_remove_data {
-       const gchar *sym;
-       struct rspamd_composite *comp;
-       GNode *parent;
-       guint action;
-       struct symbol_remove_data *prev, *next;
-};
-
-static rspamd_expression_atom_t * rspamd_composite_expr_parse (const gchar *line, gsize len,
-               rspamd_mempool_t *pool, gpointer ud, GError **err);
-static gdouble rspamd_composite_expr_process (void *ud, rspamd_expression_atom_t *atom);
-static gint rspamd_composite_expr_priority (rspamd_expression_atom_t *atom);
-static void rspamd_composite_expr_destroy (rspamd_expression_atom_t *atom);
-static void composites_foreach_callback (gpointer key, gpointer value, void *data);
-
-const struct rspamd_atom_subr composite_expr_subr = {
-       .parse = rspamd_composite_expr_parse,
-       .process = rspamd_composite_expr_process,
-       .priority = rspamd_composite_expr_priority,
-       .destroy = rspamd_composite_expr_destroy
-};
-
-static GQuark
-rspamd_composites_quark (void)
-{
-       return g_quark_from_static_string ("composites");
-}
-
-static rspamd_expression_atom_t *
-rspamd_composite_expr_parse (const gchar *line, gsize len,
-               rspamd_mempool_t *pool, gpointer ud, GError **err)
-{
-       gsize clen = 0;
-       rspamd_expression_atom_t *res;
-       struct rspamd_composite_atom *atom;
-       const gchar *p, *end;
-       enum composite_expr_state {
-               comp_state_read_symbol = 0,
-               comp_state_read_obrace,
-               comp_state_read_option,
-               comp_state_read_regexp,
-               comp_state_read_regexp_end,
-               comp_state_read_comma,
-               comp_state_read_ebrace,
-               comp_state_read_end
-       } state = comp_state_read_symbol;
-
-       end = line + len;
-       p = line;
-
-       /* Find length of the atom using a reduced state machine */
-       while (p < end) {
-               if (state == comp_state_read_end) {
-                       break;
-               }
-
-               switch (state) {
-               case comp_state_read_symbol:
-                       clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
-                       p += clen;
-
-                       if (*p == '[') {
-                               state = comp_state_read_obrace;
-                       }
-                       else {
-                               state = comp_state_read_end;
-                       }
-                       break;
-               case comp_state_read_obrace:
-                       p ++;
-
-                       if (*p == '/') {
-                               p ++;
-                               state = comp_state_read_regexp;
-                       }
-                       else {
-                               state = comp_state_read_option;
-                       }
-                       break;
-               case comp_state_read_regexp:
-                       if (*p == '\\' && p + 1 < end) {
-                               /* Escaping */
-                               p ++;
-                       }
-                       else if (*p == '/') {
-                               /* End of regexp, possible flags */
-                               state = comp_state_read_regexp_end;
-                       }
-                       p ++;
-                       break;
-               case comp_state_read_option:
-               case comp_state_read_regexp_end:
-                       if (*p == ',') {
-                               p ++;
-                               state = comp_state_read_comma;
-                       }
-                       else if (*p == ']') {
-                               state = comp_state_read_ebrace;
-                       }
-                       else {
-                               p ++;
-                       }
-                       break;
-               case comp_state_read_comma:
-                       if (!g_ascii_isspace (*p)) {
-                               if (*p == '/') {
-                                       state = comp_state_read_regexp;
-                               }
-                               else if (*p == ']') {
-                                       state = comp_state_read_ebrace;
-                               }
-                               else {
-                                       state = comp_state_read_option;
-                               }
-                       }
-                       else {
-                               /* Skip spaces after comma */
-                               p ++;
-                       }
-                       break;
-               case comp_state_read_ebrace:
-                       p ++;
-                       state = comp_state_read_end;
-                       break;
-               case comp_state_read_end:
-                       g_assert_not_reached ();
-               }
-       }
-
-       if (state != comp_state_read_end) {
-               g_set_error (err, rspamd_composites_quark (), 100, "invalid composite: %s;"
-                                                                                                                  "parser stopped in state %d",
-                               line, state);
-               return NULL;
-       }
-
-       clen = p - line;
-       p = line;
-       state = comp_state_read_symbol;
-
-       atom = rspamd_mempool_alloc0 (pool, sizeof (*atom));
-       atom->comp_type = ATOM_UNKNOWN;
-       res = rspamd_mempool_alloc0 (pool, sizeof (*res));
-       res->len = clen;
-       res->str = line;
-
-       /* Full state machine to fill a composite atom */
-       const gchar *opt_start = NULL;
-
-       while (p < end) {
-               struct rspamd_composite_option_match *opt_match;
-
-               if (state == comp_state_read_end) {
-                       break;
-               }
-
-               switch (state) {
-               case comp_state_read_symbol:
-                       clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
-                       p += clen;
-
-                       if (*p == '[') {
-                               state = comp_state_read_obrace;
-                       }
-                       else {
-                               state = comp_state_read_end;
-                       }
-
-                       atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
-                       rspamd_strlcpy (atom->symbol, line, clen + 1);
-
-                       break;
-               case comp_state_read_obrace:
-                       p ++;
-
-                       if (*p == '/') {
-                               opt_start = p;
-                               p ++; /* Starting slash */
-                               state = comp_state_read_regexp;
-                       }
-                       else {
-                               state = comp_state_read_option;
-                               opt_start = p;
-                       }
-
-                       break;
-               case comp_state_read_regexp:
-                       if (*p == '\\' && p + 1 < end) {
-                               /* Escaping */
-                               p ++;
-                       }
-                       else if (*p == '/') {
-                               /* End of regexp, possible flags */
-                               state = comp_state_read_regexp_end;
-                       }
-                       p ++;
-                       break;
-               case comp_state_read_option:
-                       if (*p == ',' || *p == ']') {
-                               opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
-                               /* Plain match */
-                               gchar *opt_buf;
-                               gint opt_len = p - opt_start;
-
-                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
-                               rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
-
-                               opt_match->data.match = opt_buf;
-                               opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
-
-                               DL_APPEND (atom->opts, opt_match);
-
-                               if (*p == ',') {
-                                       p++;
-                                       state = comp_state_read_comma;
-                               }
-                               else {
-                                       state = comp_state_read_ebrace;
-                               }
-                       }
-                       else {
-                               p ++;
-                       }
-                       break;
-               case comp_state_read_regexp_end:
-                       if (*p == ',' || *p == ']') {
-                               opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
-                               /* Plain match */
-                               gchar *opt_buf;
-                               gint opt_len = p - opt_start;
-
-                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
-                               rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
-
-                               rspamd_regexp_t *re;
-                               GError *re_err = NULL;
-
-                               re = rspamd_regexp_new (opt_buf, NULL, &re_err);
-
-                               if (re == NULL) {
-                                       msg_err_pool ("cannot create regexp from string %s: %e",
-                                                       opt_buf, re_err);
-
-                                       g_error_free (re_err);
-                               }
-                               else {
-                                       rspamd_mempool_add_destructor (pool,
-                                                       (rspamd_mempool_destruct_t)rspamd_regexp_unref,
-                                                       re);
-                                       opt_match->data.re = re;
-                                       opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
-
-                                       DL_APPEND (atom->opts, opt_match);
-                               }
-
-                               if (*p == ',') {
-                                       p++;
-                                       state = comp_state_read_comma;
-                               }
-                               else {
-                                       state = comp_state_read_ebrace;
-                               }
-                       }
-                       else {
-                               p ++;
-                       }
-                       break;
-               case comp_state_read_comma:
-                       if (!g_ascii_isspace (*p)) {
-                               if (*p == '/') {
-                                       state = comp_state_read_regexp;
-                                       opt_start = p;
-                               }
-                               else if (*p == ']') {
-                                       state = comp_state_read_ebrace;
-                               }
-                               else {
-                                       opt_start = p;
-                                       state = comp_state_read_option;
-                               }
-                       }
-                       else {
-                               /* Skip spaces after comma */
-                               p ++;
-                       }
-                       break;
-               case comp_state_read_ebrace:
-                       p ++;
-                       state = comp_state_read_end;
-                       break;
-               case comp_state_read_end:
-                       g_assert_not_reached ();
-               }
-       }
-
-       res->data = atom;
-
-       return res;
-}
-
-static gdouble
-rspamd_composite_process_single_symbol (struct composites_data *cd,
-                                                                               const gchar *sym,
-                                                                               struct rspamd_symbol_result **pms,
-                                                                               struct rspamd_composite_atom *atom)
-{
-       struct rspamd_symbol_result *ms = NULL;
-       gdouble rc = 0;
-       struct rspamd_task *task = cd->task;
-
-       if ((ms = rspamd_task_find_symbol_result (cd->task, sym, cd->metric_res)) == NULL) {
-               msg_debug_composites ("not found symbol %s in composite %s", sym,
-                               cd->composite->sym);
-
-               if (atom->comp_type == ATOM_UNKNOWN) {
-                       struct rspamd_composite *ncomp;
-
-                       if ((ncomp =
-                                                g_hash_table_lookup (cd->task->cfg->composite_symbols,
-                                                                sym)) != NULL) {
-                               atom->comp_type = ATOM_COMPOSITE;
-                               atom->ncomp = ncomp;
-                       }
-                       else {
-                               atom->comp_type = ATOM_PLAIN;
-                       }
-               }
-
-               if (atom->comp_type == ATOM_COMPOSITE) {
-                       msg_debug_composites ("symbol %s for composite %s is another composite",
-                                       sym, cd->composite->sym);
-
-                       if (isclr (cd->checked, atom->ncomp->id * 2)) {
-                               struct rspamd_composite *saved;
-
-                               msg_debug_composites ("composite dependency %s for %s is not checked",
-                                               sym, cd->composite->sym);
-                               /* Set checked for this symbol to avoid cyclic references */
-                               setbit (cd->checked, cd->composite->id * 2);
-                               saved = cd->composite; /* Save the current composite */
-                               composites_foreach_callback ((gpointer)atom->ncomp->sym, atom->ncomp, cd);
-
-                               /* Restore state */
-                               cd->composite = saved;
-                               clrbit (cd->checked, cd->composite->id * 2);
-
-                               ms = rspamd_task_find_symbol_result (cd->task, sym,
-                                               cd->metric_res);
-                       }
-                       else {
-                               /*
-                                * XXX: in case of cyclic references this would return 0
-                                */
-                               if (isset (cd->checked, atom->ncomp->id * 2 + 1)) {
-                                       ms = rspamd_task_find_symbol_result (cd->task, sym,
-                                                       cd->metric_res);
-                               }
-                       }
-               }
-       }
-
-       if (ms) {
-               msg_debug_composites ("found symbol %s in composite %s, weight: %.3f",
-                               sym, cd->composite->sym, ms->score);
-
-               /* Now check options */
-               struct rspamd_composite_option_match *cur_opt;
-
-               DL_FOREACH (atom->opts, cur_opt) {
-                       struct rspamd_symbol_option *opt;
-                       bool found = false;
-
-                       DL_FOREACH (ms->opts_head, opt) {
-                               if (cur_opt->type == RSPAMD_COMPOSITE_OPTION_PLAIN) {
-                                       gsize mlen = strlen (cur_opt->data.match);
-
-                                       if (opt->optlen == mlen &&
-                                               memcmp (opt->option, cur_opt->data.match, mlen) == 0) {
-
-                                               found = true;
-
-                                               break;
-                                       }
-                               }
-                               else {
-                                       if (rspamd_regexp_search (cur_opt->data.re,
-                                                       opt->option, opt->optlen, NULL, NULL, FALSE, NULL)) {
-                                               found = true;
-
-                                               break;
-                                       }
-                               }
-                       }
-
-
-                       if (!found) {
-                               if (cur_opt->type == RSPAMD_COMPOSITE_OPTION_PLAIN) {
-                                       msg_debug_composites ("symbol %s in composite %s misses required option %s",
-                                                       sym,
-                                                       cd->composite->sym,
-                                                       cur_opt->data.match);
-                               }
-                               else {
-                                       msg_debug_composites ("symbol %s in composite %s failed to match regexp %s",
-                                                       sym,
-                                                       cd->composite->sym,
-                                                       rspamd_regexp_get_pattern (cur_opt->data.re));
-                               }
-
-                               ms = NULL;
-
-                               break;
-                       }
-               }
-
-               if (ms) {
-                       if (ms->score == 0) {
-                               rc = 0.001; /* Distinguish from 0 */
-                       }
-                       else {
-                               rc = ms->score;
-                       }
-               }
-       }
-
-       *pms = ms;
-       return rc;
-}
-
-static void
-rspamd_composite_process_symbol_removal (rspamd_expression_atom_t *atom,
-                                                                                struct composites_data *cd,
-                                                                                struct rspamd_symbol_result *ms,
-                                                                                const gchar *beg)
-{
-       gchar t;
-       struct symbol_remove_data *rd, *nrd;
-       struct rspamd_task *task = cd->task;
-
-       if (ms == NULL) {
-               return;
-       }
-
-       /*
-        * At this point we know that we need to do something about this symbol,
-        * however, we don't know whether we need to delete it unfortunately,
-        * that depends on the later decisions when the complete expression is
-        * evaluated.
-        */
-       rd = g_hash_table_lookup (cd->symbols_to_remove, ms->name);
-
-       nrd = rspamd_mempool_alloc (cd->task->task_pool, sizeof (*nrd));
-       nrd->sym = ms->name;
-
-       /* By default remove symbols */
-       switch (cd->composite->policy) {
-       case RSPAMD_COMPOSITE_POLICY_REMOVE_ALL:
-       default:
-               nrd->action = (RSPAMD_COMPOSITE_REMOVE_SYMBOL|RSPAMD_COMPOSITE_REMOVE_WEIGHT);
-               break;
-       case RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL:
-               nrd->action = RSPAMD_COMPOSITE_REMOVE_SYMBOL;
-               break;
-       case RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT:
-               nrd->action = RSPAMD_COMPOSITE_REMOVE_WEIGHT;
-               break;
-       case RSPAMD_COMPOSITE_POLICY_LEAVE:
-               nrd->action = 0;
-               break;
-       }
-
-       for (;;) {
-               t = *beg;
-
-               if (t == '~') {
-                       nrd->action &= ~RSPAMD_COMPOSITE_REMOVE_SYMBOL;
-               }
-               else if (t == '-') {
-                       nrd->action &= ~(RSPAMD_COMPOSITE_REMOVE_WEIGHT|
-                                                        RSPAMD_COMPOSITE_REMOVE_SYMBOL);
-               }
-               else if (t == '^') {
-                       nrd->action |= RSPAMD_COMPOSITE_REMOVE_FORCED;
-               }
-               else {
-                       break;
-               }
-
-               beg ++;
-       }
-
-       nrd->comp = cd->composite;
-       nrd->parent = atom->parent;
-
-       if (rd == NULL) {
-               DL_APPEND (rd, nrd);
-               g_hash_table_insert (cd->symbols_to_remove, (gpointer)ms->name, rd);
-               msg_debug_composites ("%s: added symbol %s to removal: %d policy, from composite %s",
-                               cd->metric_res->name,
-                               ms->name, nrd->action,
-                               cd->composite->sym);
-       }
-       else {
-               DL_APPEND (rd, nrd);
-               msg_debug_composites ("%s: append symbol %s to removal: %d policy, from composite %s",
-                               cd->metric_res->name,
-                               ms->name, nrd->action,
-                               cd->composite->sym);
-       }
-}
-
-static gdouble
-rspamd_composite_expr_process (void *ud,
-               rspamd_expression_atom_t *atom)
-{
-       static const double epsilon = 0.00001;
-       struct composites_data *cd = (struct composites_data *)ud;
-       const gchar *sym = NULL;
-       struct rspamd_composite_atom *comp_atom = (struct rspamd_composite_atom *)atom->data;
-
-       struct rspamd_symbol_result *ms = NULL;
-       struct rspamd_symbols_group *gr;
-       struct rspamd_symbol *sdef;
-       struct rspamd_task *task = cd->task;
-       GHashTableIter it;
-       gpointer k, v;
-       gdouble rc = 0, max = 0;
-
-       if (isset (cd->checked, cd->composite->id * 2)) {
-               /* We have already checked this composite, so just return its value */
-               if (isset (cd->checked, cd->composite->id * 2 + 1)) {
-                       ms = rspamd_task_find_symbol_result (cd->task, sym, cd->metric_res);
-               }
-
-               if (ms) {
-                       if (ms->score == 0) {
-                               rc = epsilon; /* Distinguish from 0 */
-                       }
-                       else {
-                               /* Treat negative and positive scores equally... */
-                               rc = fabs (ms->score);
-                       }
-               }
-
-               msg_debug_composites ("composite %s is already checked, result: %.2f",
-                               cd->composite->sym, rc);
-
-               return rc;
-       }
-
-       sym = comp_atom->symbol;
-       guint slen = strlen (sym);
-
-       while (*sym != '\0' && !g_ascii_isalnum (*sym)) {
-               sym ++;
-               slen --;
-       }
-
-       if (slen > 2) {
-               if (G_UNLIKELY (memcmp (sym, "g:", 2) == 0)) {
-                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 2);
-
-                       if (gr != NULL) {
-                               g_hash_table_iter_init (&it, gr->symbols);
-
-                               while (g_hash_table_iter_next (&it, &k, &v)) {
-                                       sdef = v;
-                                       rc = rspamd_composite_process_single_symbol (cd, sdef->name, &ms,
-                                                       comp_atom);
-
-                                       if (rc) {
-                                               rspamd_composite_process_symbol_removal (atom,
-                                                               cd,
-                                                               ms,
-                                                               comp_atom->symbol);
-
-                                               if (fabs (rc) > max) {
-                                                       max = fabs (rc);
-                                               }
-                                       }
-                               }
-                       }
-
-                       rc = max;
-               }
-               else if (G_UNLIKELY (memcmp (sym, "g+:", 3) == 0)) {
-                       /* Group, positive symbols only */
-                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 3);
-
-                       if (gr != NULL) {
-                               g_hash_table_iter_init (&it, gr->symbols);
-
-                               while (g_hash_table_iter_next (&it, &k, &v)) {
-                                       sdef = v;
-
-                                       if (sdef->score > 0) {
-                                               rc = rspamd_composite_process_single_symbol (cd,
-                                                               sdef->name,
-                                                               &ms,
-                                                               comp_atom);
-
-                                               if (rc) {
-                                                       rspamd_composite_process_symbol_removal (atom,
-                                                                       cd,
-                                                                       ms,
-                                                                       comp_atom->symbol);
-
-                                                       if (fabs (rc) > max) {
-                                                               max = fabs (rc);
-                                                       }
-                                               }
-                                       }
-                               }
-
-                               rc = max;
-                       }
-               }
-               else if (G_UNLIKELY (memcmp (sym, "g-:", 3) == 0)) {
-                       /* Group, negative symbols only */
-                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 3);
-
-                       if (gr != NULL) {
-                               g_hash_table_iter_init (&it, gr->symbols);
-
-                               while (g_hash_table_iter_next (&it, &k, &v)) {
-                                       sdef = v;
-
-                                       if (sdef->score < 0) {
-                                               rc = rspamd_composite_process_single_symbol (cd,
-                                                               sdef->name,
-                                                               &ms,
-                                                               comp_atom);
-
-                                               if (rc) {
-                                                       rspamd_composite_process_symbol_removal (atom,
-                                                                       cd,
-                                                                       ms,
-                                                                       comp_atom->symbol);
-
-                                                       if (fabs (rc) > max) {
-                                                               max = fabs (rc);
-                                                       }
-                                               }
-                                       }
-                               }
-
-                               rc = max;
-                       }
-               }
-               else {
-                       rc = rspamd_composite_process_single_symbol (cd, sym, &ms, comp_atom);
-
-                       if (rc) {
-                               rspamd_composite_process_symbol_removal (atom,
-                                               cd,
-                                               ms,
-                                               comp_atom->symbol);
-                       }
-               }
-       }
-       else {
-               rc = rspamd_composite_process_single_symbol (cd, sym, &ms, comp_atom);
-
-               if (rc) {
-                       rspamd_composite_process_symbol_removal (atom,
-                                       cd,
-                                       ms,
-                                       comp_atom->symbol);
-               }
-       }
-
-       msg_debug_composites ("%s: final result for composite %s is %.2f",
-                       cd->metric_res->name,
-                       cd->composite->sym, rc);
-
-       return rc;
-}
-
-/*
- * We don't have preferences for composites
- */
-static gint
-rspamd_composite_expr_priority (rspamd_expression_atom_t *atom)
-{
-       return 0;
-}
-
-static void
-rspamd_composite_expr_destroy (rspamd_expression_atom_t *atom)
-{
-       /* Composite atoms are destroyed just with the pool */
-}
-
-
-static void
-composites_foreach_callback (gpointer key, gpointer value, void *data)
-{
-       struct composites_data *cd = data;
-       struct rspamd_composite *comp = value;
-       struct rspamd_task *task;
-       gdouble rc;
-
-       cd->composite = comp;
-       task = cd->task;
-
-       if (!isset (cd->checked, cd->composite->id * 2)) {
-               if (rspamd_symcache_is_checked (cd->task, cd->task->cfg->cache,
-                               key)) {
-                       msg_debug_composites ("composite %s is checked in symcache but not "
-                                       "in composites bitfield", cd->composite->sym);
-                       setbit (cd->checked, comp->id * 2);
-                       clrbit (cd->checked, comp->id * 2 + 1);
-               }
-               else {
-                       if (rspamd_task_find_symbol_result (cd->task, key,
-                                       cd->metric_res) != NULL) {
-                               /* Already set, no need to check */
-                               msg_debug_composites ("composite %s is already in metric "
-                                               "in composites bitfield", cd->composite->sym);
-                               setbit (cd->checked, comp->id * 2);
-                               clrbit (cd->checked, comp->id * 2 + 1);
-
-                               return;
-                       }
-
-                       rc = rspamd_process_expression (comp->expr, RSPAMD_EXPRESSION_FLAG_NOOPT,
-                                       cd);
-
-                       /* Checked bit */
-                       setbit (cd->checked, comp->id * 2);
-
-                       /* Result bit */
-                       if (rc != 0) {
-                               setbit (cd->checked, comp->id * 2 + 1);
-                               rspamd_task_insert_result_full (cd->task, key, 1.0, NULL,
-                                               RSPAMD_SYMBOL_INSERT_SINGLE, cd->metric_res);
-                       }
-                       else {
-                               clrbit (cd->checked, comp->id * 2 + 1);
-                       }
-               }
-       }
-}
-
-
-static void
-composites_remove_symbols (gpointer key, gpointer value, gpointer data)
-{
-       struct composites_data *cd = data;
-       struct rspamd_task *task;
-       struct symbol_remove_data *rd = value, *cur;
-       struct rspamd_symbol_result *ms;
-       gboolean skip = FALSE,
-                       has_valid_op = FALSE,
-                       want_remove_score = TRUE,
-                       want_remove_symbol = TRUE,
-                       want_forced = FALSE;
-       const gchar *disable_score_reason = "no policy",
-               *disable_symbol_reason = "no policy";
-       GNode *par;
-
-       task = cd->task;
-
-       DL_FOREACH (rd, cur) {
-               if (!isset (cd->checked, cur->comp->id * 2 + 1)) {
-                       continue;
-               }
-               /*
-                * First of all exclude all elements with any parent that is negation:
-                * !A || B -> here we can have both !A and B matched, but we do *NOT*
-                * want to remove symbol in that case
-                */
-               par = cur->parent;
-               skip = FALSE;
-
-               while (par) {
-                       if (rspamd_expression_node_is_op (par, OP_NOT)) {
-                               skip = TRUE;
-                               break;
-                       }
-
-                       par = par->parent;
-               }
-
-               if (skip) {
-                       continue;
-               }
-
-               has_valid_op = TRUE;
-               /*
-                * Now we can try to remove symbols/scores
-                *
-                * We apply the following logic here:
-                * - if no composites would like to save score then we remove score
-                * - if no composites would like to save symbol then we remove symbol
-                */
-               if (!want_forced) {
-                       if (!(cur->action & RSPAMD_COMPOSITE_REMOVE_SYMBOL)) {
-                               want_remove_symbol = FALSE;
-                               disable_symbol_reason = cur->comp->sym;
-                       }
-
-                       if (!(cur->action & RSPAMD_COMPOSITE_REMOVE_WEIGHT)) {
-                               want_remove_score = FALSE;
-                               disable_score_reason = cur->comp->sym;
-                       }
-
-                       if (cur->action & RSPAMD_COMPOSITE_REMOVE_FORCED) {
-                               want_forced = TRUE;
-                               disable_symbol_reason = cur->comp->sym;
-                               disable_score_reason = cur->comp->sym;
-                       }
-               }
-       }
-
-       ms = rspamd_task_find_symbol_result (task, rd->sym, cd->metric_res);
-
-       if (has_valid_op && ms && !(ms->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) {
-
-               if (want_remove_score || want_forced) {
-                       msg_debug_composites ("%s: %s remove symbol weight for %s (was %.2f), "
-                                                "score removal affected by %s, symbol removal affected by %s",
-                                       cd->metric_res->name,
-                                       (want_forced ? "forced" : "normal"), key, ms->score,
-                                       disable_score_reason, disable_symbol_reason);
-                       cd->metric_res->score -= ms->score;
-                       ms->score = 0.0;
-               }
-
-               if (want_remove_symbol || want_forced) {
-                       ms->flags |= RSPAMD_SYMBOL_RESULT_IGNORED;
-                       msg_debug_composites ("%s: %s remove symbol %s (score %.2f), "
-                                                                 "score removal affected by %s, symbol removal affected by %s",
-                                       cd->metric_res->name,
-                                       (want_forced ? "forced" : "normal"), key, ms->score,
-                                       disable_score_reason, disable_symbol_reason);
-               }
-       }
-}
-
-static void
-composites_metric_callback (struct rspamd_task *task)
-{
-       struct composites_data *cd, *first_cd = NULL;
-       struct rspamd_scan_result *mres;
-
-       DL_FOREACH (task->result, mres) {
-               cd = rspamd_mempool_alloc (task->task_pool, sizeof (struct composites_data));
-               cd->task = task;
-               cd->metric_res = mres;
-               cd->symbols_to_remove = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
-               cd->checked =
-                               rspamd_mempool_alloc0 (task->task_pool,
-                                               NBYTES (g_hash_table_size (task->cfg->composite_symbols) * 2));
-
-               /* Process hash table */
-               rspamd_symcache_composites_foreach (task,
-                               task->cfg->cache,
-                               composites_foreach_callback,
-                               cd);
-               LL_PREPEND (first_cd, cd);
-       }
-
-       LL_REVERSE (first_cd);
-
-       LL_FOREACH (first_cd, cd) {
-               /* Remove symbols that are in composites */
-               g_hash_table_foreach (cd->symbols_to_remove, composites_remove_symbols, cd);
-               /* Free list */
-               g_hash_table_unref (cd->symbols_to_remove);
-       }
-}
-
-void
-rspamd_composites_process_task (struct rspamd_task *task)
-{
-       if (task->result && !RSPAMD_TASK_IS_SKIPPED (task)) {
-               composites_metric_callback (task);
-       }
-}
-
-
-enum rspamd_composite_policy
-rspamd_composite_policy_from_str (const gchar *string)
-{
-       enum rspamd_composite_policy ret = RSPAMD_COMPOSITE_POLICY_UNKNOWN;
-
-       if (strcmp (string, "remove") == 0 || strcmp (string, "remove_all") == 0 ||
-                       strcmp (string, "default") == 0) {
-               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_ALL;
-       }
-       else if (strcmp (string, "remove_symbol") == 0) {
-               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL;
-       }
-       else if (strcmp (string, "remove_weight") == 0) {
-               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT;
-       }
-       else if (strcmp (string, "leave") == 0 || strcmp (string, "remove_none") == 0) {
-               ret = RSPAMD_COMPOSITE_POLICY_LEAVE;
-       }
-
-       return ret;
-}
diff --git a/src/libserver/composites.h b/src/libserver/composites.h
deleted file mode 100644 (file)
index bb7eb89..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*-
- * Copyright 2016 Vsevolod Stakhov
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef SRC_LIBSERVER_COMPOSITES_H_
-#define SRC_LIBSERVER_COMPOSITES_H_
-
-#include "config.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-struct rspamd_task;
-
-/**
- * Subr for composite expressions
- */
-extern const struct rspamd_atom_subr composite_expr_subr;
-
-enum rspamd_composite_policy {
-       RSPAMD_COMPOSITE_POLICY_REMOVE_ALL = 0,
-       RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL,
-       RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT,
-       RSPAMD_COMPOSITE_POLICY_LEAVE,
-       RSPAMD_COMPOSITE_POLICY_UNKNOWN
-};
-
-/**
- * Composite structure
- */
-struct rspamd_composite {
-       const gchar *str_expr;
-       const gchar *sym;
-       struct rspamd_expression *expr;
-       gint id;
-       enum rspamd_composite_policy policy;
-};
-
-/**
- * Process all results and form composite metrics from existent metrics as it is defined in config
- * @param task worker's task that present message from user
- */
-void rspamd_composites_process_task (struct rspamd_task *task);
-
-enum rspamd_composite_policy rspamd_composite_policy_from_str (const gchar *string);
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif /* SRC_LIBSERVER_COMPOSITES_H_ */
diff --git a/src/libserver/composites/composites.cxx b/src/libserver/composites/composites.cxx
new file mode 100644 (file)
index 0000000..0d95346
--- /dev/null
@@ -0,0 +1,1019 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "config.h"
+#include "logger.h"
+#include "expression.h"
+#include "task.h"
+#include "utlist.h"
+#include "scan_result.h"
+#include "composites.h"
+
+#include <cmath>
+#include <vector>
+#include <variant>
+#include "contrib/robin-hood/robin_hood.h"
+
+#define msg_err_composites(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
+        "composites", task->task_pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+#define msg_warn_composites(...)   rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
+        "composites", task->task_pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+#define msg_info_composites(...)   rspamd_default_log_function (G_LOG_LEVEL_INFO, \
+        "composites", task->task_pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+
+#define msg_debug_composites(...)  rspamd_conditional_debug_fast (NULL, task->from_addr, \
+        rspamd_composites_log_id, "composites", task->task_pool->tag.uid, \
+        G_STRFUNC, \
+        __VA_ARGS__)
+
+INIT_LOG_MODULE(composites)
+
+
+namespace rspamd::composites {
+static rspamd_expression_atom_t *rspamd_composite_expr_parse(const gchar *line, gsize len,
+                                                                                                                        rspamd_mempool_t *pool,
+                                                                                                                        gpointer ud, GError **err);
+static gdouble rspamd_composite_expr_process(void *ud, rspamd_expression_atom_t *atom);
+static gint rspamd_composite_expr_priority(rspamd_expression_atom_t *atom);
+static void rspamd_composite_expr_destroy(rspamd_expression_atom_t *atom);
+}
+
+const struct rspamd_atom_subr composite_expr_subr = {
+               .parse = rspamd::composites::rspamd_composite_expr_parse,
+               .process = rspamd::composites::rspamd_composite_expr_process,
+               .priority = rspamd::composites::rspamd_composite_expr_priority,
+               .destroy = rspamd::composites::rspamd_composite_expr_destroy
+};
+
+namespace rspamd::composites {
+
+enum class rspamd_composite_policy {
+       RSPAMD_COMPOSITE_POLICY_REMOVE_ALL = 0,
+       RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL,
+       RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT,
+       RSPAMD_COMPOSITE_POLICY_LEAVE,
+       RSPAMD_COMPOSITE_POLICY_UNKNOWN
+};
+
+/**
+ * Static composites structure
+ */
+struct rspamd_composite {
+       std::string str_expr;
+       std::string sym;
+       struct rspamd_expression *expr;
+       gint id;
+       rspamd_composite_policy policy;
+};
+
+struct composites_data {
+       struct rspamd_task *task;
+       struct rspamd_composite *composite;
+       struct rspamd_scan_result *metric_res;
+       GHashTable *symbols_to_remove;
+       guint8 *checked;
+       struct composites_data *next;
+};
+
+struct rspamd_composite_option_match {
+       std::variant<rspamd_regexp_t *, std::string> match;
+
+       ~rspamd_composite_option_match() {
+               if (std::holds_alternative<rspamd_regexp_t *>(match)) {
+                       rspamd_regexp_unref(std::get<rspamd_regexp_t *>(match));
+               }
+       }
+};
+
+enum class rspamd_composite_atom_type {
+       ATOM_UNKNOWN,
+       ATOM_COMPOSITE,
+       ATOM_PLAIN
+};
+struct rspamd_composite_atom {
+       std::string symbol;
+       rspamd_composite_atom_type comp_type;
+       struct rspamd_composite *ncomp; /* underlying composite */
+       std::vector<rspamd_composite_option_match> opts;
+};
+
+enum rspamd_composite_action : std::uint8_t {
+       RSPAMD_COMPOSITE_UNTOUCH = 0,
+       RSPAMD_COMPOSITE_REMOVE_SYMBOL = (1u << 0),
+       RSPAMD_COMPOSITE_REMOVE_WEIGHT = (1u << 1),
+       RSPAMD_COMPOSITE_REMOVE_FORCED = (1u << 2)
+};
+
+struct symbol_remove_data {
+       const char *sym;
+       struct rspamd_composite *comp;
+       GNode *parent;
+       std::uint8_t action;
+       struct symbol_remove_data *prev, *next;
+};
+
+static GQuark
+rspamd_composites_quark (void)
+{
+       return g_quark_from_static_string ("composites");
+}
+
+static rspamd_expression_atom_t *
+rspamd_composite_expr_parse(const gchar *line, gsize len,
+                                                       rspamd_mempool_t *pool,
+                                                       gpointer ud, GError **err)
+{
+       gsize clen = 0;
+       rspamd_expression_atom_t *res;
+       struct rspamd_composite_atom *atom;
+       const gchar *p, *end;
+       enum composite_expr_state {
+               comp_state_read_symbol = 0,
+               comp_state_read_obrace,
+               comp_state_read_option,
+               comp_state_read_regexp,
+               comp_state_read_regexp_end,
+               comp_state_read_comma,
+               comp_state_read_ebrace,
+               comp_state_read_end
+       } state = comp_state_read_symbol;
+
+       end = line + len;
+       p = line;
+
+       /* Find length of the atom using a reduced state machine */
+       while (p < end) {
+               if (state == comp_state_read_end) {
+                       break;
+               }
+
+               switch (state) {
+               case comp_state_read_symbol:
+                       clen = rspamd_memcspn(p, "[; \t()><!|&\n", len);
+                       p += clen;
+
+                       if (*p == '[') {
+                               state = comp_state_read_obrace;
+                       }
+                       else {
+                               state = comp_state_read_end;
+                       }
+                       break;
+               case comp_state_read_obrace:
+                       p++;
+
+                       if (*p == '/') {
+                               p++;
+                               state = comp_state_read_regexp;
+                       }
+                       else {
+                               state = comp_state_read_option;
+                       }
+                       break;
+               case comp_state_read_regexp:
+                       if (*p == '\\' && p + 1 < end) {
+                               /* Escaping */
+                               p++;
+                       }
+                       else if (*p == '/') {
+                               /* End of regexp, possible flags */
+                               state = comp_state_read_regexp_end;
+                       }
+                       p++;
+                       break;
+               case comp_state_read_option:
+               case comp_state_read_regexp_end:
+                       if (*p == ',') {
+                               p++;
+                               state = comp_state_read_comma;
+                       }
+                       else if (*p == ']') {
+                               state = comp_state_read_ebrace;
+                       }
+                       else {
+                               p++;
+                       }
+                       break;
+               case comp_state_read_comma:
+                       if (!g_ascii_isspace (*p)) {
+                               if (*p == '/') {
+                                       state = comp_state_read_regexp;
+                               }
+                               else if (*p == ']') {
+                                       state = comp_state_read_ebrace;
+                               }
+                               else {
+                                       state = comp_state_read_option;
+                               }
+                       }
+                       else {
+                               /* Skip spaces after comma */
+                               p++;
+                       }
+                       break;
+               case comp_state_read_ebrace:
+                       p++;
+                       state = comp_state_read_end;
+                       break;
+               case comp_state_read_end:
+                       g_assert_not_reached ();
+               }
+       }
+
+       if (state != comp_state_read_end) {
+               g_set_error(err, rspamd_composites_quark(), 100, "invalid composite: %s;"
+                                                                                                                "parser stopped in state %d",
+                               line, state);
+               return NULL;
+       }
+
+       clen = p - line;
+       p = line;
+       state = comp_state_read_symbol;
+
+       atom = rspamd_mempool_alloc0 (pool, sizeof(*atom));
+       atom->comp_type = ATOM_UNKNOWN;
+       res = rspamd_mempool_alloc0 (pool, sizeof(*res));
+       res->len = clen;
+       res->str = line;
+
+       /* Full state machine to fill a composite atom */
+       const gchar *opt_start = NULL;
+
+       while (p < end) {
+               struct rspamd_composite_option_match *opt_match;
+
+               if (state == comp_state_read_end) {
+                       break;
+               }
+
+               switch (state) {
+               case comp_state_read_symbol:
+                       clen = rspamd_memcspn(p, "[; \t()><!|&\n", len);
+                       p += clen;
+
+                       if (*p == '[') {
+                               state = comp_state_read_obrace;
+                       }
+                       else {
+                               state = comp_state_read_end;
+                       }
+
+                       atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
+                       rspamd_strlcpy(atom->symbol, line, clen + 1);
+
+                       break;
+               case comp_state_read_obrace:
+                       p++;
+
+                       if (*p == '/') {
+                               opt_start = p;
+                               p++; /* Starting slash */
+                               state = comp_state_read_regexp;
+                       }
+                       else {
+                               state = comp_state_read_option;
+                               opt_start = p;
+                       }
+
+                       break;
+               case comp_state_read_regexp:
+                       if (*p == '\\' && p + 1 < end) {
+                               /* Escaping */
+                               p++;
+                       }
+                       else if (*p == '/') {
+                               /* End of regexp, possible flags */
+                               state = comp_state_read_regexp_end;
+                       }
+                       p++;
+                       break;
+               case comp_state_read_option:
+                       if (*p == ',' || *p == ']') {
+                               opt_match = rspamd_mempool_alloc (pool, sizeof(*opt_match));
+                               /* Plain match */
+                               gchar *opt_buf;
+                               gint opt_len = p - opt_start;
+
+                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+                               rspamd_strlcpy(opt_buf, opt_start, opt_len + 1);
+
+                               opt_match->data.match = opt_buf;
+                               opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
+
+                               DL_APPEND (atom->opts, opt_match);
+
+                               if (*p == ',') {
+                                       p++;
+                                       state = comp_state_read_comma;
+                               }
+                               else {
+                                       state = comp_state_read_ebrace;
+                               }
+                       }
+                       else {
+                               p++;
+                       }
+                       break;
+               case comp_state_read_regexp_end:
+                       if (*p == ',' || *p == ']') {
+                               opt_match = rspamd_mempool_alloc (pool, sizeof(*opt_match));
+                               /* Plain match */
+                               gchar *opt_buf;
+                               gint opt_len = p - opt_start;
+
+                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+                               rspamd_strlcpy(opt_buf, opt_start, opt_len + 1);
+
+                               rspamd_regexp_t *re;
+                               GError *re_err = NULL;
+
+                               re = rspamd_regexp_new(opt_buf, NULL, &re_err);
+
+                               if (re == NULL) {
+                                       msg_err_pool ("cannot create regexp from string %s: %e",
+                                                       opt_buf, re_err);
+
+                                       g_error_free(re_err);
+                               }
+                               else {
+                                       rspamd_mempool_add_destructor (pool,
+                                                       (rspamd_mempool_destruct_t) rspamd_regexp_unref,
+                                                       re);
+                                       opt_match->data.re = re;
+                                       opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
+
+                                       DL_APPEND (atom->opts, opt_match);
+                               }
+
+                               if (*p == ',') {
+                                       p++;
+                                       state = comp_state_read_comma;
+                               }
+                               else {
+                                       state = comp_state_read_ebrace;
+                               }
+                       }
+                       else {
+                               p++;
+                       }
+                       break;
+               case comp_state_read_comma:
+                       if (!g_ascii_isspace (*p)) {
+                               if (*p == '/') {
+                                       state = comp_state_read_regexp;
+                                       opt_start = p;
+                               }
+                               else if (*p == ']') {
+                                       state = comp_state_read_ebrace;
+                               }
+                               else {
+                                       opt_start = p;
+                                       state = comp_state_read_option;
+                               }
+                       }
+                       else {
+                               /* Skip spaces after comma */
+                               p++;
+                       }
+                       break;
+               case comp_state_read_ebrace:
+                       p++;
+                       state = comp_state_read_end;
+                       break;
+               case comp_state_read_end:
+                       g_assert_not_reached ();
+               }
+       }
+
+       res->data = atom;
+
+       return res;
+}
+
+}
+
+static void composites_foreach_callback (gpointer key, gpointer value, void *data);
+
+
+static gdouble
+rspamd_composite_process_single_symbol (struct composites_data *cd,
+                                                                               const gchar *sym,
+                                                                               struct rspamd_symbol_result **pms,
+                                                                               struct rspamd_composite_atom *atom)
+{
+       struct rspamd_symbol_result *ms = NULL;
+       gdouble rc = 0;
+       struct rspamd_task *task = cd->task;
+
+       if ((ms = rspamd_task_find_symbol_result (cd->task, sym, cd->metric_res)) == NULL) {
+               msg_debug_composites ("not found symbol %s in composite %s", sym,
+                               cd->composite->sym);
+
+               if (atom->comp_type == ATOM_UNKNOWN) {
+                       struct rspamd_composite *ncomp;
+
+                       if ((ncomp =
+                                                g_hash_table_lookup (cd->task->cfg->composite_symbols,
+                                                                sym)) != NULL) {
+                               atom->comp_type = ATOM_COMPOSITE;
+                               atom->ncomp = ncomp;
+                       }
+                       else {
+                               atom->comp_type = ATOM_PLAIN;
+                       }
+               }
+
+               if (atom->comp_type == ATOM_COMPOSITE) {
+                       msg_debug_composites ("symbol %s for composite %s is another composite",
+                                       sym, cd->composite->sym);
+
+                       if (isclr (cd->checked, atom->ncomp->id * 2)) {
+                               struct rspamd_composite *saved;
+
+                               msg_debug_composites ("composite dependency %s for %s is not checked",
+                                               sym, cd->composite->sym);
+                               /* Set checked for this symbol to avoid cyclic references */
+                               setbit (cd->checked, cd->composite->id * 2);
+                               saved = cd->composite; /* Save the current composite */
+                               composites_foreach_callback ((gpointer)atom->ncomp->sym, atom->ncomp, cd);
+
+                               /* Restore state */
+                               cd->composite = saved;
+                               clrbit (cd->checked, cd->composite->id * 2);
+
+                               ms = rspamd_task_find_symbol_result (cd->task, sym,
+                                               cd->metric_res);
+                       }
+                       else {
+                               /*
+                                * XXX: in case of cyclic references this would return 0
+                                */
+                               if (isset (cd->checked, atom->ncomp->id * 2 + 1)) {
+                                       ms = rspamd_task_find_symbol_result (cd->task, sym,
+                                                       cd->metric_res);
+                               }
+                       }
+               }
+       }
+
+       if (ms) {
+               msg_debug_composites ("found symbol %s in composite %s, weight: %.3f",
+                               sym, cd->composite->sym, ms->score);
+
+               /* Now check options */
+               struct rspamd_composite_option_match *cur_opt;
+
+               DL_FOREACH (atom->opts, cur_opt) {
+                       struct rspamd_symbol_option *opt;
+                       bool found = false;
+
+                       DL_FOREACH (ms->opts_head, opt) {
+                               if (cur_opt->type == RSPAMD_COMPOSITE_OPTION_PLAIN) {
+                                       gsize mlen = strlen (cur_opt->data.match);
+
+                                       if (opt->optlen == mlen &&
+                                               memcmp (opt->option, cur_opt->data.match, mlen) == 0) {
+
+                                               found = true;
+
+                                               break;
+                                       }
+                               }
+                               else {
+                                       if (rspamd_regexp_search (cur_opt->data.re,
+                                                       opt->option, opt->optlen, NULL, NULL, FALSE, NULL)) {
+                                               found = true;
+
+                                               break;
+                                       }
+                               }
+                       }
+
+
+                       if (!found) {
+                               if (cur_opt->type == RSPAMD_COMPOSITE_OPTION_PLAIN) {
+                                       msg_debug_composites ("symbol %s in composite %s misses required option %s",
+                                                       sym,
+                                                       cd->composite->sym,
+                                                       cur_opt->data.match);
+                               }
+                               else {
+                                       msg_debug_composites ("symbol %s in composite %s failed to match regexp %s",
+                                                       sym,
+                                                       cd->composite->sym,
+                                                       rspamd_regexp_get_pattern (cur_opt->data.re));
+                               }
+
+                               ms = NULL;
+
+                               break;
+                       }
+               }
+
+               if (ms) {
+                       if (ms->score == 0) {
+                               rc = 0.001; /* Distinguish from 0 */
+                       }
+                       else {
+                               rc = ms->score;
+                       }
+               }
+       }
+
+       *pms = ms;
+       return rc;
+}
+
+static void
+rspamd_composite_process_symbol_removal (rspamd_expression_atom_t *atom,
+                                                                                struct composites_data *cd,
+                                                                                struct rspamd_symbol_result *ms,
+                                                                                const gchar *beg)
+{
+       gchar t;
+       struct symbol_remove_data *rd, *nrd;
+       struct rspamd_task *task = cd->task;
+
+       if (ms == NULL) {
+               return;
+       }
+
+       /*
+        * At this point we know that we need to do something about this symbol,
+        * however, we don't know whether we need to delete it unfortunately,
+        * that depends on the later decisions when the complete expression is
+        * evaluated.
+        */
+       rd = g_hash_table_lookup (cd->symbols_to_remove, ms->name);
+
+       nrd = rspamd_mempool_alloc (cd->task->task_pool, sizeof (*nrd));
+       nrd->sym = ms->name;
+
+       /* By default remove symbols */
+       switch (cd->composite->policy) {
+       case RSPAMD_COMPOSITE_POLICY_REMOVE_ALL:
+       default:
+               nrd->action = (RSPAMD_COMPOSITE_REMOVE_SYMBOL|RSPAMD_COMPOSITE_REMOVE_WEIGHT);
+               break;
+       case RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL:
+               nrd->action = RSPAMD_COMPOSITE_REMOVE_SYMBOL;
+               break;
+       case RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT:
+               nrd->action = RSPAMD_COMPOSITE_REMOVE_WEIGHT;
+               break;
+       case RSPAMD_COMPOSITE_POLICY_LEAVE:
+               nrd->action = 0;
+               break;
+       }
+
+       for (;;) {
+               t = *beg;
+
+               if (t == '~') {
+                       nrd->action &= ~RSPAMD_COMPOSITE_REMOVE_SYMBOL;
+               }
+               else if (t == '-') {
+                       nrd->action &= ~(RSPAMD_COMPOSITE_REMOVE_WEIGHT|
+                                                        RSPAMD_COMPOSITE_REMOVE_SYMBOL);
+               }
+               else if (t == '^') {
+                       nrd->action |= RSPAMD_COMPOSITE_REMOVE_FORCED;
+               }
+               else {
+                       break;
+               }
+
+               beg ++;
+       }
+
+       nrd->comp = cd->composite;
+       nrd->parent = atom->parent;
+
+       if (rd == NULL) {
+               DL_APPEND (rd, nrd);
+               g_hash_table_insert (cd->symbols_to_remove, (gpointer)ms->name, rd);
+               msg_debug_composites ("%s: added symbol %s to removal: %d policy, from composite %s",
+                               cd->metric_res->name,
+                               ms->name, nrd->action,
+                               cd->composite->sym);
+       }
+       else {
+               DL_APPEND (rd, nrd);
+               msg_debug_composites ("%s: append symbol %s to removal: %d policy, from composite %s",
+                               cd->metric_res->name,
+                               ms->name, nrd->action,
+                               cd->composite->sym);
+       }
+}
+
+static gdouble
+rspamd_composite_expr_process (void *ud,
+               rspamd_expression_atom_t *atom)
+{
+       static const double epsilon = 0.00001;
+       struct composites_data *cd = (struct composites_data *)ud;
+       const gchar *sym = NULL;
+       struct rspamd_composite_atom *comp_atom = (struct rspamd_composite_atom *)atom->data;
+
+       struct rspamd_symbol_result *ms = NULL;
+       struct rspamd_symbols_group *gr;
+       struct rspamd_symbol *sdef;
+       struct rspamd_task *task = cd->task;
+       GHashTableIter it;
+       gpointer k, v;
+       gdouble rc = 0, max = 0;
+
+       if (isset (cd->checked, cd->composite->id * 2)) {
+               /* We have already checked this composite, so just return its value */
+               if (isset (cd->checked, cd->composite->id * 2 + 1)) {
+                       ms = rspamd_task_find_symbol_result (cd->task, sym, cd->metric_res);
+               }
+
+               if (ms) {
+                       if (ms->score == 0) {
+                               rc = epsilon; /* Distinguish from 0 */
+                       }
+                       else {
+                               /* Treat negative and positive scores equally... */
+                               rc = fabs (ms->score);
+                       }
+               }
+
+               msg_debug_composites ("composite %s is already checked, result: %.2f",
+                               cd->composite->sym, rc);
+
+               return rc;
+       }
+
+       sym = comp_atom->symbol;
+       guint slen = strlen (sym);
+
+       while (*sym != '\0' && !g_ascii_isalnum (*sym)) {
+               sym ++;
+               slen --;
+       }
+
+       if (slen > 2) {
+               if (G_UNLIKELY (memcmp (sym, "g:", 2) == 0)) {
+                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 2);
+
+                       if (gr != NULL) {
+                               g_hash_table_iter_init (&it, gr->symbols);
+
+                               while (g_hash_table_iter_next (&it, &k, &v)) {
+                                       sdef = v;
+                                       rc = rspamd_composite_process_single_symbol (cd, sdef->name, &ms,
+                                                       comp_atom);
+
+                                       if (rc) {
+                                               rspamd_composite_process_symbol_removal (atom,
+                                                               cd,
+                                                               ms,
+                                                               comp_atom->symbol);
+
+                                               if (fabs (rc) > max) {
+                                                       max = fabs (rc);
+                                               }
+                                       }
+                               }
+                       }
+
+                       rc = max;
+               }
+               else if (G_UNLIKELY (memcmp (sym, "g+:", 3) == 0)) {
+                       /* Group, positive symbols only */
+                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 3);
+
+                       if (gr != NULL) {
+                               g_hash_table_iter_init (&it, gr->symbols);
+
+                               while (g_hash_table_iter_next (&it, &k, &v)) {
+                                       sdef = v;
+
+                                       if (sdef->score > 0) {
+                                               rc = rspamd_composite_process_single_symbol (cd,
+                                                               sdef->name,
+                                                               &ms,
+                                                               comp_atom);
+
+                                               if (rc) {
+                                                       rspamd_composite_process_symbol_removal (atom,
+                                                                       cd,
+                                                                       ms,
+                                                                       comp_atom->symbol);
+
+                                                       if (fabs (rc) > max) {
+                                                               max = fabs (rc);
+                                                       }
+                                               }
+                                       }
+                               }
+
+                               rc = max;
+                       }
+               }
+               else if (G_UNLIKELY (memcmp (sym, "g-:", 3) == 0)) {
+                       /* Group, negative symbols only */
+                       gr = g_hash_table_lookup (cd->task->cfg->groups, sym + 3);
+
+                       if (gr != NULL) {
+                               g_hash_table_iter_init (&it, gr->symbols);
+
+                               while (g_hash_table_iter_next (&it, &k, &v)) {
+                                       sdef = v;
+
+                                       if (sdef->score < 0) {
+                                               rc = rspamd_composite_process_single_symbol (cd,
+                                                               sdef->name,
+                                                               &ms,
+                                                               comp_atom);
+
+                                               if (rc) {
+                                                       rspamd_composite_process_symbol_removal (atom,
+                                                                       cd,
+                                                                       ms,
+                                                                       comp_atom->symbol);
+
+                                                       if (fabs (rc) > max) {
+                                                               max = fabs (rc);
+                                                       }
+                                               }
+                                       }
+                               }
+
+                               rc = max;
+                       }
+               }
+               else {
+                       rc = rspamd_composite_process_single_symbol (cd, sym, &ms, comp_atom);
+
+                       if (rc) {
+                               rspamd_composite_process_symbol_removal (atom,
+                                               cd,
+                                               ms,
+                                               comp_atom->symbol);
+                       }
+               }
+       }
+       else {
+               rc = rspamd_composite_process_single_symbol (cd, sym, &ms, comp_atom);
+
+               if (rc) {
+                       rspamd_composite_process_symbol_removal (atom,
+                                       cd,
+                                       ms,
+                                       comp_atom->symbol);
+               }
+       }
+
+       msg_debug_composites ("%s: final result for composite %s is %.2f",
+                       cd->metric_res->name,
+                       cd->composite->sym, rc);
+
+       return rc;
+}
+
+/*
+ * We don't have preferences for composites
+ */
+static gint
+rspamd_composite_expr_priority (rspamd_expression_atom_t *atom)
+{
+       return 0;
+}
+
+static void
+rspamd_composite_expr_destroy (rspamd_expression_atom_t *atom)
+{
+       /* Composite atoms are destroyed just with the pool */
+}
+
+
+static void
+composites_foreach_callback (gpointer key, gpointer value, void *data)
+{
+       struct composites_data *cd = data;
+       struct rspamd_composite *comp = value;
+       struct rspamd_task *task;
+       gdouble rc;
+
+       cd->composite = comp;
+       task = cd->task;
+
+       if (!isset (cd->checked, cd->composite->id * 2)) {
+               if (rspamd_symcache_is_checked (cd->task, cd->task->cfg->cache,
+                               key)) {
+                       msg_debug_composites ("composite %s is checked in symcache but not "
+                                       "in composites bitfield", cd->composite->sym);
+                       setbit (cd->checked, comp->id * 2);
+                       clrbit (cd->checked, comp->id * 2 + 1);
+               }
+               else {
+                       if (rspamd_task_find_symbol_result (cd->task, key,
+                                       cd->metric_res) != NULL) {
+                               /* Already set, no need to check */
+                               msg_debug_composites ("composite %s is already in metric "
+                                               "in composites bitfield", cd->composite->sym);
+                               setbit (cd->checked, comp->id * 2);
+                               clrbit (cd->checked, comp->id * 2 + 1);
+
+                               return;
+                       }
+
+                       rc = rspamd_process_expression (comp->expr, RSPAMD_EXPRESSION_FLAG_NOOPT,
+                                       cd);
+
+                       /* Checked bit */
+                       setbit (cd->checked, comp->id * 2);
+
+                       /* Result bit */
+                       if (rc != 0) {
+                               setbit (cd->checked, comp->id * 2 + 1);
+                               rspamd_task_insert_result_full (cd->task, key, 1.0, NULL,
+                                               RSPAMD_SYMBOL_INSERT_SINGLE, cd->metric_res);
+                       }
+                       else {
+                               clrbit (cd->checked, comp->id * 2 + 1);
+                       }
+               }
+       }
+}
+
+
+static void
+composites_remove_symbols (gpointer key, gpointer value, gpointer data)
+{
+       struct composites_data *cd = data;
+       struct rspamd_task *task;
+       struct symbol_remove_data *rd = value, *cur;
+       struct rspamd_symbol_result *ms;
+       gboolean skip = FALSE,
+                       has_valid_op = FALSE,
+                       want_remove_score = TRUE,
+                       want_remove_symbol = TRUE,
+                       want_forced = FALSE;
+       const gchar *disable_score_reason = "no policy",
+               *disable_symbol_reason = "no policy";
+       GNode *par;
+
+       task = cd->task;
+
+       DL_FOREACH (rd, cur) {
+               if (!isset (cd->checked, cur->comp->id * 2 + 1)) {
+                       continue;
+               }
+               /*
+                * First of all exclude all elements with any parent that is negation:
+                * !A || B -> here we can have both !A and B matched, but we do *NOT*
+                * want to remove symbol in that case
+                */
+               par = cur->parent;
+               skip = FALSE;
+
+               while (par) {
+                       if (rspamd_expression_node_is_op (par, OP_NOT)) {
+                               skip = TRUE;
+                               break;
+                       }
+
+                       par = par->parent;
+               }
+
+               if (skip) {
+                       continue;
+               }
+
+               has_valid_op = TRUE;
+               /*
+                * Now we can try to remove symbols/scores
+                *
+                * We apply the following logic here:
+                * - if no composites would like to save score then we remove score
+                * - if no composites would like to save symbol then we remove symbol
+                */
+               if (!want_forced) {
+                       if (!(cur->action & RSPAMD_COMPOSITE_REMOVE_SYMBOL)) {
+                               want_remove_symbol = FALSE;
+                               disable_symbol_reason = cur->comp->sym;
+                       }
+
+                       if (!(cur->action & RSPAMD_COMPOSITE_REMOVE_WEIGHT)) {
+                               want_remove_score = FALSE;
+                               disable_score_reason = cur->comp->sym;
+                       }
+
+                       if (cur->action & RSPAMD_COMPOSITE_REMOVE_FORCED) {
+                               want_forced = TRUE;
+                               disable_symbol_reason = cur->comp->sym;
+                               disable_score_reason = cur->comp->sym;
+                       }
+               }
+       }
+
+       ms = rspamd_task_find_symbol_result (task, rd->sym, cd->metric_res);
+
+       if (has_valid_op && ms && !(ms->flags & RSPAMD_SYMBOL_RESULT_IGNORED)) {
+
+               if (want_remove_score || want_forced) {
+                       msg_debug_composites ("%s: %s remove symbol weight for %s (was %.2f), "
+                                                "score removal affected by %s, symbol removal affected by %s",
+                                       cd->metric_res->name,
+                                       (want_forced ? "forced" : "normal"), key, ms->score,
+                                       disable_score_reason, disable_symbol_reason);
+                       cd->metric_res->score -= ms->score;
+                       ms->score = 0.0;
+               }
+
+               if (want_remove_symbol || want_forced) {
+                       ms->flags |= RSPAMD_SYMBOL_RESULT_IGNORED;
+                       msg_debug_composites ("%s: %s remove symbol %s (score %.2f), "
+                                                                 "score removal affected by %s, symbol removal affected by %s",
+                                       cd->metric_res->name,
+                                       (want_forced ? "forced" : "normal"), key, ms->score,
+                                       disable_score_reason, disable_symbol_reason);
+               }
+       }
+}
+
+static void
+composites_metric_callback (struct rspamd_task *task)
+{
+       struct composites_data *cd, *first_cd = NULL;
+       struct rspamd_scan_result *mres;
+
+       DL_FOREACH (task->result, mres) {
+               cd = rspamd_mempool_alloc (task->task_pool, sizeof (struct composites_data));
+               cd->task = task;
+               cd->metric_res = mres;
+               cd->symbols_to_remove = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+               cd->checked =
+                               rspamd_mempool_alloc0 (task->task_pool,
+                                               NBYTES (g_hash_table_size (task->cfg->composite_symbols) * 2));
+
+               /* Process hash table */
+               rspamd_symcache_composites_foreach (task,
+                               task->cfg->cache,
+                               composites_foreach_callback,
+                               cd);
+               LL_PREPEND (first_cd, cd);
+       }
+
+       LL_REVERSE (first_cd);
+
+       LL_FOREACH (first_cd, cd) {
+               /* Remove symbols that are in composites */
+               g_hash_table_foreach (cd->symbols_to_remove, composites_remove_symbols, cd);
+               /* Free list */
+               g_hash_table_unref (cd->symbols_to_remove);
+       }
+}
+
+void
+rspamd_composites_process_task (struct rspamd_task *task)
+{
+       if (task->result && !RSPAMD_TASK_IS_SKIPPED (task)) {
+               composites_metric_callback (task);
+       }
+}
+
+
+enum rspamd_composite_policy
+rspamd_composite_policy_from_str (const gchar *string)
+{
+       enum rspamd_composite_policy ret = RSPAMD_COMPOSITE_POLICY_UNKNOWN;
+
+       if (strcmp (string, "remove") == 0 || strcmp (string, "remove_all") == 0 ||
+                       strcmp (string, "default") == 0) {
+               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_ALL;
+       }
+       else if (strcmp (string, "remove_symbol") == 0) {
+               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_SYMBOL;
+       }
+       else if (strcmp (string, "remove_weight") == 0) {
+               ret = RSPAMD_COMPOSITE_POLICY_REMOVE_WEIGHT;
+       }
+       else if (strcmp (string, "leave") == 0 || strcmp (string, "remove_none") == 0) {
+               ret = RSPAMD_COMPOSITE_POLICY_LEAVE;
+       }
+
+       return ret;
+}
diff --git a/src/libserver/composites/composites.h b/src/libserver/composites/composites.h
new file mode 100644 (file)
index 0000000..d39863b
--- /dev/null
@@ -0,0 +1,44 @@
+/*-
+ * Copyright 2016 Vsevolod Stakhov
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef SRC_LIBSERVER_COMPOSITES_H_
+#define SRC_LIBSERVER_COMPOSITES_H_
+
+#include "config.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+struct rspamd_task;
+
+/**
+ * Subr for composite expressions
+ */
+extern const struct rspamd_atom_subr composite_expr_subr;
+
+/**
+ * Process all results and form composite metrics from existent metrics as it is defined in config
+ * @param task worker's task that present message from user
+ */
+void rspamd_composites_process_task (struct rspamd_task *task);
+
+enum rspamd_composite_policy rspamd_composite_policy_from_str (const gchar *string);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif /* SRC_LIBSERVER_COMPOSITES_H_ */
index aae374c2130b6036b325b044f3572a799df648b0..c9f3fb627a6b5324afbaafe7da887c5a39cde01c 100644 (file)
@@ -21,7 +21,7 @@
 #include "message.h"
 #include "lua/lua_common.h"
 #include "email_addr.h"
-#include "composites.h"
+#include "src/libserver/composites/composites.h"
 #include "stat_api.h"
 #include "unix-std.h"
 #include "utlist.h"
index 68acdd36834fca8299870487a7b5f557de14e9c6..af8964b32c772b939804c2ea118f21d00132decd 100644 (file)
@@ -15,7 +15,7 @@
  */
 #include "lua_common.h"
 #include "expression.h"
-#include "composites.h"
+#include "src/libserver/composites/composites.h"
 
 #ifdef HAVE_SYS_UTSNAME_H
 #endif
index 2631f1cecdd77e5b76e79c2ceda3979b62319437..c2f5efb42e5f6defa16b59758dde21488ffe114b 100644 (file)
@@ -16,7 +16,7 @@
 #include "lua_common.h"
 #include "libmime/message.h"
 #include "libutil/expression.h"
-#include "libserver/composites.h"
+#include "src/libserver/composites/composites.h"
 #include "libserver/cfg_file_private.h"
 #include "libmime/lang_detection.h"
 #include "lua/lua_map.h"