From: Vsevolod Stakhov Date: Wed, 4 Nov 2020 12:17:59 +0000 (+0000) Subject: [Feature] Composites: Improve composite atoms parser X-Git-Tag: 2.7~175 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=788ca58f26cee86c985ef45a53b78d81cb13fcdc;p=rspamd.git [Feature] Composites: Improve composite atoms parser --- diff --git a/src/libserver/composites.c b/src/libserver/composites.c index c1ee471f8..9f7f548ab 100644 --- a/src/libserver/composites.c +++ b/src/libserver/composites.c @@ -108,82 +108,267 @@ static rspamd_expression_atom_t * rspamd_composite_expr_parse (const gchar *line, gsize len, rspamd_mempool_t *pool, gpointer ud, GError **err) { - gsize clen; + gsize clen = 0; rspamd_expression_atom_t *res; struct rspamd_composite_atom *atom; + const gchar *p, *end; + enum composite_expr_state { + comp_state_read_symbol = 0, + comp_state_read_obrace, + comp_state_read_option, + comp_state_read_regexp, + comp_state_read_regexp_end, + comp_state_read_comma, + comp_state_read_ebrace, + comp_state_read_end + } state = comp_state_read_symbol; + + end = line + len; + p = line; + + /* Find length of the atom using a reduced state machine */ + while (p < end) { + if (state == comp_state_read_end) { + break; + } - /* - * Composites are just sequences of symbols - */ - clen = strcspn (line, "; \t()>len = clen; res->str = line; - atom = rspamd_mempool_alloc0 (pool, sizeof (*atom)); + /* Full state machine to fill a composite atom */ + const gchar *opt_start = NULL; + + while (p < end) { + struct rspamd_composite_option_match *opt_match; + + if (state == comp_state_read_end) { + break; + } + + switch (state) { + case comp_state_read_symbol: + clen = rspamd_memcspn (p, "[; \t()>symbol = rspamd_mempool_alloc (pool, clen + 1); + rspamd_strlcpy (atom->symbol, line, clen + 1); + + break; + case comp_state_read_obrace: + p ++; - /* Now check for options combinations */ - const gchar *obrace, *ebrace; + if (*p == '/') { + opt_start = p; + p ++; /* Starting slash */ + state = comp_state_read_regexp; + } + else { + state = comp_state_read_option; + opt_start = p; + } + + break; + case comp_state_read_regexp: + if (*p == '\\' && p + 1 < end) { + /* Escaping */ + p ++; + } + else if (*p == '/') { + /* End of regexp, possible flags */ + state = comp_state_read_regexp_end; + } + p ++; + break; + case comp_state_read_option: + if (*p == ',' || *p == ']') { + opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match)); + /* Plain match */ + gchar *opt_buf; + gint opt_len = p - opt_start; - if ((obrace = memchr (line, '[', clen)) != NULL && obrace > line) { - atom->symbol = rspamd_mempool_alloc (pool, obrace - line + 1); - rspamd_strlcpy (atom->symbol, line, obrace - line + 1); - ebrace = memchr (line, ']', clen); + opt_buf = rspamd_mempool_alloc (pool, opt_len + 1); + rspamd_strlcpy (opt_buf, opt_start, opt_len + 1); - if (ebrace != NULL && ebrace > obrace) { - /* We can make a list of options */ - gchar **opts = rspamd_string_len_split (obrace + 1, - ebrace - obrace - 1, ",", -1, pool); + opt_match->data.match = opt_buf; + opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN; - for (guint i = 0; opts[i] != NULL; i ++) { - struct rspamd_composite_option_match *opt_match; + DL_APPEND (atom->opts, opt_match); + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p ++; + } + break; + case comp_state_read_regexp_end: + if (*p == ',' || *p == ']') { opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match)); + /* Plain match */ + gchar *opt_buf; + gint opt_len = p - opt_start; - if (opts[i][0] == '/' && strchr (opts[i] + 1, '/') != NULL) { - /* Regexp */ - rspamd_regexp_t *re; - GError *re_err = NULL; + opt_buf = rspamd_mempool_alloc (pool, opt_len + 1); + rspamd_strlcpy (opt_buf, opt_start, opt_len + 1); - re = rspamd_regexp_new (opts[i], NULL, &re_err); + rspamd_regexp_t *re; + GError *re_err = NULL; - if (re == NULL) { - msg_err_pool ("cannot create regexp from string %s: %e", - opts[i], re_err); + re = rspamd_regexp_new (opt_buf, NULL, &re_err); - g_error_free (re_err); - } - else { - rspamd_mempool_add_destructor (pool, - (rspamd_mempool_destruct_t)rspamd_regexp_unref, - re); - opt_match->data.re = re; - opt_match->type = RSPAMD_COMPOSITE_OPTION_RE; - - DL_APPEND (atom->opts, opt_match); - } + if (re == NULL) { + msg_err_pool ("cannot create regexp from string %s: %e", + opt_buf, re_err); + + g_error_free (re_err); } else { - /* Plain match */ - opt_match->data.match = opts[i]; - opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN; + rspamd_mempool_add_destructor (pool, + (rspamd_mempool_destruct_t)rspamd_regexp_unref, + re); + opt_match->data.re = re; + opt_match->type = RSPAMD_COMPOSITE_OPTION_RE; DL_APPEND (atom->opts, opt_match); } + + if (*p == ',') { + p++; + state = comp_state_read_comma; + } + else { + state = comp_state_read_ebrace; + } + } + else { + p ++; } + break; + case comp_state_read_comma: + if (!g_ascii_isspace (*p)) { + if (*p == '/') { + state = comp_state_read_regexp; + opt_start = p; + } + else if (*p == ']') { + state = comp_state_read_ebrace; + } + else { + opt_start = p; + state = comp_state_read_option; + } + } + else { + /* Skip spaces after comma */ + p ++; + } + break; + case comp_state_read_ebrace: + p ++; + state = comp_state_read_end; + break; + case comp_state_read_end: + g_assert_not_reached (); } } - else { - atom->symbol = rspamd_mempool_alloc (pool, clen + 1); - rspamd_strlcpy (atom->symbol, line, clen + 1); - } res->data = atom;