]> source.dussan.org Git - rspamd.git/commitdiff
[Feature] Composites: Improve composite atoms parser
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 4 Nov 2020 12:17:59 +0000 (12:17 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 4 Nov 2020 12:17:59 +0000 (12:17 +0000)
src/libserver/composites.c

index c1ee471f8b9dfafd53228a8b6282805a27207e83..9f7f548ab6a4d5c73804912bde95580d07fe73ce 100644 (file)
@@ -108,82 +108,267 @@ static rspamd_expression_atom_t *
 rspamd_composite_expr_parse (const gchar *line, gsize len,
                rspamd_mempool_t *pool, gpointer ud, GError **err)
 {
-       gsize clen;
+       gsize clen = 0;
        rspamd_expression_atom_t *res;
        struct rspamd_composite_atom *atom;
+       const gchar *p, *end;
+       enum composite_expr_state {
+               comp_state_read_symbol = 0,
+               comp_state_read_obrace,
+               comp_state_read_option,
+               comp_state_read_regexp,
+               comp_state_read_regexp_end,
+               comp_state_read_comma,
+               comp_state_read_ebrace,
+               comp_state_read_end
+       } state = comp_state_read_symbol;
+
+       end = line + len;
+       p = line;
+
+       /* Find length of the atom using a reduced state machine */
+       while (p < end) {
+               if (state == comp_state_read_end) {
+                       break;
+               }
 
-       /*
-        * Composites are just sequences of symbols
-        */
-       clen = strcspn (line, "; \t()><!|&\n");
-       if (clen == 0) {
-               /* Invalid composite atom */
-               g_set_error (err, rspamd_composites_quark (), 100, "Invalid composite: %s",
-                               line);
+               switch (state) {
+               case comp_state_read_symbol:
+                       clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
+                       p += clen;
+
+                       if (*p == '[') {
+                               state = comp_state_read_obrace;
+                       }
+                       else {
+                               state = comp_state_read_end;
+                       }
+                       break;
+               case comp_state_read_obrace:
+                       p ++;
+
+                       if (*p == '/') {
+                               p ++;
+                               state = comp_state_read_regexp;
+                       }
+                       else {
+                               state = comp_state_read_option;
+                       }
+                       break;
+               case comp_state_read_regexp:
+                       if (*p == '\\' && p + 1 < end) {
+                               /* Escaping */
+                               p ++;
+                       }
+                       else if (*p == '/') {
+                               /* End of regexp, possible flags */
+                               state = comp_state_read_regexp_end;
+                       }
+                       p ++;
+                       break;
+               case comp_state_read_option:
+               case comp_state_read_regexp_end:
+                       if (*p == ',') {
+                               p ++;
+                               state = comp_state_read_comma;
+                       }
+                       else if (*p == ']') {
+                               state = comp_state_read_ebrace;
+                       }
+                       else {
+                               p ++;
+                       }
+                       break;
+               case comp_state_read_comma:
+                       if (!g_ascii_isspace (*p)) {
+                               if (*p == '/') {
+                                       state = comp_state_read_regexp;
+                               }
+                               else if (*p == ']') {
+                                       state = comp_state_read_ebrace;
+                               }
+                               else {
+                                       state = comp_state_read_option;
+                               }
+                       }
+                       else {
+                               /* Skip spaces after comma */
+                               p ++;
+                       }
+                       break;
+               case comp_state_read_ebrace:
+                       p ++;
+                       state = comp_state_read_end;
+                       break;
+               case comp_state_read_end:
+                       g_assert_not_reached ();
+               }
+       }
+
+       if (state != comp_state_read_end) {
+               g_set_error (err, rspamd_composites_quark (), 100, "invalid composite: %s;"
+                                                                                                                  "parser stopped in state %d",
+                               line, state);
                return NULL;
        }
 
+       clen = p - line;
+       p = line;
+       state = comp_state_read_symbol;
+
+       atom = rspamd_mempool_alloc0 (pool, sizeof (*atom));
        res = rspamd_mempool_alloc0 (pool, sizeof (*res));
        res->len = clen;
        res->str = line;
 
-       atom = rspamd_mempool_alloc0 (pool, sizeof (*atom));
+       /* Full state machine to fill a composite atom */
+       const gchar *opt_start = NULL;
+
+       while (p < end) {
+               struct rspamd_composite_option_match *opt_match;
+
+               if (state == comp_state_read_end) {
+                       break;
+               }
+
+               switch (state) {
+               case comp_state_read_symbol:
+                       clen = rspamd_memcspn (p, "[; \t()><!|&\n", len);
+                       p += clen;
+
+                       if (*p == '[') {
+                               state = comp_state_read_obrace;
+                       }
+                       else {
+                               state = comp_state_read_end;
+                       }
+
+                       atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
+                       rspamd_strlcpy (atom->symbol, line, clen + 1);
+
+                       break;
+               case comp_state_read_obrace:
+                       p ++;
 
-       /* Now check for options combinations */
-       const gchar *obrace, *ebrace;
+                       if (*p == '/') {
+                               opt_start = p;
+                               p ++; /* Starting slash */
+                               state = comp_state_read_regexp;
+                       }
+                       else {
+                               state = comp_state_read_option;
+                               opt_start = p;
+                       }
+
+                       break;
+               case comp_state_read_regexp:
+                       if (*p == '\\' && p + 1 < end) {
+                               /* Escaping */
+                               p ++;
+                       }
+                       else if (*p == '/') {
+                               /* End of regexp, possible flags */
+                               state = comp_state_read_regexp_end;
+                       }
+                       p ++;
+                       break;
+               case comp_state_read_option:
+                       if (*p == ',' || *p == ']') {
+                               opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
+                               /* Plain match */
+                               gchar *opt_buf;
+                               gint opt_len = p - opt_start;
 
-       if ((obrace = memchr (line, '[', clen)) != NULL && obrace > line) {
-               atom->symbol = rspamd_mempool_alloc (pool, obrace - line + 1);
-               rspamd_strlcpy (atom->symbol, line, obrace - line + 1);
-               ebrace = memchr (line, ']', clen);
+                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+                               rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
 
-               if (ebrace != NULL && ebrace > obrace) {
-                       /* We can make a list of options */
-                       gchar **opts = rspamd_string_len_split (obrace + 1,
-                                       ebrace - obrace - 1, ",", -1, pool);
+                               opt_match->data.match = opt_buf;
+                               opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
 
-                       for (guint i = 0; opts[i] != NULL; i ++) {
-                               struct rspamd_composite_option_match *opt_match;
+                               DL_APPEND (atom->opts, opt_match);
 
+                               if (*p == ',') {
+                                       p++;
+                                       state = comp_state_read_comma;
+                               }
+                               else {
+                                       state = comp_state_read_ebrace;
+                               }
+                       }
+                       else {
+                               p ++;
+                       }
+                       break;
+               case comp_state_read_regexp_end:
+                       if (*p == ',' || *p == ']') {
                                opt_match = rspamd_mempool_alloc (pool, sizeof (*opt_match));
+                               /* Plain match */
+                               gchar *opt_buf;
+                               gint opt_len = p - opt_start;
 
-                               if (opts[i][0] == '/' && strchr (opts[i] + 1, '/') != NULL) {
-                                       /* Regexp */
-                                       rspamd_regexp_t *re;
-                                       GError *re_err = NULL;
+                               opt_buf = rspamd_mempool_alloc (pool, opt_len + 1);
+                               rspamd_strlcpy (opt_buf, opt_start, opt_len + 1);
 
-                                       re = rspamd_regexp_new (opts[i], NULL, &re_err);
+                               rspamd_regexp_t *re;
+                               GError *re_err = NULL;
 
-                                       if (re == NULL) {
-                                               msg_err_pool ("cannot create regexp from string %s: %e",
-                                                               opts[i], re_err);
+                               re = rspamd_regexp_new (opt_buf, NULL, &re_err);
 
-                                               g_error_free (re_err);
-                                       }
-                                       else {
-                                               rspamd_mempool_add_destructor (pool,
-                                                               (rspamd_mempool_destruct_t)rspamd_regexp_unref,
-                                                               re);
-                                               opt_match->data.re = re;
-                                               opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
-
-                                               DL_APPEND (atom->opts, opt_match);
-                                       }
+                               if (re == NULL) {
+                                       msg_err_pool ("cannot create regexp from string %s: %e",
+                                                       opt_buf, re_err);
+
+                                       g_error_free (re_err);
                                }
                                else {
-                                       /* Plain match */
-                                       opt_match->data.match = opts[i];
-                                       opt_match->type = RSPAMD_COMPOSITE_OPTION_PLAIN;
+                                       rspamd_mempool_add_destructor (pool,
+                                                       (rspamd_mempool_destruct_t)rspamd_regexp_unref,
+                                                       re);
+                                       opt_match->data.re = re;
+                                       opt_match->type = RSPAMD_COMPOSITE_OPTION_RE;
 
                                        DL_APPEND (atom->opts, opt_match);
                                }
+
+                               if (*p == ',') {
+                                       p++;
+                                       state = comp_state_read_comma;
+                               }
+                               else {
+                                       state = comp_state_read_ebrace;
+                               }
+                       }
+                       else {
+                               p ++;
                        }
+                       break;
+               case comp_state_read_comma:
+                       if (!g_ascii_isspace (*p)) {
+                               if (*p == '/') {
+                                       state = comp_state_read_regexp;
+                                       opt_start = p;
+                               }
+                               else if (*p == ']') {
+                                       state = comp_state_read_ebrace;
+                               }
+                               else {
+                                       opt_start = p;
+                                       state = comp_state_read_option;
+                               }
+                       }
+                       else {
+                               /* Skip spaces after comma */
+                               p ++;
+                       }
+                       break;
+               case comp_state_read_ebrace:
+                       p ++;
+                       state = comp_state_read_end;
+                       break;
+               case comp_state_read_end:
+                       g_assert_not_reached ();
                }
        }
-       else {
-               atom->symbol = rspamd_mempool_alloc (pool, clen + 1);
-               rspamd_strlcpy (atom->symbol, line, clen + 1);
-       }
 
        res->data = atom;