]> source.dussan.org Git - rspamd.git/commitdiff
Add expressions parsing code.
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 17 Mar 2015 15:01:52 +0000 (15:01 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 17 Mar 2015 15:01:52 +0000 (15:01 +0000)
src/libutil/expression.c
src/libutil/expression.h

index b7f6e1d576536eb4a1bbdc71196dbd24f77dd93c..b5b4dce5fd63bbc516ecf340ebb71ec963936b0c 100644 (file)
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include <expression.h>
 #include "config.h"
+#include "expression.h"
+#include "regexp.h"
 
 enum rspamd_expression_op {
+       OP_INVALID = 0,
        OP_PLUS, /* || or + */
        OP_MULT, /* && or * */
        OP_NOT, /* ! */
        OP_LT, /* < */
        OP_GT, /* > */
        OP_LE, /* <= */
-       OP_GE /* >= */
+       OP_GE, /* >= */
+       OP_OBRACE, /* ( */
+       OP_CBRACE /* ) */
 };
 
 struct rspamd_expression_elt {
@@ -55,20 +59,26 @@ struct rspamd_expression_elt {
 struct rspamd_expression {
        struct rspamd_atom_subr *subr;
        GArray *expressions;
-       GPtrArray *expression_stack;
+       GArray *expression_stack;
 };
 
+static GQuark
+rspamd_expr_quark (void)
+{
+       return g_quark_from_static_string ("rspamd-expression");
+}
+
 static void
 rspamd_expr_stack_push (struct rspamd_expression *expr,
-               struct rspamd_expression_elt *elt)
+               gpointer elt)
 {
-       g_ptr_array_add (expr->expression_stack, elt);
+       g_array_append_val (expr->expression_stack, elt);
 }
 
-static struct rspamd_expression_elt *
+static gpointer
 rspamd_expr_stack_pop (struct rspamd_expression *expr)
 {
-       struct rspamd_expression_elt *e;
+       gpointer e;
        gint idx;
 
        if (expr->expression_stack->len == 0) {
@@ -76,18 +86,191 @@ rspamd_expr_stack_pop (struct rspamd_expression *expr)
        }
 
        idx = expr->expression_stack->len - 1;
-       e = g_ptr_array_index (expr->expression_stack, idx);
-       g_ptr_array_remove_index_fast (expr->expression_stack, idx);
+       e = g_array_index (expr->expression_stack, gpointer, idx);
+       g_array_remove_index_fast (expr->expression_stack, idx);
 
        return e;
 }
 
+/*
+ * Return operation priority
+ */
+static gint
+rspamd_expr_logic_priority (enum rspamd_expression_op op)
+{
+       gint ret = 0;
+
+       switch (op) {
+       case OP_NOT:
+               ret = 5;
+               break;
+       case OP_MULT:
+               ret = 4;
+               break;
+       case OP_PLUS:
+               ret = 3;
+               break;
+       case OP_GE:
+       case OP_GT:
+       case OP_LE:
+       case OP_LT:
+               ret = 2;
+               break;
+       case OP_OBRACE:
+       case OP_CBRACE:
+               ret = 1;
+               break;
+       case OP_INVALID:
+               ret = -1;
+               break;
+       }
+
+       return ret;
+}
+
+/*
+ * Return FALSE if symbol is not operation symbol (operand)
+ * Return TRUE if symbol is operation symbol
+ */
+static gboolean
+rspamd_expr_is_operation_symbol (gchar a)
+{
+       switch (a) {
+       case '!':
+       case '&':
+       case '|':
+       case '(':
+       case ')':
+       case '>':
+       case '<':
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+/* Return character representation of operation */
+static enum rspamd_expression_op
+rspamd_expr_str_to_op (const gchar *a, const gchar *end, const gchar **next)
+{
+       enum rspamd_expression_op op = OP_INVALID;
+
+       g_assert (a < end);
+
+       switch (*a) {
+       case '!':
+       case '&':
+       case '|':
+       case '+':
+       case '*':
+       case '(':
+       case ')': {
+               if (a < end - 1) {
+                       if ((a[0] == '&' && a[1] == '&') ||
+                                       (a[0] == '|' && a[1] == '|')) {
+                               *next = a + 2;
+                       }
+                       else {
+                               *next = a + 1;
+                       }
+               }
+               /* XXX: not especially effective */
+               switch (*a) {
+               case '!':
+                       op = OP_NOT;
+                       break;
+               case '&':
+               case '*':
+                       op = OP_MULT;
+                       break;
+               case '|':
+               case '+':
+                       op = OP_PLUS;
+                       break;
+               case ')':
+                       op = OP_CBRACE;
+                       break;
+               case '(':
+                       op = OP_OBRACE;
+                       break;
+               default:
+                       op = OP_INVALID;
+                       break;
+               }
+               break;
+       }
+       case 'O':
+       case 'o':
+               if ((gulong)(end - a) >= sizeof ("or") &&
+                               g_ascii_strncasecmp (a, "or", sizeof ("or") - 1) == 0) {
+                       *next = a + sizeof ("or") - 1;
+                       op = OP_PLUS;
+               }
+               break;
+       case 'A':
+       case 'a':
+               if ((gulong)(end - a) >= sizeof ("and") &&
+                               g_ascii_strncasecmp (a, "and", sizeof ("and") - 1) == 0) {
+                       *next = a + sizeof ("and") - 1;
+                       op = OP_MULT;
+               }
+               break;
+       case 'N':
+       case 'n':
+               if ((gulong)(end - a) >= sizeof ("not") &&
+                               g_ascii_strncasecmp (a, "not", sizeof ("not") - 1) == 0) {
+                       *next = a + sizeof ("not") - 1;
+                       op = OP_NOT;
+               }
+               break;
+       case '>':
+               if (a < end - 1 && a[1] == '=') {
+                       *next = a + 2;
+                       op = OP_GE;
+               }
+               else {
+                       *next = a + 1;
+                       op = OP_GT;
+               }
+               break;
+       case '<':
+               if (a < end - 1 && a[1] == '=') {
+                       *next = a + 2;
+                       op = OP_LE;
+               }
+               else {
+                       *next = a + 1;
+                       op = OP_LT;
+               }
+               break;
+       default:
+               op = OP_INVALID;
+               break;
+       }
+
+       return op;
+}
+
 gboolean
 rspamd_parse_expression (const gchar *line, gsize len,
                struct rspamd_atom_subr *subr, gpointer subr_data,
                rspamd_mempool_t *pool, GError **err,
                struct rspamd_expression **target)
 {
+       struct rspamd_expression *e;
+       struct rspamd_expression_elt elt;
+       rspamd_expression_atom_t *atom;
+       rspamd_regexp_t *num_re;
+       enum rspamd_expression_op op, op_stack;
+       const gchar *p, *c, *end;
+
+       enum {
+               PARSE_ATOM = 0,
+               PARSE_OP,
+               PARSE_LIM,
+               SKIP_SPACES
+       } state;
+
        g_assert (line != NULL);
        g_assert (subr != NULL && subr->parse != NULL);
 
@@ -95,6 +278,203 @@ rspamd_parse_expression (const gchar *line, gsize len,
                len = strlen (line);
        }
 
+       num_re = rspamd_regexp_cache_create (NULL, "/^\\d+\\s*[><]/", NULL, NULL);
+
+       p = line;
+       c = line;
+       end = line + len;
+       e = g_slice_alloc (sizeof (*e));
+       e->expressions = g_array_new (FALSE, FALSE,
+                       sizeof (struct rspamd_expression_elt));
+       e->expression_stack = g_array_sized_new (FALSE, FALSE, sizeof (gpointer), 32);
+
+       /* Shunting-yard algorithm */
+       while (p < end) {
+               switch (state) {
+               case PARSE_ATOM:
+                       if (g_ascii_isspace (*p)) {
+                               state = SKIP_SPACES;
+                       }
+                       else if (rspamd_expr_is_operation_symbol (*p)) {
+                               state = PARSE_ATOM;
+                       }
+                       else {
+                               /*
+                                * First of all, we check some pre-conditions:
+                                * 1) if we have 'and ' or 'or ' or 'not ' strings, they are op
+                                * 2) if we have full numeric string, then we check for the following:
+                                *  ^\d+\s*[><]$
+                                */
+                               if ((gulong)(end - p) > sizeof ("and ") &&
+                                       (g_ascii_strncasecmp (p, "and ", sizeof ("and ") - 1) == 0 ||
+                                       g_ascii_strncasecmp (p, "not ", sizeof ("not ") - 1) == 0 )) {
+                                       state = PARSE_OP;
+                               }
+                               else if ((gulong)(end - p) > sizeof ("or ") &&
+                                       g_ascii_strncasecmp (p, "or ", sizeof ("or ") - 1) == 0) {
+                                       state = PARSE_OP;
+                               }
+                               else if (rspamd_regexp_search (num_re, p, end - p, NULL, NULL,
+                                               FALSE)) {
+                                       c = p;
+                                       state = PARSE_LIM;
+                               }
+                               else {
+                                       /* Try to parse atom */
+                                       atom = subr->parse (p, end - p, pool, subr_data, err);
+                                       if (atom == NULL) {
+                                               /* We couldn't parse the atom, so go out */
+                                               goto err;
+                                       }
+                                       g_assert (atom->len != 0);
+                                       p = p + atom->len;
+
+                                       /* Push to output */
+                                       elt.type = ELT_ATOM;
+                                       elt.p.atom = atom;
+                                       g_array_append_val (e->expressions, elt);
+                               }
+                       }
+                       break;
+               case PARSE_LIM:
+                       if (g_ascii_isdigit (*p)) {
+                               p ++;
+                       }
+                       else {
+                               if (p - c > 0) {
+                                       elt.type = ELT_LIMIT;
+                                       elt.p.lim.val = strtoul (c, NULL, 10);
+                                       g_array_append_val (e->expressions, elt);
+                                       c = p;
+                                       state = SKIP_SPACES;
+                               }
+                               else {
+                                       g_set_error (err, rspamd_expr_quark(), 400, "Empty number");
+                                       goto err;
+                               }
+                       }
+                       break;
+               case PARSE_OP:
+                       op = rspamd_expr_str_to_op (p, end, &p);
+                       if (op == OP_INVALID) {
+                               g_set_error (err, rspamd_expr_quark(), 500, "Bad operator %c",
+                                               *p);
+                               goto err;
+                       }
+                       else if (op == OP_OBRACE) {
+                               /*
+                                * If the token is a left parenthesis, then push it onto
+                                * the stack.
+                                */
+                               rspamd_expr_stack_push (e, GINT_TO_POINTER (op));
+                       }
+                       else if (op == OP_CBRACE) {
+                               /*
+                                * Until the token at the top of the stack is a left
+                                * parenthesis, pop operators off the stack onto the
+                                * output queue.
+                                *
+                                * Pop the left parenthesis from the stack,
+                                * but not onto the output queue.
+                                *
+                                * If the stack runs out without finding a left parenthesis,
+                                * then there are mismatched parentheses.
+                                */
+                               do {
+                                       op = GPOINTER_TO_INT (rspamd_expr_stack_pop (e));
+
+                                       if (op == OP_INVALID) {
+                                               g_set_error (err, rspamd_expr_quark(), 600,
+                                                               "Braces mismatch");
+                                               goto err;
+                                       }
+
+                                       if (op != OP_OBRACE) {
+                                               elt.type = ELT_OP;
+                                               elt.p.op = op;
+                                               g_array_append_val (e->expressions, elt);
+                                       }
+
+                               } while (op != OP_OBRACE);
+                       }
+                       else {
+                               /*
+                                * While there is an operator token, o2, at the top of
+                                * the operator stack, and either:
+                                *
+                                * - o1 is left-associative and its precedence is less than
+                                * or equal to that of o2, or
+                                * - o1 is right associative, and has precedence less than
+                                * that of o2,
+                                *
+                                * then pop o2 off the operator stack, onto the output queue;
+                                *
+                                * push o1 onto the operator stack.
+                                */
+
+                               for (;;) {
+                                       op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e));
+
+                                       if (op_stack == OP_INVALID) {
+                                               /* Stack is empty */
+                                               break;
+                                       }
+
+                                       /* We ignore associativity for now */
+                                       if (op_stack != OP_OBRACE &&
+                                                       rspamd_expr_logic_priority (op) <=
+                                                       rspamd_expr_logic_priority(op_stack)) {
+                                               elt.type = ELT_OP;
+                                               elt.p.op = op_stack;
+                                               g_array_append_val (e->expressions, elt);
+                                       }
+                                       else {
+                                               /* Push op_stack back */
+                                               rspamd_expr_stack_push (e, GINT_TO_POINTER (op_stack));
+                                               break;
+                                       }
+                               }
+
+                               /* Push new operator itself */
+                               rspamd_expr_stack_push (e, GINT_TO_POINTER (op));
+                       }
+
+                       break;
+               case SKIP_SPACES:
+                       if (g_ascii_isspace (*p)) {
+                               p ++;
+                       }
+                       else if (rspamd_expr_is_operation_symbol (*p)) {
+                               state = PARSE_OP;
+                       }
+                       else {
+                               state = PARSE_ATOM;
+                       }
+               }
+       }
+
+       /* Now we process the stack and push operators to the output */
+       while ((op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e)))
+                       != OP_INVALID) {
+               if (op_stack != OP_OBRACE) {
+                       elt.type = ELT_OP;
+                       elt.p.op = op_stack;
+                       g_array_append_val (e->expressions, elt);
+               }
+               else {
+                       g_set_error (err, rspamd_expr_quark(), 600,
+                                       "Braces mismatch");
+                       goto err;
+               }
+       }
+
+       if (*target) {
+               *target = e;
+       }
+
+       return TRUE;
+
+err:
        return FALSE;
 }
 
index a99343e69cf750c120d13c31be616efbf08b6571..798d3a48c3b9ec400b8a78dabbbfd785be3e7dd6 100644 (file)
@@ -33,6 +33,8 @@ typedef struct rspamd_expression_atom_s {
        gpointer data;
        /* String representation of atom */
        const gchar *str;
+       /* Length of the string representation of atom */
+       gsize len;
        /* Relative priority */
        gint priority;
 } rspamd_expression_atom_t;