From: Vsevolod Stakhov Date: Tue, 17 Mar 2015 15:01:52 +0000 (+0000) Subject: Add expressions parsing code. X-Git-Tag: 0.9.0~488^2~7 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=dbe85000375aa0a02d38bb694b7e1d3877160c4f;p=rspamd.git Add expressions parsing code. --- diff --git a/src/libutil/expression.c b/src/libutil/expression.c index b7f6e1d57..b5b4dce5f 100644 --- a/src/libutil/expression.c +++ b/src/libutil/expression.c @@ -22,17 +22,21 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include #include "config.h" +#include "expression.h" +#include "regexp.h" enum rspamd_expression_op { + OP_INVALID = 0, OP_PLUS, /* || or + */ OP_MULT, /* && or * */ OP_NOT, /* ! */ OP_LT, /* < */ OP_GT, /* > */ OP_LE, /* <= */ - OP_GE /* >= */ + OP_GE, /* >= */ + OP_OBRACE, /* ( */ + OP_CBRACE /* ) */ }; struct rspamd_expression_elt { @@ -55,20 +59,26 @@ struct rspamd_expression_elt { struct rspamd_expression { struct rspamd_atom_subr *subr; GArray *expressions; - GPtrArray *expression_stack; + GArray *expression_stack; }; +static GQuark +rspamd_expr_quark (void) +{ + return g_quark_from_static_string ("rspamd-expression"); +} + static void rspamd_expr_stack_push (struct rspamd_expression *expr, - struct rspamd_expression_elt *elt) + gpointer elt) { - g_ptr_array_add (expr->expression_stack, elt); + g_array_append_val (expr->expression_stack, elt); } -static struct rspamd_expression_elt * +static gpointer rspamd_expr_stack_pop (struct rspamd_expression *expr) { - struct rspamd_expression_elt *e; + gpointer e; gint idx; if (expr->expression_stack->len == 0) { @@ -76,18 +86,191 @@ rspamd_expr_stack_pop (struct rspamd_expression *expr) } idx = expr->expression_stack->len - 1; - e = g_ptr_array_index (expr->expression_stack, idx); - g_ptr_array_remove_index_fast (expr->expression_stack, idx); + e = g_array_index (expr->expression_stack, gpointer, idx); + g_array_remove_index_fast (expr->expression_stack, idx); return e; } +/* + * Return operation priority + */ +static gint +rspamd_expr_logic_priority (enum rspamd_expression_op op) +{ + gint ret = 0; + + switch (op) { + case OP_NOT: + ret = 5; + break; + case OP_MULT: + ret = 4; + break; + case OP_PLUS: + ret = 3; + break; + case OP_GE: + case OP_GT: + case OP_LE: + case OP_LT: + ret = 2; + break; + case OP_OBRACE: + case OP_CBRACE: + ret = 1; + break; + case OP_INVALID: + ret = -1; + break; + } + + return ret; +} + +/* + * Return FALSE if symbol is not operation symbol (operand) + * Return TRUE if symbol is operation symbol + */ +static gboolean +rspamd_expr_is_operation_symbol (gchar a) +{ + switch (a) { + case '!': + case '&': + case '|': + case '(': + case ')': + case '>': + case '<': + return TRUE; + } + + return FALSE; +} + +/* Return character representation of operation */ +static enum rspamd_expression_op +rspamd_expr_str_to_op (const gchar *a, const gchar *end, const gchar **next) +{ + enum rspamd_expression_op op = OP_INVALID; + + g_assert (a < end); + + switch (*a) { + case '!': + case '&': + case '|': + case '+': + case '*': + case '(': + case ')': { + if (a < end - 1) { + if ((a[0] == '&' && a[1] == '&') || + (a[0] == '|' && a[1] == '|')) { + *next = a + 2; + } + else { + *next = a + 1; + } + } + /* XXX: not especially effective */ + switch (*a) { + case '!': + op = OP_NOT; + break; + case '&': + case '*': + op = OP_MULT; + break; + case '|': + case '+': + op = OP_PLUS; + break; + case ')': + op = OP_CBRACE; + break; + case '(': + op = OP_OBRACE; + break; + default: + op = OP_INVALID; + break; + } + break; + } + case 'O': + case 'o': + if ((gulong)(end - a) >= sizeof ("or") && + g_ascii_strncasecmp (a, "or", sizeof ("or") - 1) == 0) { + *next = a + sizeof ("or") - 1; + op = OP_PLUS; + } + break; + case 'A': + case 'a': + if ((gulong)(end - a) >= sizeof ("and") && + g_ascii_strncasecmp (a, "and", sizeof ("and") - 1) == 0) { + *next = a + sizeof ("and") - 1; + op = OP_MULT; + } + break; + case 'N': + case 'n': + if ((gulong)(end - a) >= sizeof ("not") && + g_ascii_strncasecmp (a, "not", sizeof ("not") - 1) == 0) { + *next = a + sizeof ("not") - 1; + op = OP_NOT; + } + break; + case '>': + if (a < end - 1 && a[1] == '=') { + *next = a + 2; + op = OP_GE; + } + else { + *next = a + 1; + op = OP_GT; + } + break; + case '<': + if (a < end - 1 && a[1] == '=') { + *next = a + 2; + op = OP_LE; + } + else { + *next = a + 1; + op = OP_LT; + } + break; + default: + op = OP_INVALID; + break; + } + + return op; +} + gboolean rspamd_parse_expression (const gchar *line, gsize len, struct rspamd_atom_subr *subr, gpointer subr_data, rspamd_mempool_t *pool, GError **err, struct rspamd_expression **target) { + struct rspamd_expression *e; + struct rspamd_expression_elt elt; + rspamd_expression_atom_t *atom; + rspamd_regexp_t *num_re; + enum rspamd_expression_op op, op_stack; + const gchar *p, *c, *end; + + enum { + PARSE_ATOM = 0, + PARSE_OP, + PARSE_LIM, + SKIP_SPACES + } state; + g_assert (line != NULL); g_assert (subr != NULL && subr->parse != NULL); @@ -95,6 +278,203 @@ rspamd_parse_expression (const gchar *line, gsize len, len = strlen (line); } + num_re = rspamd_regexp_cache_create (NULL, "/^\\d+\\s*[><]/", NULL, NULL); + + p = line; + c = line; + end = line + len; + e = g_slice_alloc (sizeof (*e)); + e->expressions = g_array_new (FALSE, FALSE, + sizeof (struct rspamd_expression_elt)); + e->expression_stack = g_array_sized_new (FALSE, FALSE, sizeof (gpointer), 32); + + /* Shunting-yard algorithm */ + while (p < end) { + switch (state) { + case PARSE_ATOM: + if (g_ascii_isspace (*p)) { + state = SKIP_SPACES; + } + else if (rspamd_expr_is_operation_symbol (*p)) { + state = PARSE_ATOM; + } + else { + /* + * First of all, we check some pre-conditions: + * 1) if we have 'and ' or 'or ' or 'not ' strings, they are op + * 2) if we have full numeric string, then we check for the following: + * ^\d+\s*[><]$ + */ + if ((gulong)(end - p) > sizeof ("and ") && + (g_ascii_strncasecmp (p, "and ", sizeof ("and ") - 1) == 0 || + g_ascii_strncasecmp (p, "not ", sizeof ("not ") - 1) == 0 )) { + state = PARSE_OP; + } + else if ((gulong)(end - p) > sizeof ("or ") && + g_ascii_strncasecmp (p, "or ", sizeof ("or ") - 1) == 0) { + state = PARSE_OP; + } + else if (rspamd_regexp_search (num_re, p, end - p, NULL, NULL, + FALSE)) { + c = p; + state = PARSE_LIM; + } + else { + /* Try to parse atom */ + atom = subr->parse (p, end - p, pool, subr_data, err); + if (atom == NULL) { + /* We couldn't parse the atom, so go out */ + goto err; + } + g_assert (atom->len != 0); + p = p + atom->len; + + /* Push to output */ + elt.type = ELT_ATOM; + elt.p.atom = atom; + g_array_append_val (e->expressions, elt); + } + } + break; + case PARSE_LIM: + if (g_ascii_isdigit (*p)) { + p ++; + } + else { + if (p - c > 0) { + elt.type = ELT_LIMIT; + elt.p.lim.val = strtoul (c, NULL, 10); + g_array_append_val (e->expressions, elt); + c = p; + state = SKIP_SPACES; + } + else { + g_set_error (err, rspamd_expr_quark(), 400, "Empty number"); + goto err; + } + } + break; + case PARSE_OP: + op = rspamd_expr_str_to_op (p, end, &p); + if (op == OP_INVALID) { + g_set_error (err, rspamd_expr_quark(), 500, "Bad operator %c", + *p); + goto err; + } + else if (op == OP_OBRACE) { + /* + * If the token is a left parenthesis, then push it onto + * the stack. + */ + rspamd_expr_stack_push (e, GINT_TO_POINTER (op)); + } + else if (op == OP_CBRACE) { + /* + * Until the token at the top of the stack is a left + * parenthesis, pop operators off the stack onto the + * output queue. + * + * Pop the left parenthesis from the stack, + * but not onto the output queue. + * + * If the stack runs out without finding a left parenthesis, + * then there are mismatched parentheses. + */ + do { + op = GPOINTER_TO_INT (rspamd_expr_stack_pop (e)); + + if (op == OP_INVALID) { + g_set_error (err, rspamd_expr_quark(), 600, + "Braces mismatch"); + goto err; + } + + if (op != OP_OBRACE) { + elt.type = ELT_OP; + elt.p.op = op; + g_array_append_val (e->expressions, elt); + } + + } while (op != OP_OBRACE); + } + else { + /* + * While there is an operator token, o2, at the top of + * the operator stack, and either: + * + * - o1 is left-associative and its precedence is less than + * or equal to that of o2, or + * - o1 is right associative, and has precedence less than + * that of o2, + * + * then pop o2 off the operator stack, onto the output queue; + * + * push o1 onto the operator stack. + */ + + for (;;) { + op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e)); + + if (op_stack == OP_INVALID) { + /* Stack is empty */ + break; + } + + /* We ignore associativity for now */ + if (op_stack != OP_OBRACE && + rspamd_expr_logic_priority (op) <= + rspamd_expr_logic_priority(op_stack)) { + elt.type = ELT_OP; + elt.p.op = op_stack; + g_array_append_val (e->expressions, elt); + } + else { + /* Push op_stack back */ + rspamd_expr_stack_push (e, GINT_TO_POINTER (op_stack)); + break; + } + } + + /* Push new operator itself */ + rspamd_expr_stack_push (e, GINT_TO_POINTER (op)); + } + + break; + case SKIP_SPACES: + if (g_ascii_isspace (*p)) { + p ++; + } + else if (rspamd_expr_is_operation_symbol (*p)) { + state = PARSE_OP; + } + else { + state = PARSE_ATOM; + } + } + } + + /* Now we process the stack and push operators to the output */ + while ((op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e))) + != OP_INVALID) { + if (op_stack != OP_OBRACE) { + elt.type = ELT_OP; + elt.p.op = op_stack; + g_array_append_val (e->expressions, elt); + } + else { + g_set_error (err, rspamd_expr_quark(), 600, + "Braces mismatch"); + goto err; + } + } + + if (*target) { + *target = e; + } + + return TRUE; + +err: return FALSE; } diff --git a/src/libutil/expression.h b/src/libutil/expression.h index a99343e69..798d3a48c 100644 --- a/src/libutil/expression.h +++ b/src/libutil/expression.h @@ -33,6 +33,8 @@ typedef struct rspamd_expression_atom_s { gpointer data; /* String representation of atom */ const gchar *str; + /* Length of the string representation of atom */ + gsize len; /* Relative priority */ gint priority; } rspamd_expression_atom_t;