From 6ea707966a87b98386250f43f6c4b2d840e0c5d2 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Wed, 17 Jun 2020 14:28:58 +0100 Subject: [PATCH] [Rework] Split operations processing and add more debug logs --- src/libutil/expression.c | 320 +++++++++++++++++++++++++--------- test/lua/unit/expressions.lua | 7 +- 2 files changed, 240 insertions(+), 87 deletions(-) diff --git a/src/libutil/expression.c b/src/libutil/expression.c index 4c07643ae..eeee40b75 100644 --- a/src/libutil/expression.c +++ b/src/libutil/expression.c @@ -135,6 +135,12 @@ rspamd_expr_op_to_str (enum rspamd_expression_op op) case OP_LT: op_str = "<"; break; + case OP_OBRACE: + op_str = "("; + break; + case OP_CBRACE: + op_str = ")"; + break; default: op_str = "???"; break; @@ -273,8 +279,6 @@ rspamd_expr_op_flags (enum rspamd_expression_op op) ret |= RSPAMD_EXPRESSION_BINARY|RSPAMD_EXPRESSION_COMPARISON; break; case OP_AND: - ret |= RSPAMD_EXPRESSION_NARY|RSPAMD_EXPRESSION_LOGICAL; - break; case OP_OR: ret |= RSPAMD_EXPRESSION_NARY|RSPAMD_EXPRESSION_LOGICAL; break; @@ -326,6 +330,8 @@ rspamd_expr_str_to_op (const gchar *a, const gchar *end, const gchar **next) case '|': case '+': case '*': + case '/': + case '-': case '(': case ')': { if (a < end - 1) { @@ -465,17 +471,20 @@ rspamd_expression_destroy (struct rspamd_expression *expr) } static gboolean -rspamd_ast_add_node (GPtrArray *operands, struct rspamd_expression_elt *op, - GError **err) +rspamd_ast_add_node (struct rspamd_expression *e, + GPtrArray *operands, + struct rspamd_expression_elt *op, + GError **err) { GNode *res, *a1, *a2, *test; - struct rspamd_expression_elt *test_elt; g_assert (op->type == ELT_OP); if (op->p.op.op_flags & RSPAMD_EXPRESSION_UNARY) { /* Unary operator */ + struct rspamd_expression_elt *test_elt; + res = g_node_new (op); a1 = rspamd_expr_stack_elt_pop (operands); @@ -492,9 +501,19 @@ rspamd_ast_add_node (GPtrArray *operands, struct rspamd_expression_elt *op, if (test_elt->type == ELT_ATOM) { test_elt->p.atom->parent = res; + msg_debug_expression ("added unary op %s to AST; operand: %*s", + rspamd_expr_op_to_str (op->p.op.op), + (int)test_elt->p.atom->len, test_elt->p.atom->str); + } + else { + msg_debug_expression ("added unary op %s to AST; operand type: %d", + rspamd_expr_op_to_str (op->p.op.op), + test_elt->type); } + } else { + struct rspamd_expression_elt *e1, *e2; /* For binary/nary operators we might want to examine chains */ a2 = rspamd_expr_stack_elt_pop (operands); a1 = rspamd_expr_stack_elt_pop (operands); @@ -521,23 +540,31 @@ rspamd_ast_add_node (GPtrArray *operands, struct rspamd_expression_elt *op, /* First try with a1 */ test = a1; - test_elt = test->data; + e1 = test->data; - if (test_elt->type == ELT_OP && test_elt->p.op.op == op->p.op.op) { + if (e1->type == ELT_OP && e1->p.op.op == op->p.op.op) { /* Add children */ g_node_append (test, a2); rspamd_expr_stack_elt_push (operands, a1); + + msg_debug_expression ("added nary op %s to AST merged with the first operand", + rspamd_expr_op_to_str (op->p.op.op)); + return TRUE; } /* Now test a2 */ test = a2; - test_elt = test->data; + e2 = test->data; - if (test_elt->type == ELT_OP && test_elt->p.op.op == op->p.op.op) { + if (e2->type == ELT_OP && e2->p.op.op == op->p.op.op) { /* Add children */ g_node_prepend (test, a1); rspamd_expr_stack_elt_push (operands, a2); + + msg_debug_expression ("added nary op %s to AST merged with the second operand", + rspamd_expr_op_to_str (op->p.op.op)); + return TRUE; } } @@ -547,14 +574,28 @@ rspamd_ast_add_node (GPtrArray *operands, struct rspamd_expression_elt *op, g_node_append (res, a1); g_node_append (res, a2); - test_elt = a1->data; - if (test_elt->type == ELT_ATOM) { - test_elt->p.atom->parent = res; + e1 = a1->data; + e2 = a2->data; + + if (e1->type == ELT_ATOM) { + e1->p.atom->parent = res; } - test_elt = a2->data; - if (test_elt->type == ELT_ATOM) { - test_elt->p.atom->parent = res; + if (e2->type == ELT_ATOM) { + e2->p.atom->parent = res; + } + + if (e1->type == ELT_ATOM && e2->type == ELT_ATOM) { + msg_debug_expression ("added binary op %s to AST; operands: (%*s; %*s)", + rspamd_expr_op_to_str (op->p.op.op), + (int) e1->p.atom->len, e1->p.atom->str, + (int) e2->p.atom->len, e2->p.atom->str); + } + else { + msg_debug_expression ("added binary op %s to AST; operands (types): (%d; %d)", + rspamd_expr_op_to_str (op->p.op.op), + e1->type, + e2->type); } } @@ -724,6 +765,7 @@ rspamd_parse_expression (const gchar *line, gsize len, e->log_id = g_malloc0 (RSPAMD_LOG_ID_LEN + 1); guint64 h = rspamd_cryptobox_fast_hash (line, len, 0xdeadbabe); rspamd_snprintf (e->log_id, RSPAMD_LOG_ID_LEN + 1, "%xL", h); + msg_debug_expression ("start to parse expression '%*s'", (int)len, line); /* Shunting-yard algorithm */ while (p < end) { @@ -759,9 +801,11 @@ rspamd_parse_expression (const gchar *line, gsize len, FALSE, NULL)) { state = PARSE_OP; + msg_debug_expression ("found divide operation"); continue; } + msg_debug_expression ("false divide operation"); /* Fallback to PARSE_ATOM state */ } else if (*p == '-') { @@ -775,6 +819,7 @@ rspamd_parse_expression (const gchar *line, gsize len, continue; } /* Fallback to PARSE_ATOM state */ + msg_debug_expression ("false minus operation"); } else { /* Generic operation */ @@ -841,7 +886,7 @@ rspamd_parse_expression (const gchar *line, gsize len, (int) (end - p), p); } - goto err; + goto error_label; } if (atom->str == NULL) { @@ -856,6 +901,8 @@ rspamd_parse_expression (const gchar *line, gsize len, g_array_append_val (e->expressions, elt); rspamd_expr_stack_elt_push (operand_stack, g_node_new (rspamd_expr_dup_elt (pool, &elt))); + msg_debug_expression ("found atom: %*s; pushed onto operand stack (%d size)", + (int)atom->len, atom->str, operand_stack->len); } break; @@ -875,12 +922,14 @@ rspamd_parse_expression (const gchar *line, gsize len, g_array_append_val (e->expressions, elt); rspamd_expr_stack_elt_push (operand_stack, g_node_new (rspamd_expr_dup_elt (pool, &elt))); + msg_debug_expression ("found limit: %.1f; pushed onto operand stack (%d size)", + elt.p.lim, operand_stack->len); c = p; state = SKIP_SPACES; } else { g_set_error (err, rspamd_expr_quark(), 400, "Empty number"); - goto err; + goto error_label; } } break; @@ -889,7 +938,7 @@ rspamd_parse_expression (const gchar *line, gsize len, if (op == OP_INVALID) { g_set_error (err, rspamd_expr_quark(), 500, "Bad operator %c", *p); - goto err; + goto error_label; } else if (op == OP_OBRACE) { /* @@ -897,6 +946,8 @@ rspamd_parse_expression (const gchar *line, gsize len, * the stack. */ rspamd_expr_stack_push (e, GINT_TO_POINTER (op)); + msg_debug_expression ("found obrace, pushed to operators stack (%d size)", + e->expression_stack->len); } else if (op == OP_CBRACE) { /* @@ -910,16 +961,21 @@ rspamd_parse_expression (const gchar *line, gsize len, * If the stack runs out without finding a left parenthesis, * then there are mismatched parentheses. */ + msg_debug_expression ("found cbrace, rewind operators stack (%d size)", + e->expression_stack->len); + do { op = GPOINTER_TO_INT (rspamd_expr_stack_pop (e)); if (op == OP_INVALID) { g_set_error (err, rspamd_expr_quark(), 600, "Braces mismatch"); - goto err; + goto error_label; } guint op_priority = rspamd_expr_logic_priority (op); + msg_debug_expression ("found op: %s; priority = %d", + rspamd_expr_op_to_str (op), op_priority); if (op != OP_OBRACE) { elt.type = ELT_OP; @@ -928,9 +984,9 @@ rspamd_parse_expression (const gchar *line, gsize len, elt.p.op.logical_priority = op_priority; g_array_append_val (e->expressions, elt); - if (!rspamd_ast_add_node (operand_stack, + if (!rspamd_ast_add_node (e, operand_stack, rspamd_expr_dup_elt (pool, &elt), err)) { - goto err; + goto error_label; } } @@ -956,34 +1012,54 @@ rspamd_parse_expression (const gchar *line, gsize len, if (op_stack == OP_INVALID) { /* Stack is empty */ + msg_debug_expression ("no operations in operators stack"); break; } /* We ignore associativity for now */ - guint op_priority = rspamd_expr_logic_priority (op); + guint op_priority = rspamd_expr_logic_priority (op), + stack_op_priority = rspamd_expr_logic_priority (op_stack); + + msg_debug_expression ("operators stack %d; operands stack: %d; " + "process operation '%s'(%d); pop operation '%s'(%d)", + e->expression_stack->len, + operand_stack->len, + rspamd_expr_op_to_str (op), op_priority, + rspamd_expr_op_to_str (op_stack), stack_op_priority); if (op_stack != OP_OBRACE && - op_priority < rspamd_expr_logic_priority (op_stack)) { + op_priority < stack_op_priority) { elt.type = ELT_OP; elt.p.op.op = op_stack; - elt.p.op.op_flags = rspamd_expr_op_flags (op); + elt.p.op.op_flags = rspamd_expr_op_flags (op_stack); elt.p.op.logical_priority = op_priority; g_array_append_val (e->expressions, elt); - if (!rspamd_ast_add_node (operand_stack, + if (!rspamd_ast_add_node (e, operand_stack, rspamd_expr_dup_elt (pool, &elt), err)) { - goto err; + goto error_label; } } else { /* Push op_stack back */ + msg_debug_expression ("operators stack %d; operands stack: %d; " + "process operation '%s'(%d); push back to stack '%s'(%d)", + e->expression_stack->len, + operand_stack->len, + rspamd_expr_op_to_str (op), op_priority, + rspamd_expr_op_to_str (op_stack), stack_op_priority); rspamd_expr_stack_push (e, GINT_TO_POINTER (op_stack)); break; } } /* Push new operator itself */ + msg_debug_expression ("operators stack %d; operands stack: %d; " + "process operation '%s'; push to stack", + e->expression_stack->len, + operand_stack->len, + rspamd_expr_op_to_str (op)); rspamd_expr_stack_push (e, GINT_TO_POINTER (op)); } @@ -1006,6 +1082,12 @@ rspamd_parse_expression (const gchar *line, gsize len, /* Now we process the stack and push operators to the output */ while ((op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e))) != OP_INVALID) { + msg_debug_expression ("operators stack %d; operands stack: %d; " + "rewind stack; op: %s", + e->expression_stack->len, + operand_stack->len, + rspamd_expr_op_to_str (op)); + if (op_stack != OP_OBRACE) { elt.type = ELT_OP; elt.p.op.op = op_stack; @@ -1013,22 +1095,22 @@ rspamd_parse_expression (const gchar *line, gsize len, elt.p.op.logical_priority = rspamd_expr_logic_priority (op_stack); g_array_append_val (e->expressions, elt); - if (!rspamd_ast_add_node (operand_stack, + if (!rspamd_ast_add_node (e, operand_stack, rspamd_expr_dup_elt (pool, &elt), err)) { - goto err; + goto error_label; } } else { g_set_error (err, rspamd_expr_quark(), 600, "Braces mismatch"); - goto err; + goto error_label; } } if (operand_stack->len != 1) { g_set_error (err, rspamd_expr_quark(), 601, - "Operators mismatch"); - goto err; + "Operators mismatch: %d elts in stack", operand_stack->len); + goto error_label; } e->ast = rspamd_expr_stack_elt_pop (operand_stack); @@ -1053,7 +1135,9 @@ rspamd_parse_expression (const gchar *line, gsize len, return TRUE; -err: +error_label: + msg_debug_expression ("fatal error: %e", *err); + while ((tmp = rspamd_expr_stack_elt_pop (operand_stack)) != NULL) { g_node_destroy (tmp); } @@ -1068,8 +1152,7 @@ err: * Node optimizer function: skip nodes that are not relevant */ static gboolean -rspamd_ast_node_done (struct rspamd_expression_elt *elt, - struct rspamd_expression_elt *parelt, gdouble acc) +rspamd_ast_node_done (struct rspamd_expression_elt *elt, gdouble acc) { gboolean ret = FALSE; @@ -1092,55 +1175,81 @@ rspamd_ast_node_done (struct rspamd_expression_elt *elt, return ret; } + static gdouble -rspamd_ast_do_op (struct rspamd_expression_elt *elt, gdouble val, gdouble acc) +rspamd_ast_do_unary_op (struct rspamd_expression_elt *elt, gdouble operand) { - gdouble ret = val; - + gdouble ret; g_assert (elt->type == ELT_OP); - if (isnan (acc)) { - switch (elt->p.op.op) { - case OP_PLUS: - case OP_MINUS: - case OP_OR: - case OP_AND: - case OP_MULT: - case OP_DIVIDE: - return val; - default: - acc = val; - break; - } - } - switch (elt->p.op.op) { case OP_NOT: - ret = fabs (val) > DOUBLE_EPSILON ? 0.0 : 1.0; - break; - case OP_PLUS: - ret = acc + val; - break; - case OP_MULT: - ret = acc * val; + ret = fabs (operand) > DOUBLE_EPSILON ? 0.0 : 1.0; break; + default: + g_assert_not_reached (); + } + + return ret; +} + +static gdouble +rspamd_ast_do_binary_op (struct rspamd_expression_elt *elt, gdouble op1, gdouble op2) +{ + gdouble ret; + + g_assert (elt->type == ELT_OP); + + switch (elt->p.op.op) { case OP_MINUS: - ret = acc - val; + ret = op1 - op2; break; case OP_DIVIDE: - ret = acc / val; + ret = op1 / op2; break; case OP_GE: - ret = acc >= val; + ret = op1 >= op2; break; case OP_GT: - ret = acc > val; + ret = op1 > op2; break; case OP_LE: - ret = acc <= val; + ret = op1 <= op2; break; case OP_LT: - ret = acc < val; + ret = op1 < op2; + break; + + case OP_NOT: + case OP_PLUS: + case OP_MULT: + case OP_AND: + case OP_OR: + default: + g_assert_not_reached(); + break; + } + + return ret; +} + +static gdouble +rspamd_ast_do_nary_op (struct rspamd_expression_elt *elt, gdouble val, gdouble acc) +{ + gdouble ret; + + g_assert (elt->type == ELT_OP); + + if (isnan (acc)) { + return val; + } + + switch (elt->p.op.op) { + case OP_PLUS: + ret = acc + val; + break; + case OP_MULT: + ret = acc * val; break; case OP_AND: ret = (acc * val); @@ -1149,7 +1258,14 @@ rspamd_ast_do_op (struct rspamd_expression_elt *elt, gdouble val, gdouble acc) ret = (acc + val); break; default: - g_assert (0); + case OP_NOT: + case OP_MINUS: + case OP_DIVIDE: + case OP_GE: + case OP_GT: + case OP_LE: + case OP_LT: + g_assert_not_reached(); break; } @@ -1160,15 +1276,15 @@ static gdouble rspamd_ast_process_node (struct rspamd_expression *e, GNode *node, struct rspamd_expr_process_data *process_data) { - struct rspamd_expression_elt *elt, *celt, *parelt = NULL; + struct rspamd_expression_elt *elt; GNode *cld; gdouble acc = NAN; gdouble t1, t2, val; gboolean calc_ticks = FALSE; + const gchar *op_name = NULL; elt = node->data; - switch (elt->type) { case ELT_ATOM: if (!(elt->flags & RSPAMD_EXPR_FLAG_PROCESSED)) { @@ -1213,25 +1329,61 @@ rspamd_ast_process_node (struct rspamd_expression *e, GNode *node, break; case ELT_OP: g_assert (node->children != NULL); - - DL_FOREACH (node->children, cld) { - celt = cld->data; - - val = rspamd_ast_process_node (e, cld, process_data); - msg_debug_expression ("before op: op=%s; acc=%.1f; val = %.2f", rspamd_expr_op_to_str(elt->p.op.op), - acc, val); - acc = rspamd_ast_do_op (elt, val, acc); - msg_debug_expression ("after op: op=%s; acc=%.1f; val = %.2f", rspamd_expr_op_to_str(elt->p.op.op), - acc, val); - - /* Check if we need to process further */ - if (!(process_data->flags & RSPAMD_EXPRESSION_FLAG_NOOPT)) { - if (rspamd_ast_node_done (elt, parelt, acc)) { - msg_debug_expression ("optimizer: done"); - return acc; + op_name = rspamd_expr_op_to_str (elt->p.op.op); + + if (elt->p.op.op_flags & RSPAMD_EXPRESSION_NARY) { + msg_debug_expression ("proceed nary operation %s", op_name); + /* Proceed all ops in chain */ + DL_FOREACH (node->children, cld) { + val = rspamd_ast_process_node (e, cld, process_data); + msg_debug_expression ("before op: op=%s; acc=%.1f; val = %.2f", op_name, + acc, val); + acc = rspamd_ast_do_nary_op (elt, val, acc); + msg_debug_expression ("after op: op=%s; acc=%.1f; val = %.2f", op_name, + acc, val); + + /* Check if we need to process further */ + if (!(process_data->flags & RSPAMD_EXPRESSION_FLAG_NOOPT)) { + if (rspamd_ast_node_done (elt, acc)) { + msg_debug_expression ("optimizer: done"); + return acc; + } } } } + else if (elt->p.op.op_flags & RSPAMD_EXPRESSION_BINARY) { + GNode *c1 = node->children, *c2; + + c2 = c1->next; + g_assert (c2->next == NULL); + gdouble val1, val2; + + msg_debug_expression ("proceed binary operation %s", + op_name); + val1 = rspamd_ast_process_node (e, c1, process_data); + val2 = rspamd_ast_process_node (e, c2, process_data); + + msg_debug_expression ("before op: op=%s; op1 = %.1f, op2 = %.1f", + op_name, val1, val2); + acc = rspamd_ast_do_binary_op (elt, val1, val2); + msg_debug_expression ("after op: op=%s; res=%.1f", + op_name, acc); + } + else if (elt->p.op.op_flags & RSPAMD_EXPRESSION_UNARY) { + GNode *c1 = node->children; + + g_assert (c1->next == NULL); + + msg_debug_expression ("proceed unary operation %s", + op_name); + val = rspamd_ast_process_node (e, c1, process_data); + + msg_debug_expression ("before op: op=%s; op1 = %.1f", + op_name, val); + acc = rspamd_ast_do_unary_op (elt, val); + msg_debug_expression ("after op: op=%s; res=%.1f", + op_name, acc); + } break; } diff --git a/test/lua/unit/expressions.lua b/test/lua/unit/expressions.lua index ce6c92dca..c3a771c48 100644 --- a/test/lua/unit/expressions.lua +++ b/test/lua/unit/expressions.lua @@ -41,6 +41,7 @@ context("Rspamd expressions", function() local pool = rspamd_mempool.create() local cases = { + {'A & (!B | C)', '(A) (B) ! (C) | &'}, {'A & B | !C', '(C) ! (A) (B) & |'}, {'A & (B | !C)', '(A) (B) (C) ! | &'}, {'A & B &', nil}, @@ -48,8 +49,8 @@ context("Rspamd expressions", function() {'(((A))', nil}, -- Balanced braces {'(((A)))', '(A)'}, - -- Plus and comparison operators (after 2.6 this is not optimized, maybe we can return previous behaviour some day) - {'A + B + C + D > 2', '(A) (B) (C) (D) + + + 2 >'}, + -- Plus and comparison operators + {'A + B + C + D > 2', '(A) (B) (C) (D) +(4) 2 >'}, -- Plus and logic operators {'((A + B + C + D) > 2) & D', '(D) (A) (B) (C) (D) +(4) 2 > &'}, -- Associativity @@ -91,7 +92,7 @@ context("Rspamd expressions", function() {'A & C & !(D || C || E)', 0}, {'A + B + C', 2}, {'A * 2.0 + B + C', 3}, - {'A * 2.0 + B - C', 0}, + {'A * 2.0 + B - C', 1}, {'A / 2.0 + B - C', -0.5}, } for _,c in ipairs(cases) do -- 2.39.5