Browse Source

Add expressions parsing code.

tags/0.9.0
Vsevolod Stakhov 9 years ago
parent
commit
dbe8500037
2 changed files with 391 additions and 9 deletions
  1. 389
    9
      src/libutil/expression.c
  2. 2
    0
      src/libutil/expression.h

+ 389
- 9
src/libutil/expression.c View File

@@ -22,17 +22,21 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <expression.h>
#include "config.h"
#include "expression.h"
#include "regexp.h"

enum rspamd_expression_op {
OP_INVALID = 0,
OP_PLUS, /* || or + */
OP_MULT, /* && or * */
OP_NOT, /* ! */
OP_LT, /* < */
OP_GT, /* > */
OP_LE, /* <= */
OP_GE /* >= */
OP_GE, /* >= */
OP_OBRACE, /* ( */
OP_CBRACE /* ) */
};

struct rspamd_expression_elt {
@@ -55,20 +59,26 @@ struct rspamd_expression_elt {
struct rspamd_expression {
struct rspamd_atom_subr *subr;
GArray *expressions;
GPtrArray *expression_stack;
GArray *expression_stack;
};

static GQuark
rspamd_expr_quark (void)
{
return g_quark_from_static_string ("rspamd-expression");
}

static void
rspamd_expr_stack_push (struct rspamd_expression *expr,
struct rspamd_expression_elt *elt)
gpointer elt)
{
g_ptr_array_add (expr->expression_stack, elt);
g_array_append_val (expr->expression_stack, elt);
}

static struct rspamd_expression_elt *
static gpointer
rspamd_expr_stack_pop (struct rspamd_expression *expr)
{
struct rspamd_expression_elt *e;
gpointer e;
gint idx;

if (expr->expression_stack->len == 0) {
@@ -76,18 +86,191 @@ rspamd_expr_stack_pop (struct rspamd_expression *expr)
}

idx = expr->expression_stack->len - 1;
e = g_ptr_array_index (expr->expression_stack, idx);
g_ptr_array_remove_index_fast (expr->expression_stack, idx);
e = g_array_index (expr->expression_stack, gpointer, idx);
g_array_remove_index_fast (expr->expression_stack, idx);

return e;
}

/*
* Return operation priority
*/
static gint
rspamd_expr_logic_priority (enum rspamd_expression_op op)
{
gint ret = 0;

switch (op) {
case OP_NOT:
ret = 5;
break;
case OP_MULT:
ret = 4;
break;
case OP_PLUS:
ret = 3;
break;
case OP_GE:
case OP_GT:
case OP_LE:
case OP_LT:
ret = 2;
break;
case OP_OBRACE:
case OP_CBRACE:
ret = 1;
break;
case OP_INVALID:
ret = -1;
break;
}

return ret;
}

/*
* Return FALSE if symbol is not operation symbol (operand)
* Return TRUE if symbol is operation symbol
*/
static gboolean
rspamd_expr_is_operation_symbol (gchar a)
{
switch (a) {
case '!':
case '&':
case '|':
case '(':
case ')':
case '>':
case '<':
return TRUE;
}

return FALSE;
}

/* Return character representation of operation */
static enum rspamd_expression_op
rspamd_expr_str_to_op (const gchar *a, const gchar *end, const gchar **next)
{
enum rspamd_expression_op op = OP_INVALID;

g_assert (a < end);

switch (*a) {
case '!':
case '&':
case '|':
case '+':
case '*':
case '(':
case ')': {
if (a < end - 1) {
if ((a[0] == '&' && a[1] == '&') ||
(a[0] == '|' && a[1] == '|')) {
*next = a + 2;
}
else {
*next = a + 1;
}
}
/* XXX: not especially effective */
switch (*a) {
case '!':
op = OP_NOT;
break;
case '&':
case '*':
op = OP_MULT;
break;
case '|':
case '+':
op = OP_PLUS;
break;
case ')':
op = OP_CBRACE;
break;
case '(':
op = OP_OBRACE;
break;
default:
op = OP_INVALID;
break;
}
break;
}
case 'O':
case 'o':
if ((gulong)(end - a) >= sizeof ("or") &&
g_ascii_strncasecmp (a, "or", sizeof ("or") - 1) == 0) {
*next = a + sizeof ("or") - 1;
op = OP_PLUS;
}
break;
case 'A':
case 'a':
if ((gulong)(end - a) >= sizeof ("and") &&
g_ascii_strncasecmp (a, "and", sizeof ("and") - 1) == 0) {
*next = a + sizeof ("and") - 1;
op = OP_MULT;
}
break;
case 'N':
case 'n':
if ((gulong)(end - a) >= sizeof ("not") &&
g_ascii_strncasecmp (a, "not", sizeof ("not") - 1) == 0) {
*next = a + sizeof ("not") - 1;
op = OP_NOT;
}
break;
case '>':
if (a < end - 1 && a[1] == '=') {
*next = a + 2;
op = OP_GE;
}
else {
*next = a + 1;
op = OP_GT;
}
break;
case '<':
if (a < end - 1 && a[1] == '=') {
*next = a + 2;
op = OP_LE;
}
else {
*next = a + 1;
op = OP_LT;
}
break;
default:
op = OP_INVALID;
break;
}

return op;
}

gboolean
rspamd_parse_expression (const gchar *line, gsize len,
struct rspamd_atom_subr *subr, gpointer subr_data,
rspamd_mempool_t *pool, GError **err,
struct rspamd_expression **target)
{
struct rspamd_expression *e;
struct rspamd_expression_elt elt;
rspamd_expression_atom_t *atom;
rspamd_regexp_t *num_re;
enum rspamd_expression_op op, op_stack;
const gchar *p, *c, *end;

enum {
PARSE_ATOM = 0,
PARSE_OP,
PARSE_LIM,
SKIP_SPACES
} state;

g_assert (line != NULL);
g_assert (subr != NULL && subr->parse != NULL);

@@ -95,6 +278,203 @@ rspamd_parse_expression (const gchar *line, gsize len,
len = strlen (line);
}

num_re = rspamd_regexp_cache_create (NULL, "/^\\d+\\s*[><]/", NULL, NULL);

p = line;
c = line;
end = line + len;
e = g_slice_alloc (sizeof (*e));
e->expressions = g_array_new (FALSE, FALSE,
sizeof (struct rspamd_expression_elt));
e->expression_stack = g_array_sized_new (FALSE, FALSE, sizeof (gpointer), 32);

/* Shunting-yard algorithm */
while (p < end) {
switch (state) {
case PARSE_ATOM:
if (g_ascii_isspace (*p)) {
state = SKIP_SPACES;
}
else if (rspamd_expr_is_operation_symbol (*p)) {
state = PARSE_ATOM;
}
else {
/*
* First of all, we check some pre-conditions:
* 1) if we have 'and ' or 'or ' or 'not ' strings, they are op
* 2) if we have full numeric string, then we check for the following:
* ^\d+\s*[><]$
*/
if ((gulong)(end - p) > sizeof ("and ") &&
(g_ascii_strncasecmp (p, "and ", sizeof ("and ") - 1) == 0 ||
g_ascii_strncasecmp (p, "not ", sizeof ("not ") - 1) == 0 )) {
state = PARSE_OP;
}
else if ((gulong)(end - p) > sizeof ("or ") &&
g_ascii_strncasecmp (p, "or ", sizeof ("or ") - 1) == 0) {
state = PARSE_OP;
}
else if (rspamd_regexp_search (num_re, p, end - p, NULL, NULL,
FALSE)) {
c = p;
state = PARSE_LIM;
}
else {
/* Try to parse atom */
atom = subr->parse (p, end - p, pool, subr_data, err);
if (atom == NULL) {
/* We couldn't parse the atom, so go out */
goto err;
}
g_assert (atom->len != 0);
p = p + atom->len;

/* Push to output */
elt.type = ELT_ATOM;
elt.p.atom = atom;
g_array_append_val (e->expressions, elt);
}
}
break;
case PARSE_LIM:
if (g_ascii_isdigit (*p)) {
p ++;
}
else {
if (p - c > 0) {
elt.type = ELT_LIMIT;
elt.p.lim.val = strtoul (c, NULL, 10);
g_array_append_val (e->expressions, elt);
c = p;
state = SKIP_SPACES;
}
else {
g_set_error (err, rspamd_expr_quark(), 400, "Empty number");
goto err;
}
}
break;
case PARSE_OP:
op = rspamd_expr_str_to_op (p, end, &p);
if (op == OP_INVALID) {
g_set_error (err, rspamd_expr_quark(), 500, "Bad operator %c",
*p);
goto err;
}
else if (op == OP_OBRACE) {
/*
* If the token is a left parenthesis, then push it onto
* the stack.
*/
rspamd_expr_stack_push (e, GINT_TO_POINTER (op));
}
else if (op == OP_CBRACE) {
/*
* Until the token at the top of the stack is a left
* parenthesis, pop operators off the stack onto the
* output queue.
*
* Pop the left parenthesis from the stack,
* but not onto the output queue.
*
* If the stack runs out without finding a left parenthesis,
* then there are mismatched parentheses.
*/
do {
op = GPOINTER_TO_INT (rspamd_expr_stack_pop (e));

if (op == OP_INVALID) {
g_set_error (err, rspamd_expr_quark(), 600,
"Braces mismatch");
goto err;
}

if (op != OP_OBRACE) {
elt.type = ELT_OP;
elt.p.op = op;
g_array_append_val (e->expressions, elt);
}

} while (op != OP_OBRACE);
}
else {
/*
* While there is an operator token, o2, at the top of
* the operator stack, and either:
*
* - o1 is left-associative and its precedence is less than
* or equal to that of o2, or
* - o1 is right associative, and has precedence less than
* that of o2,
*
* then pop o2 off the operator stack, onto the output queue;
*
* push o1 onto the operator stack.
*/

for (;;) {
op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e));

if (op_stack == OP_INVALID) {
/* Stack is empty */
break;
}

/* We ignore associativity for now */
if (op_stack != OP_OBRACE &&
rspamd_expr_logic_priority (op) <=
rspamd_expr_logic_priority(op_stack)) {
elt.type = ELT_OP;
elt.p.op = op_stack;
g_array_append_val (e->expressions, elt);
}
else {
/* Push op_stack back */
rspamd_expr_stack_push (e, GINT_TO_POINTER (op_stack));
break;
}
}

/* Push new operator itself */
rspamd_expr_stack_push (e, GINT_TO_POINTER (op));
}

break;
case SKIP_SPACES:
if (g_ascii_isspace (*p)) {
p ++;
}
else if (rspamd_expr_is_operation_symbol (*p)) {
state = PARSE_OP;
}
else {
state = PARSE_ATOM;
}
}
}

/* Now we process the stack and push operators to the output */
while ((op_stack = GPOINTER_TO_INT (rspamd_expr_stack_pop (e)))
!= OP_INVALID) {
if (op_stack != OP_OBRACE) {
elt.type = ELT_OP;
elt.p.op = op_stack;
g_array_append_val (e->expressions, elt);
}
else {
g_set_error (err, rspamd_expr_quark(), 600,
"Braces mismatch");
goto err;
}
}

if (*target) {
*target = e;
}

return TRUE;

err:
return FALSE;
}


+ 2
- 0
src/libutil/expression.h View File

@@ -33,6 +33,8 @@ typedef struct rspamd_expression_atom_s {
gpointer data;
/* String representation of atom */
const gchar *str;
/* Length of the string representation of atom */
gsize len;
/* Relative priority */
gint priority;
} rspamd_expression_atom_t;

Loading…
Cancel
Save