aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/expressions.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-04-21 16:25:51 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-04-21 16:25:51 +0100
commit61555065f3d1c8badcc9573691232f1b6e42988c (patch)
tree563d5b7cb8c468530f7e79c4da0a75267b1184e1 /src/libmime/expressions.c
parentad5bf825b7f33bc10311673991f0cc888e69c0b1 (diff)
downloadrspamd-61555065f3d1c8badcc9573691232f1b6e42988c.tar.gz
rspamd-61555065f3d1c8badcc9573691232f1b6e42988c.zip
Rework project structure, remove trash files.
Diffstat (limited to 'src/libmime/expressions.c')
-rw-r--r--src/libmime/expressions.c1452
1 files changed, 1452 insertions, 0 deletions
diff --git a/src/libmime/expressions.c b/src/libmime/expressions.c
new file mode 100644
index 000000000..5d19626bb
--- /dev/null
+++ b/src/libmime/expressions.c
@@ -0,0 +1,1452 @@
+/*
+ * Copyright (c) 2009-2012, Vsevolod Stakhov
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR ''AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL AUTHOR BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "util.h"
+#include "cfg_file.h"
+#include "main.h"
+#include "message.h"
+#include "fuzzy.h"
+#include "expressions.h"
+#include "html.h"
+#include "lua/lua_common.h"
+#include "diff.h"
+
+gboolean rspamd_compare_encoding (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_header_exists (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_parts_distance (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_recipients_distance (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_has_only_html_part (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_is_recipients_sorted (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_compare_transfer_encoding (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_is_html_balanced (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_has_html_tag (struct rspamd_task *task, GList * args, void *unused);
+gboolean rspamd_has_fake_html (struct rspamd_task *task, GList * args, void *unused);
+
+/*
+ * List of internal functions of rspamd
+ * Sorted by name to use bsearch
+ */
+static struct _fl {
+ const gchar *name;
+ rspamd_internal_func_t func;
+ void *user_data;
+} rspamd_functions_list[] = {
+ {"compare_encoding", rspamd_compare_encoding, NULL},
+ {"compare_parts_distance", rspamd_parts_distance, NULL},
+ {"compare_recipients_distance", rspamd_recipients_distance, NULL},
+ {"compare_transfer_encoding", rspamd_compare_transfer_encoding, NULL},
+ {"has_fake_html", rspamd_has_fake_html, NULL},
+ {"has_html_tag", rspamd_has_html_tag, NULL},
+ {"has_only_html_part", rspamd_has_only_html_part, NULL},
+ {"header_exists", rspamd_header_exists, NULL},
+ {"is_html_balanced", rspamd_is_html_balanced, NULL},
+ {"is_recipients_sorted", rspamd_is_recipients_sorted, NULL}
+};
+
+static struct _fl *list_ptr = &rspamd_functions_list[0];
+static guint32 functions_number = sizeof (rspamd_functions_list) / sizeof (struct _fl);
+static gboolean list_allocated = FALSE;
+
+/* Bsearch routine */
+static gint
+fl_cmp (const void *s1, const void *s2)
+{
+ struct _fl *fl1 = (struct _fl *)s1;
+ struct _fl *fl2 = (struct _fl *)s2;
+ return strcmp (fl1->name, fl2->name);
+}
+
+/* Cache for regular expressions that are used in functions */
+void *
+re_cache_check (const gchar *line, rspamd_mempool_t *pool)
+{
+ GHashTable *re_cache;
+
+ re_cache = rspamd_mempool_get_variable (pool, "re_cache");
+
+ if (re_cache == NULL) {
+ re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ rspamd_mempool_set_variable (pool, "re_cache", re_cache, (rspamd_mempool_destruct_t)g_hash_table_destroy);
+ return NULL;
+ }
+ return g_hash_table_lookup (re_cache, line);
+}
+
+void
+re_cache_add (const gchar *line, void *pointer, rspamd_mempool_t *pool)
+{
+ GHashTable *re_cache;
+
+ re_cache = rspamd_mempool_get_variable (pool, "re_cache");
+
+ if (re_cache == NULL) {
+ re_cache = g_hash_table_new (rspamd_str_hash, rspamd_str_equal);
+ rspamd_mempool_set_variable (pool, "re_cache", re_cache, (rspamd_mempool_destruct_t)g_hash_table_destroy);
+ }
+
+ g_hash_table_insert (re_cache, (gpointer)line, pointer);
+}
+
+void
+re_cache_del (const gchar *line, rspamd_mempool_t *pool)
+{
+ GHashTable *re_cache;
+
+ re_cache = rspamd_mempool_get_variable (pool, "re_cache");
+
+ if (re_cache != NULL) {
+ g_hash_table_remove (re_cache, line);
+ }
+
+}
+
+/*
+ * Functions for parsing expressions
+ */
+struct expression_stack {
+ gchar op;
+ struct expression_stack *next;
+};
+
+/*
+ * Push operand or operator to stack
+ */
+static struct expression_stack *
+push_expression_stack (rspamd_mempool_t * pool, struct expression_stack *head, gchar op)
+{
+ struct expression_stack *new;
+ new = rspamd_mempool_alloc (pool, sizeof (struct expression_stack));
+ new->op = op;
+ new->next = head;
+ return new;
+}
+
+/*
+ * Delete symbol from stack, return pointer to operand or operator (casted to void* )
+ */
+static gchar
+delete_expression_stack (struct expression_stack **head)
+{
+ struct expression_stack *cur;
+ gchar res;
+
+ if (*head == NULL)
+ return 0;
+
+ cur = *head;
+ res = cur->op;
+
+ *head = cur->next;
+ return res;
+}
+
+/*
+ * Return operation priority
+ */
+static gint
+logic_priority (gchar a)
+{
+ switch (a) {
+ case '!':
+ return 3;
+ case '|':
+ case '&':
+ return 2;
+ case '(':
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+/*
+ * Return FALSE if symbol is not operation symbol (operand)
+ * Return TRUE if symbol is operation symbol
+ */
+static gboolean
+is_operation_symbol (gchar *a)
+{
+ switch (*a) {
+ case '!':
+ case '&':
+ case '|':
+ case '(':
+ case ')':
+ return TRUE;
+ case 'O':
+ case 'o':
+ if (g_ascii_strncasecmp (a, "or", sizeof ("or") - 1) == 0&& g_ascii_isspace (a[2])) {
+ return TRUE;
+ }
+ break;
+ case 'A':
+ case 'a':
+ if (g_ascii_strncasecmp (a, "and", sizeof ("and") - 1) == 0&& g_ascii_isspace (a[3])) {
+ return TRUE;
+ }
+ break;
+ case 'N':
+ case 'n':
+ if (g_ascii_strncasecmp (a, "not", sizeof ("not") - 1) == 0 && g_ascii_isspace (a[3])) {
+ return TRUE;
+ }
+ break;
+ }
+
+ return FALSE;
+}
+
+/* Return character representation of operation */
+static gchar
+op_to_char (gchar *a, gchar **next)
+{
+ switch (*a) {
+ case '!':
+ case '&':
+ case '|':
+ case '(':
+ case ')':
+ *next = a + 1;
+ return *a;
+ case 'O':
+ case 'o':
+ if (g_ascii_strncasecmp (a, "or", sizeof ("or") - 1) == 0) {
+ *next = a + sizeof ("or") - 1;
+ return '|';
+ }
+ break;
+ case 'A':
+ case 'a':
+ if (g_ascii_strncasecmp (a, "and", sizeof ("and") - 1) == 0) {
+ *next = a + sizeof ("and") - 1;
+ return '&';
+ }
+ break;
+ case 'N':
+ case 'n':
+ if (g_ascii_strncasecmp (a, "not", sizeof ("not") - 1) == 0) {
+ *next = a + sizeof ("not") - 1;
+ return '!';
+ }
+ break;
+ }
+
+ return '\0';
+}
+
+/*
+ * Return TRUE if symbol can be regexp flag
+ */
+static gboolean
+is_regexp_flag (gchar a)
+{
+ switch (a) {
+ case 'i':
+ case 'm':
+ case 'x':
+ case 's':
+ case 'u':
+ case 'o':
+ case 'r':
+ case 'H':
+ case 'M':
+ case 'P':
+ case 'U':
+ case 'X':
+ case 'T':
+ case 'S':
+ return TRUE;
+ default:
+ return FALSE;
+ }
+}
+
+static void
+insert_expression (rspamd_mempool_t * pool, struct expression **head, gint type, gchar op, void *operand, const gchar *orig)
+{
+ struct expression *new, *cur;
+
+ new = rspamd_mempool_alloc (pool, sizeof (struct expression));
+ new->type = type;
+ new->orig = orig;
+ if (new->type != EXPR_OPERATION) {
+ new->content.operand = operand;
+ }
+ else {
+ new->content.operation = op;
+ }
+ new->next = NULL;
+
+ if (!*head) {
+ *head = new;
+ }
+ else {
+ cur = *head;
+ while (cur->next) {
+ cur = cur->next;
+ }
+ cur->next = new;
+ }
+}
+
+static struct expression *
+maybe_parse_expression (rspamd_mempool_t * pool, gchar *line)
+{
+ struct expression *expr;
+ gchar *p = line;
+
+ while (*p) {
+ if (is_operation_symbol (p)) {
+ return parse_expression (pool, line);
+ }
+ p++;
+ }
+
+ expr = rspamd_mempool_alloc (pool, sizeof (struct expression));
+ expr->type = EXPR_STR;
+ expr->content.operand = rspamd_mempool_strdup (pool, line);
+ expr->next = NULL;
+
+ return expr;
+}
+
+/*
+ * Make inverse polish record for specified expression
+ * Memory is allocated from given pool
+ */
+struct expression *
+parse_expression (rspamd_mempool_t * pool, gchar *line)
+{
+ struct expression *expr = NULL;
+ struct expression_stack *stack = NULL;
+ struct expression_function *func = NULL;
+ struct expression *arg;
+ GQueue *function_stack;
+ gchar *p, *c, *str, op, newop, *copy, *next;
+ gboolean in_regexp = FALSE;
+ gint brackets = 0;
+
+ enum {
+ SKIP_SPACES,
+ READ_OPERATOR,
+ READ_REGEXP,
+ READ_REGEXP_FLAGS,
+ READ_FUNCTION,
+ READ_FUNCTION_ARGUMENT,
+ } state = SKIP_SPACES;
+
+ if (line == NULL || pool == NULL) {
+ return NULL;
+ }
+
+ msg_debug ("parsing expression {{ %s }}", line);
+
+ function_stack = g_queue_new ();
+ copy = rspamd_mempool_strdup (pool, line);
+ p = line;
+ c = p;
+ while (*p) {
+ switch (state) {
+ case SKIP_SPACES:
+ if (!g_ascii_isspace (*p)) {
+ if (is_operation_symbol (p)) {
+ state = READ_OPERATOR;
+ }
+ else if (*p == '/') {
+ c = ++p;
+ state = READ_REGEXP;
+ }
+ else {
+ c = p;
+ state = READ_FUNCTION;
+ }
+ }
+ else {
+ p++;
+ }
+ break;
+ case READ_OPERATOR:
+ if (*p == ')') {
+ if (stack == NULL) {
+ return NULL;
+ }
+ /* Pop all operators from stack to nearest '(' or to head */
+ while (stack && stack->op != '(') {
+ op = delete_expression_stack (&stack);
+ if (op != '(') {
+ insert_expression (pool, &expr, EXPR_OPERATION, op, NULL, copy);
+ }
+ }
+ if (stack) {
+ op = delete_expression_stack (&stack);
+ }
+ }
+ else if (*p == '(') {
+ /* Push it to stack */
+ stack = push_expression_stack (pool, stack, *p);
+ }
+ else {
+ if (stack == NULL) {
+ newop = op_to_char (p, &next);
+ if (newop != '\0') {
+ stack = push_expression_stack (pool, stack, newop);
+ p = next;
+ state = SKIP_SPACES;
+ continue;
+ }
+ }
+ /* Check priority of logic operation */
+ else {
+ newop = op_to_char (p, &next);
+ if (newop != '\0') {
+ if (logic_priority (stack->op) < logic_priority (newop)) {
+ stack = push_expression_stack (pool, stack, newop);
+ }
+ else {
+ /* Pop all operations that have higher priority than this one */
+ while ((stack != NULL) && (logic_priority (stack->op) >= logic_priority (newop))) {
+ op = delete_expression_stack (&stack);
+ if (op != '(') {
+ insert_expression (pool, &expr, EXPR_OPERATION, op, NULL, copy);
+ }
+ }
+ stack = push_expression_stack (pool, stack, newop);
+ }
+ }
+ p = next;
+ state = SKIP_SPACES;
+ continue;
+ }
+ }
+ p++;
+ state = SKIP_SPACES;
+ break;
+
+ case READ_REGEXP:
+ if (*p == '/' && *(p - 1) != '\\') {
+ if (*(p + 1)) {
+ p++;
+ }
+ state = READ_REGEXP_FLAGS;
+ }
+ else {
+ p++;
+ }
+ break;
+
+ case READ_REGEXP_FLAGS:
+ if (!is_regexp_flag (*p) || *(p + 1) == '\0') {
+ if (c != p) {
+ if ((is_regexp_flag (*p) || *p == '/') && *(p + 1) == '\0') {
+ p++;
+ }
+ str = rspamd_mempool_alloc (pool, p - c + 2);
+ rspamd_strlcpy (str, c - 1, (p - c + 2));
+ g_strstrip (str);
+ msg_debug ("found regexp: %s", str);
+ if (strlen (str) > 0) {
+ insert_expression (pool, &expr, EXPR_REGEXP, 0, str, copy);
+ }
+ }
+ c = p;
+ state = SKIP_SPACES;
+ }
+ else {
+ p++;
+ }
+ break;
+
+ case READ_FUNCTION:
+ if (*p == '/') {
+ /* In fact it is regexp */
+ state = READ_REGEXP;
+ c++;
+ p++;
+ }
+ else if (*p == '(') {
+ func = rspamd_mempool_alloc (pool, sizeof (struct expression_function));
+ func->name = rspamd_mempool_alloc (pool, p - c + 1);
+ func->args = NULL;
+ rspamd_strlcpy (func->name, c, (p - c + 1));
+ g_strstrip (func->name);
+ state = READ_FUNCTION_ARGUMENT;
+ g_queue_push_tail (function_stack, func);
+ insert_expression (pool, &expr, EXPR_FUNCTION, 0, func, copy);
+ c = ++p;
+ }
+ else if (is_operation_symbol (p)) {
+ /* In fact it is not function, but symbol */
+ if (c != p) {
+ str = rspamd_mempool_alloc (pool, p - c + 1);
+ rspamd_strlcpy (str, c, (p - c + 1));
+ g_strstrip (str);
+ if (strlen (str) > 0) {
+ insert_expression (pool, &expr, EXPR_STR, 0, str, copy);
+ }
+ }
+ state = READ_OPERATOR;
+ }
+ else if (*(p + 1) == '\0') {
+ /* In fact it is not function, but symbol */
+ p++;
+ if (c != p) {
+ str = rspamd_mempool_alloc (pool, p - c + 1);
+ rspamd_strlcpy (str, c, (p - c + 1));
+ g_strstrip (str);
+ if (strlen (str) > 0) {
+ insert_expression (pool, &expr, EXPR_STR, 0, str, copy);
+ }
+ }
+ state = SKIP_SPACES;
+ }
+ else {
+ p++;
+ }
+ break;
+
+ case READ_FUNCTION_ARGUMENT:
+ if (*p == '/' && !in_regexp) {
+ in_regexp = TRUE;
+ p++;
+ }
+ if (!in_regexp) {
+ /* Append argument to list */
+ if (*p == ',' || (*p == ')' && brackets == 0)) {
+ arg = NULL;
+ str = rspamd_mempool_alloc (pool, p - c + 1);
+ rspamd_strlcpy (str, c, (p - c + 1));
+ g_strstrip (str);
+ /* Recursive call */
+ arg = maybe_parse_expression (pool, str);
+ func->args = g_list_append (func->args, arg);
+ /* Pop function */
+ if (*p == ')') {
+ /* Last function in chain, goto skipping spaces state */
+ func = g_queue_pop_tail (function_stack);
+ if (g_queue_get_length (function_stack) == 0) {
+ state = SKIP_SPACES;
+ }
+ }
+ c = p + 1;
+ }
+ else if (*p == '(') {
+ brackets++;
+ }
+ else if (*p == ')') {
+ brackets--;
+ }
+ }
+ else if (*p == '/' && *(p - 1) != '\\') {
+ in_regexp = FALSE;
+ }
+ p++;
+ break;
+ }
+ }
+
+ g_queue_free (function_stack);
+ if (state != SKIP_SPACES) {
+ /* In fact we got bad expression */
+ msg_warn ("expression \"%s\" is invalid", line);
+ return NULL;
+ }
+ /* Pop everything from stack */
+ while (stack != NULL) {
+ op = delete_expression_stack (&stack);
+ if (op != '(') {
+ insert_expression (pool, &expr, EXPR_OPERATION, op, NULL, copy);
+ }
+ }
+
+ return expr;
+}
+
+/*
+ * Rspamd regexp utility functions
+ */
+struct rspamd_regexp *
+parse_regexp (rspamd_mempool_t * pool, const gchar *line, gboolean raw_mode)
+{
+ const gchar *begin, *end, *p, *src, *start;
+ gchar *dbegin, *dend;
+ struct rspamd_regexp *result, *check;
+ gint regexp_flags = G_REGEX_OPTIMIZE | G_REGEX_NO_AUTO_CAPTURE;
+ GError *err = NULL;
+
+ if (line == NULL) {
+ msg_err ("cannot parse NULL line");
+ return NULL;
+ }
+
+ src = line;
+ result = rspamd_mempool_alloc0 (pool, sizeof (struct rspamd_regexp));
+ /* Skip whitespaces */
+ while (g_ascii_isspace (*line)) {
+ line++;
+ }
+ if (*line == '\0') {
+ msg_warn ("got empty regexp");
+ return NULL;
+ }
+ start = line;
+ /* First try to find header name */
+ begin = strchr (line, '/');
+ if (begin != NULL) {
+ p = begin;
+ end = NULL;
+ while (p != line) {
+ if (*p == '=') {
+ end = p;
+ break;
+ }
+ p --;
+ }
+ if (end) {
+ result->header = rspamd_mempool_alloc (pool, end - line + 1);
+ rspamd_strlcpy (result->header, line, end - line + 1);
+ result->type = REGEXP_HEADER;
+ line = end;
+ }
+ }
+ else {
+ result->header = rspamd_mempool_strdup (pool, line);
+ result->type = REGEXP_HEADER;
+ line = start;
+ }
+ /* Find begin of regexp */
+ while (*line && *line != '/') {
+ line++;
+ }
+ if (*line != '\0') {
+ begin = line + 1;
+ }
+ else if (result->header == NULL) {
+ /* Assume that line without // is just a header name */
+ result->header = rspamd_mempool_strdup (pool, line);
+ result->type = REGEXP_HEADER;
+ return result;
+ }
+ else {
+ /* We got header name earlier but have not found // expression, so it is invalid regexp */
+ msg_warn ("got no header name (eg. header=) but without corresponding regexp, %s", src);
+ return NULL;
+ }
+ /* Find end */
+ end = begin;
+ while (*end && (*end != '/' || *(end - 1) == '\\')) {
+ end++;
+ }
+ if (end == begin || *end != '/') {
+ msg_warn ("no trailing / in regexp %s", src);
+ return NULL;
+ }
+ /* Parse flags */
+ p = end + 1;
+ while (p != NULL) {
+ switch (*p) {
+ case 'i':
+ regexp_flags |= G_REGEX_CASELESS;
+ p++;
+ break;
+ case 'm':
+ regexp_flags |= G_REGEX_MULTILINE;
+ p++;
+ break;
+ case 's':
+ regexp_flags |= G_REGEX_DOTALL;
+ p++;
+ break;
+ case 'x':
+ regexp_flags |= G_REGEX_EXTENDED;
+ p++;
+ break;
+ case 'u':
+ regexp_flags |= G_REGEX_UNGREEDY;
+ p++;
+ break;
+ case 'o':
+ regexp_flags |= G_REGEX_OPTIMIZE;
+ p++;
+ break;
+ case 'r':
+ regexp_flags |= G_REGEX_RAW;
+ result->is_raw = TRUE;
+ p++;
+ break;
+ /* Type flags */
+ case 'H':
+ if (result->type == REGEXP_NONE) {
+ result->type = REGEXP_HEADER;
+ }
+ p++;
+ break;
+ case 'M':
+ if (result->type == REGEXP_NONE) {
+ result->type = REGEXP_MESSAGE;
+ }
+ p++;
+ break;
+ case 'P':
+ if (result->type == REGEXP_NONE) {
+ result->type = REGEXP_MIME;
+ }
+ p++;
+ break;
+ case 'U':
+ if (result->type == REGEXP_NONE) {
+ result->type = REGEXP_URL;
+ }
+ p++;
+ break;
+ case 'X':
+ if (result->type == REGEXP_NONE || result->type == REGEXP_HEADER) {
+ result->type = REGEXP_RAW_HEADER;
+ }
+ p++;
+ break;
+ case 'T':
+ result->is_test = TRUE;
+ p ++;
+ break;
+ case 'S':
+ result->is_strong = TRUE;
+ p ++;
+ break;
+ /* Stop flags parsing */
+ default:
+ p = NULL;
+ break;
+ }
+ }
+
+ result->regexp_text = rspamd_mempool_strdup (pool, start);
+ dbegin = result->regexp_text + (begin - start);
+ dend = result->regexp_text + (end - start);
+ *dend = '\0';
+
+ if (raw_mode) {
+ regexp_flags |= G_REGEX_RAW;
+ }
+
+ /* Avoid multiply regexp structures for similar regexps */
+ if ((check = (struct rspamd_regexp *)re_cache_check (result->regexp_text, pool)) != NULL) {
+ /* Additional check for headers */
+ if (result->type == REGEXP_HEADER || result->type == REGEXP_RAW_HEADER) {
+ if (result->header && check->header) {
+ if (strcmp (result->header, check->header) == 0) {
+ return check;
+ }
+ }
+ }
+ else {
+ return check;
+ }
+ }
+ result->regexp = g_regex_new (dbegin, regexp_flags, 0, &err);
+ if ((regexp_flags & G_REGEX_RAW) != 0) {
+ result->raw_regexp = result->regexp;
+ }
+ else {
+ result->raw_regexp = g_regex_new (dbegin, regexp_flags | G_REGEX_RAW, 0, &err);
+ rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_regex_unref, (void *)result->raw_regexp);
+ }
+ rspamd_mempool_add_destructor (pool, (rspamd_mempool_destruct_t) g_regex_unref, (void *)result->regexp);
+
+ *dend = '/';
+
+ if (result->regexp == NULL || err != NULL) {
+ msg_warn ("could not read regexp: %s while reading regexp %s", err->message, src);
+ return NULL;
+ }
+
+ if (result->raw_regexp == NULL || err != NULL) {
+ msg_warn ("could not read raw regexp: %s while reading regexp %s", err->message, src);
+ return NULL;
+ }
+
+ /* Add to cache for further usage */
+ re_cache_add (result->regexp_text, result, pool);
+ return result;
+}
+
+gboolean
+call_expression_function (struct expression_function * func, struct rspamd_task * task, lua_State *L)
+{
+ struct _fl *selected, key;
+
+ key.name = func->name;
+
+ selected = bsearch (&key, list_ptr, functions_number, sizeof (struct _fl), fl_cmp);
+ if (selected == NULL) {
+ /* Try to check lua function */
+ return FALSE;
+ }
+
+ return selected->func (task, func->args, selected->user_data);
+}
+
+struct expression_argument *
+get_function_arg (struct expression *expr, struct rspamd_task *task, gboolean want_string)
+{
+ GQueue *stack;
+ gsize cur, op1, op2;
+ struct expression_argument *res;
+ struct expression *it;
+
+ if (expr == NULL) {
+ msg_warn ("NULL expression passed");
+ return NULL;
+ }
+ if (expr->next == NULL) {
+ res = rspamd_mempool_alloc (task->task_pool, sizeof (struct expression_argument));
+ if (expr->type == EXPR_REGEXP || expr->type == EXPR_STR || expr->type == EXPR_REGEXP_PARSED) {
+ res->type = EXPRESSION_ARGUMENT_NORMAL;
+ res->data = expr->content.operand;
+ }
+ else if (expr->type == EXPR_FUNCTION && !want_string) {
+ res->type = EXPRESSION_ARGUMENT_BOOL;
+ cur = call_expression_function (expr->content.operand, task, NULL);
+ res->data = GSIZE_TO_POINTER (cur);
+ }
+ else {
+ msg_warn ("cannot parse argument: it contains operator or bool expression that is not wanted");
+ return NULL;
+ }
+ return res;
+ }
+ else if (!want_string) {
+ res = rspamd_mempool_alloc (task->task_pool, sizeof (struct expression_argument));
+ res->type = EXPRESSION_ARGUMENT_BOOL;
+ stack = g_queue_new ();
+ it = expr;
+
+ while (it) {
+ if (it->type == EXPR_REGEXP || it->type == EXPR_REGEXP_PARSED || it->type == EXPR_STR) {
+ g_queue_free (stack);
+ res->type = EXPRESSION_ARGUMENT_EXPR;
+ res->data = expr;
+ return res;
+ }
+ else if (it->type == EXPR_FUNCTION) {
+ cur = (gsize) call_expression_function ((struct expression_function *)it->content.operand, task, NULL);
+ debug_task ("function %s returned %s", ((struct expression_function *)it->content.operand)->name, cur ? "true" : "false");
+ }
+ else if (it->type == EXPR_OPERATION) {
+ if (g_queue_is_empty (stack)) {
+ /* Queue has no operands for operation, exiting */
+ debug_task ("invalid expression");
+ g_queue_free (stack);
+ return NULL;
+ }
+ switch (it->content.operation) {
+ case '!':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op1 = !op1;
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1));
+ break;
+ case '&':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op2 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1 && op2));
+ break;
+ case '|':
+ op1 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ op2 = GPOINTER_TO_SIZE (g_queue_pop_head (stack));
+ g_queue_push_head (stack, GSIZE_TO_POINTER (op1 || op2));
+ break;
+ default:
+ it = it->next;
+ continue;
+ }
+ }
+ if (it) {
+ it = it->next;
+ }
+ }
+ if (!g_queue_is_empty (stack)) {
+ res->data = g_queue_pop_head (stack);
+ }
+ else {
+ res->data = GSIZE_TO_POINTER (FALSE);
+ }
+
+ return res;
+ }
+
+ msg_warn ("invalid expression argument");
+
+ return NULL;
+}
+
+void
+register_expression_function (const gchar *name, rspamd_internal_func_t func, void *user_data)
+{
+ static struct _fl *new;
+
+ functions_number++;
+
+ new = g_new (struct _fl, functions_number);
+ memcpy (new, list_ptr, (functions_number - 1) * sizeof (struct _fl));
+ if (list_allocated) {
+ g_free (list_ptr);
+ }
+
+ list_allocated = TRUE;
+ new[functions_number - 1].name = name;
+ new[functions_number - 1].func = func;
+ new[functions_number - 1].user_data = user_data;
+ qsort (new, functions_number, sizeof (struct _fl), fl_cmp);
+ list_ptr = new;
+}
+
+gboolean
+rspamd_compare_encoding (struct rspamd_task *task, GList * args, void *unused)
+{
+ struct expression_argument *arg;
+
+ if (args == NULL || task == NULL) {
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+ if (arg->type == EXPRESSION_ARGUMENT_BOOL) {
+ msg_warn ("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ /* XXX: really write this function */
+ return TRUE;
+}
+
+gboolean
+rspamd_header_exists (struct rspamd_task * task, GList * args, void *unused)
+{
+ struct expression_argument *arg;
+ GList *headerlist;
+
+ if (args == NULL || task == NULL) {
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+ if (!arg || arg->type == EXPRESSION_ARGUMENT_BOOL) {
+ msg_warn ("invalid argument to function is passed");
+ return FALSE;
+ }
+
+ debug_task ("try to get header %s", (gchar *)arg->data);
+ headerlist = message_get_header (task->task_pool, task->message, (gchar *)arg->data, FALSE);
+ if (headerlist) {
+ g_list_free (headerlist);
+ return TRUE;
+ }
+ return FALSE;
+}
+
+/*
+ * This function is designed to find difference between text/html and text/plain parts
+ * It takes one argument: difference threshold, if we have two text parts, compare
+ * its hashes and check for threshold, if value is greater than threshold, return TRUE
+ * and return FALSE otherwise.
+ */
+gboolean
+rspamd_parts_distance (struct rspamd_task * task, GList * args, void *unused)
+{
+ gint threshold, threshold2 = -1, diff;
+ struct mime_text_part *p1, *p2;
+ GList *cur;
+ struct expression_argument *arg;
+ GMimeObject *parent;
+ const GMimeContentType *ct;
+ gint *pdiff;
+
+ if (args == NULL) {
+ debug_task ("no threshold is specified, assume it 100");
+ threshold = 100;
+ }
+ else {
+ errno = 0;
+ arg = get_function_arg (args->data, task, TRUE);
+ threshold = strtoul ((gchar *)arg->data, NULL, 10);
+ if (errno != 0) {
+ msg_info ("bad numeric value for threshold \"%s\", assume it 100", (gchar *)args->data);
+ threshold = 100;
+ }
+ if (args->next) {
+ arg = get_function_arg (args->next->data, task, TRUE);
+ errno = 0;
+ threshold2 = strtoul ((gchar *)arg->data, NULL, 10);
+ if (errno != 0) {
+ msg_info ("bad numeric value for threshold \"%s\", ignore it", (gchar *)arg->data);
+ threshold2 = -1;
+ }
+ }
+ }
+
+ if ((pdiff = rspamd_mempool_get_variable (task->task_pool, "parts_distance")) != NULL) {
+ diff = *pdiff;
+ if (diff != -1) {
+ if (threshold2 > 0) {
+ if (diff >= MIN (threshold, threshold2) && diff < MAX (threshold, threshold2)) {
+ return TRUE;
+ }
+ }
+ else {
+ if (diff <= threshold) {
+ return TRUE;
+ }
+ }
+ return FALSE;
+ }
+ else {
+ return FALSE;
+ }
+ }
+
+ if (g_list_length (task->text_parts) == 2) {
+ cur = g_list_first (task->text_parts);
+ p1 = cur->data;
+ cur = g_list_next (cur);
+ pdiff = rspamd_mempool_alloc (task->task_pool, sizeof (gint));
+ *pdiff = -1;
+
+ if (cur == NULL) {
+ msg_info ("bad parts list");
+ return FALSE;
+ }
+ p2 = cur->data;
+ /* First of all check parent object */
+ if (p1->parent && p1->parent == p2->parent) {
+ parent = p1->parent;
+ ct = g_mime_object_get_content_type (parent);
+#ifndef GMIME24
+ if (ct == NULL || ! g_mime_content_type_is_type (ct, "multipart", "alternative")) {
+#else
+ if (ct == NULL || ! g_mime_content_type_is_type ((GMimeContentType *)ct, "multipart", "alternative")) {
+#endif
+ debug_task ("two parts are not belong to multipart/alternative container, skip check");
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ return FALSE;
+ }
+ }
+ else {
+ debug_task ("message contains two parts but they are in different multi-parts");
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ return FALSE;
+ }
+ if (!p1->is_empty && !p2->is_empty) {
+ if (p1->diff_str != NULL && p2->diff_str != NULL) {
+ diff = compare_diff_distance_normalized (p1->diff_str, p2->diff_str);
+ }
+ else {
+ diff = fuzzy_compare_parts (p1, p2);
+ }
+ debug_task ("got likeliness between parts of %d%%, threshold is %d%%", diff, threshold);
+ *pdiff = diff;
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ if (threshold2 > 0) {
+ if (diff >= MIN (threshold, threshold2) && diff < MAX (threshold, threshold2)) {
+ return TRUE;
+ }
+ }
+ else {
+ if (diff <= threshold) {
+ return TRUE;
+ }
+ }
+ }
+ else if ((p1->is_empty && !p2->is_empty) || (!p1->is_empty && p2->is_empty)) {
+ /* Empty and non empty parts are different */
+ *pdiff = 0;
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ return TRUE;
+ }
+ }
+ else {
+ debug_task ("message has too many text parts, so do not try to compare them with each other");
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ return FALSE;
+ }
+
+ rspamd_mempool_set_variable (task->task_pool, "parts_distance", pdiff, NULL);
+ return FALSE;
+}
+
+struct addr_list {
+ const gchar *name;
+ const gchar *addr;
+};
+
+#define COMPARE_RCPT_LEN 3
+#define MIN_RCPT_TO_COMPARE 7
+
+gboolean
+rspamd_recipients_distance (struct rspamd_task *task, GList * args, void *unused)
+{
+ struct expression_argument *arg;
+ InternetAddressList *cur;
+ InternetAddress *addr;
+ double threshold;
+ struct addr_list *ar;
+ gchar *c;
+ gint num, i, j, hits = 0, total = 0;
+
+ if (args == NULL) {
+ msg_warn ("no parameters to function");
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+ errno = 0;
+ threshold = strtod ((gchar *)arg->data, NULL);
+ if (errno != 0) {
+ msg_warn ("invalid numeric value '%s': %s", (gchar *)arg->data, strerror (errno));
+ return FALSE;
+ }
+
+ if (!task->rcpts) {
+ return FALSE;
+ }
+ num = internet_address_list_length (task->rcpts);
+ if (num < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+ ar = rspamd_mempool_alloc0 (task->task_pool, num * sizeof (struct addr_list));
+
+ /* Fill array */
+ cur = task->rcpts;
+#ifdef GMIME24
+ for (i = 0; i < num; i ++) {
+ addr = internet_address_list_get_address (cur, i);
+ ar[i].name = rspamd_mempool_strdup (task->task_pool, internet_address_get_name (addr));
+ if (ar[i].name != NULL && (c = strchr (ar[i].name, '@')) != NULL) {
+ *c = '\0';
+ ar[i].addr = c + 1;
+ }
+ }
+#else
+ i = 0;
+ while (cur) {
+ addr = internet_address_list_get_address (cur);
+ if (addr && internet_address_get_type (addr) == INTERNET_ADDRESS_NAME) {
+ ar[i].name = rspamd_mempool_strdup (task->task_pool, internet_address_get_addr (addr));
+ if (ar[i].name != NULL && (c = strchr (ar[i].name, '@')) != NULL) {
+ *c = '\0';
+ ar[i].addr = c + 1;
+ }
+ cur = internet_address_list_next (cur);
+ i++;
+ }
+ else {
+ cur = internet_address_list_next (cur);
+ }
+ }
+#endif
+
+ /* Cycle all elements in array */
+ for (i = 0; i < num; i++) {
+ for (j = i + 1; j < num; j++) {
+ if (ar[i].name && ar[j].name && g_ascii_strncasecmp (ar[i].name, ar[j].name, COMPARE_RCPT_LEN) == 0) {
+ /* Common name part */
+ hits++;
+ }
+ else if (ar[i].addr && ar[j].addr && g_ascii_strcasecmp (ar[i].addr, ar[j].addr) == 0) {
+ /* Common address part, but different name */
+ hits++;
+ }
+ total++;
+ }
+ }
+
+ if ((double)(hits * num / 2.) / (double)total >= threshold) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_has_only_html_part (struct rspamd_task * task, GList * args, void *unused)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ gboolean res = FALSE;
+
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ p = cur->data;
+ if (p->is_html) {
+ res = TRUE;
+ }
+ else {
+ res = FALSE;
+ break;
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+}
+
+static gboolean
+is_recipient_list_sorted (const InternetAddressList * ia)
+{
+ const InternetAddressList *cur;
+ InternetAddress *addr;
+ gboolean res = TRUE;
+ struct addr_list current = { NULL, NULL }, previous = {
+ NULL, NULL};
+#ifdef GMIME24
+ gint num, i;
+#endif
+
+ /* Do not check to short address lists */
+ if (internet_address_list_length ((InternetAddressList *)ia) < MIN_RCPT_TO_COMPARE) {
+ return FALSE;
+ }
+#ifdef GMIME24
+ num = internet_address_list_length ((InternetAddressList *)ia);
+ cur = ia;
+ for (i = 0; i < num; i ++) {
+ addr = internet_address_list_get_address ((InternetAddressList *)cur, i);
+ current.addr = (gchar *)internet_address_get_name (addr);
+ if (previous.addr != NULL) {
+ if (current.addr && g_ascii_strcasecmp (current.addr, previous.addr) < 0) {
+ res = FALSE;
+ break;
+ }
+ }
+ previous.addr = current.addr;
+ }
+#else
+ cur = ia;
+ while (cur) {
+ addr = internet_address_list_get_address (cur);
+ if (internet_address_get_type (addr) == INTERNET_ADDRESS_NAME) {
+ current.addr = internet_address_get_addr (addr);
+ if (previous.addr != NULL) {
+ if (current.addr && g_ascii_strcasecmp (current.addr, previous.addr) < 0) {
+ res = FALSE;
+ break;
+ }
+ }
+ previous.addr = current.addr;
+ }
+ cur = internet_address_list_next (cur);
+ }
+#endif
+
+ return res;
+}
+
+gboolean
+rspamd_is_recipients_sorted (struct rspamd_task * task, GList * args, void *unused)
+{
+ /* Check all types of addresses */
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_TO)) == TRUE) {
+ return TRUE;
+ }
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_BCC)) == TRUE) {
+ return TRUE;
+ }
+ if (is_recipient_list_sorted (g_mime_message_get_recipients (task->message, GMIME_RECIPIENT_TYPE_CC)) == TRUE) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_compare_transfer_encoding (struct rspamd_task * task, GList * args, void *unused)
+{
+ GMimeObject *part;
+#ifndef GMIME24
+ GMimePartEncodingType enc_req, part_enc;
+#else
+ GMimeContentEncoding enc_req, part_enc;
+#endif
+ struct expression_argument *arg;
+
+ if (args == NULL) {
+ msg_warn ("no parameters to function");
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+#ifndef GMIME24
+ enc_req = g_mime_part_encoding_from_string (arg->data);
+ if (enc_req == GMIME_PART_ENCODING_DEFAULT) {
+#else
+ enc_req = g_mime_content_encoding_from_string (arg->data);
+ if (enc_req == GMIME_CONTENT_ENCODING_DEFAULT) {
+#endif
+ msg_warn ("bad encoding type: %s", (gchar *)arg->data);
+ return FALSE;
+ }
+
+ part = g_mime_message_get_mime_part (task->message);
+ if (part) {
+ if (GMIME_IS_PART (part)) {
+#ifndef GMIME24
+ part_enc = g_mime_part_get_encoding (GMIME_PART (part));
+ if (part_enc == GMIME_PART_ENCODING_DEFAULT) {
+ /* Assume 7bit as default transfer encoding */
+ part_enc = GMIME_PART_ENCODING_7BIT;
+ }
+#else
+ part_enc = g_mime_part_get_content_encoding (GMIME_PART (part));
+ if (part_enc == GMIME_CONTENT_ENCODING_DEFAULT) {
+ /* Assume 7bit as default transfer encoding */
+ part_enc = GMIME_CONTENT_ENCODING_7BIT;
+ }
+#endif
+
+
+ debug_task ("got encoding in part: %d and compare with %d", (gint)part_enc, (gint)enc_req);
+#ifndef GMIME24
+ g_object_unref (part);
+#endif
+
+ return part_enc == enc_req;
+ }
+#ifndef GMIME24
+ g_object_unref (part);
+#endif
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_is_html_balanced (struct rspamd_task * task, GList * args, void *unused)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ gboolean res = TRUE;
+
+ cur = g_list_first (task->text_parts);
+ while (cur) {
+ p = cur->data;
+ if (!p->is_empty && p->is_html) {
+ if (p->is_balanced) {
+ res = TRUE;
+ }
+ else {
+ res = FALSE;
+ break;
+ }
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+
+}
+
+struct html_callback_data {
+ struct html_tag *tag;
+ gboolean *res;
+};
+
+static gboolean
+search_html_node_callback (GNode * node, gpointer data)
+{
+ struct html_callback_data *cd = data;
+ struct html_node *nd;
+
+ nd = node->data;
+ if (nd) {
+ if (nd->tag == cd->tag) {
+ *cd->res = TRUE;
+ return TRUE;
+ }
+ }
+
+ return FALSE;
+}
+
+gboolean
+rspamd_has_html_tag (struct rspamd_task * task, GList * args, void *unused)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ struct expression_argument *arg;
+ struct html_tag *tag;
+ gboolean res = FALSE;
+ struct html_callback_data cd;
+
+ if (args == NULL) {
+ msg_warn ("no parameters to function");
+ return FALSE;
+ }
+
+ arg = get_function_arg (args->data, task, TRUE);
+ tag = get_tag_by_name (arg->data);
+ if (tag == NULL) {
+ msg_warn ("unknown tag type passed as argument: %s", (gchar *)arg->data);
+ return FALSE;
+ }
+
+ cur = g_list_first (task->text_parts);
+ cd.res = &res;
+ cd.tag = tag;
+
+ while (cur && res == FALSE) {
+ p = cur->data;
+ if (!p->is_empty && p->is_html && p->html_nodes) {
+ g_node_traverse (p->html_nodes, G_PRE_ORDER, G_TRAVERSE_ALL, -1, search_html_node_callback, &cd);
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+
+}
+
+gboolean
+rspamd_has_fake_html (struct rspamd_task * task, GList * args, void *unused)
+{
+ struct mime_text_part *p;
+ GList *cur;
+ gboolean res = FALSE;
+
+ cur = g_list_first (task->text_parts);
+
+ while (cur && res == FALSE) {
+ p = cur->data;
+ if (!p->is_empty && p->is_html && p->html_nodes == NULL) {
+ res = TRUE;
+ }
+ cur = g_list_next (cur);
+ }
+
+ return res;
+
+}
+
+
+/*
+ * vi:ts=4
+ */