* Add function regexp_occurs_number that allows to test how much occurs of regexp...

author Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)

committer Vsevolod Stakhov <vsevolod@rambler-co.ru>

Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)
author Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)
committer Vsevolod Stakhov <vsevolod@rambler-co.ru>
Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)
diff --git a/src/filter.c b/src/filter.c

index 91b0196d4b94c827e1de7d4b4f5b519fc0a728ed..e0c4e89b8d2dca9bd505f75f4b8bda482a064783 100644 (file)
--- a/src/filter.c
+++ b/src/filter.c
@@ -44,7 +44,8 @@
  #endif
  
  static void
-insert_metric_result (struct worker_task *task, struct metric *metric, const gchar *symbol, double flag, GList * opts)
+insert_metric_result (struct worker_task *task, struct metric *metric, const gchar *symbol,
+               double flag, GList * opts, gboolean single)
  {
         struct metric_result           *metric_res;
         struct symbol                  *s;
@@ -81,28 +82,32 @@ insert_metric_result (struct worker_task *task, struct metric *metric, const gch
         }
  
         /* Add metric score */
-       metric_res->score += w;
+
  
         if ((s = g_hash_table_lookup (metric_res->symbols, symbol)) != NULL) {
-               if (s->options && opts && opts != s->options) {
-                       /* Append new options */
-                       s->options = g_list_concat (s->options, g_list_copy(opts));
-                       /* 
-                       * Note that there is no need to add new destructor of GList as elements of appended
-                       * GList are used directly, so just free initial GList
-                       */
-               }
-               else if (opts) {
-                       s->options = opts;
-                       memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_list_free, s->options);
-               }
+               if (!single) {
+                       if (s->options && opts && opts != s->options) {
+                               /* Append new options */
+                               s->options = g_list_concat (s->options, g_list_copy(opts));
+                               /*
+                                * Note that there is no need to add new destructor of GList as elements of appended
+                                * GList are used directly, so just free initial GList
+                                */
+                       }
+                       else if (opts) {
+                               s->options = opts;
+                               memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_list_free, s->options);
+                       }
  
-               s->score += w;
+                       s->score += w;
+                       metric_res->score += w;
+               }
         }
         else {
                 s = memory_pool_alloc (task->task_pool, sizeof (struct symbol));
                 s->score = w;
                 s->options = opts;
+               metric_res->score += w;
  
                 if (opts) {
                         memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_list_free, s->options);
@@ -114,8 +119,8 @@ insert_metric_result (struct worker_task *task, struct metric *metric, const gch
         
  }
  
-void
-insert_result (struct worker_task *task, const gchar *symbol, double flag, GList * opts)
+static void
+insert_result_common (struct worker_task *task, const gchar *symbol, double flag, GList * opts, gboolean single)
  {
         struct metric                  *metric;
         struct cache_item              *item;
@@ -127,13 +132,13 @@ insert_result (struct worker_task *task, const gchar *symbol, double flag, GList
                 
                 while (cur) {
                         metric = cur->data;
-                       insert_metric_result (task, metric, symbol, flag, opts);
+                       insert_metric_result (task, metric, symbol, flag, opts, single);
                         cur = g_list_next (cur);
                 }
         }
         else {
                 /* Insert symbol to default metric */
-               insert_metric_result (task, task->cfg->default_metric, symbol, flag, opts);
+               insert_metric_result (task, task->cfg->default_metric, symbol, flag, opts, single);
         }
  
         /* Process cache item */
@@ -161,6 +166,20 @@ insert_result (struct worker_task *task, const gchar *symbol, double flag, GList
         }
  }
  
+/* Insert result that may be increased on next insertions */
+void
+insert_result (struct worker_task *task, const gchar *symbol, double flag, GList * opts)
+{
+       insert_result_common (task, symbol, flag, opts, FALSE);
+}
+
+/* Insert result as a single option */
+void
+insert_result_single (struct worker_task *task, const gchar *symbol, double flag, GList * opts)
+{
+       insert_result_common (task, symbol, flag, opts, TRUE);
+}
+
  /* 
   * Call perl or C module function for specified part of message 
   */
@@ -321,6 +340,7 @@ composites_foreach_callback (gpointer key, gpointer value, void *data)
         gsize                           cur, op1, op2;
         gchar                           logbuf[256];
         gint                            r;
+       struct symbol                  *ms;
  
         stack = g_queue_new ();
  
@@ -332,7 +352,7 @@ composites_foreach_callback (gpointer key, gpointer value, void *data)
                         }
                         else {
                                 cur = 1;
-                               symbols = g_list_append (symbols, expr->content.operand);
+                               symbols = g_list_prepend (symbols, expr->content.operand);
                         }
                         g_queue_push_head (stack, GSIZE_TO_POINTER (cur));
                 }
@@ -371,7 +391,9 @@ composites_foreach_callback (gpointer key, gpointer value, void *data)
                         s = g_list_first (symbols);
                         r = rspamd_snprintf (logbuf, sizeof (logbuf), "<%s>, insert symbol %s instead of symbols: ", cd->task->message_id, key);
                         while (s) {
+                               ms = g_hash_table_lookup (cd->metric_res->symbols, s->data);
                                 g_hash_table_remove (cd->metric_res->symbols, s->data);
+                               cd->metric_res->score -= ms->score;
                                 if (s->next) {
                                         r += rspamd_snprintf (logbuf + r, sizeof (logbuf) -r, "%s, ", s->data);
                                 }
@@ -381,7 +403,8 @@ composites_foreach_callback (gpointer key, gpointer value, void *data)
                                 s = g_list_next (s);
                         }
                         /* Add new symbol */
-                       insert_result (cd->task, key, 1.0, NULL);
+                       insert_result_single (cd->task, key, 1.0, NULL);
+                       msg_info (logbuf);
                 }
         }
  
diff --git a/src/filter.h b/src/filter.h

index 2a1d97edd78cb92822a35c3b70030193c51bb032..924e1eac9773350f1efe85a1e7d7311c2b71c454 100644 (file)
--- a/src/filter.h
+++ b/src/filter.h
@@ -97,6 +97,16 @@ void process_statfiles (struct worker_task *task);
   */
  void insert_result (struct worker_task *task, const gchar *symbol, double flag, GList *opts);
  
+/**
+ * Insert a single result to task
+ * @param task worker's task that present message from user
+ * @param metric_name metric's name to which we need to insert result
+ * @param symbol symbol to insert
+ * @param flag numeric weight for symbol
+ * @param opts list of symbol's options
+ */
+void insert_result_single (struct worker_task *task, const gchar *symbol, double flag, GList *opts);
+
  /**
   * Process all results and form composite metrics from existent metrics as it is defined in config
   * @param task worker's task that present message from user
diff --git a/src/lua/lua_cfg_file.c b/src/lua/lua_cfg_file.c

index 8c44ab80972ad87e9bda9e05daf4fca06086c24d..8a3bc07aa348a5c791846f7593eced964f22897b 100644 (file)
--- a/src/lua/lua_cfg_file.c
+++ b/src/lua/lua_cfg_file.c
@@ -310,8 +310,8 @@ lua_post_load_config (struct config_file *cfg)
                         if (name != NULL && lua_isstring (L, -1)) {
                                 val = lua_tostring (L, -1);
                                 sym = memory_pool_strdup(cfg->cfg_pool, name);
-                               if ((expr = parse_expression (cfg->cfg_pool, sym)) == NULL) {
-                                       msg_err ("cannot parse composite expression: %s", sym);
+                               if ((expr = parse_expression (cfg->cfg_pool, memory_pool_strdup(cfg->cfg_pool, val))) == NULL) {
+                                       msg_err ("cannot parse composite expression: %s", val);
                                         continue;
                                 }
                                 /* Now check hash table for this composite */
diff --git a/src/plugins/regexp.c b/src/plugins/regexp.c

index 68c91f976d04cb2b5986dc4a5e960e39b5476aa9..d20c20a5a93a676a4d92df0d8bb548105b1e122d 100644 (file)
--- a/src/plugins/regexp.c
+++ b/src/plugins/regexp.c
@@ -87,6 +87,7 @@ static gint                     regexp_common_filter (struct worker_task *task);
  static gboolean                 rspamd_regexp_match_number (struct worker_task *task, GList * args, void *unused);
  static gboolean                 rspamd_raw_header_exists (struct worker_task *task, GList * args, void *unused);
  static gboolean                 rspamd_check_smtp_data (struct worker_task *task, GList * args, void *unused);
+static gboolean                 rspamd_regexp_occurs_number (struct worker_task *task, GList * args, void *unused);
  static void                     process_regexp_item (struct worker_task *task, void *user_data);
  
  static gint
@@ -105,6 +106,39 @@ regexp_dynamic_insert_result (struct worker_task *task, void *user_data)
         insert_result (task, symbol, 1, NULL);
  }
  
+/*
+ * Utility functions for matching exact number of regexps
+ */
+typedef gboolean (*int_compare_func) (gint a, gint b);
+static gboolean
+op_equal (gint a, gint b)
+{
+       return a == b;
+}
+static gboolean
+op_more (gint a, gint b)
+{
+       return a > b;
+}
+static gboolean
+op_less (gint a, gint b)
+{
+       return a < b;
+}
+static gboolean
+op_more_equal (gint a, gint b)
+{
+       return a >= b;
+}
+static gboolean
+op_less_equal (gint a, gint b)
+{
+       return a <= b;
+}
+
+/*
+ * Process ip and mask of dynamic regexp
+ */
  static gboolean
  parse_regexp_ipmask (const gchar *begin, struct dynamic_map_item *addr)
  {
@@ -404,6 +438,7 @@ regexp_module_init (struct config_file *cfg, struct module_ctx **ctx)
  
         *ctx = (struct module_ctx *)regexp_module_ctx;
         register_expression_function ("regexp_match_number", rspamd_regexp_match_number, NULL);
+       register_expression_function ("regexp_occurs_number", rspamd_regexp_occurs_number, NULL);
         register_expression_function ("raw_header_exists", rspamd_raw_header_exists, NULL);
         register_expression_function ("check_smtp_data", rspamd_check_smtp_data, NULL);
  
@@ -603,12 +638,14 @@ tree_url_callback (gpointer key, gpointer value, void *data)
  }
  
  static                          gsize
-process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar *additional)
+process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar *additional,
+               gint limit, int_compare_func f)
  {
-       gchar                           *headerv, *c, t;
+       gchar                          *headerv, *c, t;
         struct mime_text_part          *part;
         GList                          *cur, *headerlist;
         GRegex                         *regexp;
+       GMatchInfo                     *info;
         GError                         *err = NULL;
         struct url_regexp_param         callback_param = {
                 .task = task,
@@ -618,7 +655,8 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
         };
         guint8                         *ct;
         gsize                           clen;
-       gint                            r;
+       gint                            r, passed = 0, start, end, old;
+       gboolean                        matched;
  
  
         if (re == NULL) {
@@ -654,6 +692,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 msg_warn ("bad error detected: %s has invalid regexp type", re->regexp_text);
                 return 0;
         case REGEXP_HEADER:
+               /* Check header's name */
                 if (re->header == NULL) {
                         msg_info ("header regexp without header name: '%s'", re->regexp_text);
                         task_cache_add (task, re, 0);
@@ -661,8 +700,10 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 }
                 debug_task ("checking header regexp: %s = %s", re->header, re->regexp_text);
  
+               /* Get list of specified headers */
                 headerlist = message_get_header (task->task_pool, task->message, re->header, re->is_strong);
                 if (headerlist == NULL) {
+                       /* Header is not found */
                         if (G_UNLIKELY (re->is_test)) {
                                 msg_info ("process test regexp %s for header %s returned FALSE: no header found", re->regexp_text, re->header);
                         }
@@ -671,21 +712,33 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 }
                 else {
                         memory_pool_add_destructor (task->task_pool, (pool_destruct_func) g_list_free, headerlist);
+                       /* Check whether we have regexp for it */
                         if (re->regexp == NULL) {
                                 debug_task ("regexp contains only header and it is found %s", re->header);
                                 task_cache_add (task, re, 1);
                                 return 1;
                         }
+                       /* Iterate throught headers */
                         cur = headerlist;
                         while (cur) {
                                 debug_task ("found header \"%s\" with value \"%s\"", re->header, (const gchar *)cur->data);
  
+                               /* Try to match regexp */
                                 if (cur->data && g_regex_match_full (re->regexp, cur->data, -1, 0, 0, NULL, &err) == TRUE) {
                                         if (G_UNLIKELY (re->is_test)) {
                                                 msg_info ("process test regexp %s for header %s with value '%s' returned TRUE", re->regexp_text, re->header, (const gchar *)cur->data);
                                         }
-                                       task_cache_add (task, re, 1);
-                                       return 1;
+                                       if (f != NULL && limit > 1) {
+                                               /* If we have limit count, increase passed count and compare with limit */
+                                               if (f (++passed, limit)) {
+                                                       task_cache_add (task, re, 1);
+                                                       return 1;
+                                               }
+                                       }
+                                       else {
+                                               task_cache_add (task, re, 1);
+                                               return 1;
+                                       }
                                 }
                                 else if (G_UNLIKELY (re->is_test)) {
                                         msg_info ("process test regexp %s for header %s with value '%s' returned FALSE", re->regexp_text, re->header, (const gchar *)cur->data);
@@ -701,6 +754,7 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 break;
         case REGEXP_MIME:
                 debug_task ("checking mime regexp: %s", re->regexp_text);
+               /* Iterate throught text parts */
                 cur = g_list_first (task->text_parts);
                 while (cur) {
                         part = (struct mime_text_part *)cur->data;
@@ -709,12 +763,14 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                                 cur = g_list_next (cur);
                                 continue;
                         }
+                       /* Check raw flags */
                         if (part->is_raw) {
                                 regexp = re->raw_regexp;
                         }
                         else {
                                 regexp = re->regexp;
                         }
+                       /* Select data for regexp */
                         if (re->is_raw) {
                                 ct = part->orig->data;
                                 clen = part->orig->len;
@@ -723,15 +779,44 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                                 ct = part->content->data;
                                 clen = part->content->len;
                         }
-                       if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) {
-                               if (G_UNLIKELY (re->is_test)) {
-                                       msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text,
-                                                       (gint)clen);
+                       /* If we have limit, apply regexp so much times as we can */
+                       if (f != NULL && limit > 1) {
+                               end = 0;
+                               while ((matched = g_regex_match_full (regexp, ct + end + 1, clen - end - 1, 0, 0, &info, &err)) == TRUE) {
+                                       if (G_UNLIKELY (re->is_test)) {
+                                               msg_info ("process test regexp %s for mime part of length %d returned TRUE",
+                                                               re->regexp_text,
+                                                               (gint)clen,
+                                                               end);
+                                       }
+                                       if (f (++passed, limit)) {
+                                               task_cache_add (task, re, 1);
+                                               return 1;
+                                       }
+                                       else {
+                                               /* Match not found, skip further cycles */
+                                               old = end;
+                                               if (!g_match_info_fetch_pos (info, 0, &start, &end) || end <= 0) {
+                                                       break;
+                                               }
+                                               end += old;
+                                       }
+                                       g_match_info_free (info);
                                 }
-                               task_cache_add (task, re, 1);
-                               return 1;
+                               g_match_info_free (info);
                         }
-                       else if (G_UNLIKELY (re->is_test)) {
+                       else {
+                               if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) {
+                                       if (G_UNLIKELY (re->is_test)) {
+                                               msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text,
+                                                               (gint)clen);
+                                       }
+                                       task_cache_add (task, re, 1);
+                                       return 1;
+                               }
+
+                       }
+                       if (!matched && G_UNLIKELY (re->is_test)) {
                                 msg_info ("process test regexp %s for mime part of length %d returned FALSE", re->regexp_text,
                                                 (gint)clen);
                         }
@@ -744,16 +829,48 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 return 0;
         case REGEXP_MESSAGE:
                 debug_task ("checking message regexp: %s", re->regexp_text);
-
-               if (g_regex_match_full (re->raw_regexp, task->msg->begin, task->msg->len, 0, 0, NULL, &err) == TRUE) {
-                       if (G_UNLIKELY (re->is_test)) {
-                               msg_info ("process test regexp %s for message of length %d returned TRUE", re->regexp_text, (gint)task->msg->len);
+               regexp = re->raw_regexp;
+               ct = task->msg->begin;
+               clen = task->msg->len;
+
+               /* If we have limit, apply regexp so much times as we can */
+               if (f != NULL && limit > 1) {
+                       end = 0;
+                       while ((matched = g_regex_match_full (regexp, ct + end + 1, clen - end - 1, 0, 0, &info, &err)) == TRUE) {
+                               if (G_UNLIKELY (re->is_test)) {
+                                       msg_info ("process test regexp %s for mime part of length %d returned TRUE", re->regexp_text,
+                                                       (gint)clen);
+                               }
+                               if (f (++passed, limit)) {
+                                       task_cache_add (task, re, 1);
+                                       return 1;
+                               }
+                               else {
+                                       /* Match not found, skip further cycles */
+                                       old = end;
+                                       if (!g_match_info_fetch_pos (info, 0, &start, &end) || end <= 0) {
+                                               break;
+                                       }
+                                       old += end;
+                               }
+                               g_match_info_free (info);
                         }
-                       task_cache_add (task, re, 1);
-                       return 1;
+                       g_match_info_free (info);
+               }
+               else {
+                       if (g_regex_match_full (regexp, ct, clen, 0, 0, NULL, &err) == TRUE) {
+                               if (G_UNLIKELY (re->is_test)) {
+                                       msg_info ("process test regexp %s for message part of length %d returned TRUE", re->regexp_text,
+                                                       (gint)clen);
+                               }
+                               task_cache_add (task, re, 1);
+                               return 1;
+                       }
+
                 }
-               else if (G_UNLIKELY (re->is_test)) {
-                       msg_info ("process test regexp %s for message of length %d returned FALSE", re->regexp_text, (gint)task->msg->len);
+               if (!matched && G_UNLIKELY (re->is_test)) {
+                       msg_info ("process test regexp %s for message part of length %d returned FALSE", re->regexp_text,
+                                       (gint)clen);
                 }
                 if (err != NULL) {
                         msg_info ("error occured while processing regexp \"%s\": %s", re->regexp_text, err->message);
@@ -762,6 +879,10 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 return 0;
         case REGEXP_URL:
                 debug_task ("checking url regexp: %s", re->regexp_text);
+               if (f != NULL && limit > 1) {
+                       /*XXX: add support of it */
+                       msg_warn ("numbered matches are not supported for url regexp");
+               }
                 cur = g_list_first (task->text_parts);
                 while (cur) {
                         part = (struct mime_text_part *)cur->data;
@@ -794,6 +915,10 @@ process_regexp (struct rspamd_regexp *re, struct worker_task *task, const gchar
                 return 0;
         case REGEXP_RAW_HEADER:
                 debug_task ("checking for raw header: %s with regexp: %s", re->header, re->regexp_text);
+               if (f != NULL && limit > 1) {
+                       /*XXX: add support of it */
+                       msg_warn ("numbered matches are not supported for url regexp");
+               }
                 if (task->raw_headers == NULL) {
                         debug_task ("cannot check for raw header in message, no headers found");
                         task_cache_add (task, re, 0);
@@ -924,7 +1049,7 @@ process_regexp_expression (struct expression *expr, gchar *symbol, struct worker
         while (it) {
                 if (it->type == EXPR_REGEXP_PARSED) {
                         /* Find corresponding symbol */
-                       cur = process_regexp ((struct rspamd_regexp *)it->content.operand, task, additional);
+                       cur = process_regexp ((struct rspamd_regexp *)it->content.operand, task, additional, 0, NULL);
                         debug_task ("regexp %s found", cur ? "is" : "is not");
                         if (try_optimize) {
                                 try_optimize = optimize_regexp_expression (&it, stack, cur);
@@ -1073,6 +1198,73 @@ rspamd_regexp_match_number (struct worker_task *task, GList * args, void *unused
         return res >= param_count;
  }
  
+static                          gboolean
+rspamd_regexp_occurs_number (struct worker_task *task, GList * args, void *unused)
+{
+       gint                            limit;
+       struct expression_argument     *arg;
+       struct rspamd_regexp           *re;
+       gchar                          *param, *err_str, op;
+       int_compare_func                f = NULL;
+
+       if (args == NULL || args->next == NULL) {
+               msg_warn ("wrong number of parameters to function, must be 2");
+               return FALSE;
+       }
+
+       arg = get_function_arg (args->data, task, TRUE);
+       if ((re = re_cache_check (arg->data, task->cfg->cfg_pool)) == NULL) {
+               re = parse_regexp (task->cfg->cfg_pool, arg->data, task->cfg->raw_mode);
+               if (!re) {
+                       msg_err ("cannot parse given regexp: %s", (gchar *)arg->data);
+                       return FALSE;
+               }
+       }
+
+       arg = get_function_arg (args->next->data, task, TRUE);
+       param = arg->data;
+       op = *param;
+       if (g_ascii_isdigit (op)) {
+               op = '=';
+       }
+       else {
+               param ++;
+       }
+       switch (op) {
+       case '>':
+               if (*param == '=') {
+                       f = op_more_equal;
+                       param ++;
+               }
+               else {
+                       f = op_more;
+               }
+               break;
+       case '<':
+               if (*param == '=') {
+                       f = op_less_equal;
+                       param ++;
+               }
+               else {
+                       f = op_less;
+               }
+               break;
+       case '=':
+               f = op_equal;
+               break;
+       default:
+               msg_err ("wrong operation character: %c, assumed '=', '>', '<', '>=', '<=' or empty op", op);
+               return FALSE;
+       }
+
+       limit = strtoul (param, &err_str, 10);
+       if (*err_str != 0) {
+               msg_err ("wrong numeric: %s at position: %s", param, err_str);
+               return FALSE;
+       }
+
+       return process_regexp (re, task, NULL, limit, f);
+}
  static                          gboolean
  rspamd_raw_header_exists (struct worker_task *task, GList * args, void *unused)
  {
@@ -1271,7 +1463,7 @@ lua_regexp_match (lua_State *L)
                 }
                 re_cache_add ((gchar *)re_text, re, task->cfg->cfg_pool);
         }
-       r = process_regexp (re, task, NULL);
+       r = process_regexp (re, task, NULL, 0, NULL);
         lua_pushboolean (L, r == 1);
  
         return 1;
author	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)
committer	Vsevolod Stakhov <vsevolod@rambler-co.ru>
	Fri, 18 Feb 2011 13:53:03 +0000 (16:53 +0300)
src/filter.c		patch \| blob \| history
src/filter.h		patch \| blob \| history
src/lua/lua_cfg_file.c		patch \| blob \| history
src/plugins/regexp.c		patch \| blob \| history