From f074850e41e357aa7a2c5a86d2eef8121710cc47 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 21 Apr 2009 16:09:30 +0400 Subject: [PATCH] * Add initial version of chartable plugin: now it can only detects mixed unicode characters --- CMakeLists.txt | 3 +- README.utf8.txt | 1 + rspamd.conf.sample | 1 + src/message.c | 2 +- src/plugins/chartable.c | 178 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 183 insertions(+), 2 deletions(-) create mode 100644 src/plugins/chartable.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 66c9fce73..a8bcb87b1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -308,7 +308,8 @@ SET(CLASSIFIERSSRC src/classifiers/classifiers.c src/classifiers/winnow.c) SET(PLUGINSSRC src/plugins/surbl.c - src/plugins/regexp.c) + src/plugins/regexp.c + src/plugins/chartable.c) SET(TESTSRC test/rspamd_expression_test.c test/rspamd_memcached_test.c diff --git a/README.utf8.txt b/README.utf8.txt index b2461f20b..4d761da76 100644 --- a/README.utf8.txt +++ b/README.utf8.txt @@ -140,6 +140,7 @@ SOME_SYMBOL = "${to_blah} & !(${from_blah} | ${subject_blah})" между частями письма. Функция работает с сообщениями, содержащими 2 текстовые части (text/plain и text/html) и возвращает true тогда, когда эти части различаются более чем на n процентов. Если аргумент не указан, то по умолчанию ищется различие в 100% (полностью разные части). + * compare_transfer_encoding - сравнивает Content-Transfer-Encoding с заданной строкой * content_type_compare_param - сравнивает параметр content-type заголовка с регулярным выражением или строкой: content_type_compare_param(Charset, /windows-\d+/) content_type_compare_param(Charset, ascii) diff --git a/rspamd.conf.sample b/rspamd.conf.sample index 9e70bfe1f..838fa6bd3 100644 --- a/rspamd.conf.sample +++ b/rspamd.conf.sample @@ -148,3 +148,4 @@ raw_mode = yes; url_filters = "surbl"; header_filters = "regexp"; +mime_filters = "chartable"; diff --git a/src/message.c b/src/message.c index 5ce4abb40..0c3d1ed8b 100644 --- a/src/message.c +++ b/src/message.c @@ -299,7 +299,7 @@ process_text_part (struct worker_task *task, GByteArray *part_content, GMimeCont text_part = memory_pool_alloc (task->task_pool, sizeof (struct mime_text_part)); text_part->orig = convert_text_to_utf (task, part_content, type, text_part); - text_part->content = part_content; + text_part->content = text_part->orig; text_part->is_html = FALSE; text_part->fuzzy = fuzzy_init_byte_array (text_part->content, task->task_pool); task->text_parts = g_list_prepend (task->text_parts, text_part); diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c new file mode 100644 index 000000000..95987e48a --- /dev/null +++ b/src/plugins/chartable.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2009, Rambler media + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY Rambler media ''AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Rambler BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/***MODULE:chartable + * rspamd module that make marks based on symbol chains + */ + +#include "../config.h" +#include "../main.h" +#include "../message.h" +#include "../modules.h" +#include "../cfg_file.h" +#include "../expressions.h" + +#define DEFAULT_SYMBOL "R_CHARSET_MIXED" +#define DEFAULT_THRESHOLD 0.1 + +struct chartable_ctx { + int (*header_filter)(struct worker_task *task); + int (*mime_filter)(struct worker_task *task); + int (*message_filter)(struct worker_task *task); + int (*url_filter)(struct worker_task *task); + char *metric; + char *symbol; + double threshold; + + memory_pool_t *chartable_pool; +}; + +static struct chartable_ctx *chartable_module_ctx = NULL; + +static int chartable_mime_filter (struct worker_task *task); + +int +chartable_module_init (struct config_file *cfg, struct module_ctx **ctx) +{ + chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx)); + + chartable_module_ctx->header_filter = NULL; + chartable_module_ctx->mime_filter = chartable_mime_filter; + chartable_module_ctx->message_filter = NULL; + chartable_module_ctx->url_filter = NULL; + chartable_module_ctx->chartable_pool = memory_pool_new (memory_pool_get_size ()); + + *ctx = (struct module_ctx *)chartable_module_ctx; + + return 0; +} + + +int +chartable_module_config (struct config_file *cfg) +{ + char *value; + int res = TRUE; + + if ((value = get_module_opt (cfg, "chartable", "metric")) != NULL) { + chartable_module_ctx->metric = memory_pool_strdup (chartable_module_ctx->chartable_pool, value); + g_free (value); + } + else { + chartable_module_ctx->metric = DEFAULT_METRIC; + } + if ((value = get_module_opt (cfg, "chartable", "symbol")) != NULL) { + chartable_module_ctx->symbol = memory_pool_strdup (chartable_module_ctx->chartable_pool, value); + g_free (value); + } + else { + chartable_module_ctx->symbol = DEFAULT_SYMBOL; + } + if ((value = get_module_opt (cfg, "chartable", "threshold")) != NULL) { + errno = 0; + chartable_module_ctx->threshold = strtod (value, NULL); + if (errno != 0) { + msg_warn ("chartable_module_config: invalid numeric value '%s': %s", value, strerror (errno)); + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } + } + else { + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } + + return res; +} + +int +chartable_module_reconfig (struct config_file *cfg) +{ + memory_pool_delete (chartable_module_ctx->chartable_pool); + chartable_module_ctx->chartable_pool = memory_pool_new (1024); + + return chartable_module_config (cfg); +} + +static gboolean +check_part (struct mime_text_part *part) +{ + char *p, *p1; + gunichar c, t; + GUnicodeScript scc, sct; + uint32_t mark = 0, total = 0; + uint32_t remain = part->content->len; + + if (part->is_raw) { + return FALSE; + } + + p = part->content->data; + + while (remain > 0) { + c = g_utf8_get_char (p); + scc = g_unichar_get_script (c); + p1 = g_utf8_next_char (p); + remain -= p1 - p; + p = p1; + + if (remain > 0) { + t = g_utf8_get_char (p); + sct = g_unichar_get_script (t); + if (g_unichar_isalnum (c) && g_unichar_isalnum (t)) { + /* We have two unicode alphanumeric characters, so we can check its script */ + if (sct != scc) { + mark ++; + } + total ++; + } + p1 = g_utf8_next_char (p); + remain -= p1 - p; + p = p1; + } + } + + return ((double)mark / (double)total) > chartable_module_ctx->threshold; +} + +static int +chartable_mime_filter (struct worker_task *task) +{ + GList *cur; + + /* XXX: write translation tables for this */ + if (task->cfg->raw_mode) { + msg_warn ("chartable_mime_filter: cannot work in non-unicode mode"); + return 0; + } + + cur = g_list_first (task->text_parts); + while (cur) { + if (check_part ((struct mime_text_part *)cur->data)) { + insert_result (task, chartable_module_ctx->metric, chartable_module_ctx->symbol, 1, NULL); + } + cur = g_list_next (cur); + } + + return 0; +} + -- 2.39.5