From 0d43251bc637a302e969653057207b3265d113eb Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 11 Jul 2016 15:21:57 +0100 Subject: [PATCH] [Rework] Make chartable module useful --- src/plugins/chartable.c | 276 ++++++++++++++++++++++++++++++---------- 1 file changed, 208 insertions(+), 68 deletions(-) diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 0cc6825f5..61991ca41 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -29,6 +29,23 @@ #define DEFAULT_SYMBOL "R_CHARSET_MIXED" #define DEFAULT_THRESHOLD 0.1 +#define msg_err_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \ + "chartable", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_warn_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \ + "chartable", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_info_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \ + "chartable", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) +#define msg_debug_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \ + "chartable", task->task_pool->tag.uid, \ + G_STRFUNC, \ + __VA_ARGS__) + /* Initialization */ gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx); gint chartable_module_config (struct rspamd_config *cfg); @@ -47,6 +64,7 @@ struct chartable_ctx { struct module_ctx ctx; const gchar *symbol; double threshold; + guint max_word_len; rspamd_mempool_t *chartable_pool; }; @@ -60,6 +78,7 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx) chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx)); chartable_module_ctx->chartable_pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL); + chartable_module_ctx->max_word_len = 10; *ctx = (struct module_ctx *)chartable_module_ctx; @@ -94,6 +113,13 @@ chartable_module_config (struct rspamd_config *cfg) else { chartable_module_ctx->threshold = DEFAULT_THRESHOLD; } + if ((value = + rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != NULL) { + chartable_module_ctx->max_word_len = ucl_object_toint (value); + } + else { + chartable_module_ctx->threshold = DEFAULT_THRESHOLD; + } rspamd_symbols_cache_add_symbol (cfg->cache, chartable_module_ctx->symbol, @@ -117,88 +143,205 @@ chartable_module_reconfig (struct rspamd_config *cfg) return chartable_module_config (cfg); } -static gboolean -check_part (struct rspamd_mime_text_part *part, gboolean raw_mode) +static gdouble +rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w) { - guchar *p, *p1; - gunichar c, t; - GUnicodeScript scc, sct; - guint32 mark = 0, total = 0, max = 0, i; - guint32 remain = part->content->len; - guint32 scripts[G_UNICODE_SCRIPT_NKO]; - GUnicodeScript sel = 0; - - p = part->content->data; - - if (IS_PART_UTF (part) || raw_mode) { - while (remain > 1) { - if ((g_ascii_isalpha (*p) && - (*(p + 1) & 0x80)) || - ((*p & 0x80) && g_ascii_isalpha (*(p + 1)))) { - mark++; - total++; + const gchar *p, *end, *c; + gdouble badness = 0.0; + gunichar uc; + gint sc, last_sc; + guint same_script_count = 0, nsym = 0; + enum { + start_process = 0, + got_alpha, + got_digit, + got_unknown, + } state = start_process; + + p = w->begin; + end = p + w->len; + c = p; + last_sc = 0; + + /* We assume that w is normalized */ + + while (p < end) { + uc = g_utf8_get_char (p); + + if (g_unichar_isalpha (uc)) { + + if (state == got_digit) { + /* Penalize digit -> alpha translations */ + badness += 1.0; } - /* Current and next symbols are of one class */ - else if (((*p & 0x80) && - (*(p + 1) & 0x80)) || - (g_ascii_isalpha (*p) && g_ascii_isalpha (*(p + 1)))) { - total++; + else if (state == got_alpha) { + /* Check script */ + sc = g_unichar_get_script (uc); + + if (same_script_count > 0) { + if (sc != last_sc) { + badness += 1.0 / (gdouble)same_script_count; + last_sc = sc; + same_script_count = 1; + } + else { + same_script_count ++; + } + } + else { + last_sc = sc; + same_script_count = 1; + } } - p++; - remain--; + + state = got_alpha; + + } + else if (g_unichar_isdigit (uc)) { + state = got_digit; + same_script_count = 0; } + else { + /* We don't care about unknown characters here */ + state = got_unknown; + same_script_count = 0; + } + + nsym ++; + p = g_utf8_next_char (p); + } + + /* Try to avoid FP for long words */ + if (nsym > chartable_module_ctx->max_word_len) { + badness = 0; } else { - memset (&scripts, 0, sizeof (scripts)); - while (remain > 0) { - c = g_utf8_get_char_validated (p, remain); - if (c == (gunichar) - 2 || c == (gunichar) - 1) { - /* Invalid characters detected, stop processing */ - return FALSE; - } + if (badness > 4.0) { + badness = 4.0; + } + } + + msg_debug_chartable ("word %T, badness: %.2f", w, badness); + + return badness; +} - scc = g_unichar_get_script (c); - if (scc < (gint)G_N_ELEMENTS (scripts)) { - scripts[scc]++; +static gdouble +rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w) +{ + const gchar *p, *end, *c; + gdouble badness = 0.0; + enum { + ascii = 1, + non_ascii + } sc, last_sc; + gint same_script_count = 0; + enum { + start_process = 0, + got_alpha, + got_digit, + got_unknown, + } state = start_process; + + p = w->begin; + end = p + w->len; + c = p; + last_sc = 0; + + if (w->len > chartable_module_ctx->max_word_len) { + return 0.0; + } + + /* We assume that w is normalized */ + while (p < end) { + if (g_ascii_isalpha (*p) || *p > 0x7f) { + + if (state == got_digit) { + /* Penalize digit -> alpha translations */ + badness += 2.0; } - p1 = g_utf8_next_char (p); - remain -= p1 - p; - p = p1; - - if (remain > 0) { - t = g_utf8_get_char_validated (p, remain); - if (t == (gunichar) - 2 || t == (gunichar) - 1) { - /* Invalid characters detected, stop processing */ - return FALSE; - } - sct = g_unichar_get_script (t); - if (g_unichar_isalpha (c) && g_unichar_isalpha (t)) { - /* We have two unicode alphanumeric characters, so we can check its script */ - if (sct != scc) { - mark++; + else if (state == got_alpha) { + /* Check script */ + sc = (*p > 0x7f) ? ascii : non_ascii; + + if (same_script_count > 0) { + if (sc != last_sc) { + badness += 1.0 / (gdouble)same_script_count; + last_sc = sc; + same_script_count = 1; } - total++; + else { + same_script_count ++; + } + } + else { + last_sc = sc; + same_script_count = 1; } - p1 = g_utf8_next_char (p); - remain -= p1 - p; - p = p1; } + + state = got_alpha; + } - /* Detect the mostly charset of this part */ - for (i = 0; i < G_N_ELEMENTS (scripts); i++) { - if (scripts[i] > max) { - max = scripts[i]; - sel = i; + else if (g_ascii_isdigit (*p)) { + state = got_digit; + same_script_count = 0; + } + else { + /* We don't care about unknown characters here */ + state = got_unknown; + same_script_count = 0; + } + + p ++; + } + + if (badness > 4.0) { + badness = 4.0; + } + + msg_debug_chartable ("word %T, badness: %.2f", w, badness); + + return badness; +} + +static void +rspamd_chartable_process_part (struct rspamd_task *task, + struct rspamd_mime_text_part *part) +{ + rspamd_ftok_t *w; + guint i; + gdouble cur_score = 0.0; + + if (part->normalized_words->len == 0) { + return; + } + + for (i = 0; i < part->normalized_words->len; i++) { + w = &g_array_index (part->normalized_words, rspamd_ftok_t, i); + + if (w->len > 0) { + + if (IS_PART_UTF (part)) { + cur_score += rspamd_chartable_process_word_utf (task, w); + } + else { + cur_score += rspamd_chartable_process_word_ascii (task, w); } } - part->script = sel; } - if (total == 0) { - return 0; + cur_score /= (gdouble)part->normalized_words->len; + + if (cur_score > 2.0) { + cur_score = 2.0; } - return ((double)mark / (double)total) > chartable_module_ctx->threshold; + if (cur_score > chartable_module_ctx->threshold) { + rspamd_task_insert_result (task, chartable_module_ctx->symbol, + cur_score, NULL); + + } } static void @@ -209,10 +352,7 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused) for (i = 0; i < task->text_parts->len; i ++) { part = g_ptr_array_index (task->text_parts, i); - - if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) { - rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL); - } + rspamd_chartable_process_part (task, part); } } -- 2.39.5