summaryrefslogtreecommitdiffstats
path: root/src/plugins/chartable.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-11 15:21:57 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2016-07-11 15:21:57 +0100
commit0d43251bc637a302e969653057207b3265d113eb (patch)
treec3396ecdbdfd537548657cc654159127da3c0a0d /src/plugins/chartable.c
parentdb8b9b74f9f91b60d4aab211342c945ecb857700 (diff)
downloadrspamd-0d43251bc637a302e969653057207b3265d113eb.tar.gz
rspamd-0d43251bc637a302e969653057207b3265d113eb.zip
[Rework] Make chartable module useful
Diffstat (limited to 'src/plugins/chartable.c')
-rw-r--r--src/plugins/chartable.c276
1 files changed, 208 insertions, 68 deletions
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 0cc6825f5..61991ca41 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -29,6 +29,23 @@
#define DEFAULT_SYMBOL "R_CHARSET_MIXED"
#define DEFAULT_THRESHOLD 0.1
+#define msg_err_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
+ "chartable", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+#define msg_warn_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
+ "chartable", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+#define msg_info_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
+ "chartable", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+#define msg_debug_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \
+ "chartable", task->task_pool->tag.uid, \
+ G_STRFUNC, \
+ __VA_ARGS__)
+
/* Initialization */
gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
gint chartable_module_config (struct rspamd_config *cfg);
@@ -47,6 +64,7 @@ struct chartable_ctx {
struct module_ctx ctx;
const gchar *symbol;
double threshold;
+ guint max_word_len;
rspamd_mempool_t *chartable_pool;
};
@@ -60,6 +78,7 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx));
chartable_module_ctx->chartable_pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL);
+ chartable_module_ctx->max_word_len = 10;
*ctx = (struct module_ctx *)chartable_module_ctx;
@@ -94,6 +113,13 @@ chartable_module_config (struct rspamd_config *cfg)
else {
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
+ if ((value =
+ rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != NULL) {
+ chartable_module_ctx->max_word_len = ucl_object_toint (value);
+ }
+ else {
+ chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
+ }
rspamd_symbols_cache_add_symbol (cfg->cache,
chartable_module_ctx->symbol,
@@ -117,88 +143,205 @@ chartable_module_reconfig (struct rspamd_config *cfg)
return chartable_module_config (cfg);
}
-static gboolean
-check_part (struct rspamd_mime_text_part *part, gboolean raw_mode)
+static gdouble
+rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w)
{
- guchar *p, *p1;
- gunichar c, t;
- GUnicodeScript scc, sct;
- guint32 mark = 0, total = 0, max = 0, i;
- guint32 remain = part->content->len;
- guint32 scripts[G_UNICODE_SCRIPT_NKO];
- GUnicodeScript sel = 0;
-
- p = part->content->data;
-
- if (IS_PART_UTF (part) || raw_mode) {
- while (remain > 1) {
- if ((g_ascii_isalpha (*p) &&
- (*(p + 1) & 0x80)) ||
- ((*p & 0x80) && g_ascii_isalpha (*(p + 1)))) {
- mark++;
- total++;
+ const gchar *p, *end, *c;
+ gdouble badness = 0.0;
+ gunichar uc;
+ gint sc, last_sc;
+ guint same_script_count = 0, nsym = 0;
+ enum {
+ start_process = 0,
+ got_alpha,
+ got_digit,
+ got_unknown,
+ } state = start_process;
+
+ p = w->begin;
+ end = p + w->len;
+ c = p;
+ last_sc = 0;
+
+ /* We assume that w is normalized */
+
+ while (p < end) {
+ uc = g_utf8_get_char (p);
+
+ if (g_unichar_isalpha (uc)) {
+
+ if (state == got_digit) {
+ /* Penalize digit -> alpha translations */
+ badness += 1.0;
}
- /* Current and next symbols are of one class */
- else if (((*p & 0x80) &&
- (*(p + 1) & 0x80)) ||
- (g_ascii_isalpha (*p) && g_ascii_isalpha (*(p + 1)))) {
- total++;
+ else if (state == got_alpha) {
+ /* Check script */
+ sc = g_unichar_get_script (uc);
+
+ if (same_script_count > 0) {
+ if (sc != last_sc) {
+ badness += 1.0 / (gdouble)same_script_count;
+ last_sc = sc;
+ same_script_count = 1;
+ }
+ else {
+ same_script_count ++;
+ }
+ }
+ else {
+ last_sc = sc;
+ same_script_count = 1;
+ }
}
- p++;
- remain--;
+
+ state = got_alpha;
+
+ }
+ else if (g_unichar_isdigit (uc)) {
+ state = got_digit;
+ same_script_count = 0;
}
+ else {
+ /* We don't care about unknown characters here */
+ state = got_unknown;
+ same_script_count = 0;
+ }
+
+ nsym ++;
+ p = g_utf8_next_char (p);
+ }
+
+ /* Try to avoid FP for long words */
+ if (nsym > chartable_module_ctx->max_word_len) {
+ badness = 0;
}
else {
- memset (&scripts, 0, sizeof (scripts));
- while (remain > 0) {
- c = g_utf8_get_char_validated (p, remain);
- if (c == (gunichar) - 2 || c == (gunichar) - 1) {
- /* Invalid characters detected, stop processing */
- return FALSE;
- }
+ if (badness > 4.0) {
+ badness = 4.0;
+ }
+ }
+
+ msg_debug_chartable ("word %T, badness: %.2f", w, badness);
+
+ return badness;
+}
- scc = g_unichar_get_script (c);
- if (scc < (gint)G_N_ELEMENTS (scripts)) {
- scripts[scc]++;
+static gdouble
+rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w)
+{
+ const gchar *p, *end, *c;
+ gdouble badness = 0.0;
+ enum {
+ ascii = 1,
+ non_ascii
+ } sc, last_sc;
+ gint same_script_count = 0;
+ enum {
+ start_process = 0,
+ got_alpha,
+ got_digit,
+ got_unknown,
+ } state = start_process;
+
+ p = w->begin;
+ end = p + w->len;
+ c = p;
+ last_sc = 0;
+
+ if (w->len > chartable_module_ctx->max_word_len) {
+ return 0.0;
+ }
+
+ /* We assume that w is normalized */
+ while (p < end) {
+ if (g_ascii_isalpha (*p) || *p > 0x7f) {
+
+ if (state == got_digit) {
+ /* Penalize digit -> alpha translations */
+ badness += 2.0;
}
- p1 = g_utf8_next_char (p);
- remain -= p1 - p;
- p = p1;
-
- if (remain > 0) {
- t = g_utf8_get_char_validated (p, remain);
- if (t == (gunichar) - 2 || t == (gunichar) - 1) {
- /* Invalid characters detected, stop processing */
- return FALSE;
- }
- sct = g_unichar_get_script (t);
- if (g_unichar_isalpha (c) && g_unichar_isalpha (t)) {
- /* We have two unicode alphanumeric characters, so we can check its script */
- if (sct != scc) {
- mark++;
+ else if (state == got_alpha) {
+ /* Check script */
+ sc = (*p > 0x7f) ? ascii : non_ascii;
+
+ if (same_script_count > 0) {
+ if (sc != last_sc) {
+ badness += 1.0 / (gdouble)same_script_count;
+ last_sc = sc;
+ same_script_count = 1;
}
- total++;
+ else {
+ same_script_count ++;
+ }
+ }
+ else {
+ last_sc = sc;
+ same_script_count = 1;
}
- p1 = g_utf8_next_char (p);
- remain -= p1 - p;
- p = p1;
}
+
+ state = got_alpha;
+
}
- /* Detect the mostly charset of this part */
- for (i = 0; i < G_N_ELEMENTS (scripts); i++) {
- if (scripts[i] > max) {
- max = scripts[i];
- sel = i;
+ else if (g_ascii_isdigit (*p)) {
+ state = got_digit;
+ same_script_count = 0;
+ }
+ else {
+ /* We don't care about unknown characters here */
+ state = got_unknown;
+ same_script_count = 0;
+ }
+
+ p ++;
+ }
+
+ if (badness > 4.0) {
+ badness = 4.0;
+ }
+
+ msg_debug_chartable ("word %T, badness: %.2f", w, badness);
+
+ return badness;
+}
+
+static void
+rspamd_chartable_process_part (struct rspamd_task *task,
+ struct rspamd_mime_text_part *part)
+{
+ rspamd_ftok_t *w;
+ guint i;
+ gdouble cur_score = 0.0;
+
+ if (part->normalized_words->len == 0) {
+ return;
+ }
+
+ for (i = 0; i < part->normalized_words->len; i++) {
+ w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);
+
+ if (w->len > 0) {
+
+ if (IS_PART_UTF (part)) {
+ cur_score += rspamd_chartable_process_word_utf (task, w);
+ }
+ else {
+ cur_score += rspamd_chartable_process_word_ascii (task, w);
}
}
- part->script = sel;
}
- if (total == 0) {
- return 0;
+ cur_score /= (gdouble)part->normalized_words->len;
+
+ if (cur_score > 2.0) {
+ cur_score = 2.0;
}
- return ((double)mark / (double)total) > chartable_module_ctx->threshold;
+ if (cur_score > chartable_module_ctx->threshold) {
+ rspamd_task_insert_result (task, chartable_module_ctx->symbol,
+ cur_score, NULL);
+
+ }
}
static void
@@ -209,10 +352,7 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);
-
- if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) {
- rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL);
- }
+ rspamd_chartable_process_part (task, part);
}
}