il y a 8 ans · 0d43251bc6
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -29,6 +29,23 @@
 #define DEFAULT_SYMBOL "R_CHARSET_MIXED"
 #define DEFAULT_THRESHOLD 0.1

 #define msg_err_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
        "chartable", task->task_pool->tag.uid, \
        G_STRFUNC, \
        __VA_ARGS__)
 #define msg_warn_chartable(...)   rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
        "chartable", task->task_pool->tag.uid, \
        G_STRFUNC, \
        __VA_ARGS__)
 #define msg_info_chartable(...)   rspamd_default_log_function (G_LOG_LEVEL_INFO, \
        "chartable", task->task_pool->tag.uid, \
        G_STRFUNC, \
        __VA_ARGS__)
 #define msg_debug_chartable(...)  rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \
        "chartable", task->task_pool->tag.uid, \
        G_STRFUNC, \
        __VA_ARGS__)

 /* Initialization */
 gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
 gint chartable_module_config (struct rspamd_config *cfg);
@@ -47,6 +64,7 @@ struct chartable_ctx {
 	struct module_ctx ctx;
 	const gchar *symbol;
 	double threshold;
 	guint max_word_len;

 	rspamd_mempool_t *chartable_pool;
 };
@@ -60,6 +78,7 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
 	chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx));

 	chartable_module_ctx->chartable_pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL);
 	chartable_module_ctx->max_word_len = 10;

 	*ctx = (struct module_ctx *)chartable_module_ctx;

@@ -94,6 +113,13 @@ chartable_module_config (struct rspamd_config *cfg)
 	else {
 		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
 	}
 	if ((value =
 			rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != NULL) {
 		chartable_module_ctx->max_word_len = ucl_object_toint (value);
 	}
 	else {
 		chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
 	}

 	rspamd_symbols_cache_add_symbol (cfg->cache,
 		chartable_module_ctx->symbol,
@@ -117,88 +143,205 @@ chartable_module_reconfig (struct rspamd_config *cfg)
 	return chartable_module_config (cfg);
 }

 static gboolean
 check_part (struct rspamd_mime_text_part *part, gboolean raw_mode)
 static gdouble
 rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w)
 {
 	guchar *p, *p1;
 	gunichar c, t;
 	GUnicodeScript scc, sct;
 	guint32 mark = 0, total = 0, max = 0, i;
 	guint32 remain = part->content->len;
 	guint32 scripts[G_UNICODE_SCRIPT_NKO];
 	GUnicodeScript sel = 0;

 	p = part->content->data;

 	if (IS_PART_UTF (part) || raw_mode) {
 		while (remain > 1) {
 			if ((g_ascii_isalpha (*p) &&
 				(*(p + 1) & 0x80)) ||
 				((*p & 0x80) && g_ascii_isalpha (*(p + 1)))) {
 				mark++;
 				total++;
 	const gchar *p, *end, *c;
 	gdouble badness = 0.0;
 	gunichar uc;
 	gint sc, last_sc;
 	guint same_script_count = 0, nsym = 0;
 	enum {
 		start_process = 0,
 		got_alpha,
 		got_digit,
 		got_unknown,
 	} state = start_process;

 	p = w->begin;
 	end = p + w->len;
 	c = p;
 	last_sc = 0;

 	/* We assume that w is normalized */

 	while (p < end) {
 		uc = g_utf8_get_char (p);

 		if (g_unichar_isalpha (uc)) {

 			if (state == got_digit) {
 				/* Penalize digit -> alpha translations */
 				badness += 1.0;
 			}
 			/* Current and next symbols are of one class */
 			else if (((*p & 0x80) &&
 				(*(p + 1) & 0x80)) ||
 				(g_ascii_isalpha (*p) && g_ascii_isalpha (*(p + 1)))) {
 				total++;
 			else if (state == got_alpha) {
 				/* Check script */
 				sc = g_unichar_get_script (uc);

 				if (same_script_count > 0) {
 					if (sc != last_sc) {
 						badness += 1.0 / (gdouble)same_script_count;
 						last_sc = sc;
 						same_script_count = 1;
 					}
 					else {
 						same_script_count ++;
 					}
 				}
 				else {
 					last_sc = sc;
 					same_script_count = 1;
 				}
 			}
 			p++;
 			remain--;

 			state = got_alpha;

 		}
 		else if (g_unichar_isdigit (uc)) {
 			state = got_digit;
 			same_script_count = 0;
 		}
 		else {
 			/* We don't care about unknown characters here */
 			state = got_unknown;
 			same_script_count = 0;
 		}

 		nsym ++;
 		p = g_utf8_next_char (p);
 	}

 	/* Try to avoid FP for long words */
 	if (nsym > chartable_module_ctx->max_word_len) {
 		badness = 0;
 	}
 	else {
 		memset (&scripts, 0, sizeof (scripts));
 		while (remain > 0) {
 			c = g_utf8_get_char_validated (p, remain);
 			if (c == (gunichar) - 2 || c == (gunichar) - 1) {
 				/* Invalid characters detected, stop processing */
 				return FALSE;
 			}
 		if (badness > 4.0) {
 			badness = 4.0;
 		}
 	}

 	msg_debug_chartable ("word %T, badness: %.2f", w, badness);

 	return badness;
 }

 			scc = g_unichar_get_script (c);
 			if (scc < (gint)G_N_ELEMENTS (scripts)) {
 				scripts[scc]++;
 static gdouble
 rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w)
 {
 	const gchar *p, *end, *c;
 	gdouble badness = 0.0;
 	enum {
 		ascii = 1,
 		non_ascii
 	} sc, last_sc;
 	gint same_script_count = 0;
 	enum {
 		start_process = 0,
 		got_alpha,
 		got_digit,
 		got_unknown,
 	} state = start_process;

 	p = w->begin;
 	end = p + w->len;
 	c = p;
 	last_sc = 0;

 	if (w->len > chartable_module_ctx->max_word_len) {
 		return 0.0;
 	}

 	/* We assume that w is normalized */
 	while (p < end) {
 		if (g_ascii_isalpha (*p) || *p > 0x7f) {

 			if (state == got_digit) {
 				/* Penalize digit -> alpha translations */
 				badness += 2.0;
 			}
 			p1 = g_utf8_next_char (p);
 			remain -= p1 - p;
 			p = p1;

 			if (remain > 0) {
 				t = g_utf8_get_char_validated (p, remain);
 				if (t == (gunichar) - 2 || t == (gunichar) - 1) {
 					/* Invalid characters detected, stop processing */
 					return FALSE;
 				}
 				sct = g_unichar_get_script (t);
 				if (g_unichar_isalpha (c) && g_unichar_isalpha (t)) {
 					/* We have two unicode alphanumeric characters, so we can check its script */
 					if (sct != scc) {
 						mark++;
 			else if (state == got_alpha) {
 				/* Check script */
 				sc = (*p > 0x7f) ? ascii : non_ascii;

 				if (same_script_count > 0) {
 					if (sc != last_sc) {
 						badness += 1.0 / (gdouble)same_script_count;
 						last_sc = sc;
 						same_script_count = 1;
 					}
 					total++;
 					else {
 						same_script_count ++;
 					}
 				}
 				else {
 					last_sc = sc;
 					same_script_count = 1;
 				}
 				p1 = g_utf8_next_char (p);
 				remain -= p1 - p;
 				p = p1;
 			}

 			state = got_alpha;

 		}
 		/* Detect the mostly charset of this part */
 		for (i = 0; i < G_N_ELEMENTS (scripts); i++) {
 			if (scripts[i] > max) {
 				max = scripts[i];
 				sel = i;
 		else if (g_ascii_isdigit (*p)) {
 			state = got_digit;
 			same_script_count = 0;
 		}
 		else {
 			/* We don't care about unknown characters here */
 			state = got_unknown;
 			same_script_count = 0;
 		}

 		p ++;
 	}

 	if (badness > 4.0) {
 		badness = 4.0;
 	}

 	msg_debug_chartable ("word %T, badness: %.2f", w, badness);

 	return badness;
 }

 static void
 rspamd_chartable_process_part (struct rspamd_task *task,
 		struct rspamd_mime_text_part *part)
 {
 	rspamd_ftok_t *w;
 	guint i;
 	gdouble cur_score = 0.0;

 	if (part->normalized_words->len == 0) {
 		return;
 	}

 	for (i = 0; i < part->normalized_words->len; i++) {
 		w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);

 		if (w->len > 0) {

 			if (IS_PART_UTF (part)) {
 				cur_score += rspamd_chartable_process_word_utf (task, w);
 			}
 			else {
 				cur_score += rspamd_chartable_process_word_ascii (task, w);
 			}
 		}
 		part->script = sel;
 	}

 	if (total == 0) {
 		return 0;
 	cur_score /= (gdouble)part->normalized_words->len;

 	if (cur_score > 2.0) {
 		cur_score = 2.0;
 	}

 	return ((double)mark / (double)total) > chartable_module_ctx->threshold;
 	if (cur_score > chartable_module_ctx->threshold) {
 		rspamd_task_insert_result (task, chartable_module_ctx->symbol,
 				cur_score, NULL);

 	}
 }

 static void
@@ -209,10 +352,7 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)

 	for (i = 0; i < task->text_parts->len; i ++) {
 		part = g_ptr_array_index (task->text_parts, i);

 		if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) {
 			rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL);
 		}
 		rspamd_chartable_process_part (task, part);
 	}

 }