Browse Source

[Rework] Make chartable module useful

tags/1.3.0
Vsevolod Stakhov 7 years ago
parent
commit
0d43251bc6
1 changed files with 208 additions and 68 deletions
  1. 208
    68
      src/plugins/chartable.c

+ 208
- 68
src/plugins/chartable.c View File

@@ -29,6 +29,23 @@
#define DEFAULT_SYMBOL "R_CHARSET_MIXED"
#define DEFAULT_THRESHOLD 0.1

#define msg_err_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_CRITICAL, \
"chartable", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
#define msg_warn_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_WARNING, \
"chartable", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
#define msg_info_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_INFO, \
"chartable", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)
#define msg_debug_chartable(...) rspamd_default_log_function (G_LOG_LEVEL_DEBUG, \
"chartable", task->task_pool->tag.uid, \
G_STRFUNC, \
__VA_ARGS__)

/* Initialization */
gint chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx);
gint chartable_module_config (struct rspamd_config *cfg);
@@ -47,6 +64,7 @@ struct chartable_ctx {
struct module_ctx ctx;
const gchar *symbol;
double threshold;
guint max_word_len;

rspamd_mempool_t *chartable_pool;
};
@@ -60,6 +78,7 @@ chartable_module_init (struct rspamd_config *cfg, struct module_ctx **ctx)
chartable_module_ctx = g_malloc (sizeof (struct chartable_ctx));

chartable_module_ctx->chartable_pool = rspamd_mempool_new (rspamd_mempool_suggest_size (), NULL);
chartable_module_ctx->max_word_len = 10;

*ctx = (struct module_ctx *)chartable_module_ctx;

@@ -94,6 +113,13 @@ chartable_module_config (struct rspamd_config *cfg)
else {
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}
if ((value =
rspamd_config_get_module_opt (cfg, "chartable", "max_word_len")) != NULL) {
chartable_module_ctx->max_word_len = ucl_object_toint (value);
}
else {
chartable_module_ctx->threshold = DEFAULT_THRESHOLD;
}

rspamd_symbols_cache_add_symbol (cfg->cache,
chartable_module_ctx->symbol,
@@ -117,88 +143,205 @@ chartable_module_reconfig (struct rspamd_config *cfg)
return chartable_module_config (cfg);
}

static gboolean
check_part (struct rspamd_mime_text_part *part, gboolean raw_mode)
static gdouble
rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_ftok_t *w)
{
guchar *p, *p1;
gunichar c, t;
GUnicodeScript scc, sct;
guint32 mark = 0, total = 0, max = 0, i;
guint32 remain = part->content->len;
guint32 scripts[G_UNICODE_SCRIPT_NKO];
GUnicodeScript sel = 0;

p = part->content->data;

if (IS_PART_UTF (part) || raw_mode) {
while (remain > 1) {
if ((g_ascii_isalpha (*p) &&
(*(p + 1) & 0x80)) ||
((*p & 0x80) && g_ascii_isalpha (*(p + 1)))) {
mark++;
total++;
const gchar *p, *end, *c;
gdouble badness = 0.0;
gunichar uc;
gint sc, last_sc;
guint same_script_count = 0, nsym = 0;
enum {
start_process = 0,
got_alpha,
got_digit,
got_unknown,
} state = start_process;

p = w->begin;
end = p + w->len;
c = p;
last_sc = 0;

/* We assume that w is normalized */

while (p < end) {
uc = g_utf8_get_char (p);

if (g_unichar_isalpha (uc)) {

if (state == got_digit) {
/* Penalize digit -> alpha translations */
badness += 1.0;
}
/* Current and next symbols are of one class */
else if (((*p & 0x80) &&
(*(p + 1) & 0x80)) ||
(g_ascii_isalpha (*p) && g_ascii_isalpha (*(p + 1)))) {
total++;
else if (state == got_alpha) {
/* Check script */
sc = g_unichar_get_script (uc);

if (same_script_count > 0) {
if (sc != last_sc) {
badness += 1.0 / (gdouble)same_script_count;
last_sc = sc;
same_script_count = 1;
}
else {
same_script_count ++;
}
}
else {
last_sc = sc;
same_script_count = 1;
}
}
p++;
remain--;

state = got_alpha;

}
else if (g_unichar_isdigit (uc)) {
state = got_digit;
same_script_count = 0;
}
else {
/* We don't care about unknown characters here */
state = got_unknown;
same_script_count = 0;
}

nsym ++;
p = g_utf8_next_char (p);
}

/* Try to avoid FP for long words */
if (nsym > chartable_module_ctx->max_word_len) {
badness = 0;
}
else {
memset (&scripts, 0, sizeof (scripts));
while (remain > 0) {
c = g_utf8_get_char_validated (p, remain);
if (c == (gunichar) - 2 || c == (gunichar) - 1) {
/* Invalid characters detected, stop processing */
return FALSE;
}
if (badness > 4.0) {
badness = 4.0;
}
}

msg_debug_chartable ("word %T, badness: %.2f", w, badness);

return badness;
}

scc = g_unichar_get_script (c);
if (scc < (gint)G_N_ELEMENTS (scripts)) {
scripts[scc]++;
static gdouble
rspamd_chartable_process_word_ascii (struct rspamd_task *task, rspamd_ftok_t *w)
{
const gchar *p, *end, *c;
gdouble badness = 0.0;
enum {
ascii = 1,
non_ascii
} sc, last_sc;
gint same_script_count = 0;
enum {
start_process = 0,
got_alpha,
got_digit,
got_unknown,
} state = start_process;

p = w->begin;
end = p + w->len;
c = p;
last_sc = 0;

if (w->len > chartable_module_ctx->max_word_len) {
return 0.0;
}

/* We assume that w is normalized */
while (p < end) {
if (g_ascii_isalpha (*p) || *p > 0x7f) {

if (state == got_digit) {
/* Penalize digit -> alpha translations */
badness += 2.0;
}
p1 = g_utf8_next_char (p);
remain -= p1 - p;
p = p1;

if (remain > 0) {
t = g_utf8_get_char_validated (p, remain);
if (t == (gunichar) - 2 || t == (gunichar) - 1) {
/* Invalid characters detected, stop processing */
return FALSE;
}
sct = g_unichar_get_script (t);
if (g_unichar_isalpha (c) && g_unichar_isalpha (t)) {
/* We have two unicode alphanumeric characters, so we can check its script */
if (sct != scc) {
mark++;
else if (state == got_alpha) {
/* Check script */
sc = (*p > 0x7f) ? ascii : non_ascii;

if (same_script_count > 0) {
if (sc != last_sc) {
badness += 1.0 / (gdouble)same_script_count;
last_sc = sc;
same_script_count = 1;
}
total++;
else {
same_script_count ++;
}
}
else {
last_sc = sc;
same_script_count = 1;
}
p1 = g_utf8_next_char (p);
remain -= p1 - p;
p = p1;
}

state = got_alpha;

}
/* Detect the mostly charset of this part */
for (i = 0; i < G_N_ELEMENTS (scripts); i++) {
if (scripts[i] > max) {
max = scripts[i];
sel = i;
else if (g_ascii_isdigit (*p)) {
state = got_digit;
same_script_count = 0;
}
else {
/* We don't care about unknown characters here */
state = got_unknown;
same_script_count = 0;
}

p ++;
}

if (badness > 4.0) {
badness = 4.0;
}

msg_debug_chartable ("word %T, badness: %.2f", w, badness);

return badness;
}

static void
rspamd_chartable_process_part (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
rspamd_ftok_t *w;
guint i;
gdouble cur_score = 0.0;

if (part->normalized_words->len == 0) {
return;
}

for (i = 0; i < part->normalized_words->len; i++) {
w = &g_array_index (part->normalized_words, rspamd_ftok_t, i);

if (w->len > 0) {

if (IS_PART_UTF (part)) {
cur_score += rspamd_chartable_process_word_utf (task, w);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, w);
}
}
part->script = sel;
}

if (total == 0) {
return 0;
cur_score /= (gdouble)part->normalized_words->len;

if (cur_score > 2.0) {
cur_score = 2.0;
}

return ((double)mark / (double)total) > chartable_module_ctx->threshold;
if (cur_score > chartable_module_ctx->threshold) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol,
cur_score, NULL);

}
}

static void
@@ -209,10 +352,7 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)

for (i = 0; i < task->text_parts->len; i ++) {
part = g_ptr_array_index (task->text_parts, i);

if (!IS_PART_EMPTY (part) && check_part (part, task->cfg->raw_mode)) {
rspamd_task_insert_result (task, chartable_module_ctx->symbol, 1, NULL);
}
rspamd_chartable_process_part (task, part);
}

}

Loading…
Cancel
Save