From ffa32bb64d0fc7bb1bc7f5087927fbc7ccf30651 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 23 Sep 2017 13:44:57 +0100 Subject: [PATCH] [Feature] Add more text attributes --- src/libmime/message.c | 10 ++++++++++ src/libmime/message.h | 2 ++ src/plugins/chartable.c | 28 ++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/src/libmime/message.c b/src/libmime/message.c index a22f51912..ce53c15f9 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -202,6 +202,7 @@ rspamd_extract_words (struct rspamd_task *task, gchar *temp_word; const guchar *r; guint i, nlen, total_len = 0, short_len = 0; + gdouble avg_len = 0; #ifdef WITH_SNOWBALL static GHashTable *stemmers = NULL; @@ -252,6 +253,8 @@ rspamd_extract_words (struct rspamd_task *task, #endif if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { + avg_len = avg_len + (w->len - avg_len) / (double)i; + if (r != NULL) { nlen = strlen (r); nlen = MIN (nlen, w->len); @@ -462,6 +465,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, part->non_ascii_chars ++; } else { + if (g_ascii_isupper (*p)) { + part->capital_letters ++; + } + else if (g_ascii_isdigit (*p)) { + part->numeric_characters ++; + } + part->ascii_chars ++; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index 8dc06eb3a..3092f3da5 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -101,6 +101,8 @@ struct rspamd_mime_text_part { guint double_spaces; guint non_spaces; guint empty_lines; + guint capital_letters; + guint numeric_characters; }; enum rspamd_received_type { diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 3391fa996..95145ac9c 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -170,7 +170,8 @@ chartable_module_reconfig (struct rspamd_config *cfg) static gdouble rspamd_chartable_process_word_utf (struct rspamd_task *task, rspamd_stat_token_t *w, - gboolean is_url) + gboolean is_url, + guint *ncap) { const gchar *p, *end; gdouble badness = 0.0; @@ -208,6 +209,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task, sc = UBLOCK_BASIC_LATIN; } + if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) { + if (ncap) { + (*ncap) ++; + } + } + if (state == got_digit) { /* Penalize digit -> alpha translations */ if (!is_url && sc != UBLOCK_BASIC_LATIN && @@ -363,7 +370,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, struct rspamd_mime_text_part *part) { rspamd_stat_token_t *w; - guint i; + guint i, ncap = 0; gdouble cur_score = 0.0; if (part == NULL || part->normalized_words == NULL || @@ -377,7 +384,8 @@ rspamd_chartable_process_part (struct rspamd_task *task, if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { if (IS_PART_UTF (part)) { - cur_score += rspamd_chartable_process_word_utf (task, w, FALSE); + cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, + &ncap); } else { cur_score += rspamd_chartable_process_word_ascii (task, w, FALSE); @@ -385,6 +393,13 @@ rspamd_chartable_process_part (struct rspamd_task *task, } } + /* + * TODO: perhaps, we should do this analysis somewhere else and get + * something like: representing classes for all + * symbols in the text + */ + part->capital_letters += ncap; + cur_score /= (gdouble)part->normalized_words->len; if (cur_score > 2.0) { @@ -425,7 +440,8 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused) if (words && words->len > 0) { for (i = 0; i < words->len; i++) { w = &g_array_index (words, rspamd_stat_token_t, i); - cur_score += rspamd_chartable_process_word_utf (task, w, FALSE); + cur_score += rspamd_chartable_process_word_utf (task, w, FALSE, + NULL); } cur_score /= (gdouble)words->len; @@ -471,7 +487,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused) w.len = u->hostlen; if (g_utf8_validate (w.begin, w.len, NULL)) { - cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE); + cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL); } else { cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE); @@ -494,7 +510,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused) w.len = u->hostlen; if (g_utf8_validate (w.begin, w.len, NULL)) { - cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE); + cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL); } else { cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE); -- 2.39.5