summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2017-09-23 13:44:57 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2017-09-23 13:44:57 +0100
commitffa32bb64d0fc7bb1bc7f5087927fbc7ccf30651 (patch)
tree38aac8587a9644d989b54c964be6e83946252a5e /src
parent011a2b7a5ded539e384c5851f1b12d131aedc0f1 (diff)
downloadrspamd-ffa32bb64d0fc7bb1bc7f5087927fbc7ccf30651.tar.gz
rspamd-ffa32bb64d0fc7bb1bc7f5087927fbc7ccf30651.zip
[Feature] Add more text attributes
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c10
-rw-r--r--src/libmime/message.h2
-rw-r--r--src/plugins/chartable.c28
3 files changed, 34 insertions, 6 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index a22f51912..ce53c15f9 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -202,6 +202,7 @@ rspamd_extract_words (struct rspamd_task *task,
gchar *temp_word;
const guchar *r;
guint i, nlen, total_len = 0, short_len = 0;
+ gdouble avg_len = 0;
#ifdef WITH_SNOWBALL
static GHashTable *stemmers = NULL;
@@ -252,6 +253,8 @@ rspamd_extract_words (struct rspamd_task *task,
#endif
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+ avg_len = avg_len + (w->len - avg_len) / (double)i;
+
if (r != NULL) {
nlen = strlen (r);
nlen = MIN (nlen, w->len);
@@ -462,6 +465,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe,
part->non_ascii_chars ++;
}
else {
+ if (g_ascii_isupper (*p)) {
+ part->capital_letters ++;
+ }
+ else if (g_ascii_isdigit (*p)) {
+ part->numeric_characters ++;
+ }
+
part->ascii_chars ++;
}
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 8dc06eb3a..3092f3da5 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -101,6 +101,8 @@ struct rspamd_mime_text_part {
guint double_spaces;
guint non_spaces;
guint empty_lines;
+ guint capital_letters;
+ guint numeric_characters;
};
enum rspamd_received_type {
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 3391fa996..95145ac9c 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -170,7 +170,8 @@ chartable_module_reconfig (struct rspamd_config *cfg)
static gdouble
rspamd_chartable_process_word_utf (struct rspamd_task *task,
rspamd_stat_token_t *w,
- gboolean is_url)
+ gboolean is_url,
+ guint *ncap)
{
const gchar *p, *end;
gdouble badness = 0.0;
@@ -208,6 +209,12 @@ rspamd_chartable_process_word_utf (struct rspamd_task *task,
sc = UBLOCK_BASIC_LATIN;
}
+ if (sc != UBLOCK_BASIC_LATIN && u_isupper (uc)) {
+ if (ncap) {
+ (*ncap) ++;
+ }
+ }
+
if (state == got_digit) {
/* Penalize digit -> alpha translations */
if (!is_url && sc != UBLOCK_BASIC_LATIN &&
@@ -363,7 +370,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
struct rspamd_mime_text_part *part)
{
rspamd_stat_token_t *w;
- guint i;
+ guint i, ncap = 0;
gdouble cur_score = 0.0;
if (part == NULL || part->normalized_words == NULL ||
@@ -377,7 +384,8 @@ rspamd_chartable_process_part (struct rspamd_task *task,
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
if (IS_PART_UTF (part)) {
- cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
+ cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+ &ncap);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, w, FALSE);
@@ -385,6 +393,13 @@ rspamd_chartable_process_part (struct rspamd_task *task,
}
}
+ /*
+ * TODO: perhaps, we should do this analysis somewhere else and get
+ * something like: <SYM_SC><SYM_SC><SYM_SC> representing classes for all
+ * symbols in the text
+ */
+ part->capital_letters += ncap;
+
cur_score /= (gdouble)part->normalized_words->len;
if (cur_score > 2.0) {
@@ -425,7 +440,8 @@ chartable_symbol_callback (struct rspamd_task *task, void *unused)
if (words && words->len > 0) {
for (i = 0; i < words->len; i++) {
w = &g_array_index (words, rspamd_stat_token_t, i);
- cur_score += rspamd_chartable_process_word_utf (task, w, FALSE);
+ cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
+ NULL);
}
cur_score /= (gdouble)words->len;
@@ -471,7 +487,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused)
w.len = u->hostlen;
if (g_utf8_validate (w.begin, w.len, NULL)) {
- cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE);
+ cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE);
@@ -494,7 +510,7 @@ chartable_url_symbol_callback (struct rspamd_task *task, void *unused)
w.len = u->hostlen;
if (g_utf8_validate (w.begin, w.len, NULL)) {
- cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE);
+ cur_score += rspamd_chartable_process_word_utf (task, &w, TRUE, NULL);
}
else {
cur_score += rspamd_chartable_process_word_ascii (task, &w, TRUE);