From 8857f667412c2db45c5d346575db3eb1cf398b04 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 30 Nov 2018 15:03:00 +0000 Subject: [PATCH] [Minor] Count words based on text words --- src/libmime/lang_detection.c | 6 +++--- src/libmime/message.c | 8 ++++++-- src/libmime/message.h | 1 + src/lua/lua_mimepart.c | 2 +- src/plugins/chartable.c | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index f5a175ad8..82e5fc2ff 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1665,11 +1665,11 @@ rspamd_language_detector_detect (struct rspamd_task *task, } if (!ret) { - if (part->utf_words->len < default_short_text_limit) { + if (part->nwords < default_short_text_limit) { r = rs_detect_none; msg_debug_lang_det ("text is too short for trigramms detection: " "%d words; at least %d words required", - (int)part->utf_words->len, + (int)part->nwords, (int)default_short_text_limit); rspamd_language_detector_set_language (task, part, "en"); candidates = kh_init (rspamd_candidates_hash); @@ -1728,7 +1728,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, cbd.std = std; cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; - if (part->utf_words->len < default_words / 2) { + if (part->nwords < default_words / 2) { cbd.flags |= RSPAMD_LANG_FLAG_SHORT; } } diff --git a/src/libmime/message.c b/src/libmime/message.c index 3c29b1170..411b3bf03 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -89,6 +89,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, } } + if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { + part->nwords ++; + } + if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE| RSPAMD_STAT_TOKEN_FLAG_NORMALISED| RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) { @@ -96,7 +100,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, } } - if (part->utf_words && part->utf_words->len) { + if (part->utf_words->len) { gdouble *avg_len_p, *short_len_p; avg_len_p = rspamd_mempool_get_variable (task->task_pool, @@ -1294,7 +1298,7 @@ rspamd_message_process (struct rspamd_task *task) rspamd_mime_part_extract_words (task, text_part); if (text_part->utf_words) { - total_words += text_part->utf_words->len; + total_words += text_part->nwords; } } diff --git a/src/libmime/message.h b/src/libmime/message.h index ed9dfef6e..29f777c3b 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -112,6 +112,7 @@ struct rspamd_mime_text_part { guint flags; guint nlines; guint spaces; + guint nwords; guint non_ascii_chars; guint ascii_chars; guint double_spaces; diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index 14111f760..3617a145b 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -756,7 +756,7 @@ lua_textpart_get_words_count (lua_State *L) lua_pushinteger (L, 0); } else { - lua_pushinteger (L, part->utf_words->len); + lua_pushinteger (L, part->nwords); } return 1; diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index e48d19ea2..b6e42457a 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -594,7 +594,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, */ part->capital_letters += ncap; - cur_score /= (gdouble)part->utf_words->len; + cur_score /= (gdouble)part->nwords; if (cur_score > 2.0) { cur_score = 2.0; -- 2.39.5