summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-30 15:03:00 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-30 15:03:00 +0000
commit8857f667412c2db45c5d346575db3eb1cf398b04 (patch)
tree46bcc4157c54ee8811f6618760d1f0ae6c8b6624 /src
parent7fde20c6c535c8bee0ba418fb48cb93b0fe089bb (diff)
downloadrspamd-8857f667412c2db45c5d346575db3eb1cf398b04.tar.gz
rspamd-8857f667412c2db45c5d346575db3eb1cf398b04.zip
[Minor] Count words based on text words
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c6
-rw-r--r--src/libmime/message.c8
-rw-r--r--src/libmime/message.h1
-rw-r--r--src/lua/lua_mimepart.c2
-rw-r--r--src/plugins/chartable.c2
5 files changed, 12 insertions, 7 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index f5a175ad8..82e5fc2ff 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1665,11 +1665,11 @@ rspamd_language_detector_detect (struct rspamd_task *task,
}
if (!ret) {
- if (part->utf_words->len < default_short_text_limit) {
+ if (part->nwords < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det ("text is too short for trigramms detection: "
"%d words; at least %d words required",
- (int)part->utf_words->len,
+ (int)part->nwords,
(int)default_short_text_limit);
rspamd_language_detector_set_language (task, part, "en");
candidates = kh_init (rspamd_candidates_hash);
@@ -1728,7 +1728,7 @@ rspamd_language_detector_detect (struct rspamd_task *task,
cbd.std = std;
cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
- if (part->utf_words->len < default_words / 2) {
+ if (part->nwords < default_words / 2) {
cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
}
}
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 3c29b1170..411b3bf03 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -89,6 +89,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
}
}
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ part->nwords ++;
+ }
+
if (w->flags & (RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE|
RSPAMD_STAT_TOKEN_FLAG_NORMALISED|
RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES)) {
@@ -96,7 +100,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
}
}
- if (part->utf_words && part->utf_words->len) {
+ if (part->utf_words->len) {
gdouble *avg_len_p, *short_len_p;
avg_len_p = rspamd_mempool_get_variable (task->task_pool,
@@ -1294,7 +1298,7 @@ rspamd_message_process (struct rspamd_task *task)
rspamd_mime_part_extract_words (task, text_part);
if (text_part->utf_words) {
- total_words += text_part->utf_words->len;
+ total_words += text_part->nwords;
}
}
diff --git a/src/libmime/message.h b/src/libmime/message.h
index ed9dfef6e..29f777c3b 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -112,6 +112,7 @@ struct rspamd_mime_text_part {
guint flags;
guint nlines;
guint spaces;
+ guint nwords;
guint non_ascii_chars;
guint ascii_chars;
guint double_spaces;
diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c
index 14111f760..3617a145b 100644
--- a/src/lua/lua_mimepart.c
+++ b/src/lua/lua_mimepart.c
@@ -756,7 +756,7 @@ lua_textpart_get_words_count (lua_State *L)
lua_pushinteger (L, 0);
}
else {
- lua_pushinteger (L, part->utf_words->len);
+ lua_pushinteger (L, part->nwords);
}
return 1;
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index e48d19ea2..b6e42457a 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -594,7 +594,7 @@ rspamd_chartable_process_part (struct rspamd_task *task,
*/
part->capital_letters += ncap;
- cur_score /= (gdouble)part->utf_words->len;
+ cur_score /= (gdouble)part->nwords;
if (cur_score > 2.0) {
cur_score = 2.0;