aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2019-06-05 13:59:45 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2019-06-05 13:59:45 +0100
commit4bba6e33a91047f1ea3a0360e6a4480d4b51d26f (patch)
tree728d54fadc43872cf4ac113528911e1da0bf19ad /src/libmime/lang_detection.c
parent5fea03d7fb799392ba290643b27e585e49d5e590 (diff)
downloadrspamd-4bba6e33a91047f1ea3a0360e6a4480d4b51d26f.tar.gz
rspamd-4bba6e33a91047f1ea3a0360e6a4480d4b51d26f.zip
[CritFix] Langdet: Fix language detection where no stop words found
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c23
1 files changed, 20 insertions, 3 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index aad01ec8a..0312d009b 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1718,13 +1718,30 @@ rspamd_language_detector_detect (struct rspamd_task *task,
}
if (!ret) {
- if (part->nwords < default_short_text_limit) {
+ if (part->utf_words->len < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det ("text is too short for trigramms detection: "
"%d words; at least %d words required",
- (int)part->nwords,
+ (int)part->utf_words->len,
(int)default_short_text_limit);
- rspamd_language_detector_set_language (task, part, "en");
+ switch (cat) {
+ case RSPAMD_LANGUAGE_CYRILLIC:
+ rspamd_language_detector_set_language (task, part, "ru");
+ break;
+ case RSPAMD_LANGUAGE_DEVANAGARI:
+ rspamd_language_detector_set_language (task, part, "hi");
+ break;
+ case RSPAMD_LANGUAGE_ARAB:
+ rspamd_language_detector_set_language (task, part, "ar");
+ break;
+ default:
+ case RSPAMD_LANGUAGE_LATIN:
+ rspamd_language_detector_set_language (task, part, "en");
+ break;
+ }
+ msg_debug_lang_det ("set %s language based on symbols category",
+ part->language);
+
candidates = kh_init (rspamd_candidates_hash);
}
else {