diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-06-05 13:59:45 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2019-06-05 13:59:45 +0100 |
commit | 4bba6e33a91047f1ea3a0360e6a4480d4b51d26f (patch) | |
tree | 728d54fadc43872cf4ac113528911e1da0bf19ad /src/libmime/lang_detection.c | |
parent | 5fea03d7fb799392ba290643b27e585e49d5e590 (diff) | |
download | rspamd-4bba6e33a91047f1ea3a0360e6a4480d4b51d26f.tar.gz rspamd-4bba6e33a91047f1ea3a0360e6a4480d4b51d26f.zip |
[CritFix] Langdet: Fix language detection where no stop words found
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 23 |
1 files changed, 20 insertions, 3 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index aad01ec8a..0312d009b 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1718,13 +1718,30 @@ rspamd_language_detector_detect (struct rspamd_task *task, } if (!ret) { - if (part->nwords < default_short_text_limit) { + if (part->utf_words->len < default_short_text_limit) { r = rs_detect_none; msg_debug_lang_det ("text is too short for trigramms detection: " "%d words; at least %d words required", - (int)part->nwords, + (int)part->utf_words->len, (int)default_short_text_limit); - rspamd_language_detector_set_language (task, part, "en"); + switch (cat) { + case RSPAMD_LANGUAGE_CYRILLIC: + rspamd_language_detector_set_language (task, part, "ru"); + break; + case RSPAMD_LANGUAGE_DEVANAGARI: + rspamd_language_detector_set_language (task, part, "hi"); + break; + case RSPAMD_LANGUAGE_ARAB: + rspamd_language_detector_set_language (task, part, "ar"); + break; + default: + case RSPAMD_LANGUAGE_LATIN: + rspamd_language_detector_set_language (task, part, "en"); + break; + } + msg_debug_lang_det ("set %s language based on symbols category", + part->language); + candidates = kh_init (rspamd_candidates_hash); } else { |