]> source.dussan.org Git - rspamd.git/commitdiff
[CritFix] Langdet: Fix language detection where no stop words found
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Jun 2019 12:59:45 +0000 (13:59 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Wed, 5 Jun 2019 12:59:45 +0000 (13:59 +0100)
src/libmime/lang_detection.c

index aad01ec8a8d37a906d10cb96136f85e18ed3874e..0312d009be660946cb0c0410055095e430f4eaa4 100644 (file)
@@ -1718,13 +1718,30 @@ rspamd_language_detector_detect (struct rspamd_task *task,
        }
 
        if (!ret) {
-               if (part->nwords < default_short_text_limit) {
+               if (part->utf_words->len < default_short_text_limit) {
                        r = rs_detect_none;
                        msg_debug_lang_det ("text is too short for trigramms detection: "
                                           "%d words; at least %d words required",
-                                       (int)part->nwords,
+                                       (int)part->utf_words->len,
                                        (int)default_short_text_limit);
-                       rspamd_language_detector_set_language (task, part, "en");
+                       switch (cat) {
+                       case RSPAMD_LANGUAGE_CYRILLIC:
+                               rspamd_language_detector_set_language (task, part, "ru");
+                               break;
+                       case RSPAMD_LANGUAGE_DEVANAGARI:
+                               rspamd_language_detector_set_language (task, part, "hi");
+                               break;
+                       case RSPAMD_LANGUAGE_ARAB:
+                               rspamd_language_detector_set_language (task, part, "ar");
+                               break;
+                       default:
+                       case RSPAMD_LANGUAGE_LATIN:
+                               rspamd_language_detector_set_language (task, part, "en");
+                               break;
+                       }
+                       msg_debug_lang_det ("set %s language based on symbols category",
+                                       part->language);
+
                        candidates = kh_init (rspamd_candidates_hash);
                }
                else {