]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Fix various corner cases for language detection
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Sep 2018 15:40:05 +0000 (16:40 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 8 Sep 2018 15:40:05 +0000 (16:40 +0100)
src/libmime/lang_detection.c
src/libmime/message.c

index 64a602e7b7a6e959f9d00adc8ba39cf33e273636..fbc5f56c909dccb3d2e0b770cf899068df700a19 100644 (file)
@@ -1608,8 +1608,10 @@ rspamd_language_detector_detect (struct rspamd_task *task,
                                candidates);
 
                if (r == rs_detect_none) {
-                       msg_debug_lang_det ("no trigramms found, switch to nothing");
-               } else if (r == rs_detect_multiple) {
+                       msg_debug_lang_det ("no trigramms found, fallback to english");
+                       rspamd_language_detector_set_language (task, part, "en");
+               }
+               else if (r == rs_detect_multiple) {
                        /* Check our guess */
 
                        mean = 0.0;
@@ -1656,34 +1658,38 @@ rspamd_language_detector_detect (struct rspamd_task *task,
                }
 
                /* Now, convert hash to array and sort it */
-               result = g_ptr_array_sized_new (kh_size (candidates));
+               if (r != rs_detect_none && kh_size (candidates) > 0) {
+                       result = g_ptr_array_sized_new (kh_size (candidates));
 
-               kh_foreach_value (candidates, cand, {
-                       if (!isnan (cand->prob)) {
-                               msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
-                                               cand->prob);
-                               g_ptr_array_add (result, cand);
-                       }
-               });
+                       kh_foreach_value (candidates, cand, {
+                               if (!isnan (cand->prob)) {
+                                       msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
+                                                       cand->prob);
+                                       g_ptr_array_add (result, cand);
+                               }
+                       });
 
-               if (frequency_heuristic_applied) {
-                       g_ptr_array_sort_with_data (result,
-                                       rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
-               } else {
-                       g_ptr_array_sort (result, rspamd_language_detector_cmp);
-               }
+                       if (frequency_heuristic_applied) {
+                               g_ptr_array_sort_with_data (result,
+                                               rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
+                       } else {
+                               g_ptr_array_sort (result, rspamd_language_detector_cmp);
+                       }
 
-               kh_destroy (rspamd_candidates_hash, candidates);
+                       if (result->len > 0 && !frequency_heuristic_applied) {
+                               cand = g_ptr_array_index (result, 0);
+                               cand->elt->occurencies++;
+                               d->total_occurencies++;
+                       }
 
-               if (result->len > 0 && !frequency_heuristic_applied) {
-                       cand = g_ptr_array_index (result, 0);
-                       cand->elt->occurencies++;
-                       d->total_occurencies++;
+                       part->languages = result;
+                       ret = TRUE;
+               }
+               else if (part->languages == NULL) {
+                       rspamd_language_detector_set_language (task, part, "en");
                }
 
-               part->languages = result;
-
-               ret = TRUE;
+               kh_destroy (rspamd_candidates_hash, candidates);
        }
 
        end_ticks = rspamd_get_ticks (TRUE);
index 70a08a06ff55a5822cd79cbde7f0f2b513c7620b..0d4581ad7bd65d747b3556d5a9564258a03df92c 100644 (file)
@@ -223,13 +223,17 @@ rspamd_mime_part_detect_language (struct rspamd_task *task,
 {
        struct rspamd_lang_detector_res *lang;
 
-       if (part->utf_words && task->lang_det) {
+       if (!IS_PART_EMPTY (part) && part->utf_words && part->utf_words->len > 0 &&
+                       task->lang_det) {
                if (rspamd_language_detector_detect (task, task->lang_det, part)) {
                        lang = g_ptr_array_index (part->languages, 0);
                        part->language = lang->lang;
 
                        msg_info_task ("detected part language: %s", part->language);
                }
+               else {
+                       part->language = "en"; /* Safe fallback */
+               }
        }
 }