diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-08 16:40:05 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-08 16:40:05 +0100 |
commit | e81e8ffb30b953c42e52e5bf20d97f820e8b08e0 (patch) | |
tree | 9e8b0526d8f20b379ecf28cb06a414dcb02b32fc /src/libmime/lang_detection.c | |
parent | 96fea560db92cebad8837e1721b4c3ca147974cd (diff) | |
download | rspamd-e81e8ffb30b953c42e52e5bf20d97f820e8b08e0.tar.gz rspamd-e81e8ffb30b953c42e52e5bf20d97f820e8b08e0.zip |
[Fix] Fix various corner cases for language detection
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 54 |
1 files changed, 30 insertions, 24 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 64a602e7b..fbc5f56c9 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1608,8 +1608,10 @@ rspamd_language_detector_detect (struct rspamd_task *task, candidates); if (r == rs_detect_none) { - msg_debug_lang_det ("no trigramms found, switch to nothing"); - } else if (r == rs_detect_multiple) { + msg_debug_lang_det ("no trigramms found, fallback to english"); + rspamd_language_detector_set_language (task, part, "en"); + } + else if (r == rs_detect_multiple) { /* Check our guess */ mean = 0.0; @@ -1656,34 +1658,38 @@ rspamd_language_detector_detect (struct rspamd_task *task, } /* Now, convert hash to array and sort it */ - result = g_ptr_array_sized_new (kh_size (candidates)); + if (r != rs_detect_none && kh_size (candidates) > 0) { + result = g_ptr_array_sized_new (kh_size (candidates)); - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, - cand->prob); - g_ptr_array_add (result, cand); - } - }); + kh_foreach_value (candidates, cand, { + if (!isnan (cand->prob)) { + msg_debug_lang_det ("final probability %s -> %.2f", cand->lang, + cand->prob); + g_ptr_array_add (result, cand); + } + }); - if (frequency_heuristic_applied) { - g_ptr_array_sort_with_data (result, - rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); - } else { - g_ptr_array_sort (result, rspamd_language_detector_cmp); - } + if (frequency_heuristic_applied) { + g_ptr_array_sort_with_data (result, + rspamd_language_detector_cmp_heuristic, (gpointer) &cbd); + } else { + g_ptr_array_sort (result, rspamd_language_detector_cmp); + } - kh_destroy (rspamd_candidates_hash, candidates); + if (result->len > 0 && !frequency_heuristic_applied) { + cand = g_ptr_array_index (result, 0); + cand->elt->occurencies++; + d->total_occurencies++; + } - if (result->len > 0 && !frequency_heuristic_applied) { - cand = g_ptr_array_index (result, 0); - cand->elt->occurencies++; - d->total_occurencies++; + part->languages = result; + ret = TRUE; + } + else if (part->languages == NULL) { + rspamd_language_detector_set_language (task, part, "en"); } - part->languages = result; - - ret = TRUE; + kh_destroy (rspamd_candidates_hash, candidates); } end_ticks = rspamd_get_ticks (TRUE); |