aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/lang_detection.c
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-08 16:40:05 +0100
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-09-08 16:40:05 +0100
commite81e8ffb30b953c42e52e5bf20d97f820e8b08e0 (patch)
tree9e8b0526d8f20b379ecf28cb06a414dcb02b32fc /src/libmime/lang_detection.c
parent96fea560db92cebad8837e1721b4c3ca147974cd (diff)
downloadrspamd-e81e8ffb30b953c42e52e5bf20d97f820e8b08e0.tar.gz
rspamd-e81e8ffb30b953c42e52e5bf20d97f820e8b08e0.zip
[Fix] Fix various corner cases for language detection
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r--src/libmime/lang_detection.c54
1 files changed, 30 insertions, 24 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index 64a602e7b..fbc5f56c9 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1608,8 +1608,10 @@ rspamd_language_detector_detect (struct rspamd_task *task,
candidates);
if (r == rs_detect_none) {
- msg_debug_lang_det ("no trigramms found, switch to nothing");
- } else if (r == rs_detect_multiple) {
+ msg_debug_lang_det ("no trigramms found, fallback to english");
+ rspamd_language_detector_set_language (task, part, "en");
+ }
+ else if (r == rs_detect_multiple) {
/* Check our guess */
mean = 0.0;
@@ -1656,34 +1658,38 @@ rspamd_language_detector_detect (struct rspamd_task *task,
}
/* Now, convert hash to array and sort it */
- result = g_ptr_array_sized_new (kh_size (candidates));
+ if (r != rs_detect_none && kh_size (candidates) > 0) {
+ result = g_ptr_array_sized_new (kh_size (candidates));
- kh_foreach_value (candidates, cand, {
- if (!isnan (cand->prob)) {
- msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
- cand->prob);
- g_ptr_array_add (result, cand);
- }
- });
+ kh_foreach_value (candidates, cand, {
+ if (!isnan (cand->prob)) {
+ msg_debug_lang_det ("final probability %s -> %.2f", cand->lang,
+ cand->prob);
+ g_ptr_array_add (result, cand);
+ }
+ });
- if (frequency_heuristic_applied) {
- g_ptr_array_sort_with_data (result,
- rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
- } else {
- g_ptr_array_sort (result, rspamd_language_detector_cmp);
- }
+ if (frequency_heuristic_applied) {
+ g_ptr_array_sort_with_data (result,
+ rspamd_language_detector_cmp_heuristic, (gpointer) &cbd);
+ } else {
+ g_ptr_array_sort (result, rspamd_language_detector_cmp);
+ }
- kh_destroy (rspamd_candidates_hash, candidates);
+ if (result->len > 0 && !frequency_heuristic_applied) {
+ cand = g_ptr_array_index (result, 0);
+ cand->elt->occurencies++;
+ d->total_occurencies++;
+ }
- if (result->len > 0 && !frequency_heuristic_applied) {
- cand = g_ptr_array_index (result, 0);
- cand->elt->occurencies++;
- d->total_occurencies++;
+ part->languages = result;
+ ret = TRUE;
+ }
+ else if (part->languages == NULL) {
+ rspamd_language_detector_set_language (task, part, "en");
}
- part->languages = result;
-
- ret = TRUE;
+ kh_destroy (rspamd_candidates_hash, candidates);
}
end_ticks = rspamd_get_ticks (TRUE);