diff options
-rw-r--r-- | src/libmime/lang_detection.c | 171 |
1 files changed, 97 insertions, 74 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index c3a647507..4796e4834 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1822,7 +1822,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, struct rspamd_lang_detector *d, struct rspamd_mime_text_part *part) { - khash_t(rspamd_candidates_hash) * candidates; + khash_t(rspamd_candidates_hash) *candidates = NULL; GPtrArray *result; double mean, std, start_ticks, end_ticks; unsigned int cand_len; @@ -1831,7 +1831,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, enum rspamd_language_detected_type r; struct rspamd_frequency_sort_cbdata cbd; /* Check if we have sorted candidates based on frequency */ - gboolean frequency_heuristic_applied = FALSE, ret = FALSE; + gboolean frequency_heuristic_applied = FALSE, ret = FALSE, internal_heuristic_applied = FALSE; if (!part->utf_stripped_content) { return FALSE; @@ -1854,6 +1854,8 @@ rspamd_language_detector_detect(struct rspamd_task *task, if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) { ret = TRUE; } + + internal_heuristic_applied = TRUE; } if (!ret) { @@ -1906,91 +1908,110 @@ rspamd_language_detector_detect(struct rspamd_task *task, rspamd_fasttext_predict_result_destroy(fasttext_predict_result); } - if (ndetected == 0) { - if (part->utf_words->len < default_short_text_limit) { - r = rs_detect_none; - msg_debug_lang_det("text is too short for trigrams detection: " - "%d words; at least %d words required", - (int) part->utf_words->len, - (int) default_short_text_limit); - switch (cat) { - case RSPAMD_LANGUAGE_CYRILLIC: - rspamd_language_detector_set_language(task, part, "ru", NULL); - break; - case RSPAMD_LANGUAGE_DEVANAGARI: - rspamd_language_detector_set_language(task, part, "hi", NULL); - break; - case RSPAMD_LANGUAGE_ARAB: - rspamd_language_detector_set_language(task, part, "ar", NULL); - break; - default: - case RSPAMD_LANGUAGE_LATIN: - rspamd_language_detector_set_language(task, part, "en", NULL); - break; + else { + /* Fasttext has failed to apply anything */ + r = rs_detect_none; + + if (!internal_heuristic_applied) { + /* Apply unicode scripts heuristic */ + if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) { + ret = TRUE; } - msg_debug_lang_det("set %s language based on symbols category", - part->language); - candidates = kh_init(rspamd_candidates_hash); + cat = rspamd_language_detector_get_category(part->unicode_scripts); + + if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) { + ret = TRUE; + } + + internal_heuristic_applied = TRUE; } - else { + + if (!ret) { + /* Apply trigramms detection */ candidates = kh_init(rspamd_candidates_hash); - kh_resize(rspamd_candidates_hash, candidates, 32); - - r = rspamd_language_detector_try_ngramm(task, - default_words, - d, - part->utf_words, - cat, - candidates, - part); - - if (r == rs_detect_none) { - msg_debug_lang_det("no trigrams found, fallback to english"); - rspamd_language_detector_set_language(task, part, "en", NULL); + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det("text is too short for trigrams detection: " + "%d words; at least %d words required", + (int) part->utf_words->len, + (int) default_short_text_limit); + switch (cat) { + case RSPAMD_LANGUAGE_CYRILLIC: + rspamd_language_detector_set_language(task, part, "ru", NULL); + break; + case RSPAMD_LANGUAGE_DEVANAGARI: + rspamd_language_detector_set_language(task, part, "hi", NULL); + break; + case RSPAMD_LANGUAGE_ARAB: + rspamd_language_detector_set_language(task, part, "ar", NULL); + break; + default: + case RSPAMD_LANGUAGE_LATIN: + rspamd_language_detector_set_language(task, part, "en", NULL); + break; + } + msg_debug_lang_det("set %s language based on symbols category", + part->language); } - else if (r == rs_detect_multiple) { - /* Check our guess */ - - mean = 0.0; - std = 0.0; - cand_len = 0; - - /* Check distribution */ - kh_foreach_value(candidates, cand, { - if (!isnan(cand->prob)) { - mean += cand->prob; - cand_len++; - } - }); + else { + kh_resize(rspamd_candidates_hash, candidates, 32); + + r = rspamd_language_detector_try_ngramm(task, + default_words, + d, + part->utf_words, + cat, + candidates, + part); + + if (r == rs_detect_none) { + msg_debug_lang_det("no trigrams found, fallback to english"); + rspamd_language_detector_set_language(task, part, "en", NULL); + } + else if (r == rs_detect_multiple) { + /* Check our guess */ - if (cand_len > 0) { - mean /= cand_len; + mean = 0.0; + std = 0.0; + cand_len = 0; + /* Check distribution */ kh_foreach_value(candidates, cand, { - double err; if (!isnan(cand->prob)) { - err = cand->prob - mean; - std += fabs(err); + mean += cand->prob; + cand_len++; } }); - std /= cand_len; - } + if (cand_len > 0) { + mean /= cand_len; + + kh_foreach_value(candidates, cand, { + double err; + if (!isnan(cand->prob)) { + err = cand->prob - mean; + std += fabs(err); + } + }); + + std /= cand_len; + } - msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev", - cand_len, mean, std); + msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); - if (cand_len > 0 && std / fabs(mean) < 0.25) { - msg_debug_lang_det("apply frequency heuristic sorting"); - frequency_heuristic_applied = TRUE; - cbd.d = d; - cbd.mean = mean; - cbd.std = std; - cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + if (cand_len > 0 && std / fabs(mean) < 0.25) { + msg_debug_lang_det("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; - if (part->nwords < default_words / 2) { - cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + if (part->nwords < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } } } } @@ -1998,7 +2019,7 @@ rspamd_language_detector_detect(struct rspamd_task *task, } /* Now, convert hash to array and sort it */ - if (r != rs_detect_none && kh_size(candidates) > 0) { + if (r != rs_detect_none && candidates != NULL && kh_size(candidates) > 0) { result = g_ptr_array_sized_new(kh_size(candidates)); kh_foreach_value(candidates, cand, { @@ -2037,7 +2058,9 @@ rspamd_language_detector_detect(struct rspamd_task *task, rspamd_language_detector_set_language(task, part, "en", NULL); } - kh_destroy(rspamd_candidates_hash, candidates); + if (candidates != NULL) { + kh_destroy(rspamd_candidates_hash, candidates); + } } /* Update internal stat */ |