aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libmime/lang_detection.c171
1 files changed, 97 insertions, 74 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index c3a647507..4796e4834 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -1822,7 +1822,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
struct rspamd_lang_detector *d,
struct rspamd_mime_text_part *part)
{
- khash_t(rspamd_candidates_hash) * candidates;
+ khash_t(rspamd_candidates_hash) *candidates = NULL;
GPtrArray *result;
double mean, std, start_ticks, end_ticks;
unsigned int cand_len;
@@ -1831,7 +1831,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
enum rspamd_language_detected_type r;
struct rspamd_frequency_sort_cbdata cbd;
/* Check if we have sorted candidates based on frequency */
- gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
+ gboolean frequency_heuristic_applied = FALSE, ret = FALSE, internal_heuristic_applied = FALSE;
if (!part->utf_stripped_content) {
return FALSE;
@@ -1854,6 +1854,8 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
ret = TRUE;
}
+
+ internal_heuristic_applied = TRUE;
}
if (!ret) {
@@ -1906,91 +1908,110 @@ rspamd_language_detector_detect(struct rspamd_task *task,
rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
}
- if (ndetected == 0) {
- if (part->utf_words->len < default_short_text_limit) {
- r = rs_detect_none;
- msg_debug_lang_det("text is too short for trigrams detection: "
- "%d words; at least %d words required",
- (int) part->utf_words->len,
- (int) default_short_text_limit);
- switch (cat) {
- case RSPAMD_LANGUAGE_CYRILLIC:
- rspamd_language_detector_set_language(task, part, "ru", NULL);
- break;
- case RSPAMD_LANGUAGE_DEVANAGARI:
- rspamd_language_detector_set_language(task, part, "hi", NULL);
- break;
- case RSPAMD_LANGUAGE_ARAB:
- rspamd_language_detector_set_language(task, part, "ar", NULL);
- break;
- default:
- case RSPAMD_LANGUAGE_LATIN:
- rspamd_language_detector_set_language(task, part, "en", NULL);
- break;
+ else {
+ /* Fasttext has failed to apply anything */
+ r = rs_detect_none;
+
+ if (!internal_heuristic_applied) {
+ /* Apply unicode scripts heuristic */
+ if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
+ ret = TRUE;
}
- msg_debug_lang_det("set %s language based on symbols category",
- part->language);
- candidates = kh_init(rspamd_candidates_hash);
+ cat = rspamd_language_detector_get_category(part->unicode_scripts);
+
+ if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
+ ret = TRUE;
+ }
+
+ internal_heuristic_applied = TRUE;
}
- else {
+
+ if (!ret) {
+ /* Apply trigramms detection */
candidates = kh_init(rspamd_candidates_hash);
- kh_resize(rspamd_candidates_hash, candidates, 32);
-
- r = rspamd_language_detector_try_ngramm(task,
- default_words,
- d,
- part->utf_words,
- cat,
- candidates,
- part);
-
- if (r == rs_detect_none) {
- msg_debug_lang_det("no trigrams found, fallback to english");
- rspamd_language_detector_set_language(task, part, "en", NULL);
+ if (part->utf_words->len < default_short_text_limit) {
+ r = rs_detect_none;
+ msg_debug_lang_det("text is too short for trigrams detection: "
+ "%d words; at least %d words required",
+ (int) part->utf_words->len,
+ (int) default_short_text_limit);
+ switch (cat) {
+ case RSPAMD_LANGUAGE_CYRILLIC:
+ rspamd_language_detector_set_language(task, part, "ru", NULL);
+ break;
+ case RSPAMD_LANGUAGE_DEVANAGARI:
+ rspamd_language_detector_set_language(task, part, "hi", NULL);
+ break;
+ case RSPAMD_LANGUAGE_ARAB:
+ rspamd_language_detector_set_language(task, part, "ar", NULL);
+ break;
+ default:
+ case RSPAMD_LANGUAGE_LATIN:
+ rspamd_language_detector_set_language(task, part, "en", NULL);
+ break;
+ }
+ msg_debug_lang_det("set %s language based on symbols category",
+ part->language);
}
- else if (r == rs_detect_multiple) {
- /* Check our guess */
-
- mean = 0.0;
- std = 0.0;
- cand_len = 0;
-
- /* Check distribution */
- kh_foreach_value(candidates, cand, {
- if (!isnan(cand->prob)) {
- mean += cand->prob;
- cand_len++;
- }
- });
+ else {
+ kh_resize(rspamd_candidates_hash, candidates, 32);
+
+ r = rspamd_language_detector_try_ngramm(task,
+ default_words,
+ d,
+ part->utf_words,
+ cat,
+ candidates,
+ part);
+
+ if (r == rs_detect_none) {
+ msg_debug_lang_det("no trigrams found, fallback to english");
+ rspamd_language_detector_set_language(task, part, "en", NULL);
+ }
+ else if (r == rs_detect_multiple) {
+ /* Check our guess */
- if (cand_len > 0) {
- mean /= cand_len;
+ mean = 0.0;
+ std = 0.0;
+ cand_len = 0;
+ /* Check distribution */
kh_foreach_value(candidates, cand, {
- double err;
if (!isnan(cand->prob)) {
- err = cand->prob - mean;
- std += fabs(err);
+ mean += cand->prob;
+ cand_len++;
}
});
- std /= cand_len;
- }
+ if (cand_len > 0) {
+ mean /= cand_len;
+
+ kh_foreach_value(candidates, cand, {
+ double err;
+ if (!isnan(cand->prob)) {
+ err = cand->prob - mean;
+ std += fabs(err);
+ }
+ });
+
+ std /= cand_len;
+ }
- msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
- cand_len, mean, std);
+ msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
+ cand_len, mean, std);
- if (cand_len > 0 && std / fabs(mean) < 0.25) {
- msg_debug_lang_det("apply frequency heuristic sorting");
- frequency_heuristic_applied = TRUE;
- cbd.d = d;
- cbd.mean = mean;
- cbd.std = std;
- cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
+ if (cand_len > 0 && std / fabs(mean) < 0.25) {
+ msg_debug_lang_det("apply frequency heuristic sorting");
+ frequency_heuristic_applied = TRUE;
+ cbd.d = d;
+ cbd.mean = mean;
+ cbd.std = std;
+ cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
- if (part->nwords < default_words / 2) {
- cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ if (part->nwords < default_words / 2) {
+ cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
+ }
}
}
}
@@ -1998,7 +2019,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
}
/* Now, convert hash to array and sort it */
- if (r != rs_detect_none && kh_size(candidates) > 0) {
+ if (r != rs_detect_none && candidates != NULL && kh_size(candidates) > 0) {
result = g_ptr_array_sized_new(kh_size(candidates));
kh_foreach_value(candidates, cand, {
@@ -2037,7 +2058,9 @@ rspamd_language_detector_detect(struct rspamd_task *task,
rspamd_language_detector_set_language(task, part, "en", NULL);
}
- kh_destroy(rspamd_candidates_hash, candidates);
+ if (candidates != NULL) {
+ kh_destroy(rspamd_candidates_hash, candidates);
+ }
}
/* Update internal stat */