diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 15:47:15 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 15:47:15 +0100 |
commit | 264b9f2c480a1b0240acb8183a8d7470691aff11 (patch) | |
tree | aeecf4738499e48c0405903abebc5434975b39ba /src/libmime/lang_detection.c | |
parent | fea5bdc79758530a3c28970c9c19d05e9932de74 (diff) | |
download | rspamd-264b9f2c480a1b0240acb8183a8d7470691aff11.tar.gz rspamd-264b9f2c480a1b0240acb8183a8d7470691aff11.zip |
[Project] Implement fasttext language detection
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 169 |
1 files changed, 108 insertions, 61 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 09591438e..211dfe48b 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1801,88 +1801,132 @@ rspamd_language_detector_detect (struct rspamd_task *task, } if (!ret) { - if (part->utf_words->len < default_short_text_limit) { - r = rs_detect_none; - msg_debug_lang_det ("text is too short for trigrams detection: " - "%d words; at least %d words required", + unsigned ndetected = 0; + if (rspamd_lang_detection_fasttext_is_enabled(d->fasttext_detector)) { + rspamd_fasttext_predict_result_t fasttext_predict_result; + fasttext_predict_result = rspamd_lang_detection_fasttext_detect(d->fasttext_detector, + part->utf_stripped_content->data, + part->utf_stripped_content->len, 4); + + ndetected = rspamd_lang_detection_fasttext_get_nlangs(fasttext_predict_result); + + if (ndetected > 0) { + candidates = kh_init (rspamd_candidates_hash); + kh_resize (rspamd_candidates_hash, candidates, ndetected); + + /* Now fill all results where probability is above threshold */ + float max_prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, 0); + + for (unsigned int i = 0; i < ndetected; i ++) { + float prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + if (prob > max_prob * 0.75) { + char *lang = rspamd_mempool_strdup(task->task_pool, + rspamd_lang_detection_fasttext_get_lang(fasttext_predict_result, i)); + int tmp; + khiter_t k = kh_put (rspamd_candidates_hash, candidates, lang, &tmp); + + kh_value(candidates, k) = rspamd_mempool_alloc0(task->task_pool, sizeof(*cand)); + cand = kh_value(candidates, k); + cand->lang = lang; + cand->prob = rspamd_lang_detection_fasttext_get_prob(fasttext_predict_result, i); + } + } + + if (kh_size(candidates) == 1) { + r = rs_detect_single; + } + else if (kh_size(candidates) > 1) { + r = rs_detect_multiple; + } + else { + r = rs_detect_none; + } + } + } + if (ndetected == 0) { + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det ("text is too short for trigrams detection: " + "%d words; at least %d words required", (int)part->utf_words->len, (int)default_short_text_limit); - switch (cat) { - case RSPAMD_LANGUAGE_CYRILLIC: - rspamd_language_detector_set_language (task, part, "ru", NULL); - break; - case RSPAMD_LANGUAGE_DEVANAGARI: - rspamd_language_detector_set_language (task, part, "hi", NULL); - break; - case RSPAMD_LANGUAGE_ARAB: - rspamd_language_detector_set_language (task, part, "ar", NULL); - break; - default: - case RSPAMD_LANGUAGE_LATIN: - rspamd_language_detector_set_language (task, part, "en", NULL); - break; - } - msg_debug_lang_det ("set %s language based on symbols category", + switch (cat) { + case RSPAMD_LANGUAGE_CYRILLIC: + rspamd_language_detector_set_language (task, part, "ru", NULL); + break; + case RSPAMD_LANGUAGE_DEVANAGARI: + rspamd_language_detector_set_language (task, part, "hi", NULL); + break; + case RSPAMD_LANGUAGE_ARAB: + rspamd_language_detector_set_language (task, part, "ar", NULL); + break; + default: + case RSPAMD_LANGUAGE_LATIN: + rspamd_language_detector_set_language (task, part, "en", NULL); + break; + } + msg_debug_lang_det ("set %s language based on symbols category", part->language); - candidates = kh_init (rspamd_candidates_hash); - } - else { - candidates = kh_init (rspamd_candidates_hash); - kh_resize (rspamd_candidates_hash, candidates, 32); + candidates = kh_init (rspamd_candidates_hash); + } + else { + candidates = kh_init (rspamd_candidates_hash); + kh_resize (rspamd_candidates_hash, candidates, 32); - r = rspamd_language_detector_try_ngramm (task, + r = rspamd_language_detector_try_ngramm (task, default_words, d, part->utf_words, cat, candidates); - if (r == rs_detect_none) { - msg_debug_lang_det ("no trigrams found, fallback to english"); - rspamd_language_detector_set_language (task, part, "en", NULL); - } else if (r == rs_detect_multiple) { - /* Check our guess */ - - mean = 0.0; - std = 0.0; - cand_len = 0; - - /* Check distribution */ - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - mean += cand->prob; - cand_len++; - } - }); + if (r == rs_detect_none) { + msg_debug_lang_det ("no trigrams found, fallback to english"); + rspamd_language_detector_set_language (task, part, "en", NULL); + } else if (r == rs_detect_multiple) { + /* Check our guess */ - if (cand_len > 0) { - mean /= cand_len; + mean = 0.0; + std = 0.0; + cand_len = 0; + /* Check distribution */ kh_foreach_value (candidates, cand, { - gdouble err; if (!isnan (cand->prob)) { - err = cand->prob - mean; - std += fabs (err); + mean += cand->prob; + cand_len++; } }); - std /= cand_len; - } + if (cand_len > 0) { + mean /= cand_len; - msg_debug_lang_det ("trigrams checked, %d candidates, %.3f mean, %.4f stddev", + kh_foreach_value (candidates, cand, { + gdouble err; + if (!isnan (cand->prob)) { + err = cand->prob - mean; + std += fabs (err); + } + }); + + std /= cand_len; + } + + msg_debug_lang_det ("trigrams checked, %d candidates, %.3f mean, %.4f stddev", cand_len, mean, std); - if (cand_len > 0 && std / fabs (mean) < 0.25) { - msg_debug_lang_det ("apply frequency heuristic sorting"); - frequency_heuristic_applied = TRUE; - cbd.d = d; - cbd.mean = mean; - cbd.std = std; - cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + if (cand_len > 0 && std / fabs (mean) < 0.25) { + msg_debug_lang_det ("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; - if (part->nwords < default_words / 2) { - cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + if (part->nwords < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } } } } @@ -1909,7 +1953,9 @@ rspamd_language_detector_detect (struct rspamd_task *task, if (result->len > 0 && !frequency_heuristic_applied) { cand = g_ptr_array_index (result, 0); - cand->elt->occurrences++; + if (cand->elt) { + cand->elt->occurrences++; + } d->total_occurrences++; } @@ -1918,6 +1964,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, } part->languages = result; + part->language = ((struct rspamd_lang_detector_res *)g_ptr_array_index (result, 0))->lang; ret = TRUE; } else if (part->languages == NULL) { |