Przeglądaj źródła

Merge pull request #4951 from rspamd/vstakhov-langdet-fix

Apply detection phase if fasttext could not detect language
pull/4953/head
Vsevolod Stakhov 1 miesiąc temu
rodzic
commit
c3e5d3d6fc
No account linked to committer's email address
1 zmienionych plików z 97 dodań i 74 usunięć
  1. 97
    74
      src/libmime/lang_detection.c

+ 97
- 74
src/libmime/lang_detection.c Wyświetl plik

@@ -1822,7 +1822,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
struct rspamd_lang_detector *d,
struct rspamd_mime_text_part *part)
{
khash_t(rspamd_candidates_hash) * candidates;
khash_t(rspamd_candidates_hash) *candidates = NULL;
GPtrArray *result;
double mean, std, start_ticks, end_ticks;
unsigned int cand_len;
@@ -1831,7 +1831,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
enum rspamd_language_detected_type r;
struct rspamd_frequency_sort_cbdata cbd;
/* Check if we have sorted candidates based on frequency */
gboolean frequency_heuristic_applied = FALSE, ret = FALSE;
gboolean frequency_heuristic_applied = FALSE, ret = FALSE, internal_heuristic_applied = FALSE;

if (!part->utf_stripped_content) {
return FALSE;
@@ -1854,6 +1854,8 @@ rspamd_language_detector_detect(struct rspamd_task *task,
if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
ret = TRUE;
}

internal_heuristic_applied = TRUE;
}

if (!ret) {
@@ -1906,91 +1908,110 @@ rspamd_language_detector_detect(struct rspamd_task *task,

rspamd_fasttext_predict_result_destroy(fasttext_predict_result);
}
if (ndetected == 0) {
if (part->utf_words->len < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det("text is too short for trigrams detection: "
"%d words; at least %d words required",
(int) part->utf_words->len,
(int) default_short_text_limit);
switch (cat) {
case RSPAMD_LANGUAGE_CYRILLIC:
rspamd_language_detector_set_language(task, part, "ru", NULL);
break;
case RSPAMD_LANGUAGE_DEVANAGARI:
rspamd_language_detector_set_language(task, part, "hi", NULL);
break;
case RSPAMD_LANGUAGE_ARAB:
rspamd_language_detector_set_language(task, part, "ar", NULL);
break;
default:
case RSPAMD_LANGUAGE_LATIN:
rspamd_language_detector_set_language(task, part, "en", NULL);
break;
else {
/* Fasttext has failed to apply anything */
r = rs_detect_none;

if (!internal_heuristic_applied) {
/* Apply unicode scripts heuristic */
if (rspamd_language_detector_try_uniscript(task, part, nchinese, nspecial)) {
ret = TRUE;
}
msg_debug_lang_det("set %s language based on symbols category",
part->language);

candidates = kh_init(rspamd_candidates_hash);
cat = rspamd_language_detector_get_category(part->unicode_scripts);

if (!ret && rspamd_language_detector_try_stop_words(task, d, part, cat)) {
ret = TRUE;
}

internal_heuristic_applied = TRUE;
}
else {

if (!ret) {
/* Apply trigramms detection */
candidates = kh_init(rspamd_candidates_hash);
kh_resize(rspamd_candidates_hash, candidates, 32);

r = rspamd_language_detector_try_ngramm(task,
default_words,
d,
part->utf_words,
cat,
candidates,
part);

if (r == rs_detect_none) {
msg_debug_lang_det("no trigrams found, fallback to english");
rspamd_language_detector_set_language(task, part, "en", NULL);
if (part->utf_words->len < default_short_text_limit) {
r = rs_detect_none;
msg_debug_lang_det("text is too short for trigrams detection: "
"%d words; at least %d words required",
(int) part->utf_words->len,
(int) default_short_text_limit);
switch (cat) {
case RSPAMD_LANGUAGE_CYRILLIC:
rspamd_language_detector_set_language(task, part, "ru", NULL);
break;
case RSPAMD_LANGUAGE_DEVANAGARI:
rspamd_language_detector_set_language(task, part, "hi", NULL);
break;
case RSPAMD_LANGUAGE_ARAB:
rspamd_language_detector_set_language(task, part, "ar", NULL);
break;
default:
case RSPAMD_LANGUAGE_LATIN:
rspamd_language_detector_set_language(task, part, "en", NULL);
break;
}
msg_debug_lang_det("set %s language based on symbols category",
part->language);
}
else if (r == rs_detect_multiple) {
/* Check our guess */

mean = 0.0;
std = 0.0;
cand_len = 0;

/* Check distribution */
kh_foreach_value(candidates, cand, {
if (!isnan(cand->prob)) {
mean += cand->prob;
cand_len++;
}
});
else {
kh_resize(rspamd_candidates_hash, candidates, 32);

r = rspamd_language_detector_try_ngramm(task,
default_words,
d,
part->utf_words,
cat,
candidates,
part);

if (r == rs_detect_none) {
msg_debug_lang_det("no trigrams found, fallback to english");
rspamd_language_detector_set_language(task, part, "en", NULL);
}
else if (r == rs_detect_multiple) {
/* Check our guess */

if (cand_len > 0) {
mean /= cand_len;
mean = 0.0;
std = 0.0;
cand_len = 0;

/* Check distribution */
kh_foreach_value(candidates, cand, {
double err;
if (!isnan(cand->prob)) {
err = cand->prob - mean;
std += fabs(err);
mean += cand->prob;
cand_len++;
}
});

std /= cand_len;
}
if (cand_len > 0) {
mean /= cand_len;

kh_foreach_value(candidates, cand, {
double err;
if (!isnan(cand->prob)) {
err = cand->prob - mean;
std += fabs(err);
}
});

std /= cand_len;
}

msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
cand_len, mean, std);
msg_debug_lang_det("trigrams checked, %d candidates, %.3f mean, %.4f stddev",
cand_len, mean, std);

if (cand_len > 0 && std / fabs(mean) < 0.25) {
msg_debug_lang_det("apply frequency heuristic sorting");
frequency_heuristic_applied = TRUE;
cbd.d = d;
cbd.mean = mean;
cbd.std = std;
cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;
if (cand_len > 0 && std / fabs(mean) < 0.25) {
msg_debug_lang_det("apply frequency heuristic sorting");
frequency_heuristic_applied = TRUE;
cbd.d = d;
cbd.mean = mean;
cbd.std = std;
cbd.flags = RSPAMD_LANG_FLAG_DEFAULT;

if (part->nwords < default_words / 2) {
cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
if (part->nwords < default_words / 2) {
cbd.flags |= RSPAMD_LANG_FLAG_SHORT;
}
}
}
}
@@ -1998,7 +2019,7 @@ rspamd_language_detector_detect(struct rspamd_task *task,
}

/* Now, convert hash to array and sort it */
if (r != rs_detect_none && kh_size(candidates) > 0) {
if (r != rs_detect_none && candidates != NULL && kh_size(candidates) > 0) {
result = g_ptr_array_sized_new(kh_size(candidates));

kh_foreach_value(candidates, cand, {
@@ -2037,7 +2058,9 @@ rspamd_language_detector_detect(struct rspamd_task *task,
rspamd_language_detector_set_language(task, part, "en", NULL);
}

kh_destroy(rspamd_candidates_hash, candidates);
if (candidates != NULL) {
kh_destroy(rspamd_candidates_hash, candidates);
}
}

/* Update internal stat */

Ładowanie…
Anuluj
Zapisz