diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-08 13:36:58 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2020-02-08 13:36:58 +0000 |
commit | cd1deb3c19e84b759924c5208d2e1405be3b857a (patch) | |
tree | 035bfb8eb740c53172ac3a435e716aab91bdfa27 /src | |
parent | c946bde60c5cd38c91997234ef9692b02c950fa9 (diff) | |
download | rspamd-cd1deb3c19e84b759924c5208d2e1405be3b857a.tar.gz rspamd-cd1deb3c19e84b759924c5208d2e1405be3b857a.zip |
[Minor] Add some more heuristics for stop words detection
Diffstat (limited to 'src')
-rw-r--r-- | src/libmime/lang_detection.c | 40 | ||||
-rw-r--r-- | src/libmime/lang_detection.h | 1 |
2 files changed, 40 insertions, 1 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index f27d71a76..a178b1bf8 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -446,6 +446,9 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, if (strcmp (fl, "diacritics") == 0) { nelt->flags |= RS_LANGUAGE_DIACRITICS; } + else if (strcmp (fl, "ascii") == 0) { + nelt->flags |= RS_LANGUAGE_ASCII; + } else { msg_debug_config ("unknown flag %s of language %s", fl, nelt->name); } @@ -1668,7 +1671,8 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, struct rspamd_stop_word_elt *elt; struct rspamd_sw_cbdata cbdata; gboolean ret = FALSE; - static const int stop_words_threshold = 4; + static const int stop_words_threshold = 4, /* minimum stop words count */ + strong_confidence_threshold = 10 /* we are sure that this is enough */; elt = &d->stop_words[cat]; cbdata.res = kh_init (rspamd_sw_hash); @@ -1683,18 +1687,52 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, gint cur_matches; double max_rate = G_MINDOUBLE; struct rspamd_language_elt *cur_lang, *sel = NULL; + gboolean ignore_ascii = FALSE, ignore_latin = FALSE; + again: kh_foreach (cbdata.res, cur_lang, cur_matches, { + if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) { + /* Restart matches */ + ignore_ascii = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det ("ignore ascii after finding %d stop words from %s", + cur_matches, cur_lang->name); + goto again; + } + + if (!ignore_latin && cur_lang->category != RSPAMD_LANGUAGE_LATIN) { + /* Restart matches */ + ignore_latin = TRUE; + sel = NULL; + max_rate = G_MINDOUBLE; + msg_debug_lang_det ("ignore latin after finding stop %d words from %s", + cur_matches, cur_lang->name); + goto again; + } + if (cur_matches < stop_words_threshold) { continue; } + if (cur_matches < strong_confidence_threshold) { + /* Ignore mixed languages when not enough confidence */ + if (ignore_ascii && (cur_lang->flags & RS_LANGUAGE_ASCII)) { + continue; + } + + if (ignore_latin && cur_lang->category == RSPAMD_LANGUAGE_LATIN) { + continue; + } + } + double rate = (double)cur_matches / (double)cur_lang->stop_words; if (rate > max_rate) { max_rate = rate; sel = cur_lang; } + msg_debug_lang_det ("found %d stop words from %s: %3f rate", cur_matches, cur_lang->name, rate); }); diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index b1382e6ad..6c3234848 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -56,6 +56,7 @@ enum rspamd_language_elt_flags { RS_LANGUAGE_TIER1 = (1 << 3), RS_LANGUAGE_TIER0 = (1 << 4), RS_LANGUAGE_DIACRITICS = (1 << 5), + RS_LANGUAGE_ASCII = (1 << 6), }; struct rspamd_lang_detector_res { |