diff options
-rw-r--r-- | src/libmime/lang_detection.c | 21 |
1 files changed, 19 insertions, 2 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index b3188e6dd..c340a1b33 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -67,8 +67,11 @@ static const gchar *unigramms_langs[] = { /* * Top languages */ +static const gchar *tier0_langs[] = { + "en", +}; static const gchar *tier1_langs[] = { - "en", "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja", + "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja", "ko", "pt", "ru", "pl", "tk", "th", "ar" }; @@ -78,7 +81,7 @@ enum rspamd_language_elt_flags { RS_LANGUAGE_UNISCRIPT = (1 << 1), RS_LANGUAGE_UNIGRAMM = (1 << 2), RS_LANGUAGE_TIER1 = (1 << 3), - RS_LANGUAGE_TIER2 = (1 << 4), + RS_LANGUAGE_TIER0 = (1 << 4), }; struct rspamd_language_elt { @@ -343,6 +346,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, nelt->flags |= RS_LANGUAGE_TIER1; } + if (rspamd_language_search_str (nelt->name, tier0_langs, + G_N_ELEMENTS (tier0_langs))) { + nelt->flags |= RS_LANGUAGE_TIER0; + } + it = NULL; ngramms = g_ptr_array_sized_new (freqs->len); @@ -972,6 +980,7 @@ struct rspamd_frequency_sort_cbdata { gdouble mean; }; +static const gdouble tier0_adjustment = 1.2; static const gdouble tier1_adjustment = 0.8; static const gdouble frequency_adjustment = 0.8; @@ -1021,6 +1030,14 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, probb_adjusted += cbd->std * tier1_adjustment; } + if (canda->elt->flags & RS_LANGUAGE_TIER0) { + proba_adjusted += cbd->std * tier0_adjustment; + } + + if (candb->elt->flags & RS_LANGUAGE_TIER0) { + probb_adjusted += cbd->std * tier0_adjustment; + } + if (proba_adjusted > probb_adjusted) { return -1; } else if (probb_adjusted > proba_adjusted) { |