aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/libmime/lang_detection.c21
1 files changed, 19 insertions, 2 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index b3188e6dd..c340a1b33 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -67,8 +67,11 @@ static const gchar *unigramms_langs[] = {
/*
* Top languages
*/
+static const gchar *tier0_langs[] = {
+ "en",
+};
static const gchar *tier1_langs[] = {
- "en", "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
+ "fr", "it", "de", "es", "nl", "zh-CN", "zh-TW", "ja",
"ko", "pt", "ru", "pl", "tk", "th", "ar"
};
@@ -78,7 +81,7 @@ enum rspamd_language_elt_flags {
RS_LANGUAGE_UNISCRIPT = (1 << 1),
RS_LANGUAGE_UNIGRAMM = (1 << 2),
RS_LANGUAGE_TIER1 = (1 << 3),
- RS_LANGUAGE_TIER2 = (1 << 4),
+ RS_LANGUAGE_TIER0 = (1 << 4),
};
struct rspamd_language_elt {
@@ -343,6 +346,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
nelt->flags |= RS_LANGUAGE_TIER1;
}
+ if (rspamd_language_search_str (nelt->name, tier0_langs,
+ G_N_ELEMENTS (tier0_langs))) {
+ nelt->flags |= RS_LANGUAGE_TIER0;
+ }
+
it = NULL;
ngramms = g_ptr_array_sized_new (freqs->len);
@@ -972,6 +980,7 @@ struct rspamd_frequency_sort_cbdata {
gdouble mean;
};
+static const gdouble tier0_adjustment = 1.2;
static const gdouble tier1_adjustment = 0.8;
static const gdouble frequency_adjustment = 0.8;
@@ -1021,6 +1030,14 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
probb_adjusted += cbd->std * tier1_adjustment;
}
+ if (canda->elt->flags & RS_LANGUAGE_TIER0) {
+ proba_adjusted += cbd->std * tier0_adjustment;
+ }
+
+ if (candb->elt->flags & RS_LANGUAGE_TIER0) {
+ probb_adjusted += cbd->std * tier0_adjustment;
+ }
+
if (proba_adjusted > probb_adjusted) {
return -1;
} else if (probb_adjusted > proba_adjusted) {