aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-07 09:13:41 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-02-07 09:13:41 +0000
commita727cbdcdb40178352e22b389645ebd6baf4dffc (patch)
tree13eb4c3658dc9aaebc0f4a66470bc586de39d6c0 /src
parentcdc6ea2e0dae1e0a6a409dc2e8bb2dde90274371 (diff)
downloadrspamd-a727cbdcdb40178352e22b389645ebd6baf4dffc.tar.gz
rspamd-a727cbdcdb40178352e22b389645ebd6baf4dffc.zip
[Minor] Add more filtering for ngramm chains
Diffstat (limited to 'src')
-rw-r--r--src/libmime/lang_detection.c51
1 files changed, 51 insertions, 0 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index cb2427724..4e5103341 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -102,6 +102,8 @@ struct rspamd_ngramm_elt {
struct rspamd_ngramm_chain {
GPtrArray *languages;
+ gdouble mean;
+ gdouble std;
gchar *utf;
};
@@ -551,6 +553,45 @@ rspamd_ucl_array_find_str (const gchar *str, const ucl_object_t *ar)
return FALSE;
}
+static void
+rspamd_language_detector_process_chain (struct rspamd_config *cfg,
+ struct rspamd_ngramm_chain *chain)
+{
+ struct rspamd_ngramm_elt *elt;
+ guint i;
+ gdouble delta, mean = 0, delta2, m2 = 0, std;
+
+ if (chain->languages->len > 3) {
+ PTR_ARRAY_FOREACH (chain->languages, i, elt) {
+ delta = elt->prob - mean;
+ mean += delta / (i + 1);
+ delta2 = elt->prob - mean;
+ m2 += delta * delta2;
+ }
+
+ std = sqrt (m2 / (i - 1));
+ chain->mean = mean;
+ chain->std = std;
+
+ /* Now, filter elements that are lower than mean */
+ PTR_ARRAY_FOREACH (chain->languages, i, elt) {
+ if (elt->prob < mean) {
+ g_ptr_array_remove_index_fast (chain->languages, i);
+ msg_debug_lang_det_cfg ("remove %s from %s; prob: %.4f; mean: %.4f, std: %.4f",
+ elt->elt->name, chain->utf, elt->prob, mean, std);
+ }
+ }
+ }
+ else {
+ /* We have a unique ngramm, increase its weigth */
+ PTR_ARRAY_FOREACH (chain->languages, i, elt) {
+ elt->prob *= 4.0;
+ msg_debug_lang_det_cfg ("increase weight of %s in %s; prob: %.4f",
+ elt->elt->name, chain->utf, elt->prob);
+ }
+ }
+}
+
struct rspamd_lang_detector*
rspamd_language_detector_init (struct rspamd_config *cfg)
{
@@ -561,6 +602,9 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
size_t i, short_text_limit = default_short_text_limit;
UErrorCode uc_err = U_ZERO_ERROR;
GString *languages_pattern;
+ GHashTableIter it;
+ gpointer k, v;
+ struct rspamd_ngramm_chain *chain;
gchar *fname;
struct rspamd_lang_detector *ret = NULL;
@@ -620,6 +664,13 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
g_free (fname);
}
+ g_hash_table_iter_init (&it, ret->trigramms);
+
+ while (g_hash_table_iter_next (&it, &k, &v)) {
+ chain = (struct rspamd_ngramm_chain *)v;
+ rspamd_language_detector_process_chain (cfg, chain);
+ }
+
msg_info_config ("loaded %d languages, %d unicode only languages, "
"%d unigramms, "
"%d trigramms",