[Project] Improve weighting in lang_detection

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index 98c4a10efedac42c4732143fadc83decc611ec09..04bff86b337bfbd07c68c17303fed077d736ed40 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -338,6 +338,11 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                         total = nelt->trigramms_total;
                 }
  
+               if (rspamd_language_search_str (nelt->name, tier1_langs,
+                               G_N_ELEMENTS (tier1_langs))) {
+                       nelt->flags |= RS_LANGUAGE_TIER1;
+               }
+
                 it = NULL;
                 ngramms = g_ptr_array_sized_new (freqs->len);
  
@@ -741,6 +746,7 @@ rspamd_language_detector_detect_word (struct rspamd_task *task,
         }
  }
  
+static const gdouble cutoff_limit = -8.0;
  /*
   * Converts frequencies to log probabilities, filter those candidates who
   * has the lowest probabilities
@@ -762,12 +768,20 @@ rspamd_language_detector_filter_negligible (struct rspamd_task *task,
                 cand = (struct rspamd_lang_detector_res *)v;
  
                 if (cand->prob == 0) {
+                       msg_debug_lang_det ("exclude language %s: %.3f",
+                                       cand->lang, cand->prob, max_prob);
                         g_hash_table_iter_remove (&it);
+                       filtered ++;
                 }
                 else {
                         cand->prob = log2 (cand->prob);
-
-                       if (cand->prob > max_prob) {
+                       if (cand->prob < cutoff_limit) {
+                               msg_debug_lang_det ("exclude language %s: %.3f, cutoff limit: %.3f",
+                                               cand->lang, cand->prob, cutoff_limit);
+                               g_hash_table_iter_remove (&it);
+                               filtered ++;
+                       }
+                       else if (cand->prob > max_prob) {
                                 max_prob = cand->prob;
                         }
                 }
@@ -958,6 +972,9 @@ struct rspamd_frequency_sort_cbdata {
         gdouble mean;
  };
  
+static const gdouble tier1_adjustment = 0.25;
+static const gdouble frequency_adjustment = 0.25;
+
  static gint
  rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
                 gpointer ud)
@@ -988,8 +1005,21 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b,
                 freqb = ((gdouble)candb->elt->occurencies) /
                                 (gdouble)cbd->d->total_occurencies;
  
-               proba_adjusted = canda->prob * freqa;
-               probb_adjusted = candb->prob * freqb;
+               proba_adjusted = canda->prob;
+               probb_adjusted = candb->prob;
+
+               if (isnormal (freqa) && isnormal (freqb)) {
+                       proba_adjusted += cbd->std * (frequency_adjustment * freqa);
+                       probb_adjusted += cbd->std * (frequency_adjustment * freqb);
+               }
+
+               if (canda->elt->flags & RS_LANGUAGE_TIER1) {
+                       proba_adjusted += cbd->std * tier1_adjustment;
+               }
+
+               if (candb->elt->flags & RS_LANGUAGE_TIER1) {
+                       probb_adjusted += cbd->std * tier1_adjustment;
+               }
  
                 if (proba_adjusted > probb_adjusted) {
                         return -1;
@@ -1024,8 +1054,6 @@ rspamd_language_detector_detect (struct rspamd_task *task,
         candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
                         NULL, g_free);
  
-       msg_debug_lang_det ("text is less than %z words: %z, start with trigramms",
-                       d->short_text_limit, words_len);
         r = rspamd_language_detector_try_ngramm (task, default_words, d,
                         ucs_tokens, rs_trigramm,
                         candidates);
@@ -1038,8 +1066,9 @@ rspamd_language_detector_detect (struct rspamd_task *task,
         }
         else if (r == rs_detect_multiple) {
                 /* Check our guess */
-               msg_debug_lang_det ("unigramms pass finished, found %d candidates",
+               msg_debug_lang_det ("trigramms pass finished, found %d candidates",
                                 (gint)g_hash_table_size (candidates));
+
                 mean = 0.0;
                 std = 0.0;
                 g_hash_table_iter_init (&it, candidates);
@@ -1065,7 +1094,7 @@ rspamd_language_detector_detect (struct rspamd_task *task,
                 msg_debug_lang_det ("trigramms checked, %.3f mean, %.4f stddev",
                                 mean, std);
  
-               if (std / fabs (mean) < 0.1) {
+               if (std / fabs (mean) < 0.25) {
                         msg_debug_lang_det ("apply frequency heuristic sorting");
                         frequency_heuristic_applied = TRUE;
                         cbd.d = d;
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 27 Jan 2018 14:46:54 +0000 (14:46 +0000)