[Fix] Various improvements in language detection

author Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)

committer Vsevolod Stakhov <vsevolod@highsecure.ru>

Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)
author Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)
committer Vsevolod Stakhov <vsevolod@highsecure.ru>
Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c

index 66901e6b93b4f67af8fae99886833ae0fe63a337..ead12b8e8e6519bc455cea9831e529382cb5f315 100644 (file)
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,7 +24,8 @@
  #include <math.h>
  
  static const gsize default_short_text_limit = 200;
-static const gsize default_words = 20;
+static const gsize default_words = 30;
+static const gdouble update_prob = 0.6;
  static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages";
  
  struct rspamd_language_elt {
@@ -503,9 +504,17 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
         /* Split words */
         while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
                         != -1) {
-               if (!rspamd_language_detector_process_ngramm_update (d, window,
-                               type, candidates)) {
-                       ret = FALSE;
+
+               if (rspamd_random_double_fast () > update_prob) {
+                       if (!rspamd_language_detector_process_ngramm_update (d, window,
+                                       type, candidates)) {
+                               ret = FALSE;
+                       }
+               }
+               else {
+                       /* Try to do full update in case if we are missing some candidates */
+                       rspamd_language_detector_process_ngramm_full (d, window, type,
+                                       candidates);
                 }
         }
  
@@ -576,10 +585,10 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
                 cand = (struct rspamd_lang_detector_res *) v;
  
                 /*
-                * Probabilities are logarifmic, so if prob1 - prob2 > 4, it means that
+                * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
                  * prob2 is 2^4 less than prob1
                  */
-               if (max_prob - cand->prob > 256) {
+               if (max_prob - cand->prob > 4) {
                         g_hash_table_iter_remove (&it);
                 }
         }
@@ -636,55 +645,134 @@ rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
         return 0;
  }
  
+enum rspamd_language_detected_type {
+       rs_detect_none = 0,
+       rs_detect_single,
+       rs_detect_multiple,
+};
+
+static enum rspamd_language_detected_type
+rspamd_language_detector_try_ngramm (struct rspamd_lang_detector *d,
+               GArray *ucs_tokens,
+               enum rspamd_language_gramm_type type,
+               GHashTable *candidates)
+{
+       guint cand_len;
+
+       rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
+                       type, TRUE);
+
+       cand_len = g_hash_table_size (candidates);
+
+       if (cand_len == 0) {
+               return rs_detect_none;
+       }
+       else if (cand_len == 1) {
+               return rs_detect_single;
+       }
+
+       return rs_detect_multiple;
+}
+
  GPtrArray *
  rspamd_language_detector_detect (struct rspamd_lang_detector *d,
                 GArray *ucs_tokens, gsize words_len)
  {
-       GHashTable *candidates;
+       GHashTable *candidates, *tcandidates;
         GPtrArray *result;
         GHashTableIter it;
         gpointer k, v;
+       gdouble mean, std;
         struct rspamd_lang_detector_res *cand;
-       guint cand_len, prev_len;
+       enum rspamd_language_detected_type r;
  
         candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
                         NULL, g_free);
         if (words_len < d->short_text_limit) {
                 /* For short text, start directly from trigramms */
-               rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
-                               rs_trigramm, TRUE);
+               r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
+                               candidates);
+
+               if (r == rs_detect_none) {
+                       r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm,
+                                       candidates);
+
+                       if (r == rs_detect_none) {
+                               r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
+                                               candidates);
+                       }
+               }
         }
         else {
                 /* Start with unigramms */
-               rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
-                               rs_unigramm, TRUE);
-               cand_len = g_hash_table_size (candidates);
-
-               if (cand_len > 1) {
-                       /* Try bigramms */
-                       rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
-                                       rs_unigramm, FALSE);
-
-                       cand_len = g_hash_table_size (candidates);
-                       if (cand_len > 1) {
-                               prev_len = cand_len;
-                               /* Try trigramms */
-                               GHashTable *ncandidates;
-                               ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
-                                               NULL, g_free);
-                               rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates,
-                                               rs_trigramm, TRUE);
-                               cand_len = g_hash_table_size (ncandidates);
-
-                               if (cand_len < prev_len) {
-                                       g_hash_table_unref (candidates);
-                                       candidates = ncandidates;
+               r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_unigramm,
+                               candidates);
+
+               switch (r) {
+               case rs_detect_none:
+               case rs_detect_single:
+                       /* No unigramms found or single set found, no reason to continue */;
+                       break;
+               case rs_detect_multiple:
+                       /* Try to improve guess */
+                       tcandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
+                                       NULL, g_free);
+                       r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_bigramm,
+                                       tcandidates);
+
+                       switch (r) {
+                       case rs_detect_none:
+                               /* Revert to unigramms result */
+                               g_hash_table_unref (tcandidates);
+                               break;
+                       case rs_detect_single:
+                               /* We have good enough result, return it */
+                               g_hash_table_unref (candidates);
+                               candidates = tcandidates;
+                               break;
+                       case rs_detect_multiple:
+                               mean = 0.0;
+                               std = 0.0;
+                               g_hash_table_iter_init (&it, tcandidates);
+
+                               /* Check distirbution */
+                               while (g_hash_table_iter_next (&it, &k, &v)) {
+                                       cand = (struct rspamd_lang_detector_res *) v;
+                                       mean += cand->prob;
                                 }
-                               else {
-                                       /* Not a better guess */
-                                       g_hash_table_unref (ncandidates);
+
+                               mean /= g_hash_table_size (tcandidates);
+
+                               g_hash_table_iter_init (&it, tcandidates);
+                               while (g_hash_table_iter_next (&it, &k, &v)) {
+                                       gdouble err;
+                                       cand = (struct rspamd_lang_detector_res *) v;
+                                       err = cand->prob - mean;
+                                       std += err * err;
+                               }
+
+                               std /= g_hash_table_size (tcandidates);
+                               g_hash_table_unref (candidates);
+                               candidates = tcandidates;
+
+                               if (std < mean / 100) {
+                                       /* Try trigramms */
+                                       tcandidates = g_hash_table_new_full (rspamd_str_hash,
+                                                       rspamd_str_equal,
+                                                       NULL, g_free);
+
+                                       r = rspamd_language_detector_try_ngramm (d, ucs_tokens, rs_trigramm,
+                                                       tcandidates);
+
+                                       if (r != rs_detect_none) {
+                                               /* TODO: check if we have better distribution here */
+                                               g_hash_table_unref (candidates);
+                                               candidates = tcandidates;
+                                       }
                                 }
+                               break;
                         }
+                       break;
                 }
         }
  
@@ -694,7 +782,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
  
         while (g_hash_table_iter_next (&it, &k, &v)) {
                 cand = (struct rspamd_lang_detector_res *) v;
-               msg_err ("%s -> %.2f", cand->lang, cand->prob);
+               msg_debug ("%s -> %.2f", cand->lang, cand->prob);
                 g_ptr_array_add (result, cand);
                 g_hash_table_iter_steal (&it);
         }
diff --git a/src/libmime/message.c b/src/libmime/message.c

index 4bac7706200c9dfb5edf325630f8e7abfdea51c8..2a7801100166509ef67a931a25c66fec8c60d27d 100644 (file)
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -98,11 +98,13 @@ rspamd_extract_words (struct rspamd_task *task,
                         for (i = 0; i < part->normalized_words->len; i++) {
                                 w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);
  
-
-                               rspamd_language_detector_to_ucs (task->lang_det, task->task_pool,
-                                               w, &ucs_w);
-                               g_array_append_val (part->ucs32_words, ucs_w);
-                               ucs_len += ucs_w.len;
+                               if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+                                       rspamd_language_detector_to_ucs (task->lang_det,
+                                                       task->task_pool,
+                                                       w, &ucs_w);
+                                       g_array_append_val (part->ucs32_words, ucs_w);
+                                       ucs_len += ucs_w.len;
+                               }
                         }
  
                         part->languages = rspamd_language_detector_detect (task->lang_det,
author	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>
	Sat, 13 Jan 2018 20:13:18 +0000 (20:13 +0000)
src/libmime/lang_detection.c		patch \| blob \| history
src/libmime/message.c		patch \| blob \| history