]> source.dussan.org Git - rspamd.git/commitdiff
[Fix] Use n_words attribute from ngramms
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 14 Jan 2018 22:20:39 +0000 (22:20 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 14 Jan 2018 22:20:39 +0000 (22:20 +0000)
src/libmime/lang_detection.c

index 6e3635a95acd4d9f771cfb0967bf8da7cea65868..65f7d9b462749270d7185fc309598be7be8e77fa 100644 (file)
@@ -87,7 +87,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 {
        struct ucl_parser *parser;
        ucl_object_t *top;
-       const ucl_object_t *freqs, *cur;
+       const ucl_object_t *freqs, *n_words, *cur;
        ucl_object_iter_t it = NULL;
        UErrorCode uc_err = U_ZERO_ERROR;
        struct rspamd_language_elt *nelt;
@@ -171,6 +171,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
                }
        }
 
+       n_words = ucl_object_lookup (top, "n_words");
+
+       if (n_words == NULL || ucl_object_type (n_words) != UCL_ARRAY ||
+                       n_words->len != 3) {
+               msg_warn_config ("cannot find n_words in language %s", nelt->name);
+       }
+       else {
+               nelt->unigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
+                               0));
+               nelt->bigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
+                               1));
+               nelt->trigramms_total = ucl_object_toint (ucl_array_find_index (n_words,
+                               2));
+       }
+
        msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
                        nelt->name,
                        (gint)g_hash_table_size (nelt->unigramms),
@@ -588,7 +603,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
                 * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that
                 * prob2 is 2^4 less than prob1
                 */
-               if (max_prob - cand->prob > 4) {
+               if (max_prob - cand->prob > 1.5) {
                        g_hash_table_iter_remove (&it);
                }
        }
@@ -755,6 +770,8 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
                                g_hash_table_unref (candidates);
                                candidates = tcandidates;
 
+                               msg_err ("bigramms checked, %.3f mean, %.4f stddev", mean, std);
+
                                if (std / fabs (mean) < 0.3) {
                                        /* Try trigramms */
                                        tcandidates = g_hash_table_new_full (rspamd_str_hash,
@@ -782,7 +799,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 
        while (g_hash_table_iter_next (&it, &k, &v)) {
                cand = (struct rspamd_lang_detector_res *) v;
-               msg_debug ("%s -> %.2f", cand->lang, cand->prob);
+               msg_err ("%s -> %.2f", cand->lang, cand->prob);
                g_ptr_array_add (result, cand);
                g_hash_table_iter_steal (&it);
        }