From: Vsevolod Stakhov Date: Sun, 14 Jan 2018 22:20:39 +0000 (+0000) Subject: [Fix] Use n_words attribute from ngramms X-Git-Tag: 1.7.0~280 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=3178969987c7d61219d04274c39dd4a452e99bb4;p=rspamd.git [Fix] Use n_words attribute from ngramms --- diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 6e3635a95..65f7d9b46 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -87,7 +87,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, { struct ucl_parser *parser; ucl_object_t *top; - const ucl_object_t *freqs, *cur; + const ucl_object_t *freqs, *n_words, *cur; ucl_object_iter_t it = NULL; UErrorCode uc_err = U_ZERO_ERROR; struct rspamd_language_elt *nelt; @@ -171,6 +171,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } } + n_words = ucl_object_lookup (top, "n_words"); + + if (n_words == NULL || ucl_object_type (n_words) != UCL_ARRAY || + n_words->len != 3) { + msg_warn_config ("cannot find n_words in language %s", nelt->name); + } + else { + nelt->unigramms_total = ucl_object_toint (ucl_array_find_index (n_words, + 0)); + nelt->bigramms_total = ucl_object_toint (ucl_array_find_index (n_words, + 1)); + nelt->trigramms_total = ucl_object_toint (ucl_array_find_index (n_words, + 2)); + } + msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms", nelt->name, (gint)g_hash_table_size (nelt->unigramms), @@ -588,7 +603,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates) * Probabilities are logarithmic, so if prob1 - prob2 > 4, it means that * prob2 is 2^4 less than prob1 */ - if (max_prob - cand->prob > 4) { + if (max_prob - cand->prob > 1.5) { g_hash_table_iter_remove (&it); } } @@ -755,6 +770,8 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, g_hash_table_unref (candidates); candidates = tcandidates; + msg_err ("bigramms checked, %.3f mean, %.4f stddev", mean, std); + if (std / fabs (mean) < 0.3) { /* Try trigramms */ tcandidates = g_hash_table_new_full (rspamd_str_hash, @@ -782,7 +799,7 @@ rspamd_language_detector_detect (struct rspamd_lang_detector *d, while (g_hash_table_iter_next (&it, &k, &v)) { cand = (struct rspamd_lang_detector_res *) v; - msg_debug ("%s -> %.2f", cand->lang, cand->prob); + msg_err ("%s -> %.2f", cand->lang, cand->prob); g_ptr_array_add (result, cand); g_hash_table_iter_steal (&it); }