From 2ef4d7f2356eb44e71d2906c6ed6ad08c23773c4 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sat, 8 Sep 2018 16:46:49 +0100 Subject: [PATCH] [Minor] Do not apply ngramms detection for short texts --- src/libmime/lang_detection.c | 103 +++++++++++++++++++---------------- 1 file changed, 56 insertions(+), 47 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index fbc5f56c9..dfd62d4e5 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -27,7 +27,7 @@ #include #include -static const gsize default_short_text_limit = 200; +static const gsize default_short_text_limit = 20; static const gsize default_words = 80; static const gdouble update_prob = 0.6; static const gchar *default_languages_path = RSPAMD_PLUGINSDIR "/languages"; @@ -1597,62 +1597,71 @@ rspamd_language_detector_detect (struct rspamd_task *task, } if (!ret) { - candidates = kh_init (rspamd_candidates_hash); - kh_resize (rspamd_candidates_hash, candidates, 32); - - r = rspamd_language_detector_try_ngramm (task, - default_words, - d, - part->utf_words, - cat, - candidates); - - if (r == rs_detect_none) { - msg_debug_lang_det ("no trigramms found, fallback to english"); + if (part->utf_words->len < default_short_text_limit) { + r = rs_detect_none; + msg_debug_lang_det ("text is too short for trigramms detection: " + "%d words; at least %d words required", + (int)part->utf_words->len, + (int)default_short_text_limit); rspamd_language_detector_set_language (task, part, "en"); } - else if (r == rs_detect_multiple) { - /* Check our guess */ - - mean = 0.0; - std = 0.0; - cand_len = 0; - - /* Check distirbution */ - kh_foreach_value (candidates, cand, { - if (!isnan (cand->prob)) { - mean += cand->prob; - cand_len++; - } - }); - - if (cand_len > 0) { - mean /= cand_len; - + else { + candidates = kh_init (rspamd_candidates_hash); + kh_resize (rspamd_candidates_hash, candidates, 32); + + r = rspamd_language_detector_try_ngramm (task, + default_words, + d, + part->utf_words, + cat, + candidates); + + if (r == rs_detect_none) { + msg_debug_lang_det ("no trigramms found, fallback to english"); + rspamd_language_detector_set_language (task, part, "en"); + } else if (r == rs_detect_multiple) { + /* Check our guess */ + + mean = 0.0; + std = 0.0; + cand_len = 0; + + /* Check distirbution */ kh_foreach_value (candidates, cand, { - gdouble err; if (!isnan (cand->prob)) { - err = cand->prob - mean; - std += fabs (err); + mean += cand->prob; + cand_len++; } }); - std /= cand_len; - } + if (cand_len > 0) { + mean /= cand_len; - msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev", - cand_len, mean, std); + kh_foreach_value (candidates, cand, { + gdouble err; + if (!isnan (cand->prob)) { + err = cand->prob - mean; + std += fabs (err); + } + }); - if (cand_len > 0 && std / fabs (mean) < 0.25) { - msg_debug_lang_det ("apply frequency heuristic sorting"); - frequency_heuristic_applied = TRUE; - cbd.d = d; - cbd.mean = mean; - cbd.std = std; - cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + std /= cand_len; + } - if (part->utf_words->len < default_words / 2) { - cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + msg_debug_lang_det ("trigramms checked, %d candidates, %.3f mean, %.4f stddev", + cand_len, mean, std); + + if (cand_len > 0 && std / fabs (mean) < 0.25) { + msg_debug_lang_det ("apply frequency heuristic sorting"); + frequency_heuristic_applied = TRUE; + cbd.d = d; + cbd.mean = mean; + cbd.std = std; + cbd.flags = RSPAMD_LANG_FLAG_DEFAULT; + + if (part->utf_words->len < default_words / 2) { + cbd.flags |= RSPAMD_LANG_FLAG_SHORT; + } } } } -- 2.39.5