From 92e1b614db2bba173c3352455c3454249d357c9d Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Fri, 2 Aug 2019 18:28:29 +0100 Subject: [PATCH] [Minor] Langdet: Add threshold for stop words --- src/libmime/lang_detection.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 74c6f7247..9ccd7bef5 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1650,6 +1650,7 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, struct rspamd_stop_word_elt *elt; struct rspamd_sw_cbdata cbdata; gboolean ret = FALSE; + static const int stop_words_threshold = 4; elt = &d->stop_words[cat]; cbdata.res = kh_init (rspamd_sw_hash); @@ -1667,7 +1668,12 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, struct rspamd_language_elt *cur_lang; kh_foreach (cbdata.res, cur_lang, cur_matches, { + if (cur_matches < stop_words_threshold) { + continue; + } + double rate = (double)cur_matches / (double)cur_lang->stop_words; + if (rate > max_rate) { max_rate = rate; sel = cur_lang->name; -- 2.39.5