diff options
author | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 16:00:52 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@rspamd.com> | 2023-04-29 16:00:52 +0100 |
commit | 070120ed1370ac7179cf4945195294df6a26b4dc (patch) | |
tree | fda9fd949d1ef76012363949daebc1a6b388f31e /src/libmime/lang_detection.c | |
parent | 264b9f2c480a1b0240acb8183a8d7470691aff11 (diff) | |
download | rspamd-070120ed1370ac7179cf4945195294df6a26b4dc.tar.gz rspamd-070120ed1370ac7179cf4945195294df6a26b4dc.zip |
[Fix] Ignore non-unique stop words
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 38 |
1 files changed, 30 insertions, 8 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 211dfe48b..d8e81e075 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -1583,7 +1583,10 @@ rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2) return strcmp (elt1->name, elt2->name) == 0; } -KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1, +/* This hash set stores a word index in the language to avoid duplicate stop words */ +KHASH_INIT (rspamd_sw_res_set, int, char, 0, kh_int_hash_func, kh_int_hash_equal); + +KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, khash_t(rspamd_sw_res_set) *, 1, rspamd_langelt_hash_func, rspamd_langelt_equal_func); struct rspamd_sw_cbdata { @@ -1652,9 +1655,20 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, gint nwords = 1; if (k != kh_end (cbdata->res)) { - nwords = ++ kh_value (cbdata->res, k); + khiter_t set_k; + int tt; + + set_k = kh_get(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum); + nwords = kh_size(kh_value(cbdata->res, k)); - if (kh_value (cbdata->res, k) > max_stop_words) { + if (set_k == kh_end(kh_value(cbdata->res, k))) { + /* New word */ + set_k = kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); + msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)", + (int)(next - prev - 1), prev + 1, r->elt->name, nwords); + } + + if (nwords > max_stop_words) { return 1; } } @@ -1662,11 +1676,12 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, gint tt; k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt); - kh_value (cbdata->res, k) = 1; - } + kh_value(cbdata->res, k) = kh_init(rspamd_sw_res_set); + kh_put(rspamd_sw_res_set, kh_value(cbdata->res, k), strnum, &tt); - msg_debug_lang_det ("found word %*s from %s language (%d stop words found so far)", + msg_debug_lang_det ("found new word %*s from %s language (%d stop words found so far)", (int)(next - prev - 1), prev + 1, r->elt->name, nwords); + } return 0; } @@ -1693,13 +1708,15 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, &cbdata, NULL); if (kh_size (cbdata.res) > 0) { - gint cur_matches; + khash_t(rspamd_sw_res_set) *cur_res; double max_rate = G_MINDOUBLE; struct rspamd_language_elt *cur_lang, *sel = NULL; gboolean ignore_ascii = FALSE, ignore_latin = FALSE; again: - kh_foreach (cbdata.res, cur_lang, cur_matches, { + kh_foreach (cbdata.res, cur_lang, cur_res, { + int cur_matches = kh_size(cur_res); + if (!ignore_ascii && (cur_lang->flags & RS_LANGUAGE_DIACRITICS)) { /* Restart matches */ ignore_ascii = TRUE; @@ -1746,6 +1763,11 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, cur_matches, cur_lang->name, rate); }); + /* Cleanup */ + kh_foreach (cbdata.res, cur_lang, cur_res, { + kh_destroy (rspamd_sw_res_set, cur_res); + }); + if (max_rate > 0 && sel) { msg_debug_lang_det ("set language based on stop words script %s, %.3f found", sel->name, max_rate); |