diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-10-06 10:29:32 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-10-06 10:29:32 +0100 |
commit | aeea55d92d3c417fcbafee474cdc16648a83a10c (patch) | |
tree | 253c434c1e0f0fdbcc33d7b0acff52f7dda25ca3 /src/libmime/lang_detection.c | |
parent | 4bbaebccdeca31bb4f4aafee8b088f18175bf3fe (diff) | |
download | rspamd-aeea55d92d3c417fcbafee474cdc16648a83a10c.tar.gz rspamd-aeea55d92d3c417fcbafee474cdc16648a83a10c.zip |
[Fix] Fix boundaries detection and rework stop words algorithm
Issue: #2541
Closes: #2541
Diffstat (limited to 'src/libmime/lang_detection.c')
-rw-r--r-- | src/libmime/lang_detection.c | 55 |
1 files changed, 39 insertions, 16 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index f4811f18b..7cf9ffec4 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -87,6 +87,7 @@ struct rspamd_language_elt { enum rspamd_language_elt_flags flags; enum rspamd_language_category category; guint trigramms_words; + guint stop_words; gdouble mean; gdouble std; guint occurencies; /* total number of parts with this language */ @@ -447,6 +448,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { rspamd_multipattern_add_pattern (d->stop_words[cat].mp, ucl_object_tostring (w), 0); + nelt->stop_words ++; nstop ++; } @@ -592,7 +594,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, (gint)nelt->trigramms_words, total, std, mean, - skipped, loaded, nstop, + skipped, loaded, nelt->stop_words, rspamd_language_detector_print_flags (nelt)); g_ptr_array_add (d->languages, nelt); @@ -1446,8 +1448,24 @@ rspamd_language_detector_try_uniscript (struct rspamd_task *task, return FALSE; } +static guint +rspamd_langelt_hash_func (gconstpointer key) +{ + const struct rspamd_language_elt *elt = (const struct rspamd_language_elt *)key; + return rspamd_cryptobox_fast_hash (elt->name, strlen (elt->name), + rspamd_hash_seed ()); +} -KHASH_MAP_INIT_STR (rspamd_sw_hash, int); +static gboolean +rspamd_langelt_equal_func (gconstpointer v, gconstpointer v2) +{ + const struct rspamd_language_elt *elt1 = (const struct rspamd_language_elt *)v, + *elt2 = (const struct rspamd_language_elt *)v2; + return strcmp (elt1->name, elt2->name) == 0; +} + +KHASH_INIT (rspamd_sw_hash, struct rspamd_language_elt *, int, 1, + rspamd_langelt_hash_func, rspamd_langelt_equal_func); struct rspamd_sw_cbdata { khash_t (rspamd_sw_hash) *res; @@ -1480,7 +1498,7 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, void *context) { /* Check if boundary */ - const gchar *prev, *next; + const gchar *prev = text, *next = text + len; struct rspamd_stop_word_range *r; struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context; khiter_t k; @@ -1492,8 +1510,9 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, return 0; } } - else if (match_pos < len) { - next = text + match_pos + 1; + + if (match_pos < len) { + next = text + match_pos; if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) { return 0; @@ -1503,10 +1522,9 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, /* We have a word on the boundary, check range */ r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data, cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp); - g_assert (r != NULL); - k = kh_get (rspamd_sw_hash, cbdata->res, r->elt->name); + k = kh_get (rspamd_sw_hash, cbdata->res, r->elt); if (k != kh_end (cbdata->res)) { kh_value (cbdata->res, k) ++; @@ -1514,7 +1532,7 @@ rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, else { gint tt; - k = kh_put (rspamd_sw_hash, cbdata->res, r->elt->name, &tt); + k = kh_put (rspamd_sw_hash, cbdata->res, r->elt, &tt); kh_value (cbdata->res, k) = 1; } @@ -1540,19 +1558,24 @@ rspamd_language_detector_try_stop_words (struct rspamd_task *task, &cbdata, NULL); if (kh_size (cbdata.res) > 0) { - gint max = G_MININT, cur_matches; - const gchar *sel = NULL, *cur_lang; + gint cur_matches; + double max_rate = G_MINDOUBLE; + const gchar *sel = NULL; + struct rspamd_language_elt *cur_lang; kh_foreach (cbdata.res, cur_lang, cur_matches, { - if (cur_matches > max) { - max = cur_matches; - sel = cur_lang; + double rate = (double)cur_matches / (double)cur_lang->stop_words; + if (rate > max_rate) { + max_rate = rate; + sel = cur_lang->name; } + msg_debug_lang_det ("found %d stop words from %s: %3f rate", + cur_matches, cur_lang->name, rate); }); - if (max > 0 && sel) { - msg_debug_lang_det ("set language based on stop words script %s, %d found", - sel, max); + if (max_rate > 0 && sel) { + msg_debug_lang_det ("set language based on stop words script %s, %.3f found", + sel, max_rate); rspamd_language_detector_set_language (task, part, sel); |