diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 19:57:43 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 19:57:43 +0100 |
commit | d540364334a9571a396dc35041dbe619f6f84e16 (patch) | |
tree | 2cb1165f71d5e9d96e7ea03b4494849ca489460c /src/libmime | |
parent | 9d58fcefc399d5aebb130d787fcf9706a393a558 (diff) | |
download | rspamd-d540364334a9571a396dc35041dbe619f6f84e16.tar.gz rspamd-d540364334a9571a396dc35041dbe619f6f84e16.zip |
[Fix] Fix stop words detection and loading logic
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/lang_detection.c | 22 |
1 files changed, 17 insertions, 5 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 05bae8192..0e4d3e8eb 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -352,7 +352,8 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, struct rspamd_language_ucs_elt *ucs_elt; khash_t (rspamd_trigram_hash) *htb = NULL; gchar *pos; - guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, loaded; + guint total = 0, total_latin = 0, total_ngramms = 0, i, skipped, + loaded, nstop = 0; gdouble mean = 0, std = 0, delta = 0, delta2 = 0, m2 = 0; enum rspamd_language_category cat = RSPAMD_LANGUAGE_MAX; @@ -446,6 +447,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { rspamd_multipattern_add_pattern (d->stop_words[cat].mp, ucl_object_tostring (w), 0); + nstop ++; } stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); @@ -584,13 +586,13 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, msg_info_config ("loaded %s language, %d trigramms, " "%d ngramms loaded; " - "std=%.2f, mean=%.2f, skipped=%d, loaded=%d; " + "std=%.2f, mean=%.2f, skipped=%d, loaded=%d, stop_words=%d; " "(%s)", nelt->name, (gint)nelt->trigramms_words, total, std, mean, - skipped, loaded, + skipped, loaded, nstop, rspamd_language_detector_print_flags (nelt)); g_ptr_array_add (d->languages, nelt); @@ -717,7 +719,7 @@ rspamd_language_detector_init (struct rspamd_config *cfg) languages_pattern = g_string_sized_new (PATH_MAX); rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path); - parser = ucl_parser_new (UCL_PARSER_DEFAULT|UCL_PARSER_ZEROCOPY); + parser = ucl_parser_new (UCL_PARSER_DEFAULT); if (ucl_parser_add_file (parser, languages_pattern->str)) { stop_words = ucl_parser_get_object (parser); @@ -772,11 +774,19 @@ rspamd_language_detector_init (struct rspamd_config *cfg) } for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { + GError *err = NULL; + kh_foreach_value (ret->trigramms[i], schain, { chain = &schain; rspamd_language_detector_process_chain (cfg, chain); }); + if (!rspamd_multipattern_compile (ret->stop_words[i].mp, &err)) { + msg_err_config ("cannot compile stop words for %d language group: %e", + i, err); + g_error_free (err); + } + total += kh_size (ret->trigramms[i]); } @@ -1589,7 +1599,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, cat = rspamd_language_detector_get_category (part->unicode_scripts); - if (rspamd_language_detector_try_stop_words (task, d, part, cat)) { + if (!ret && rspamd_language_detector_try_stop_words (task, d, part, cat)) { ret = TRUE; } @@ -1679,6 +1689,8 @@ rspamd_language_detector_detect (struct rspamd_task *task, } part->languages = result; + + ret = TRUE; } end_ticks = rspamd_get_ticks (TRUE); |