diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 18:40:54 +0100 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2018-09-07 18:40:54 +0100 |
commit | 863d2260fa59165388c906098e23b608329668d6 (patch) | |
tree | b83353862f369da8a8417665008b9fc67f62619a | |
parent | 4c460de1a26ba805803808ca5eea76842aa3945a (diff) | |
download | rspamd-863d2260fa59165388c906098e23b608329668d6.tar.gz rspamd-863d2260fa59165388c906098e23b608329668d6.zip |
[Feature] Add preliminary stop words detection support
-rw-r--r-- | CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/libmime/lang_detection.c | 201 |
2 files changed, 199 insertions, 3 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 4c6d17c07..e75e85d7b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1353,6 +1353,7 @@ FILE(GLOB LANGUAGES "${CMAKE_CURRENT_SOURCE_DIR}/contrib/languages-data/*.json") FOREACH(_LANG ${LANGUAGES}) INSTALL(FILES "${_LANG}" DESTINATION ${PLUGINSDIR}/languages) ENDFOREACH() +INSTALL(FILES "${CMAKE_CURRENT_SOURCE_DIR}/contrib/languages-data/stop_words" DESTINATION ${PLUGINSDIR}/languages) # Lua config FILE(GLOB_RECURSE LUA_CONFIGS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/rules" diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index aadee2b6d..05bae8192 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -17,6 +17,7 @@ #include "lang_detection.h" #include "libutil/logger.h" #include "libcryptobox/cryptobox.h" +#include "libutil/multipattern.h" #include "ucl.h" #include "khash.h" #include <glob.h> @@ -103,6 +104,17 @@ struct rspamd_ngramm_chain { gchar *utf; }; +struct rspamd_stop_word_range { + guint start; + guint stop; + struct rspamd_language_elt *elt; +}; + +struct rspamd_stop_word_elt { + struct rspamd_multipattern *mp; + GArray *ranges; /* of rspamd_stop_word_range */ +}; + #define msg_debug_lang_det(...) rspamd_conditional_debug_fast (NULL, NULL, \ rspamd_langdet_log_id, "langdet", task->task_pool->tag.uid, \ G_STRFUNC, \ @@ -163,6 +175,7 @@ KHASH_INIT (rspamd_candidates_hash, const gchar *, struct rspamd_lang_detector { GPtrArray *languages; khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */ + struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; UConverter *uchar_converter; gsize short_text_limit; gsize total_occurencies; /* number of all languages found */ @@ -327,7 +340,8 @@ rspamd_language_detector_cmp_ngramm (gconstpointer a, gconstpointer b) static void rspamd_language_detector_read_file (struct rspamd_config *cfg, struct rspamd_lang_detector *d, - const gchar *path) + const gchar *path, + const ucl_object_t *stop_words) { struct ucl_parser *parser; ucl_object_t *top; @@ -417,6 +431,36 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, } } + if (stop_words) { + const ucl_object_t *specific_stop_words; + + specific_stop_words = ucl_object_lookup (stop_words, nelt->name); + + if (specific_stop_words) { + it = NULL; + const ucl_object_t *w; + guint start, stop; + + start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + + while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { + rspamd_multipattern_add_pattern (d->stop_words[cat].mp, + ucl_object_tostring (w), 0); + } + + stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); + + struct rspamd_stop_word_range r; + + r.start = start; + r.stop = stop; + r.elt = nelt; + + g_array_append_val (d->stop_words[cat].ranges, r); + it = NULL; + } + } + nelt->category = cat; htb = d->trigramms[cat]; @@ -626,6 +670,8 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d) for (guint i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { kh_destroy (rspamd_trigram_hash, d->trigramms[i]); + rspamd_multipattern_destroy (d->stop_words[i].mp); + g_array_free (d->stop_words[i].ranges, TRUE); } if (d->languages) { @@ -647,6 +693,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg) struct rspamd_ngramm_chain *chain, schain; gchar *fname; struct rspamd_lang_detector *ret = NULL; + struct ucl_parser *parser; + ucl_object_t *stop_words; section = ucl_object_lookup (cfg->rcl_obj, "lang_detection"); @@ -668,6 +716,22 @@ rspamd_language_detector_init (struct rspamd_config *cfg) } languages_pattern = g_string_sized_new (PATH_MAX); + rspamd_printf_gstring (languages_pattern, "%s/stop_words", languages_path); + parser = ucl_parser_new (UCL_PARSER_DEFAULT|UCL_PARSER_ZEROCOPY); + + if (ucl_parser_add_file (parser, languages_pattern->str)) { + stop_words = ucl_parser_get_object (parser); + } + else { + msg_err_config ("cannot read stop words from %s: %s", + languages_pattern->str, + ucl_parser_get_error (parser)); + stop_words = NULL; + } + + ucl_parser_free (parser); + languages_pattern->len = 0; + rspamd_printf_gstring (languages_pattern, "%s/*.json", languages_path); memset (&gl, 0, sizeof (gl)); @@ -683,6 +747,10 @@ rspamd_language_detector_init (struct rspamd_config *cfg) /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { ret->trigramms[i] = kh_init (rspamd_trigram_hash); + ret->stop_words[i].mp = rspamd_multipattern_create ( + RSPAMD_MULTIPATTERN_ICASE|RSPAMD_MULTIPATTERN_UTF8); + ret->stop_words[i].ranges = g_array_new (FALSE, FALSE, + sizeof (struct rspamd_stop_word_range)); } g_assert (uc_err == U_ZERO_ERROR); @@ -693,7 +761,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg) if (!rspamd_ucl_array_find_str (fname, languages_disable) || (languages_enable == NULL || rspamd_ucl_array_find_str (fname, languages_enable))) { - rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i]); + rspamd_language_detector_read_file (cfg, ret, gl.gl_pathv[i], + stop_words); } else { msg_info_config ("skip language file %s: disabled", fname); @@ -1370,6 +1439,125 @@ rspamd_language_detector_try_uniscript (struct rspamd_task *task, return FALSE; } + +KHASH_MAP_INIT_STR (rspamd_sw_hash, int); + +struct rspamd_sw_cbdata { + khash_t (rspamd_sw_hash) *res; + GArray *ranges; +}; + +static gint +rspamd_ranges_cmp (const void *k, const void *memb) +{ + gint pos = GPOINTER_TO_INT (k); + const struct rspamd_stop_word_range *r = (struct rspamd_stop_word_range *)memb; + + if (pos >= r->start && pos < r->stop) { + return 0; + } + else if (pos < r->start) { + return -1; + } + + return 1; +} + +static gint +rspamd_language_detector_sw_cb (struct rspamd_multipattern *mp, + guint strnum, + gint match_start, + gint match_pos, + const gchar *text, + gsize len, + void *context) +{ + /* Check if boundary */ + const gchar *prev, *next; + struct rspamd_stop_word_range *r; + struct rspamd_sw_cbdata *cbdata = (struct rspamd_sw_cbdata *)context; + khiter_t k; + + if (match_start > 0) { + prev = text + match_start - 1; + + if (!(g_ascii_isspace (*prev) || g_ascii_ispunct (*prev))) { + return 0; + } + } + else if (match_pos < len) { + next = text + match_pos + 1; + + if (!(g_ascii_isspace (*next) || g_ascii_ispunct (*next))) { + return 0; + } + } + + /* We have a word on the boundary, check range */ + r = bsearch (GINT_TO_POINTER (strnum), cbdata->ranges->data, + cbdata->ranges->len, sizeof (*r), rspamd_ranges_cmp); + + g_assert (r != NULL); + + k = kh_get (rspamd_sw_hash, cbdata->res, r->elt->name); + + if (k != kh_end (cbdata->res)) { + kh_value (cbdata->res, k) ++; + } + else { + gint tt; + + k = kh_put (rspamd_sw_hash, cbdata->res, r->elt->name, &tt); + kh_value (cbdata->res, k) = 1; + } + + return 0; +} + +static gboolean +rspamd_language_detector_try_stop_words (struct rspamd_task *task, + struct rspamd_lang_detector *d, + struct rspamd_mime_text_part *part, + enum rspamd_language_category cat) +{ + struct rspamd_stop_word_elt *elt; + struct rspamd_sw_cbdata cbdata; + gboolean ret = FALSE; + + elt = &d->stop_words[cat]; + cbdata.res = kh_init (rspamd_sw_hash); + cbdata.ranges = elt->ranges; + + rspamd_multipattern_lookup (elt->mp, part->utf_stripped_content->data, + part->utf_stripped_content->len, rspamd_language_detector_sw_cb, + &cbdata, NULL); + + if (kh_size (cbdata.res) > 0) { + gint max = G_MININT, cur_matches; + const gchar *sel = NULL, *cur_lang; + + kh_foreach (cbdata.res, cur_lang, cur_matches, { + if (cur_matches > max) { + max = cur_matches; + sel = cur_lang; + } + }); + + if (max > 0 && sel) { + msg_debug_lang_det ("set language based on stop words script %s, %d found", + sel, max); + rspamd_language_detector_set_language (task, part, + sel); + + ret = TRUE; + } + } + + kh_destroy (rspamd_sw_hash, cbdata.res); + + return ret; +} + gboolean rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, @@ -1379,6 +1567,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, GPtrArray *result; gdouble mean, std, start_ticks, end_ticks; guint cand_len; + enum rspamd_language_category cat; struct rspamd_lang_detector_res *cand; enum rspamd_language_detected_type r; struct rspamd_frequency_sort_cbdata cbd; @@ -1398,6 +1587,12 @@ rspamd_language_detector_detect (struct rspamd_task *task, ret = TRUE; } + cat = rspamd_language_detector_get_category (part->unicode_scripts); + + if (rspamd_language_detector_try_stop_words (task, d, part, cat)) { + ret = TRUE; + } + if (!ret) { candidates = kh_init (rspamd_candidates_hash); kh_resize (rspamd_candidates_hash, candidates, 32); @@ -1406,7 +1601,7 @@ rspamd_language_detector_detect (struct rspamd_task *task, default_words, d, part->utf_words, - rspamd_language_detector_get_category (part->unicode_scripts), + cat, candidates); if (r == rs_detect_none) { |