From f4d3c21d16cd1c71f7a4bb7772e77e768a7ab8d1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Thu, 15 Nov 2018 14:47:18 +0000 Subject: [PATCH] [Feature] Store stop words and allow to query them --- src/libmime/lang_detection.c | 77 +++++++++++++++++++++++++++++++++++- src/libmime/lang_detection.h | 10 +++++ src/libutil/str_util.c | 24 ++++++++++- src/libutil/str_util.h | 2 + 4 files changed, 110 insertions(+), 3 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index e4dc2ebf4..500b0dfdf 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -20,6 +20,8 @@ #include "libutil/multipattern.h" #include "ucl.h" #include "khash.h" +#include "libstemmer.h" + #include #include #include @@ -172,11 +174,15 @@ KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true KHASH_INIT (rspamd_candidates_hash, const gchar *, struct rspamd_lang_detector_res *, true, rspamd_str_hash, rspamd_str_equal); +KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *, + char, false, + rspamd_ftok_hash, rspamd_ftok_equal); struct rspamd_lang_detector { GPtrArray *languages; khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */ struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX]; + khash_t(rspamd_stopwords_hash) *stop_words_norm; UConverter *uchar_converter; gsize short_text_limit; gsize total_occurencies; /* number of all languages found */ @@ -439,17 +445,59 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, specific_stop_words = ucl_object_lookup (stop_words, nelt->name); if (specific_stop_words) { + struct sb_stemmer *stem = NULL; it = NULL; const ucl_object_t *w; guint start, stop; + stem = sb_stemmer_new (nelt->name, "UTF_8"); start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) { + gsize wlen; + const char *word = ucl_object_tolstring (w, &wlen); + const char *saved; + rspamd_multipattern_add_pattern (d->stop_words[cat].mp, - ucl_object_tostring (w), 0); + word, wlen); nelt->stop_words ++; nstop ++; + + /* Also lemmatise and store normalised */ + if (stem) { + const char *nw = sb_stemmer_stem (stem, word, wlen); + + + if (nw) { + saved = nw; + wlen = strlen (nw); + } + else { + saved = word; + } + } + else { + saved = word; + } + + if (saved) { + gint rc; + rspamd_ftok_t *tok; + gchar *dst; + + tok = g_malloc (sizeof (*tok) + wlen + 1); + dst = ((gchar *)tok) + sizeof (*tok); + rspamd_strlcpy (dst, saved, wlen + 1); + tok->begin = dst; + tok->len = wlen; + + kh_put (rspamd_stopwords_hash, d->stop_words_norm, + tok, &rc); + } + } + + if (stem) { + sb_stemmer_delete (stem); } stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp); @@ -668,6 +716,8 @@ static void rspamd_language_detector_dtor (struct rspamd_lang_detector *d) { if (d) { + rspamd_ftok_t *tok; + if (d->uchar_converter) { ucnv_close (d->uchar_converter); } @@ -681,6 +731,10 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d) if (d->languages) { g_ptr_array_free (d->languages, TRUE); } + + kh_foreach_key (d->stop_words_norm, tok, { + g_free (tok); /* String is embedded and freed automatically */ + }); } } @@ -748,6 +802,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg) ret->languages = g_ptr_array_sized_new (gl.gl_pathc); ret->uchar_converter = ucnv_open ("UTF-8", &uc_err); ret->short_text_limit = short_text_limit; + ret->stop_words_norm = kh_init (rspamd_stopwords_hash); + /* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */ for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) { ret->trigramms[i] = kh_init (rspamd_trigram_hash); @@ -1749,4 +1805,23 @@ void rspamd_language_detector_unref (struct rspamd_lang_detector* d) { REF_RELEASE (d); +} + +gboolean +rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, + const gchar *word, gsize wlen) +{ + khiter_t k; + rspamd_ftok_t search; + + search.begin = word; + search.len = wlen; + + k = kh_get (rspamd_stopwords_hash, d->stop_words_norm, &search); + + if (k != kh_end (d->stop_words_norm)) { + return TRUE; + } + + return FALSE; } \ No newline at end of file diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index 50fe19b6e..204bdf9af 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -84,4 +84,14 @@ gboolean rspamd_language_detector_detect (struct rspamd_task *task, struct rspamd_lang_detector *d, struct rspamd_mime_text_part *part); +/** + * Returns TRUE if the specified word is known to be a stop word + * @param d + * @param word + * @param wlen + * @return + */ +gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, + const gchar *word, gsize wlen); + #endif diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c index 4f7aa1c97..f798d9eeb 100644 --- a/src/libutil/str_util.c +++ b/src/libutil/str_util.c @@ -296,19 +296,39 @@ rspamd_ftok_icase_hash (gconstpointer key) return rspamd_icase_hash (f->begin, f->len, rspamd_hash_seed ()); } +gboolean +rspamd_ftok_equal (gconstpointer v, gconstpointer v2) +{ + const rspamd_ftok_t *f1 = v, *f2 = v2; + + if (f1->len == f2->len && + memcmp (f1->begin, f2->begin, f1->len) == 0) { + return TRUE; + } + + return FALSE; +} + +guint +rspamd_ftok_hash (gconstpointer key) +{ + const rspamd_ftok_t *f = key; + + return t1ha (f->begin, f->len, rspamd_hash_seed ()); +} + gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2) { const GString *f1 = v, *f2 = v2; if (f1->len == f2->len && - rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) { + rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) { return TRUE; } return FALSE; } - guint rspamd_gstring_icase_hash (gconstpointer key) { diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h index 73637a62c..100b64b88 100644 --- a/src/libutil/str_util.h +++ b/src/libutil/str_util.h @@ -61,6 +61,8 @@ gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2); */ guint rspamd_ftok_icase_hash (gconstpointer key); gboolean rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2); +guint rspamd_ftok_hash (gconstpointer key); +gboolean rspamd_ftok_equal (gconstpointer v, gconstpointer v2); guint rspamd_gstring_icase_hash (gconstpointer key); gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2); -- 2.39.5