aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-15 14:47:18 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2018-11-15 14:47:18 +0000
commitf4d3c21d16cd1c71f7a4bb7772e77e768a7ab8d1 (patch)
tree84b5d411e9192dfb3a003f115faf58ba89afacbc
parent3dadbb5159db3a59342834082144690588aa61db (diff)
downloadrspamd-f4d3c21d16cd1c71f7a4bb7772e77e768a7ab8d1.tar.gz
rspamd-f4d3c21d16cd1c71f7a4bb7772e77e768a7ab8d1.zip
[Feature] Store stop words and allow to query them
-rw-r--r--src/libmime/lang_detection.c77
-rw-r--r--src/libmime/lang_detection.h10
-rw-r--r--src/libutil/str_util.c24
-rw-r--r--src/libutil/str_util.h2
4 files changed, 110 insertions, 3 deletions
diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c
index e4dc2ebf4..500b0dfdf 100644
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -20,6 +20,8 @@
#include "libutil/multipattern.h"
#include "ucl.h"
#include "khash.h"
+#include "libstemmer.h"
+
#include <glob.h>
#include <unicode/utf8.h>
#include <unicode/ucnv.h>
@@ -172,11 +174,15 @@ KHASH_INIT (rspamd_trigram_hash, const UChar *, struct rspamd_ngramm_chain, true
KHASH_INIT (rspamd_candidates_hash, const gchar *,
struct rspamd_lang_detector_res *, true,
rspamd_str_hash, rspamd_str_equal);
+KHASH_INIT (rspamd_stopwords_hash, rspamd_ftok_t *,
+ char, false,
+ rspamd_ftok_hash, rspamd_ftok_equal);
struct rspamd_lang_detector {
GPtrArray *languages;
khash_t(rspamd_trigram_hash) *trigramms[RSPAMD_LANGUAGE_MAX]; /* trigramms frequencies */
struct rspamd_stop_word_elt stop_words[RSPAMD_LANGUAGE_MAX];
+ khash_t(rspamd_stopwords_hash) *stop_words_norm;
UConverter *uchar_converter;
gsize short_text_limit;
gsize total_occurencies; /* number of all languages found */
@@ -439,17 +445,59 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
specific_stop_words = ucl_object_lookup (stop_words, nelt->name);
if (specific_stop_words) {
+ struct sb_stemmer *stem = NULL;
it = NULL;
const ucl_object_t *w;
guint start, stop;
+ stem = sb_stemmer_new (nelt->name, "UTF_8");
start = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
while ((w = ucl_object_iterate (specific_stop_words, &it, true)) != NULL) {
+ gsize wlen;
+ const char *word = ucl_object_tolstring (w, &wlen);
+ const char *saved;
+
rspamd_multipattern_add_pattern (d->stop_words[cat].mp,
- ucl_object_tostring (w), 0);
+ word, wlen);
nelt->stop_words ++;
nstop ++;
+
+ /* Also lemmatise and store normalised */
+ if (stem) {
+ const char *nw = sb_stemmer_stem (stem, word, wlen);
+
+
+ if (nw) {
+ saved = nw;
+ wlen = strlen (nw);
+ }
+ else {
+ saved = word;
+ }
+ }
+ else {
+ saved = word;
+ }
+
+ if (saved) {
+ gint rc;
+ rspamd_ftok_t *tok;
+ gchar *dst;
+
+ tok = g_malloc (sizeof (*tok) + wlen + 1);
+ dst = ((gchar *)tok) + sizeof (*tok);
+ rspamd_strlcpy (dst, saved, wlen + 1);
+ tok->begin = dst;
+ tok->len = wlen;
+
+ kh_put (rspamd_stopwords_hash, d->stop_words_norm,
+ tok, &rc);
+ }
+ }
+
+ if (stem) {
+ sb_stemmer_delete (stem);
}
stop = rspamd_multipattern_get_npatterns (d->stop_words[cat].mp);
@@ -668,6 +716,8 @@ static void
rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
{
if (d) {
+ rspamd_ftok_t *tok;
+
if (d->uchar_converter) {
ucnv_close (d->uchar_converter);
}
@@ -681,6 +731,10 @@ rspamd_language_detector_dtor (struct rspamd_lang_detector *d)
if (d->languages) {
g_ptr_array_free (d->languages, TRUE);
}
+
+ kh_foreach_key (d->stop_words_norm, tok, {
+ g_free (tok); /* String is embedded and freed automatically */
+ });
}
}
@@ -748,6 +802,8 @@ rspamd_language_detector_init (struct rspamd_config *cfg)
ret->languages = g_ptr_array_sized_new (gl.gl_pathc);
ret->uchar_converter = ucnv_open ("UTF-8", &uc_err);
ret->short_text_limit = short_text_limit;
+ ret->stop_words_norm = kh_init (rspamd_stopwords_hash);
+
/* Map from ngramm in ucs32 to GPtrArray of rspamd_language_elt */
for (i = 0; i < RSPAMD_LANGUAGE_MAX; i ++) {
ret->trigramms[i] = kh_init (rspamd_trigram_hash);
@@ -1749,4 +1805,23 @@ void
rspamd_language_detector_unref (struct rspamd_lang_detector* d)
{
REF_RELEASE (d);
+}
+
+gboolean
+rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
+ const gchar *word, gsize wlen)
+{
+ khiter_t k;
+ rspamd_ftok_t search;
+
+ search.begin = word;
+ search.len = wlen;
+
+ k = kh_get (rspamd_stopwords_hash, d->stop_words_norm, &search);
+
+ if (k != kh_end (d->stop_words_norm)) {
+ return TRUE;
+ }
+
+ return FALSE;
} \ No newline at end of file
diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h
index 50fe19b6e..204bdf9af 100644
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -84,4 +84,14 @@ gboolean rspamd_language_detector_detect (struct rspamd_task *task,
struct rspamd_lang_detector *d,
struct rspamd_mime_text_part *part);
+/**
+ * Returns TRUE if the specified word is known to be a stop word
+ * @param d
+ * @param word
+ * @param wlen
+ * @return
+ */
+gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
+ const gchar *word, gsize wlen);
+
#endif
diff --git a/src/libutil/str_util.c b/src/libutil/str_util.c
index 4f7aa1c97..f798d9eeb 100644
--- a/src/libutil/str_util.c
+++ b/src/libutil/str_util.c
@@ -297,18 +297,38 @@ rspamd_ftok_icase_hash (gconstpointer key)
}
gboolean
+rspamd_ftok_equal (gconstpointer v, gconstpointer v2)
+{
+ const rspamd_ftok_t *f1 = v, *f2 = v2;
+
+ if (f1->len == f2->len &&
+ memcmp (f1->begin, f2->begin, f1->len) == 0) {
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+guint
+rspamd_ftok_hash (gconstpointer key)
+{
+ const rspamd_ftok_t *f = key;
+
+ return t1ha (f->begin, f->len, rspamd_hash_seed ());
+}
+
+gboolean
rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2)
{
const GString *f1 = v, *f2 = v2;
if (f1->len == f2->len &&
- rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) {
+ rspamd_lc_cmp (f1->str, f2->str, f1->len) == 0) {
return TRUE;
}
return FALSE;
}
-
guint
rspamd_gstring_icase_hash (gconstpointer key)
{
diff --git a/src/libutil/str_util.h b/src/libutil/str_util.h
index 73637a62c..100b64b88 100644
--- a/src/libutil/str_util.h
+++ b/src/libutil/str_util.h
@@ -61,6 +61,8 @@ gboolean rspamd_str_equal (gconstpointer v, gconstpointer v2);
*/
guint rspamd_ftok_icase_hash (gconstpointer key);
gboolean rspamd_ftok_icase_equal (gconstpointer v, gconstpointer v2);
+guint rspamd_ftok_hash (gconstpointer key);
+gboolean rspamd_ftok_equal (gconstpointer v, gconstpointer v2);
guint rspamd_gstring_icase_hash (gconstpointer key);
gboolean rspamd_gstring_icase_equal (gconstpointer v, gconstpointer v2);