From fc4c42b43cb9ab5dd2cc417fb8d87790c7741518 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 1 Jan 2018 20:28:59 +0000 Subject: [PATCH] [Project] Add unigramms to language detection as well --- src/libmime/lang_detection.c | 51 +++++++++++++++++++++++++++++++++--- src/libmime/lang_detection.h | 16 +++++++++++ src/libmime/message.c | 18 +++++++++++-- src/libmime/message.h | 1 + src/libserver/task.c | 3 +++ src/worker.c | 2 +- 6 files changed, 85 insertions(+), 6 deletions(-) diff --git a/src/libmime/lang_detection.c b/src/libmime/lang_detection.c index 730733dfb..3dd0a8d63 100644 --- a/src/libmime/lang_detection.c +++ b/src/libmime/lang_detection.c @@ -24,8 +24,10 @@ struct rspamd_language_elt { const gchar *name; /* e.g. "en" or "ru" */ + guint unigramms_total; /* total frequencies for unigramms */ + GHashTable *unigramms; /* unigramms frequencies */ guint bigramms_total; /* total frequencies for bigramms */ - GHashTable *bigramms; /* bigrams frequencies */ + GHashTable *bigramms; /* bigramms frequencies */ guint trigramms_total; /* total frequencies for trigramms */ GHashTable *trigramms; /* trigramms frequencies */ }; @@ -35,6 +37,18 @@ struct rspamd_lang_detector { UConverter *uchar_converter; }; +static guint +rspamd_unigram_hash (gconstpointer key) +{ + return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ()); +} + +static gboolean +rspamd_unigram_equal (gconstpointer v, gconstpointer v2) +{ + return memcmp (v, v2, sizeof (UChar)) == 0; +} + static guint rspamd_bigram_hash (gconstpointer key) { @@ -101,6 +115,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, pos = strchr (nelt->name, '.'); g_assert (pos != NULL); *pos = '\0'; + nelt->unigramms = g_hash_table_new (rspamd_unigram_hash, rspamd_unigram_equal); nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal); nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal); @@ -138,14 +153,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, GUINT_TO_POINTER (freq)); nelt->trigramms_total += freq; } + else if (nsym == 1) { + g_hash_table_insert (nelt->unigramms, ucs_key, + GUINT_TO_POINTER (freq)); + nelt->unigramms_total += freq; + } else if (nsym > 3) { msg_warn_config ("have more than 3 characters in key: %d", nsym); } } } - msg_info_config ("loaded %s language, %d digramms, %d trigramms", - nelt->name, (gint)g_hash_table_size (nelt->bigramms), + msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms", + nelt->name, + (gint)g_hash_table_size (nelt->unigramms), + (gint)g_hash_table_size (nelt->bigramms), (gint)g_hash_table_size (nelt->trigramms)); g_ptr_array_add (d->languages, nelt); @@ -202,3 +224,26 @@ end: return ret; } + + +void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, + rspamd_mempool_t *pool, + rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token) +{ + UChar *out; + int32_t nsym; + UErrorCode uc_err = U_ZERO_ERROR; + + ucs_token->flags = utf_token->flags; + out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1)); + nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1), + utf_token->begin, utf_token->len, &uc_err); + + if (nsym >= 0) { + ucs_token->begin = (const gchar *) out; + ucs_token->len = nsym; + } + else { + ucs_token->len = 0; + } +} \ No newline at end of file diff --git a/src/libmime/lang_detection.h b/src/libmime/lang_detection.h index aa1bf5494..c0d05cf1d 100644 --- a/src/libmime/lang_detection.h +++ b/src/libmime/lang_detection.h @@ -19,9 +19,25 @@ #include "config.h" #include "libserver/cfg_file.h" +#include "libstat/stat_api.h" struct rspamd_lang_detector; +/** + * Create new language detector object using configuration object + * @param cfg + * @return + */ struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg); +/** + * Convert string from utf8 to ucs32 + * @param d + * @param utf_token + * @param ucs_token + */ +void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, + rspamd_mempool_t *pool, + rspamd_stat_token_t *utf_token, + rspamd_stat_token_t *ucs_token); #endif diff --git a/src/libmime/message.c b/src/libmime/message.c index a2ea7d685..672d78806 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -24,6 +24,7 @@ #include "smtp_parsers.h" #include "mime_parser.h" #include "mime_encoding.h" +#include "lang_detection.h" #include "libutil/multipattern.h" #include "libserver/mempool_vars_internal.h" @@ -204,10 +205,10 @@ rspamd_extract_words (struct rspamd_task *task, #ifdef WITH_SNOWBALL struct sb_stemmer *stem = NULL; #endif - rspamd_stat_token_t *w; + rspamd_stat_token_t *w, ucs_w; gchar *temp_word; const guchar *r; - guint i, nlen, total_len = 0, short_len = 0; + guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0; gdouble avg_len = 0; #ifdef WITH_SNOWBALL @@ -257,10 +258,23 @@ rspamd_extract_words (struct rspamd_task *task, part->normalized_hashes = g_array_sized_new (FALSE, FALSE, sizeof (guint64), part->normalized_words->len); + if (IS_PART_UTF (part) && task->lang_det) { + part->ucs32_words = g_array_sized_new (FALSE, FALSE, + sizeof (rspamd_stat_token_t), part->normalized_words->len); + } + for (i = 0; i < part->normalized_words->len; i ++) { guint64 h; w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); + + if (part->ucs32_words) { + rspamd_language_detector_to_ucs (task->lang_det, task->task_pool, + w, &ucs_w); + g_array_append_val (part->ucs32_words, ucs_w); + ucs_len += ucs_w.len; + } + r = NULL; #ifdef WITH_SNOWBALL if (stem) { diff --git a/src/libmime/message.h b/src/libmime/message.h index 3092f3da5..90f86b3bd 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -93,6 +93,7 @@ struct rspamd_mime_text_part { GList *exceptions; /**< list of offsets of urls */ struct rspamd_mime_part *mime_part; GArray *normalized_words; + GArray *ucs32_words; GArray *normalized_hashes; guint nlines; guint spaces; diff --git a/src/libserver/task.c b/src/libserver/task.c index 2c014a7d1..7b665d983 100644 --- a/src/libserver/task.c +++ b/src/libserver/task.c @@ -239,6 +239,9 @@ rspamd_task_free (struct rspamd_task *task) if (tp->normalized_hashes) { g_array_free (tp->normalized_hashes, TRUE); } + if (tp->ucs32_words) { + g_array_free (tp->ucs32_words, TRUE); + } } if (task->rcpt_envelope) { diff --git a/src/worker.c b/src/worker.c index e0d2b4a0b..8b01205eb 100644 --- a/src/worker.c +++ b/src/worker.c @@ -660,7 +660,7 @@ rspamd_worker_init_scanner (struct rspamd_worker *worker, rspamd_worker_monitored_handler, worker->srv->cfg); - *plang_det = worker->srv->cfg; + *plang_det = worker->srv->cfg->lang_det; } /* -- 2.39.5