@@ -24,8 +24,10 @@ | |||
struct rspamd_language_elt { | |||
const gchar *name; /* e.g. "en" or "ru" */ | |||
guint unigramms_total; /* total frequencies for unigramms */ | |||
GHashTable *unigramms; /* unigramms frequencies */ | |||
guint bigramms_total; /* total frequencies for bigramms */ | |||
GHashTable *bigramms; /* bigrams frequencies */ | |||
GHashTable *bigramms; /* bigramms frequencies */ | |||
guint trigramms_total; /* total frequencies for trigramms */ | |||
GHashTable *trigramms; /* trigramms frequencies */ | |||
}; | |||
@@ -35,6 +37,18 @@ struct rspamd_lang_detector { | |||
UConverter *uchar_converter; | |||
}; | |||
static guint | |||
rspamd_unigram_hash (gconstpointer key) | |||
{ | |||
return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ()); | |||
} | |||
static gboolean | |||
rspamd_unigram_equal (gconstpointer v, gconstpointer v2) | |||
{ | |||
return memcmp (v, v2, sizeof (UChar)) == 0; | |||
} | |||
static guint | |||
rspamd_bigram_hash (gconstpointer key) | |||
{ | |||
@@ -101,6 +115,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, | |||
pos = strchr (nelt->name, '.'); | |||
g_assert (pos != NULL); | |||
*pos = '\0'; | |||
nelt->unigramms = g_hash_table_new (rspamd_unigram_hash, rspamd_unigram_equal); | |||
nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal); | |||
nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal); | |||
@@ -138,14 +153,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg, | |||
GUINT_TO_POINTER (freq)); | |||
nelt->trigramms_total += freq; | |||
} | |||
else if (nsym == 1) { | |||
g_hash_table_insert (nelt->unigramms, ucs_key, | |||
GUINT_TO_POINTER (freq)); | |||
nelt->unigramms_total += freq; | |||
} | |||
else if (nsym > 3) { | |||
msg_warn_config ("have more than 3 characters in key: %d", nsym); | |||
} | |||
} | |||
} | |||
msg_info_config ("loaded %s language, %d digramms, %d trigramms", | |||
nelt->name, (gint)g_hash_table_size (nelt->bigramms), | |||
msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms", | |||
nelt->name, | |||
(gint)g_hash_table_size (nelt->unigramms), | |||
(gint)g_hash_table_size (nelt->bigramms), | |||
(gint)g_hash_table_size (nelt->trigramms)); | |||
g_ptr_array_add (d->languages, nelt); | |||
@@ -202,3 +224,26 @@ end: | |||
return ret; | |||
} | |||
void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, | |||
rspamd_mempool_t *pool, | |||
rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token) | |||
{ | |||
UChar *out; | |||
int32_t nsym; | |||
UErrorCode uc_err = U_ZERO_ERROR; | |||
ucs_token->flags = utf_token->flags; | |||
out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1)); | |||
nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1), | |||
utf_token->begin, utf_token->len, &uc_err); | |||
if (nsym >= 0) { | |||
ucs_token->begin = (const gchar *) out; | |||
ucs_token->len = nsym; | |||
} | |||
else { | |||
ucs_token->len = 0; | |||
} | |||
} |
@@ -19,9 +19,25 @@ | |||
#include "config.h" | |||
#include "libserver/cfg_file.h" | |||
#include "libstat/stat_api.h" | |||
struct rspamd_lang_detector; | |||
/** | |||
* Create new language detector object using configuration object | |||
* @param cfg | |||
* @return | |||
*/ | |||
struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg); | |||
/** | |||
* Convert string from utf8 to ucs32 | |||
* @param d | |||
* @param utf_token | |||
* @param ucs_token | |||
*/ | |||
void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, | |||
rspamd_mempool_t *pool, | |||
rspamd_stat_token_t *utf_token, | |||
rspamd_stat_token_t *ucs_token); | |||
#endif |
@@ -24,6 +24,7 @@ | |||
#include "smtp_parsers.h" | |||
#include "mime_parser.h" | |||
#include "mime_encoding.h" | |||
#include "lang_detection.h" | |||
#include "libutil/multipattern.h" | |||
#include "libserver/mempool_vars_internal.h" | |||
@@ -204,10 +205,10 @@ rspamd_extract_words (struct rspamd_task *task, | |||
#ifdef WITH_SNOWBALL | |||
struct sb_stemmer *stem = NULL; | |||
#endif | |||
rspamd_stat_token_t *w; | |||
rspamd_stat_token_t *w, ucs_w; | |||
gchar *temp_word; | |||
const guchar *r; | |||
guint i, nlen, total_len = 0, short_len = 0; | |||
guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0; | |||
gdouble avg_len = 0; | |||
#ifdef WITH_SNOWBALL | |||
@@ -257,10 +258,23 @@ rspamd_extract_words (struct rspamd_task *task, | |||
part->normalized_hashes = g_array_sized_new (FALSE, FALSE, | |||
sizeof (guint64), part->normalized_words->len); | |||
if (IS_PART_UTF (part) && task->lang_det) { | |||
part->ucs32_words = g_array_sized_new (FALSE, FALSE, | |||
sizeof (rspamd_stat_token_t), part->normalized_words->len); | |||
} | |||
for (i = 0; i < part->normalized_words->len; i ++) { | |||
guint64 h; | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
if (part->ucs32_words) { | |||
rspamd_language_detector_to_ucs (task->lang_det, task->task_pool, | |||
w, &ucs_w); | |||
g_array_append_val (part->ucs32_words, ucs_w); | |||
ucs_len += ucs_w.len; | |||
} | |||
r = NULL; | |||
#ifdef WITH_SNOWBALL | |||
if (stem) { |
@@ -93,6 +93,7 @@ struct rspamd_mime_text_part { | |||
GList *exceptions; /**< list of offsets of urls */ | |||
struct rspamd_mime_part *mime_part; | |||
GArray *normalized_words; | |||
GArray *ucs32_words; | |||
GArray *normalized_hashes; | |||
guint nlines; | |||
guint spaces; |
@@ -239,6 +239,9 @@ rspamd_task_free (struct rspamd_task *task) | |||
if (tp->normalized_hashes) { | |||
g_array_free (tp->normalized_hashes, TRUE); | |||
} | |||
if (tp->ucs32_words) { | |||
g_array_free (tp->ucs32_words, TRUE); | |||
} | |||
} | |||
if (task->rcpt_envelope) { |
@@ -660,7 +660,7 @@ rspamd_worker_init_scanner (struct rspamd_worker *worker, | |||
rspamd_worker_monitored_handler, | |||
worker->srv->cfg); | |||
*plang_det = worker->srv->cfg; | |||
*plang_det = worker->srv->cfg->lang_det; | |||
} | |||
/* |