6 years ago · fc4c42b43c
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -24,8 +24,10 @@

 struct rspamd_language_elt {
 	const gchar *name; /* e.g. "en" or "ru" */
 	guint unigramms_total; /* total frequencies for unigramms */
 	GHashTable *unigramms; /* unigramms frequencies */
 	guint bigramms_total; /* total frequencies for bigramms */
 	GHashTable *bigramms; /* bigrams frequencies */
 	GHashTable *bigramms; /* bigramms frequencies */
 	guint trigramms_total; /* total frequencies for trigramms */
 	GHashTable *trigramms; /* trigramms frequencies */
 };
@@ -35,6 +37,18 @@ struct rspamd_lang_detector {
 	UConverter *uchar_converter;
 };

 static guint
 rspamd_unigram_hash (gconstpointer key)
 {
 	return rspamd_cryptobox_fast_hash (key, sizeof (UChar), rspamd_hash_seed ());
 }

 static gboolean
 rspamd_unigram_equal (gconstpointer v, gconstpointer v2)
 {
 	return memcmp (v, v2, sizeof (UChar)) == 0;
 }

 static guint
 rspamd_bigram_hash (gconstpointer key)
 {
@@ -101,6 +115,7 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 	pos = strchr (nelt->name, '.');
 	g_assert (pos != NULL);
 	*pos = '\0';
 	nelt->unigramms = g_hash_table_new (rspamd_unigram_hash, rspamd_unigram_equal);
 	nelt->bigramms = g_hash_table_new (rspamd_bigram_hash, rspamd_bigram_equal);
 	nelt->trigramms = g_hash_table_new (rspamd_trigram_hash, rspamd_trigram_equal);

@@ -138,14 +153,21 @@ rspamd_language_detector_read_file (struct rspamd_config *cfg,
 						GUINT_TO_POINTER (freq));
 				nelt->trigramms_total += freq;
 			}
 			else if (nsym == 1) {
 				g_hash_table_insert (nelt->unigramms, ucs_key,
 						GUINT_TO_POINTER (freq));
 				nelt->unigramms_total += freq;
 			}
 			else if (nsym > 3) {
 				msg_warn_config ("have more than 3 characters in key: %d", nsym);
 			}
 		}
 	}

 	msg_info_config ("loaded %s language, %d digramms, %d trigramms",
 			nelt->name, (gint)g_hash_table_size (nelt->bigramms),
 	msg_info_config ("loaded %s language, %d unigramms, %d digramms, %d trigramms",
 			nelt->name,
 			(gint)g_hash_table_size (nelt->unigramms),
 			(gint)g_hash_table_size (nelt->bigramms),
 			(gint)g_hash_table_size (nelt->trigramms));

 	g_ptr_array_add (d->languages, nelt);
@@ -202,3 +224,26 @@ end:

 	return ret;
 }


 void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
 		rspamd_mempool_t *pool,
 		rspamd_stat_token_t *utf_token, rspamd_stat_token_t *ucs_token)
 {
 	UChar *out;
 	int32_t nsym;
 	UErrorCode uc_err = U_ZERO_ERROR;

 	ucs_token->flags = utf_token->flags;
 	out = rspamd_mempool_alloc (pool, sizeof (*out) * (utf_token->len + 1));
 	nsym = ucnv_toUChars (d->uchar_converter, out, (utf_token->len + 1),
 			utf_token->begin, utf_token->len, &uc_err);

 	if (nsym >= 0) {
 		ucs_token->begin = (const gchar *) out;
 		ucs_token->len = nsym;
 	}
 	else {
 		ucs_token->len = 0;
 	}
 }
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -19,9 +19,25 @@

 #include "config.h"
 #include "libserver/cfg_file.h"
 #include "libstat/stat_api.h"

 struct rspamd_lang_detector;

 /**
 * Create new language detector object using configuration object
 * @param cfg
 * @return
 */
 struct rspamd_lang_detector* rspamd_language_detector_init (struct rspamd_config *cfg);
 /**
 * Convert string from utf8 to ucs32
 * @param d
 * @param utf_token
 * @param ucs_token
 */
 void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
 		rspamd_mempool_t *pool,
 		rspamd_stat_token_t *utf_token,
 		rspamd_stat_token_t *ucs_token);

 #endif
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -24,6 +24,7 @@
 #include "smtp_parsers.h"
 #include "mime_parser.h"
 #include "mime_encoding.h"
 #include "lang_detection.h"
 #include "libutil/multipattern.h"
 #include "libserver/mempool_vars_internal.h"

@@ -204,10 +205,10 @@ rspamd_extract_words (struct rspamd_task *task,
 #ifdef WITH_SNOWBALL
 	struct sb_stemmer *stem = NULL;
 #endif
 	rspamd_stat_token_t *w;
 	rspamd_stat_token_t *w, ucs_w;
 	gchar *temp_word;
 	const guchar *r;
 	guint i, nlen, total_len = 0, short_len = 0;
 	guint i, nlen, total_len = 0, short_len = 0, ucs_len = 0;
 	gdouble avg_len = 0;

 #ifdef WITH_SNOWBALL
@@ -257,10 +258,23 @@ rspamd_extract_words (struct rspamd_task *task,
 		part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
 				sizeof (guint64), part->normalized_words->len);

 		if (IS_PART_UTF (part) && task->lang_det) {
 			part->ucs32_words =  g_array_sized_new (FALSE, FALSE,
 					sizeof (rspamd_stat_token_t), part->normalized_words->len);
 		}

 		for (i = 0; i < part->normalized_words->len; i ++) {
 			guint64 h;

 			w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i);

 			if (part->ucs32_words) {
 				rspamd_language_detector_to_ucs (task->lang_det, task->task_pool,
 						w, &ucs_w);
 				g_array_append_val (part->ucs32_words, ucs_w);
 				ucs_len += ucs_w.len;
 			}

 			r = NULL;
 #ifdef WITH_SNOWBALL
 			if (stem) {
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -93,6 +93,7 @@ struct rspamd_mime_text_part {
 	GList *exceptions;	/**< list of offsets of urls						*/
 	struct rspamd_mime_part *mime_part;
 	GArray *normalized_words;
 	GArray *ucs32_words;
 	GArray *normalized_hashes;
 	guint nlines;
 	guint spaces;
--- a/src/libserver/task.c
+++ b/src/libserver/task.c
@@ -239,6 +239,9 @@ rspamd_task_free (struct rspamd_task *task)
 			if (tp->normalized_hashes) {
 				g_array_free (tp->normalized_hashes, TRUE);
 			}
 			if (tp->ucs32_words) {
 				g_array_free (tp->ucs32_words, TRUE);
 			}
 		}

 		if (task->rcpt_envelope) {
--- a/src/worker.c
+++ b/src/worker.c
@@ -660,7 +660,7 @@ rspamd_worker_init_scanner (struct rspamd_worker *worker,
 			rspamd_worker_monitored_handler,
 			worker->srv->cfg);

 	*plang_det = worker->srv->cfg;
 	*plang_det = worker->srv->cfg->lang_det;
 }

 /*