6 years ago · 88950e4e4f
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -380,7 +380,8 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
 		UChar *window, enum rspamd_language_gramm_type type,
 		GHashTable *candidates)
 {
 	guint i, freq;
 	guint i;
 	gdouble freq, class_freq;
 	struct rspamd_language_elt *elt;
 	struct rspamd_lang_detector_res *cand;
 	GHashTable *ngramms;
@@ -391,16 +392,20 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
 		switch (type) {
 		case rs_unigramm:
 			ngramms = elt->unigramms;
 			class_freq = elt->unigramms_total;
 			break;
 		case rs_bigramm:
 			ngramms = elt->bigramms;
 			class_freq = elt->bigramms_total;
 			break;
 		case rs_trigramm:
 			ngramms = elt->trigramms;
 			class_freq = elt->trigramms_total;
 			break;
 		}

 		freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
 		freq = ((gdouble)GPOINTER_TO_UINT (
 				g_hash_table_lookup (ngramms, window))) / class_freq;
 		cand = g_hash_table_lookup (candidates, elt->name);

 		if (cand == NULL) {
@@ -419,12 +424,12 @@ rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
 /*
 * Check only candidates, if none found, switch to full version
 */
 static void
 static gboolean
 rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
 		UChar *window, enum rspamd_language_gramm_type type,
 		GHashTable *candidates)
 {
 	guint freq, total_freq = 0;
 	gdouble freq, total_freq = 0.0, class_freq;
 	struct rspamd_language_elt *elt;
 	struct rspamd_lang_detector_res *cand;
 	GHashTableIter it;
@@ -440,16 +445,20 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
 		switch (type) {
 		case rs_unigramm:
 			ngramms = elt->unigramms;
 			class_freq = elt->unigramms_total;
 			break;
 		case rs_bigramm:
 			ngramms = elt->bigramms;
 			class_freq = elt->bigramms_total;
 			break;
 		case rs_trigramm:
 			ngramms = elt->trigramms;
 			class_freq = elt->trigramms_total;
 			break;
 		}

 		freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
 		freq = ((gdouble)GPOINTER_TO_UINT (
 				g_hash_table_lookup (ngramms, window))) / class_freq;

 		cand->prob += freq;
 		total_freq += freq;
@@ -458,7 +467,11 @@ rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
 	if (total_freq == 0) {
 		/* Nothing found , do full scan which will also update candidates */
 		rspamd_language_detector_process_ngramm_full (d, window, type, candidates);

 		return FALSE;
 	}

 	return TRUE;
 }

 static gboolean
@@ -469,6 +482,7 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
 	guint wlen;
 	UChar window[3];
 	goffset cur = 0;
 	gboolean ret = TRUE;

 	switch (type) {
 	case rs_unigramm:
@@ -485,8 +499,13 @@ rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
 	/* Split words */
 	while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
 			!= -1) {
 		rspamd_language_detector_process_ngramm_update (d, window, type, candidates);
 		if (!rspamd_language_detector_process_ngramm_update (d, window,
 				type, candidates)) {
 			ret = FALSE;
 		}
 	}

 	return ret;
 }

 static void
@@ -539,7 +558,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
 			g_hash_table_iter_remove (&it);
 		}
 		else {
 			cand->prob = log2 (cand->prob / cand->total_words);
 			cand->prob = log2 (cand->prob);

 			if (cand->prob > max_prob) {
 				max_prob = cand->prob;
@@ -547,6 +566,7 @@ rspamd_language_detector_filter_negligible (GHashTable *candidates)
 		}
 	}

 	g_hash_table_iter_init (&it, candidates);
 	/* Filter step */
 	while (g_hash_table_iter_next (&it, &k, &v)) {
 		cand = (struct rspamd_lang_detector_res *) v;
@@ -565,7 +585,8 @@ static void
 rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
 		GPtrArray *ucs_tokens,
 		GHashTable *candidates,
 		enum rspamd_language_gramm_type type)
 		enum rspamd_language_gramm_type type,
 		gboolean start_over)
 {
 	guint nparts = MIN (ucs_tokens->len, default_words);
 	goffset *selected_words;
@@ -577,7 +598,13 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,

 	/* Deal with the first word in a special case */
 	tok = g_ptr_array_index (ucs_tokens, selected_words[0]);
 	rspamd_language_detector_detect_word (d, tok, candidates, type);

 	if (start_over) {
 		rspamd_language_detector_detect_word (d, tok, candidates, type);
 	}
 	else {
 		rspamd_language_detector_update_guess (d, tok, candidates, type);
 	}

 	for (i = 1; i < nparts; i ++) {
 		tok = g_ptr_array_index (ucs_tokens, selected_words[i]);
@@ -588,20 +615,87 @@ rspamd_language_detector_detect_type (struct rspamd_lang_detector *d,
 	rspamd_language_detector_filter_negligible (candidates);
 }

 const gchar *
 static gint
 rspamd_language_detector_cmp (gconstpointer a, gconstpointer b)
 {
 	const struct rspamd_lang_detector_res
 			*canda = *(const struct rspamd_lang_detector_res **)a,
 			*candb = *(const struct rspamd_lang_detector_res **)a;

 	if (canda->prob > candb->prob) {
 		return 1;
 	}
 	else if (candb->prob > canda->prob) {
 		return -1;
 	}

 	return 0;
 }

 GPtrArray *
 rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 		GPtrArray *ucs_tokens, gsize words_len)
 {
 	GHashTable *candidates;
 	GPtrArray *result;
 	GHashTableIter it;
 	gpointer k, v;
 	struct rspamd_lang_detector_res *cand;
 	guint cand_len, prev_len;

 	candidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
 			NULL, g_free);
 	if (words_len < d->short_text_limit) {
 		/* For short text, start directly from trigramms */
 		rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
 				rs_trigramm);
 				rs_trigramm, TRUE);
 	}
 	else {
 		/* Start with unigramms */
 		rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
 				rs_unigramm, TRUE);
 		cand_len = g_hash_table_size (candidates);

 		if (cand_len > 1) {
 			/* Try bigramms */
 			rspamd_language_detector_detect_type (d, ucs_tokens, candidates,
 					rs_unigramm, FALSE);

 			cand_len = g_hash_table_size (candidates);
 			if (cand_len > 1) {
 				prev_len = cand_len;
 				/* Try trigramms */
 				GHashTable *ncandidates;
 				ncandidates = g_hash_table_new_full (rspamd_str_hash, rspamd_str_equal,
 						NULL, g_free);
 				rspamd_language_detector_detect_type (d, ucs_tokens, ncandidates,
 						rs_trigramm, TRUE);
 				cand_len = g_hash_table_size (ncandidates);

 				if (cand_len < prev_len) {
 					g_hash_table_unref (candidates);
 					candidates = ncandidates;
 				}
 				else {
 					/* Not a better guess */
 					g_hash_table_unref (ncandidates);
 				}
 			}
 		}
 	}

 	/* Now, convert hash to array and sort it */
 	result = g_ptr_array_new_full (g_hash_table_size (candidates), g_free);
 	g_hash_table_iter_init (&it, candidates);

 	while (g_hash_table_iter_next (&it, &k, &v)) {
 		cand = (struct rspamd_lang_detector_res *) v;
 		g_ptr_array_add (result, cand);
 		g_hash_table_iter_steal (&it);
 	}

 	g_ptr_array_sort (result, rspamd_language_detector_cmp);
 	g_hash_table_unref (candidates);

 	return result;
 }
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -26,7 +26,6 @@ struct rspamd_language_elt;

 struct rspamd_lang_detector_res {
 	gdouble prob;
 	gdouble total_words;
 	const gchar *lang;
 	struct rspamd_language_elt *elt;
 };
@@ -53,9 +52,9 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d,
 * @param d
 * @param ucs_tokens
 * @param words_len
 * @return language code or NULL if language has not been detected
 * @return array of struct rspamd_lang_detector_res sorted by freq descending
 */
 const gchar * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 GPtrArray * rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 		GPtrArray *ucs_tokens, gsize words_len);

 #endif