6 years ago · f581bcea91
--- a/src/libmime/lang_detection.c
+++ b/src/libmime/lang_detection.c
@@ -303,13 +303,207 @@ rspamd_language_detector_random_select (GPtrArray *ucs_tokens, guint nwords,
 	}
 }

 enum rspamd_language_gramm_type {
 	rs_unigramm = 0,
 	rs_bigramm,
 	rs_trigramm
 };

 static goffset
 rspamd_language_detector_next_ngramm (rspamd_stat_token_t *tok, UChar *window,
 		guint wlen, goffset cur_off)
 {
 	guint i;

 	if (wlen > 1) {
 		/* Deal with spaces at the beginning and ending */

 		if (cur_off == 0) {
 			window[0] = (UChar)' ';

 			for (i = 0; i < wlen - 1; i ++) {
 				window[i + 1] = *(((UChar *)tok->begin) + i);
 			}
 		}
 		else if (cur_off + wlen == tok->len + 1) {
 			/* Add trailing space */
 			for (i = 0; i < wlen - 1; i ++) {
 				window[i] = *(((UChar *)tok->begin) + cur_off + i);
 			}
 			window[wlen - 1] = (UChar)' ';
 		}
 		else if (cur_off + wlen > tok->len + 1) {
 			/* No more fun */
 			return -1;
 		}

 		/* Normal case */
 		for (i = 0; i < wlen; i ++) {
 			window[i] = *(((UChar *)tok->begin) + cur_off + i);
 		}
 	}
 	else {
 		if (tok->len >= cur_off) {
 			return -1;
 		}

 		window[0] = *(((UChar *)tok->begin) + cur_off);
 	}

 	return cur_off + 1;
 }

 /*
 * Do full guess for a specific ngramm, checking all languages defined
 */
 static void
 rspamd_language_detector_process_ngramm_full (struct rspamd_lang_detector *d,
 		UChar *window, enum rspamd_language_gramm_type type,
 		GHashTable *candidates)
 {
 	guint i, freq;
 	struct rspamd_language_elt *elt;
 	struct rspamd_lang_detector_res *cand;
 	GHashTable *ngramms;

 	for (i = 0; i < d->languages->len; i ++) {
 		elt = g_ptr_array_index (d->languages, i);

 		switch (type) {
 		case rs_unigramm:
 			ngramms = elt->unigramms;
 			break;
 		case rs_bigramm:
 			ngramms = elt->bigramms;
 			break;
 		case rs_trigramm:
 			ngramms = elt->trigramms;
 			break;
 		}

 		freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));
 		cand = g_hash_table_lookup (candidates, elt->name);

 		if (cand == NULL) {
 			cand = g_malloc (sizeof (*cand));
 			cand->elt = elt;
 			cand->lang = elt->name;
 			cand->prob = freq;
 		}
 		else {
 			/* Update guess */
 			cand->prob += freq;
 		}
 	}
 }

 /*
 * Check only candidates, if none found, switch to full version
 */
 static void
 rspamd_language_detector_process_ngramm_update (struct rspamd_lang_detector *d,
 		UChar *window, enum rspamd_language_gramm_type type,
 		GHashTable *candidates)
 {
 	guint freq, total_freq = 0;
 	struct rspamd_language_elt *elt;
 	struct rspamd_lang_detector_res *cand;
 	GHashTableIter it;
 	gpointer k, v;
 	GHashTable *ngramms;

 	g_hash_table_iter_init (&it, candidates);

 	while (g_hash_table_iter_next (&it, &k, &v)) {
 		cand = (struct rspamd_lang_detector_res *)v;
 		elt = cand->elt;

 		switch (type) {
 		case rs_unigramm:
 			ngramms = elt->unigramms;
 			break;
 		case rs_bigramm:
 			ngramms = elt->bigramms;
 			break;
 		case rs_trigramm:
 			ngramms = elt->trigramms;
 			break;
 		}

 		freq = GPOINTER_TO_UINT (g_hash_table_lookup (ngramms, window));

 		cand->prob += freq;
 		total_freq += freq;
 	}

 	if (total_freq == 0) {
 		/* Nothing found , do full scan which will also update candidates */
 		rspamd_language_detector_process_ngramm_full (d, window, type, candidates);
 	}
 }

 static gboolean
 rspamd_language_detector_update_guess (struct rspamd_lang_detector *d,
 		rspamd_stat_token_t *tok, GHashTable *candidates,
 		enum rspamd_language_gramm_type type)
 {
 	guint wlen;
 	UChar window[3];
 	goffset cur = 0;

 	switch (type) {
 	case rs_unigramm:
 		wlen = 1;
 		break;
 	case rs_bigramm:
 		wlen = 2;
 		break;
 	case rs_trigramm:
 		wlen = 3;
 		break;
 	}

 	/* Split words */
 	while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
 			!= -1) {

 	}
 }

 static void
 rspamd_language_detector_detect_word (struct rspamd_lang_detector *d,
 		rspamd_stat_token_t *tok, GHashTable *candidates,
 		enum rspamd_language_gramm_type type)
 {
 	guint wlen;
 	UChar window[3];
 	goffset cur = 0;

 	switch (type) {
 	case rs_unigramm:
 		wlen = 1;
 		break;
 	case rs_bigramm:
 		wlen = 2;
 		break;
 	case rs_trigramm:
 		wlen = 3;
 		break;
 	}

 	/* Split words */
 	while ((cur = rspamd_language_detector_next_ngramm (tok, window, wlen, cur))
 			!= -1) {

 	}
 }

 const gchar *
 rspamd_language_detector_detect (struct rspamd_lang_detector *d,
 		GPtrArray *ucs_tokens, gsize words_len)
 {
 	if (words_len < d->short_text_limit) {
 		/* For short text, start directly from trigramms */
 		return rspamd_language_detector_detect_trigramm ();
 	}

 	/* Start with unigramms */
--- a/src/libmime/lang_detection.h
+++ b/src/libmime/lang_detection.h
@@ -22,10 +22,12 @@
 #include "libstat/stat_api.h"

 struct rspamd_lang_detector;
 struct rspamd_language_elt;

 struct rspamd_lang_detector_res {
 	gdouble prob;
 	const gchar *lang;
 	struct rspamd_language_elt *elt;
 };

 /**