Rework language detection.

author: Vsevolod Stakhov <vsevolod@highsecure.ru> 2014-12-23 15:28:27 +0000
committer: Vsevolod Stakhov <vsevolod@highsecure.ru> 2014-12-23 15:28:27 +0000
commit: 3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch)
tree: 5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src
parent: 39b8dcb94620669ae369ab559175dde1a5c103b7 (diff)
download: rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz
rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip
3 files changed, 152 insertions, 155 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 6140f3c24..3c5f83047 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1025,6 +1025,148 @@ convert_text_to_utf (struct rspamd_task *task,
 	return result_array;
 }
 
+struct language_match {
+	const char *code;
+	const char *name;
+	GUnicodeScript script;
+};
+
+static int
+language_elts_cmp (const void *a, const void *b)
+{
+	GUnicodeScript sc = *(const GUnicodeScript *)a;
+	const struct language_match *bb = (const struct language_match *)b;
+
+	return (sc - bb->script);
+}
+
+static void
+detect_text_language (struct mime_text_part *part)
+{
+	/* Keep sorted */
+	static const struct language_match language_codes[] = {
+			{ "", "english", G_UNICODE_SCRIPT_COMMON },
+			{ "", "", G_UNICODE_SCRIPT_INHERITED },
+			{ "ar", "arabic", G_UNICODE_SCRIPT_ARABIC },
+			{ "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN },
+			{ "bn", "chineese", G_UNICODE_SCRIPT_BENGALI },
+			{ "", "", G_UNICODE_SCRIPT_BOPOMOFO },
+			{ "chr", "", G_UNICODE_SCRIPT_CHEROKEE },
+			{ "cop", "",  G_UNICODE_SCRIPT_COPTIC  },
+			{ "ru", "russian",  G_UNICODE_SCRIPT_CYRILLIC },
+			/* Deseret was used to write English */
+			{ "", "",  G_UNICODE_SCRIPT_DESERET },
+			{ "hi", "",  G_UNICODE_SCRIPT_DEVANAGARI },
+			{ "am", "",  G_UNICODE_SCRIPT_ETHIOPIC },
+			{ "ka", "",  G_UNICODE_SCRIPT_GEORGIAN },
+			{ "", "",  G_UNICODE_SCRIPT_GOTHIC },
+			{ "el", "greek",  G_UNICODE_SCRIPT_GREEK },
+			{ "gu", "",  G_UNICODE_SCRIPT_GUJARATI },
+			{ "pa", "",  G_UNICODE_SCRIPT_GURMUKHI },
+			{ "han", "chineese",  G_UNICODE_SCRIPT_HAN },
+			{ "ko", "",  G_UNICODE_SCRIPT_HANGUL },
+			{ "he", "hebrew",  G_UNICODE_SCRIPT_HEBREW },
+			{ "ja", "",  G_UNICODE_SCRIPT_HIRAGANA },
+			{ "kn", "",  G_UNICODE_SCRIPT_KANNADA },
+			{ "ja", "",  G_UNICODE_SCRIPT_KATAKANA },
+			{ "km", "",  G_UNICODE_SCRIPT_KHMER },
+			{ "lo", "",  G_UNICODE_SCRIPT_LAO },
+			{ "en", "english",  G_UNICODE_SCRIPT_LATIN },
+			{ "ml", "",  G_UNICODE_SCRIPT_MALAYALAM },
+			{ "mn", "",  G_UNICODE_SCRIPT_MONGOLIAN },
+			{ "my", "",  G_UNICODE_SCRIPT_MYANMAR },
+			/* Ogham was used to write old Irish */
+			{ "", "",  G_UNICODE_SCRIPT_OGHAM },
+			{ "", "",  G_UNICODE_SCRIPT_OLD_ITALIC },
+			{ "or", "",  G_UNICODE_SCRIPT_ORIYA },
+			{ "", "",  G_UNICODE_SCRIPT_RUNIC },
+			{ "si", "",  G_UNICODE_SCRIPT_SINHALA },
+			{ "syr", "",  G_UNICODE_SCRIPT_SYRIAC },
+			{ "ta", "",  G_UNICODE_SCRIPT_TAMIL },
+			{ "te", "",  G_UNICODE_SCRIPT_TELUGU },
+			{ "dv", "",  G_UNICODE_SCRIPT_THAANA },
+			{ "th", "",  G_UNICODE_SCRIPT_THAI },
+			{ "bo", "",  G_UNICODE_SCRIPT_TIBETAN },
+			{ "iu", "",  G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL },
+			{ "", "",  G_UNICODE_SCRIPT_YI },
+			{ "tl", "",  G_UNICODE_SCRIPT_TAGALOG },
+			/* Phillipino languages/scripts */
+			{ "hnn", "",  G_UNICODE_SCRIPT_HANUNOO },
+			{ "bku", "",  G_UNICODE_SCRIPT_BUHID },
+			{ "tbw", "",  G_UNICODE_SCRIPT_TAGBANWA },
+
+			{ "", "",  G_UNICODE_SCRIPT_BRAILLE },
+			{ "", "",  G_UNICODE_SCRIPT_CYPRIOT },
+			{ "", "",  G_UNICODE_SCRIPT_LIMBU },
+			/* Used for Somali (so) in the past */
+			{ "", "",  G_UNICODE_SCRIPT_OSMANYA },
+			/* The Shavian alphabet was designed for English */
+			{ "", "",  G_UNICODE_SCRIPT_SHAVIAN },
+			{ "", "",  G_UNICODE_SCRIPT_LINEAR_B },
+			{ "", "",  G_UNICODE_SCRIPT_TAI_LE },
+			{ "uga", "",  G_UNICODE_SCRIPT_UGARITIC },
+			{ "", "",  G_UNICODE_SCRIPT_NEW_TAI_LUE },
+			{ "bug", "",  G_UNICODE_SCRIPT_BUGINESE },
+			{ "", "",  G_UNICODE_SCRIPT_GLAGOLITIC },
+			/* Used for for Berber (ber), but Arabic script is more common */
+			{ "", "",  G_UNICODE_SCRIPT_TIFINAGH },
+			{ "syl", "",  G_UNICODE_SCRIPT_SYLOTI_NAGRI },
+			{ "peo", "",  G_UNICODE_SCRIPT_OLD_PERSIAN },
+			{ "", "",  G_UNICODE_SCRIPT_KHAROSHTHI },
+			{ "", "",  G_UNICODE_SCRIPT_UNKNOWN },
+			{ "", "",  G_UNICODE_SCRIPT_BALINESE },
+			{ "", "",  G_UNICODE_SCRIPT_CUNEIFORM },
+			{ "", "",  G_UNICODE_SCRIPT_PHOENICIAN },
+			{ "", "",  G_UNICODE_SCRIPT_PHAGS_PA },
+			{ "nqo", "", G_UNICODE_SCRIPT_NKO }
+	};
+	const struct language_match *lm;
+	const int max_chars = 32;
+
+	if (part != NULL) {
+		if (part->is_utf) {
+			/* Try to detect encoding by several symbols */
+			const gchar *p, *pp;
+			gunichar c;
+			gint32 remain = part->content->len, max = 0, processed = 0;
+			gint32 scripts[G_N_ELEMENTS (language_codes)];
+			GUnicodeScript scc, sel;
+
+			p = part->content->data;
+			memset (scripts, 0, sizeof (scripts));
+
+			while (remain > 0 && processed < max_chars) {
+				c = g_utf8_get_char_validated (p, remain);
+				if (c == (gunichar) -2 || c == (gunichar) -1) {
+					break;
+				}
+				if (g_unichar_isalpha (c)) {
+					scc = g_unichar_get_script (c);
+					if (scc < (gint)G_N_ELEMENTS (scripts)) {
+						scripts[scc]++;
+					}
+					processed ++;
+				}
+				pp = g_utf8_next_char (p);
+				remain -= pp - p;
+				p = pp;
+			}
+			for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
+				if (scripts[remain] > max) {
+					max = scripts[remain];
+					sel = remain;
+				}
+			}
+			part->script = sel;
+			lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes),
+					sizeof (language_codes[0]), &language_elts_cmp);
+
+			part->lang_code = lm->code;
+			part->language = lm->name;
+		}
+	}
+}
+
 static void
 process_text_part (struct rspamd_task *task,
 	GByteArray *part_content,
@@ -1035,9 +1177,6 @@ process_text_part (struct rspamd_task *task,
 {
 	struct mime_text_part *text_part;
 	const gchar *cd;
-	gchar *pos;
-	gsize l;
-	rspamd_fstring_t token, buf;
 
 	/* Skip attachements */
 #ifndef GMIME24
@@ -1128,32 +1267,10 @@ process_text_part (struct rspamd_task *task,
 	}
 
 	/* Post process part */
-	buf.begin = text_part->content->data;
-	buf.len = text_part->content->len;
-	buf.size = buf.len;
-	token.begin = NULL;
-	token.len = 0;
-
-	text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
-	while ((pos = rspamd_tokenizer_get_word (&buf,
-			&token, &text_part->urls_offset)) != NULL) {
-		if (text_part->is_utf) {
-			l = g_utf8_strlen (token.begin, token.len);
-		}
-		else {
-			l = token.len;
-		}
-		/*
-		 * XXX: make this configurable
-		 */
-		if (l < 4) {
-			token.begin = pos;
-			continue;
-		}
-		g_array_append_val (text_part->words, token);
-
-		token.begin = pos;
-	}
+	detect_text_language (text_part);
+	text_part->words = rspamd_tokenize_text (text_part->content->data,
+			text_part->content->len, text_part->is_utf, 4,
+			&text_part->urls_offset);
 }
 
 #ifdef GMIME24
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 8287db9b0..7b3e03883 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -26,6 +26,9 @@ struct mime_text_part {
 	gboolean is_balanced;
 	gboolean is_empty;
 	gboolean is_utf;
+	GUnicodeScript script;
+	const gchar *lang_code;
+	const gchar *language;
 	const gchar *real_charset;
 	GByteArray *orig;
 	GByteArray *content;
@@ -34,7 +37,6 @@ struct mime_text_part {
 	rspamd_fuzzy_t *fuzzy;
 	rspamd_fuzzy_t *double_fuzzy;
 	GMimeObject *parent;
-	GUnicodeScript script;
 	rspamd_fstring_t *diff_str;
 	GArray *words;
 };
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 9a61dd5c5..df7640e46 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -1989,133 +1989,11 @@ static gint
 lua_textpart_get_language (lua_State * L)
 {
 	struct mime_text_part *part = lua_check_textpart (L);
-	static const gchar languages[][4] = {
-		"",        /* G_UNICODE_SCRIPT_COMMON */
-		"",        /* G_UNICODE_SCRIPT_INHERITED */
-		"ar",      /* G_UNICODE_SCRIPT_ARABIC */
-		"hy",      /* G_UNICODE_SCRIPT_ARMENIAN */
-		"bn",      /* G_UNICODE_SCRIPT_BENGALI */
-		/* Used primarily in Taiwan, but not part of the standard
-		 * zh-tw orthography  */
-		"",        /* G_UNICODE_SCRIPT_BOPOMOFO */
-		"chr",     /* G_UNICODE_SCRIPT_CHEROKEE */
-		"cop",     /* G_UNICODE_SCRIPT_COPTIC */
-		"ru",      /* G_UNICODE_SCRIPT_CYRILLIC */
-		/* Deseret was used to write English */
-		"",        /* G_UNICODE_SCRIPT_DESERET */
-		"hi",      /* G_UNICODE_SCRIPT_DEVANAGARI */
-		"am",      /* G_UNICODE_SCRIPT_ETHIOPIC */
-		"ka",      /* G_UNICODE_SCRIPT_GEORGIAN */
-		"",        /* G_UNICODE_SCRIPT_GOTHIC */
-		"el",      /* G_UNICODE_SCRIPT_GREEK */
-		"gu",      /* G_UNICODE_SCRIPT_GUJARATI */
-		"pa",      /* G_UNICODE_SCRIPT_GURMUKHI */
-		"han",     /* G_UNICODE_SCRIPT_HAN */
-		"ko",      /* G_UNICODE_SCRIPT_HANGUL */
-		"he",      /* G_UNICODE_SCRIPT_HEBREW */
-		"ja",      /* G_UNICODE_SCRIPT_HIRAGANA */
-		"kn",      /* G_UNICODE_SCRIPT_KANNADA */
-		"ja",      /* G_UNICODE_SCRIPT_KATAKANA */
-		"km",      /* G_UNICODE_SCRIPT_KHMER */
-		"lo",      /* G_UNICODE_SCRIPT_LAO */
-		"en",      /* G_UNICODE_SCRIPT_LATIN */
-		"ml",      /* G_UNICODE_SCRIPT_MALAYALAM */
-		"mn",      /* G_UNICODE_SCRIPT_MONGOLIAN */
-		"my",      /* G_UNICODE_SCRIPT_MYANMAR */
-		/* Ogham was used to write old Irish */
-		"",        /* G_UNICODE_SCRIPT_OGHAM */
-		"",        /* G_UNICODE_SCRIPT_OLD_ITALIC */
-		"or",      /* G_UNICODE_SCRIPT_ORIYA */
-		"",        /* G_UNICODE_SCRIPT_RUNIC */
-		"si",      /* G_UNICODE_SCRIPT_SINHALA */
-		"syr",     /* G_UNICODE_SCRIPT_SYRIAC */
-		"ta",      /* G_UNICODE_SCRIPT_TAMIL */
-		"te",      /* G_UNICODE_SCRIPT_TELUGU */
-		"dv",      /* G_UNICODE_SCRIPT_THAANA */
-		"th",      /* G_UNICODE_SCRIPT_THAI */
-		"bo",      /* G_UNICODE_SCRIPT_TIBETAN */
-		"iu",      /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
-		"",        /* G_UNICODE_SCRIPT_YI */
-		"tl",      /* G_UNICODE_SCRIPT_TAGALOG */
-		/* Phillipino languages/scripts */
-		"hnn",     /* G_UNICODE_SCRIPT_HANUNOO */
-		"bku",     /* G_UNICODE_SCRIPT_BUHID */
-		"tbw",     /* G_UNICODE_SCRIPT_TAGBANWA */
-
-		"",        /* G_UNICODE_SCRIPT_BRAILLE */
-		"",        /* G_UNICODE_SCRIPT_CYPRIOT */
-		"",        /* G_UNICODE_SCRIPT_LIMBU */
-		/* Used for Somali (so) in the past */
-		"",        /* G_UNICODE_SCRIPT_OSMANYA */
-		/* The Shavian alphabet was designed for English */
-		"",        /* G_UNICODE_SCRIPT_SHAVIAN */
-		"",        /* G_UNICODE_SCRIPT_LINEAR_B */
-		"",        /* G_UNICODE_SCRIPT_TAI_LE */
-		"uga",     /* G_UNICODE_SCRIPT_UGARITIC */
-
-		"",        /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
-		"bug",     /* G_UNICODE_SCRIPT_BUGINESE */
-		/* The original script for Old Church Slavonic (chu), later
-		 * written with Cyrillic */
-		"",        /* G_UNICODE_SCRIPT_GLAGOLITIC */
-		/* Used for for Berber (ber), but Arabic script is more common */
-		"",        /* G_UNICODE_SCRIPT_TIFINAGH */
-		"syl",     /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
-		"peo",     /* G_UNICODE_SCRIPT_OLD_PERSIAN */
-		"",        /* G_UNICODE_SCRIPT_KHAROSHTHI */
-
-		"",        /* G_UNICODE_SCRIPT_UNKNOWN */
-		"",        /* G_UNICODE_SCRIPT_BALINESE */
-		"",        /* G_UNICODE_SCRIPT_CUNEIFORM */
-		"",        /* G_UNICODE_SCRIPT_PHOENICIAN */
-		"",        /* G_UNICODE_SCRIPT_PHAGS_PA */
-		"nqo"      /* G_UNICODE_SCRIPT_NKO */
-	};
-	const gchar *sel;
 
 	if (part != NULL) {
-		if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN ||
-				part->script == G_UNICODE_SCRIPT_COMMON)) {
-			/* Try to detect encoding by several symbols */
-			const gchar *p, *pp;
-			gunichar c;
-			gint32 remain = part->content->len, max = 0, processed = 0;
-			gint32 scripts[G_UNICODE_SCRIPT_NKO];
-			GUnicodeScript scc, sel;
-
-			p = part->content->data;
-			memset (scripts, 0, sizeof (scripts));
-
-			while (remain > 0 && processed < 10) {
-				c = g_utf8_get_char_validated (p, remain);
-				if (c == (gunichar) -2 || c == (gunichar) -1) {
-					break;
-				}
-				scc = g_unichar_get_script (c);
-				if (scc < (gint)G_N_ELEMENTS (scripts)) {
-					scripts[scc]++;
-				}
-				pp = g_utf8_next_char (p);
-				remain -= pp - p;
-				p = pp;
-				processed ++;
-			}
-			for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
-				if (scripts[remain] > max) {
-					max = scripts[remain];
-					sel = remain;
-				}
-			}
-			part->script = sel;
-		}
-
-		if (part->script > 0 && part->script <
-				(gint)G_N_ELEMENTS (languages)) {
-			sel = languages[part->script];
-			if (*sel != '\0') {
-				lua_pushstring (L, sel);
-				return 1;
-			}
+		if (part->lang_code != NULL && part->lang_code[0] != '\0') {
+			lua_pushstring (L, part->lang_code);
+			return 1;
 		}
 	}
author	Vsevolod Stakhov <vsevolod@highsecure.ru>	2014-12-23 15:28:27 +0000
committer	Vsevolod Stakhov <vsevolod@highsecure.ru>	2014-12-23 15:28:27 +0000
commit	3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch)
tree	5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src
parent	39b8dcb94620669ae369ab559175dde1a5c103b7 (diff)
download	rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip