From 6568e9dda41208cbacbe2ac23061fa158faea3f1 Mon Sep 17 00:00:00 2001
From: Vsevolod Stakhov <vsevolod@rambler-co.ru>
Date: Mon, 11 Jul 2011 20:38:47 +0400
Subject: [PATCH] * Add a simple logic of language detection for text parts
 (unicode script based)

---
 src/lua/lua_common.h    |   2 +-
 src/lua/lua_task.c      | 101 ++++++++++++++++++++++++++++++++++++++++
 src/message.h           |   1 +
 src/plugins/chartable.c |  16 ++++++-
 4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 284b34e8e..4427dcae9 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -16,7 +16,7 @@
 
 extern const luaL_reg null_reg[];
 
-#define RSPAMD_LUA_API_VERSION 7
+#define RSPAMD_LUA_API_VERSION 8
 
 /* Common utility functions */
 void lua_newclass (lua_State *L, const gchar *classname, const struct luaL_reg *func);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 2bb13083b..8c90feb4b 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -109,12 +109,14 @@ LUA_FUNCTION_DEF (textpart, get_content);
 LUA_FUNCTION_DEF (textpart, is_empty);
 LUA_FUNCTION_DEF (textpart, is_html);
 LUA_FUNCTION_DEF (textpart, get_fuzzy);
+LUA_FUNCTION_DEF (textpart, get_language);
 
 static const struct luaL_reg    textpartlib_m[] = {
 	LUA_INTERFACE_DEF (textpart, get_content),
 	LUA_INTERFACE_DEF (textpart, is_empty),
 	LUA_INTERFACE_DEF (textpart, is_html),
 	LUA_INTERFACE_DEF (textpart, get_fuzzy),
+	LUA_INTERFACE_DEF (textpart, get_language),
 	{"__tostring", lua_class_tostring},
 	{NULL, NULL}
 };
@@ -1240,6 +1242,105 @@ lua_textpart_get_fuzzy (lua_State * L)
 	return 1;
 }
 
+static gint
+lua_textpart_get_language (lua_State * L)
+{
+	struct mime_text_part          *part = lua_check_textpart (L);
+	static const gchar              languages[][4] = {
+			"",    /* G_UNICODE_SCRIPT_COMMON */
+			"",    /* G_UNICODE_SCRIPT_INHERITED */
+			"ar",  /* G_UNICODE_SCRIPT_ARABIC */
+			"hy",  /* G_UNICODE_SCRIPT_ARMENIAN */
+			"bn",  /* G_UNICODE_SCRIPT_BENGALI */
+			/* Used primarily in Taiwan, but not part of the standard
+			 * zh-tw orthography  */
+			"",    /* G_UNICODE_SCRIPT_BOPOMOFO */
+			"chr", /* G_UNICODE_SCRIPT_CHEROKEE */
+			"cop", /* G_UNICODE_SCRIPT_COPTIC */
+			"ru",  /* G_UNICODE_SCRIPT_CYRILLIC */
+			/* Deseret was used to write English */
+			"",    /* G_UNICODE_SCRIPT_DESERET */
+			"hi",  /* G_UNICODE_SCRIPT_DEVANAGARI */
+			"am",  /* G_UNICODE_SCRIPT_ETHIOPIC */
+			"ka",  /* G_UNICODE_SCRIPT_GEORGIAN */
+			"",    /* G_UNICODE_SCRIPT_GOTHIC */
+			"el",  /* G_UNICODE_SCRIPT_GREEK */
+			"gu",  /* G_UNICODE_SCRIPT_GUJARATI */
+			"pa",  /* G_UNICODE_SCRIPT_GURMUKHI */
+			"",    /* G_UNICODE_SCRIPT_HAN */
+			"ko",  /* G_UNICODE_SCRIPT_HANGUL */
+			"he",  /* G_UNICODE_SCRIPT_HEBREW */
+			"ja",  /* G_UNICODE_SCRIPT_HIRAGANA */
+			"kn",  /* G_UNICODE_SCRIPT_KANNADA */
+			"ja",  /* G_UNICODE_SCRIPT_KATAKANA */
+			"km",  /* G_UNICODE_SCRIPT_KHMER */
+			"lo",  /* G_UNICODE_SCRIPT_LAO */
+			"en",  /* G_UNICODE_SCRIPT_LATIN */
+			"ml",  /* G_UNICODE_SCRIPT_MALAYALAM */
+			"mn",  /* G_UNICODE_SCRIPT_MONGOLIAN */
+			"my",  /* G_UNICODE_SCRIPT_MYANMAR */
+			/* Ogham was used to write old Irish */
+			"",    /* G_UNICODE_SCRIPT_OGHAM */
+			"",    /* G_UNICODE_SCRIPT_OLD_ITALIC */
+			"or",  /* G_UNICODE_SCRIPT_ORIYA */
+			"",    /* G_UNICODE_SCRIPT_RUNIC */
+			"si",  /* G_UNICODE_SCRIPT_SINHALA */
+			"syr", /* G_UNICODE_SCRIPT_SYRIAC */
+			"ta",  /* G_UNICODE_SCRIPT_TAMIL */
+			"te",  /* G_UNICODE_SCRIPT_TELUGU */
+			"dv",  /* G_UNICODE_SCRIPT_THAANA */
+			"th",  /* G_UNICODE_SCRIPT_THAI */
+			"bo",  /* G_UNICODE_SCRIPT_TIBETAN */
+			"iu",  /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
+			"",    /* G_UNICODE_SCRIPT_YI */
+			"tl",  /* G_UNICODE_SCRIPT_TAGALOG */
+			/* Phillipino languages/scripts */
+			"hnn", /* G_UNICODE_SCRIPT_HANUNOO */
+			"bku", /* G_UNICODE_SCRIPT_BUHID */
+			"tbw", /* G_UNICODE_SCRIPT_TAGBANWA */
+
+			"",    /* G_UNICODE_SCRIPT_BRAILLE */
+			"",    /* G_UNICODE_SCRIPT_CYPRIOT */
+			"",    /* G_UNICODE_SCRIPT_LIMBU */
+			/* Used for Somali (so) in the past */
+			"",    /* G_UNICODE_SCRIPT_OSMANYA */
+			/* The Shavian alphabet was designed for English */
+			"",    /* G_UNICODE_SCRIPT_SHAVIAN */
+			"",    /* G_UNICODE_SCRIPT_LINEAR_B */
+			"",    /* G_UNICODE_SCRIPT_TAI_LE */
+			"uga", /* G_UNICODE_SCRIPT_UGARITIC */
+
+			"",    /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
+			"bug", /* G_UNICODE_SCRIPT_BUGINESE */
+			/* The original script for Old Church Slavonic (chu), later
+			 * written with Cyrillic */
+			"",    /* G_UNICODE_SCRIPT_GLAGOLITIC */
+			/* Used for for Berber (ber), but Arabic script is more common */
+			"",    /* G_UNICODE_SCRIPT_TIFINAGH */
+			"syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
+			"peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */
+			"",    /* G_UNICODE_SCRIPT_KHAROSHTHI */
+
+			"",    /* G_UNICODE_SCRIPT_UNKNOWN */
+			"",    /* G_UNICODE_SCRIPT_BALINESE */
+			"",    /* G_UNICODE_SCRIPT_CUNEIFORM */
+			"",    /* G_UNICODE_SCRIPT_PHOENICIAN */
+			"",    /* G_UNICODE_SCRIPT_PHAGS_PA */
+			"nqo"  /* G_UNICODE_SCRIPT_NKO */
+	};
+	const gchar                    *sel;
+
+	if (part != NULL && part->script > 0 && part->script < G_N_ELEMENTS (languages)) {
+		sel = languages[part->script];
+		if (*sel != '\0') {
+			lua_pushstring (L, sel);
+		}
+	}
+
+	lua_pushnil (L);
+	return 1;
+}
+
 /* Image functions */
 static gint
 lua_image_get_width (lua_State *L)
diff --git a/src/message.h b/src/message.h
index 5f19ab892..15fd90188 100644
--- a/src/message.h
+++ b/src/message.h
@@ -34,6 +34,7 @@ struct mime_text_part {
 	fuzzy_hash_t *fuzzy;
 	fuzzy_hash_t *double_fuzzy;
 	GMimeObject *parent;
+	GUnicodeScript script;
 };
 
 struct received_header {
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 7432e9f61..ea30fca87 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -116,8 +116,10 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
 	guchar                          *p, *p1;
 	gunichar                        c, t;
 	GUnicodeScript                  scc, sct;
-	guint32                         mark = 0, total = 0;
+	guint32                         mark = 0, total = 0, max = 0, i;
 	guint32                         remain = part->content->len;
+	guint32                         scripts[G_UNICODE_SCRIPT_NKO];
+	GUnicodeScript                  sel = 0;
 
 	p = part->content->data;
 
@@ -136,6 +138,7 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
 		}
 	}
 	else {
+		memset (&scripts, 0, sizeof (scripts));
 		while (remain > 0) {
 			c = g_utf8_get_char_validated (p, remain);
 			if (c == (gunichar) -2 || c == (gunichar) -1) {
@@ -144,6 +147,9 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
 			}
 
 			scc = g_unichar_get_script (c);
+			if (scc < G_N_ELEMENTS (scripts)) {
+				scripts[scc] ++;
+			}
 			p1 = g_utf8_next_char (p);
 			remain -= p1 - p;
 			p = p1;
@@ -167,6 +173,14 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
 				p = p1;
 			}
 		}
+		/* Detect the mostly charset of this part */
+		for (i = 0; i < G_N_ELEMENTS (scripts); i ++) {
+			if (scripts[i] > max) {
+				max = scripts[i];
+				sel = i;
+			}
+		}
+		part->script = sel;
 	}
 
 	return ((double)mark / (double)total) > chartable_module_ctx->threshold;
-- 
2.39.5