aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-11 20:38:47 +0400
committerVsevolod Stakhov <vsevolod@rambler-co.ru>2011-07-11 20:38:47 +0400
commit6568e9dda41208cbacbe2ac23061fa158faea3f1 (patch)
treef60e4dcebcbf9549c5236dbb5cdbc9656da5a39f
parentb14402cd4ed5bf9b3efc0cc9d50c812b66a31f57 (diff)
downloadrspamd-6568e9dda41208cbacbe2ac23061fa158faea3f1.tar.gz
rspamd-6568e9dda41208cbacbe2ac23061fa158faea3f1.zip
* Add a simple logic of language detection for text parts (unicode script based)
-rw-r--r--src/lua/lua_common.h2
-rw-r--r--src/lua/lua_task.c101
-rw-r--r--src/message.h1
-rw-r--r--src/plugins/chartable.c16
4 files changed, 118 insertions, 2 deletions
diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h
index 284b34e8e..4427dcae9 100644
--- a/src/lua/lua_common.h
+++ b/src/lua/lua_common.h
@@ -16,7 +16,7 @@
extern const luaL_reg null_reg[];
-#define RSPAMD_LUA_API_VERSION 7
+#define RSPAMD_LUA_API_VERSION 8
/* Common utility functions */
void lua_newclass (lua_State *L, const gchar *classname, const struct luaL_reg *func);
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 2bb13083b..8c90feb4b 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -109,12 +109,14 @@ LUA_FUNCTION_DEF (textpart, get_content);
LUA_FUNCTION_DEF (textpart, is_empty);
LUA_FUNCTION_DEF (textpart, is_html);
LUA_FUNCTION_DEF (textpart, get_fuzzy);
+LUA_FUNCTION_DEF (textpart, get_language);
static const struct luaL_reg textpartlib_m[] = {
LUA_INTERFACE_DEF (textpart, get_content),
LUA_INTERFACE_DEF (textpart, is_empty),
LUA_INTERFACE_DEF (textpart, is_html),
LUA_INTERFACE_DEF (textpart, get_fuzzy),
+ LUA_INTERFACE_DEF (textpart, get_language),
{"__tostring", lua_class_tostring},
{NULL, NULL}
};
@@ -1240,6 +1242,105 @@ lua_textpart_get_fuzzy (lua_State * L)
return 1;
}
+static gint
+lua_textpart_get_language (lua_State * L)
+{
+ struct mime_text_part *part = lua_check_textpart (L);
+ static const gchar languages[][4] = {
+ "", /* G_UNICODE_SCRIPT_COMMON */
+ "", /* G_UNICODE_SCRIPT_INHERITED */
+ "ar", /* G_UNICODE_SCRIPT_ARABIC */
+ "hy", /* G_UNICODE_SCRIPT_ARMENIAN */
+ "bn", /* G_UNICODE_SCRIPT_BENGALI */
+ /* Used primarily in Taiwan, but not part of the standard
+ * zh-tw orthography */
+ "", /* G_UNICODE_SCRIPT_BOPOMOFO */
+ "chr", /* G_UNICODE_SCRIPT_CHEROKEE */
+ "cop", /* G_UNICODE_SCRIPT_COPTIC */
+ "ru", /* G_UNICODE_SCRIPT_CYRILLIC */
+ /* Deseret was used to write English */
+ "", /* G_UNICODE_SCRIPT_DESERET */
+ "hi", /* G_UNICODE_SCRIPT_DEVANAGARI */
+ "am", /* G_UNICODE_SCRIPT_ETHIOPIC */
+ "ka", /* G_UNICODE_SCRIPT_GEORGIAN */
+ "", /* G_UNICODE_SCRIPT_GOTHIC */
+ "el", /* G_UNICODE_SCRIPT_GREEK */
+ "gu", /* G_UNICODE_SCRIPT_GUJARATI */
+ "pa", /* G_UNICODE_SCRIPT_GURMUKHI */
+ "", /* G_UNICODE_SCRIPT_HAN */
+ "ko", /* G_UNICODE_SCRIPT_HANGUL */
+ "he", /* G_UNICODE_SCRIPT_HEBREW */
+ "ja", /* G_UNICODE_SCRIPT_HIRAGANA */
+ "kn", /* G_UNICODE_SCRIPT_KANNADA */
+ "ja", /* G_UNICODE_SCRIPT_KATAKANA */
+ "km", /* G_UNICODE_SCRIPT_KHMER */
+ "lo", /* G_UNICODE_SCRIPT_LAO */
+ "en", /* G_UNICODE_SCRIPT_LATIN */
+ "ml", /* G_UNICODE_SCRIPT_MALAYALAM */
+ "mn", /* G_UNICODE_SCRIPT_MONGOLIAN */
+ "my", /* G_UNICODE_SCRIPT_MYANMAR */
+ /* Ogham was used to write old Irish */
+ "", /* G_UNICODE_SCRIPT_OGHAM */
+ "", /* G_UNICODE_SCRIPT_OLD_ITALIC */
+ "or", /* G_UNICODE_SCRIPT_ORIYA */
+ "", /* G_UNICODE_SCRIPT_RUNIC */
+ "si", /* G_UNICODE_SCRIPT_SINHALA */
+ "syr", /* G_UNICODE_SCRIPT_SYRIAC */
+ "ta", /* G_UNICODE_SCRIPT_TAMIL */
+ "te", /* G_UNICODE_SCRIPT_TELUGU */
+ "dv", /* G_UNICODE_SCRIPT_THAANA */
+ "th", /* G_UNICODE_SCRIPT_THAI */
+ "bo", /* G_UNICODE_SCRIPT_TIBETAN */
+ "iu", /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
+ "", /* G_UNICODE_SCRIPT_YI */
+ "tl", /* G_UNICODE_SCRIPT_TAGALOG */
+ /* Phillipino languages/scripts */
+ "hnn", /* G_UNICODE_SCRIPT_HANUNOO */
+ "bku", /* G_UNICODE_SCRIPT_BUHID */
+ "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */
+
+ "", /* G_UNICODE_SCRIPT_BRAILLE */
+ "", /* G_UNICODE_SCRIPT_CYPRIOT */
+ "", /* G_UNICODE_SCRIPT_LIMBU */
+ /* Used for Somali (so) in the past */
+ "", /* G_UNICODE_SCRIPT_OSMANYA */
+ /* The Shavian alphabet was designed for English */
+ "", /* G_UNICODE_SCRIPT_SHAVIAN */
+ "", /* G_UNICODE_SCRIPT_LINEAR_B */
+ "", /* G_UNICODE_SCRIPT_TAI_LE */
+ "uga", /* G_UNICODE_SCRIPT_UGARITIC */
+
+ "", /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
+ "bug", /* G_UNICODE_SCRIPT_BUGINESE */
+ /* The original script for Old Church Slavonic (chu), later
+ * written with Cyrillic */
+ "", /* G_UNICODE_SCRIPT_GLAGOLITIC */
+ /* Used for for Berber (ber), but Arabic script is more common */
+ "", /* G_UNICODE_SCRIPT_TIFINAGH */
+ "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
+ "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */
+ "", /* G_UNICODE_SCRIPT_KHAROSHTHI */
+
+ "", /* G_UNICODE_SCRIPT_UNKNOWN */
+ "", /* G_UNICODE_SCRIPT_BALINESE */
+ "", /* G_UNICODE_SCRIPT_CUNEIFORM */
+ "", /* G_UNICODE_SCRIPT_PHOENICIAN */
+ "", /* G_UNICODE_SCRIPT_PHAGS_PA */
+ "nqo" /* G_UNICODE_SCRIPT_NKO */
+ };
+ const gchar *sel;
+
+ if (part != NULL && part->script > 0 && part->script < G_N_ELEMENTS (languages)) {
+ sel = languages[part->script];
+ if (*sel != '\0') {
+ lua_pushstring (L, sel);
+ }
+ }
+
+ lua_pushnil (L);
+ return 1;
+}
+
/* Image functions */
static gint
lua_image_get_width (lua_State *L)
diff --git a/src/message.h b/src/message.h
index 5f19ab892..15fd90188 100644
--- a/src/message.h
+++ b/src/message.h
@@ -34,6 +34,7 @@ struct mime_text_part {
fuzzy_hash_t *fuzzy;
fuzzy_hash_t *double_fuzzy;
GMimeObject *parent;
+ GUnicodeScript script;
};
struct received_header {
diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c
index 7432e9f61..ea30fca87 100644
--- a/src/plugins/chartable.c
+++ b/src/plugins/chartable.c
@@ -116,8 +116,10 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
guchar *p, *p1;
gunichar c, t;
GUnicodeScript scc, sct;
- guint32 mark = 0, total = 0;
+ guint32 mark = 0, total = 0, max = 0, i;
guint32 remain = part->content->len;
+ guint32 scripts[G_UNICODE_SCRIPT_NKO];
+ GUnicodeScript sel = 0;
p = part->content->data;
@@ -136,6 +138,7 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
}
}
else {
+ memset (&scripts, 0, sizeof (scripts));
while (remain > 0) {
c = g_utf8_get_char_validated (p, remain);
if (c == (gunichar) -2 || c == (gunichar) -1) {
@@ -144,6 +147,9 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
}
scc = g_unichar_get_script (c);
+ if (scc < G_N_ELEMENTS (scripts)) {
+ scripts[scc] ++;
+ }
p1 = g_utf8_next_char (p);
remain -= p1 - p;
p = p1;
@@ -167,6 +173,14 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
p = p1;
}
}
+ /* Detect the mostly charset of this part */
+ for (i = 0; i < G_N_ELEMENTS (scripts); i ++) {
+ if (scripts[i] > max) {
+ max = scripts[i];
+ sel = i;
+ }
+ }
+ part->script = sel;
}
return ((double)mark / (double)total) > chartable_module_ctx->threshold;