From 6568e9dda41208cbacbe2ac23061fa158faea3f1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Mon, 11 Jul 2011 20:38:47 +0400 Subject: [PATCH] * Add a simple logic of language detection for text parts (unicode script based) --- src/lua/lua_common.h | 2 +- src/lua/lua_task.c | 101 ++++++++++++++++++++++++++++++++++++++++ src/message.h | 1 + src/plugins/chartable.c | 16 ++++++- 4 files changed, 118 insertions(+), 2 deletions(-) diff --git a/src/lua/lua_common.h b/src/lua/lua_common.h index 284b34e8e..4427dcae9 100644 --- a/src/lua/lua_common.h +++ b/src/lua/lua_common.h @@ -16,7 +16,7 @@ extern const luaL_reg null_reg[]; -#define RSPAMD_LUA_API_VERSION 7 +#define RSPAMD_LUA_API_VERSION 8 /* Common utility functions */ void lua_newclass (lua_State *L, const gchar *classname, const struct luaL_reg *func); diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 2bb13083b..8c90feb4b 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -109,12 +109,14 @@ LUA_FUNCTION_DEF (textpart, get_content); LUA_FUNCTION_DEF (textpart, is_empty); LUA_FUNCTION_DEF (textpart, is_html); LUA_FUNCTION_DEF (textpart, get_fuzzy); +LUA_FUNCTION_DEF (textpart, get_language); static const struct luaL_reg textpartlib_m[] = { LUA_INTERFACE_DEF (textpart, get_content), LUA_INTERFACE_DEF (textpart, is_empty), LUA_INTERFACE_DEF (textpart, is_html), LUA_INTERFACE_DEF (textpart, get_fuzzy), + LUA_INTERFACE_DEF (textpart, get_language), {"__tostring", lua_class_tostring}, {NULL, NULL} }; @@ -1240,6 +1242,105 @@ lua_textpart_get_fuzzy (lua_State * L) return 1; } +static gint +lua_textpart_get_language (lua_State * L) +{ + struct mime_text_part *part = lua_check_textpart (L); + static const gchar languages[][4] = { + "", /* G_UNICODE_SCRIPT_COMMON */ + "", /* G_UNICODE_SCRIPT_INHERITED */ + "ar", /* G_UNICODE_SCRIPT_ARABIC */ + "hy", /* G_UNICODE_SCRIPT_ARMENIAN */ + "bn", /* G_UNICODE_SCRIPT_BENGALI */ + /* Used primarily in Taiwan, but not part of the standard + * zh-tw orthography */ + "", /* G_UNICODE_SCRIPT_BOPOMOFO */ + "chr", /* G_UNICODE_SCRIPT_CHEROKEE */ + "cop", /* G_UNICODE_SCRIPT_COPTIC */ + "ru", /* G_UNICODE_SCRIPT_CYRILLIC */ + /* Deseret was used to write English */ + "", /* G_UNICODE_SCRIPT_DESERET */ + "hi", /* G_UNICODE_SCRIPT_DEVANAGARI */ + "am", /* G_UNICODE_SCRIPT_ETHIOPIC */ + "ka", /* G_UNICODE_SCRIPT_GEORGIAN */ + "", /* G_UNICODE_SCRIPT_GOTHIC */ + "el", /* G_UNICODE_SCRIPT_GREEK */ + "gu", /* G_UNICODE_SCRIPT_GUJARATI */ + "pa", /* G_UNICODE_SCRIPT_GURMUKHI */ + "", /* G_UNICODE_SCRIPT_HAN */ + "ko", /* G_UNICODE_SCRIPT_HANGUL */ + "he", /* G_UNICODE_SCRIPT_HEBREW */ + "ja", /* G_UNICODE_SCRIPT_HIRAGANA */ + "kn", /* G_UNICODE_SCRIPT_KANNADA */ + "ja", /* G_UNICODE_SCRIPT_KATAKANA */ + "km", /* G_UNICODE_SCRIPT_KHMER */ + "lo", /* G_UNICODE_SCRIPT_LAO */ + "en", /* G_UNICODE_SCRIPT_LATIN */ + "ml", /* G_UNICODE_SCRIPT_MALAYALAM */ + "mn", /* G_UNICODE_SCRIPT_MONGOLIAN */ + "my", /* G_UNICODE_SCRIPT_MYANMAR */ + /* Ogham was used to write old Irish */ + "", /* G_UNICODE_SCRIPT_OGHAM */ + "", /* G_UNICODE_SCRIPT_OLD_ITALIC */ + "or", /* G_UNICODE_SCRIPT_ORIYA */ + "", /* G_UNICODE_SCRIPT_RUNIC */ + "si", /* G_UNICODE_SCRIPT_SINHALA */ + "syr", /* G_UNICODE_SCRIPT_SYRIAC */ + "ta", /* G_UNICODE_SCRIPT_TAMIL */ + "te", /* G_UNICODE_SCRIPT_TELUGU */ + "dv", /* G_UNICODE_SCRIPT_THAANA */ + "th", /* G_UNICODE_SCRIPT_THAI */ + "bo", /* G_UNICODE_SCRIPT_TIBETAN */ + "iu", /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ + "", /* G_UNICODE_SCRIPT_YI */ + "tl", /* G_UNICODE_SCRIPT_TAGALOG */ + /* Phillipino languages/scripts */ + "hnn", /* G_UNICODE_SCRIPT_HANUNOO */ + "bku", /* G_UNICODE_SCRIPT_BUHID */ + "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */ + + "", /* G_UNICODE_SCRIPT_BRAILLE */ + "", /* G_UNICODE_SCRIPT_CYPRIOT */ + "", /* G_UNICODE_SCRIPT_LIMBU */ + /* Used for Somali (so) in the past */ + "", /* G_UNICODE_SCRIPT_OSMANYA */ + /* The Shavian alphabet was designed for English */ + "", /* G_UNICODE_SCRIPT_SHAVIAN */ + "", /* G_UNICODE_SCRIPT_LINEAR_B */ + "", /* G_UNICODE_SCRIPT_TAI_LE */ + "uga", /* G_UNICODE_SCRIPT_UGARITIC */ + + "", /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ + "bug", /* G_UNICODE_SCRIPT_BUGINESE */ + /* The original script for Old Church Slavonic (chu), later + * written with Cyrillic */ + "", /* G_UNICODE_SCRIPT_GLAGOLITIC */ + /* Used for for Berber (ber), but Arabic script is more common */ + "", /* G_UNICODE_SCRIPT_TIFINAGH */ + "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ + "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */ + "", /* G_UNICODE_SCRIPT_KHAROSHTHI */ + + "", /* G_UNICODE_SCRIPT_UNKNOWN */ + "", /* G_UNICODE_SCRIPT_BALINESE */ + "", /* G_UNICODE_SCRIPT_CUNEIFORM */ + "", /* G_UNICODE_SCRIPT_PHOENICIAN */ + "", /* G_UNICODE_SCRIPT_PHAGS_PA */ + "nqo" /* G_UNICODE_SCRIPT_NKO */ + }; + const gchar *sel; + + if (part != NULL && part->script > 0 && part->script < G_N_ELEMENTS (languages)) { + sel = languages[part->script]; + if (*sel != '\0') { + lua_pushstring (L, sel); + } + } + + lua_pushnil (L); + return 1; +} + /* Image functions */ static gint lua_image_get_width (lua_State *L) diff --git a/src/message.h b/src/message.h index 5f19ab892..15fd90188 100644 --- a/src/message.h +++ b/src/message.h @@ -34,6 +34,7 @@ struct mime_text_part { fuzzy_hash_t *fuzzy; fuzzy_hash_t *double_fuzzy; GMimeObject *parent; + GUnicodeScript script; }; struct received_header { diff --git a/src/plugins/chartable.c b/src/plugins/chartable.c index 7432e9f61..ea30fca87 100644 --- a/src/plugins/chartable.c +++ b/src/plugins/chartable.c @@ -116,8 +116,10 @@ check_part (struct mime_text_part *part, gboolean raw_mode) guchar *p, *p1; gunichar c, t; GUnicodeScript scc, sct; - guint32 mark = 0, total = 0; + guint32 mark = 0, total = 0, max = 0, i; guint32 remain = part->content->len; + guint32 scripts[G_UNICODE_SCRIPT_NKO]; + GUnicodeScript sel = 0; p = part->content->data; @@ -136,6 +138,7 @@ check_part (struct mime_text_part *part, gboolean raw_mode) } } else { + memset (&scripts, 0, sizeof (scripts)); while (remain > 0) { c = g_utf8_get_char_validated (p, remain); if (c == (gunichar) -2 || c == (gunichar) -1) { @@ -144,6 +147,9 @@ check_part (struct mime_text_part *part, gboolean raw_mode) } scc = g_unichar_get_script (c); + if (scc < G_N_ELEMENTS (scripts)) { + scripts[scc] ++; + } p1 = g_utf8_next_char (p); remain -= p1 - p; p = p1; @@ -167,6 +173,14 @@ check_part (struct mime_text_part *part, gboolean raw_mode) p = p1; } } + /* Detect the mostly charset of this part */ + for (i = 0; i < G_N_ELEMENTS (scripts); i ++) { + if (scripts[i] > max) { + max = scripts[i]; + sel = i; + } + } + part->script = sel; } return ((double)mark / (double)total) > chartable_module_ctx->threshold; -- 2.39.5