]> source.dussan.org Git - rspamd.git/commitdiff
* Add a simple logic of language detection for text parts (unicode script based)
authorVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 11 Jul 2011 16:38:47 +0000 (20:38 +0400)
committerVsevolod Stakhov <vsevolod@rambler-co.ru>
Mon, 11 Jul 2011 16:38:47 +0000 (20:38 +0400)
src/lua/lua_common.h
src/lua/lua_task.c
src/message.h
src/plugins/chartable.c

index 284b34e8e93fbdad4589d28640e2154f1ae48378..4427dcae964d4bfc6562ff97298265be2cabec49 100644 (file)
@@ -16,7 +16,7 @@
 
 extern const luaL_reg null_reg[];
 
-#define RSPAMD_LUA_API_VERSION 7
+#define RSPAMD_LUA_API_VERSION 8
 
 /* Common utility functions */
 void lua_newclass (lua_State *L, const gchar *classname, const struct luaL_reg *func);
index 2bb13083b60a1afce9cc03f3a665aa214f51112f..8c90feb4b88555b9cd6c9505e65ec0aee0b0c46f 100644 (file)
@@ -109,12 +109,14 @@ LUA_FUNCTION_DEF (textpart, get_content);
 LUA_FUNCTION_DEF (textpart, is_empty);
 LUA_FUNCTION_DEF (textpart, is_html);
 LUA_FUNCTION_DEF (textpart, get_fuzzy);
+LUA_FUNCTION_DEF (textpart, get_language);
 
 static const struct luaL_reg    textpartlib_m[] = {
        LUA_INTERFACE_DEF (textpart, get_content),
        LUA_INTERFACE_DEF (textpart, is_empty),
        LUA_INTERFACE_DEF (textpart, is_html),
        LUA_INTERFACE_DEF (textpart, get_fuzzy),
+       LUA_INTERFACE_DEF (textpart, get_language),
        {"__tostring", lua_class_tostring},
        {NULL, NULL}
 };
@@ -1240,6 +1242,105 @@ lua_textpart_get_fuzzy (lua_State * L)
        return 1;
 }
 
+static gint
+lua_textpart_get_language (lua_State * L)
+{
+       struct mime_text_part          *part = lua_check_textpart (L);
+       static const gchar              languages[][4] = {
+                       "",    /* G_UNICODE_SCRIPT_COMMON */
+                       "",    /* G_UNICODE_SCRIPT_INHERITED */
+                       "ar",  /* G_UNICODE_SCRIPT_ARABIC */
+                       "hy",  /* G_UNICODE_SCRIPT_ARMENIAN */
+                       "bn",  /* G_UNICODE_SCRIPT_BENGALI */
+                       /* Used primarily in Taiwan, but not part of the standard
+                        * zh-tw orthography  */
+                       "",    /* G_UNICODE_SCRIPT_BOPOMOFO */
+                       "chr", /* G_UNICODE_SCRIPT_CHEROKEE */
+                       "cop", /* G_UNICODE_SCRIPT_COPTIC */
+                       "ru",  /* G_UNICODE_SCRIPT_CYRILLIC */
+                       /* Deseret was used to write English */
+                       "",    /* G_UNICODE_SCRIPT_DESERET */
+                       "hi",  /* G_UNICODE_SCRIPT_DEVANAGARI */
+                       "am",  /* G_UNICODE_SCRIPT_ETHIOPIC */
+                       "ka",  /* G_UNICODE_SCRIPT_GEORGIAN */
+                       "",    /* G_UNICODE_SCRIPT_GOTHIC */
+                       "el",  /* G_UNICODE_SCRIPT_GREEK */
+                       "gu",  /* G_UNICODE_SCRIPT_GUJARATI */
+                       "pa",  /* G_UNICODE_SCRIPT_GURMUKHI */
+                       "",    /* G_UNICODE_SCRIPT_HAN */
+                       "ko",  /* G_UNICODE_SCRIPT_HANGUL */
+                       "he",  /* G_UNICODE_SCRIPT_HEBREW */
+                       "ja",  /* G_UNICODE_SCRIPT_HIRAGANA */
+                       "kn",  /* G_UNICODE_SCRIPT_KANNADA */
+                       "ja",  /* G_UNICODE_SCRIPT_KATAKANA */
+                       "km",  /* G_UNICODE_SCRIPT_KHMER */
+                       "lo",  /* G_UNICODE_SCRIPT_LAO */
+                       "en",  /* G_UNICODE_SCRIPT_LATIN */
+                       "ml",  /* G_UNICODE_SCRIPT_MALAYALAM */
+                       "mn",  /* G_UNICODE_SCRIPT_MONGOLIAN */
+                       "my",  /* G_UNICODE_SCRIPT_MYANMAR */
+                       /* Ogham was used to write old Irish */
+                       "",    /* G_UNICODE_SCRIPT_OGHAM */
+                       "",    /* G_UNICODE_SCRIPT_OLD_ITALIC */
+                       "or",  /* G_UNICODE_SCRIPT_ORIYA */
+                       "",    /* G_UNICODE_SCRIPT_RUNIC */
+                       "si",  /* G_UNICODE_SCRIPT_SINHALA */
+                       "syr", /* G_UNICODE_SCRIPT_SYRIAC */
+                       "ta",  /* G_UNICODE_SCRIPT_TAMIL */
+                       "te",  /* G_UNICODE_SCRIPT_TELUGU */
+                       "dv",  /* G_UNICODE_SCRIPT_THAANA */
+                       "th",  /* G_UNICODE_SCRIPT_THAI */
+                       "bo",  /* G_UNICODE_SCRIPT_TIBETAN */
+                       "iu",  /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
+                       "",    /* G_UNICODE_SCRIPT_YI */
+                       "tl",  /* G_UNICODE_SCRIPT_TAGALOG */
+                       /* Phillipino languages/scripts */
+                       "hnn", /* G_UNICODE_SCRIPT_HANUNOO */
+                       "bku", /* G_UNICODE_SCRIPT_BUHID */
+                       "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */
+
+                       "",    /* G_UNICODE_SCRIPT_BRAILLE */
+                       "",    /* G_UNICODE_SCRIPT_CYPRIOT */
+                       "",    /* G_UNICODE_SCRIPT_LIMBU */
+                       /* Used for Somali (so) in the past */
+                       "",    /* G_UNICODE_SCRIPT_OSMANYA */
+                       /* The Shavian alphabet was designed for English */
+                       "",    /* G_UNICODE_SCRIPT_SHAVIAN */
+                       "",    /* G_UNICODE_SCRIPT_LINEAR_B */
+                       "",    /* G_UNICODE_SCRIPT_TAI_LE */
+                       "uga", /* G_UNICODE_SCRIPT_UGARITIC */
+
+                       "",    /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
+                       "bug", /* G_UNICODE_SCRIPT_BUGINESE */
+                       /* The original script for Old Church Slavonic (chu), later
+                        * written with Cyrillic */
+                       "",    /* G_UNICODE_SCRIPT_GLAGOLITIC */
+                       /* Used for for Berber (ber), but Arabic script is more common */
+                       "",    /* G_UNICODE_SCRIPT_TIFINAGH */
+                       "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
+                       "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */
+                       "",    /* G_UNICODE_SCRIPT_KHAROSHTHI */
+
+                       "",    /* G_UNICODE_SCRIPT_UNKNOWN */
+                       "",    /* G_UNICODE_SCRIPT_BALINESE */
+                       "",    /* G_UNICODE_SCRIPT_CUNEIFORM */
+                       "",    /* G_UNICODE_SCRIPT_PHOENICIAN */
+                       "",    /* G_UNICODE_SCRIPT_PHAGS_PA */
+                       "nqo"  /* G_UNICODE_SCRIPT_NKO */
+       };
+       const gchar                    *sel;
+
+       if (part != NULL && part->script > 0 && part->script < G_N_ELEMENTS (languages)) {
+               sel = languages[part->script];
+               if (*sel != '\0') {
+                       lua_pushstring (L, sel);
+               }
+       }
+
+       lua_pushnil (L);
+       return 1;
+}
+
 /* Image functions */
 static gint
 lua_image_get_width (lua_State *L)
index 5f19ab892190e7e97f352dba873b5c95ea46087b..15fd90188187f66f3648e48835a44b2c7938c628 100644 (file)
@@ -34,6 +34,7 @@ struct mime_text_part {
        fuzzy_hash_t *fuzzy;
        fuzzy_hash_t *double_fuzzy;
        GMimeObject *parent;
+       GUnicodeScript script;
 };
 
 struct received_header {
index 7432e9f6168f66839d741ffd299634863335cb72..ea30fca872b59f6fccb634dc1c00a3e8eb899e0f 100644 (file)
@@ -116,8 +116,10 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
        guchar                          *p, *p1;
        gunichar                        c, t;
        GUnicodeScript                  scc, sct;
-       guint32                         mark = 0, total = 0;
+       guint32                         mark = 0, total = 0, max = 0, i;
        guint32                         remain = part->content->len;
+       guint32                         scripts[G_UNICODE_SCRIPT_NKO];
+       GUnicodeScript                  sel = 0;
 
        p = part->content->data;
 
@@ -136,6 +138,7 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
                }
        }
        else {
+               memset (&scripts, 0, sizeof (scripts));
                while (remain > 0) {
                        c = g_utf8_get_char_validated (p, remain);
                        if (c == (gunichar) -2 || c == (gunichar) -1) {
@@ -144,6 +147,9 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
                        }
 
                        scc = g_unichar_get_script (c);
+                       if (scc < G_N_ELEMENTS (scripts)) {
+                               scripts[scc] ++;
+                       }
                        p1 = g_utf8_next_char (p);
                        remain -= p1 - p;
                        p = p1;
@@ -167,6 +173,14 @@ check_part (struct mime_text_part *part, gboolean raw_mode)
                                p = p1;
                        }
                }
+               /* Detect the mostly charset of this part */
+               for (i = 0; i < G_N_ELEMENTS (scripts); i ++) {
+                       if (scripts[i] > max) {
+                               max = scripts[i];
+                               sel = i;
+                       }
+               }
+               part->script = sel;
        }
 
        return ((double)mark / (double)total) > chartable_module_ctx->threshold;