diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:28:27 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:28:27 +0000 |
commit | 3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch) | |
tree | 5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src/lua | |
parent | 39b8dcb94620669ae369ab559175dde1a5c103b7 (diff) | |
download | rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip |
Rework language detection.
Diffstat (limited to 'src/lua')
-rw-r--r-- | src/lua/lua_task.c | 128 |
1 files changed, 3 insertions, 125 deletions
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 9a61dd5c5..df7640e46 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -1989,133 +1989,11 @@ static gint lua_textpart_get_language (lua_State * L) { struct mime_text_part *part = lua_check_textpart (L); - static const gchar languages[][4] = { - "", /* G_UNICODE_SCRIPT_COMMON */ - "", /* G_UNICODE_SCRIPT_INHERITED */ - "ar", /* G_UNICODE_SCRIPT_ARABIC */ - "hy", /* G_UNICODE_SCRIPT_ARMENIAN */ - "bn", /* G_UNICODE_SCRIPT_BENGALI */ - /* Used primarily in Taiwan, but not part of the standard - * zh-tw orthography */ - "", /* G_UNICODE_SCRIPT_BOPOMOFO */ - "chr", /* G_UNICODE_SCRIPT_CHEROKEE */ - "cop", /* G_UNICODE_SCRIPT_COPTIC */ - "ru", /* G_UNICODE_SCRIPT_CYRILLIC */ - /* Deseret was used to write English */ - "", /* G_UNICODE_SCRIPT_DESERET */ - "hi", /* G_UNICODE_SCRIPT_DEVANAGARI */ - "am", /* G_UNICODE_SCRIPT_ETHIOPIC */ - "ka", /* G_UNICODE_SCRIPT_GEORGIAN */ - "", /* G_UNICODE_SCRIPT_GOTHIC */ - "el", /* G_UNICODE_SCRIPT_GREEK */ - "gu", /* G_UNICODE_SCRIPT_GUJARATI */ - "pa", /* G_UNICODE_SCRIPT_GURMUKHI */ - "han", /* G_UNICODE_SCRIPT_HAN */ - "ko", /* G_UNICODE_SCRIPT_HANGUL */ - "he", /* G_UNICODE_SCRIPT_HEBREW */ - "ja", /* G_UNICODE_SCRIPT_HIRAGANA */ - "kn", /* G_UNICODE_SCRIPT_KANNADA */ - "ja", /* G_UNICODE_SCRIPT_KATAKANA */ - "km", /* G_UNICODE_SCRIPT_KHMER */ - "lo", /* G_UNICODE_SCRIPT_LAO */ - "en", /* G_UNICODE_SCRIPT_LATIN */ - "ml", /* G_UNICODE_SCRIPT_MALAYALAM */ - "mn", /* G_UNICODE_SCRIPT_MONGOLIAN */ - "my", /* G_UNICODE_SCRIPT_MYANMAR */ - /* Ogham was used to write old Irish */ - "", /* G_UNICODE_SCRIPT_OGHAM */ - "", /* G_UNICODE_SCRIPT_OLD_ITALIC */ - "or", /* G_UNICODE_SCRIPT_ORIYA */ - "", /* G_UNICODE_SCRIPT_RUNIC */ - "si", /* G_UNICODE_SCRIPT_SINHALA */ - "syr", /* G_UNICODE_SCRIPT_SYRIAC */ - "ta", /* G_UNICODE_SCRIPT_TAMIL */ - "te", /* G_UNICODE_SCRIPT_TELUGU */ - "dv", /* G_UNICODE_SCRIPT_THAANA */ - "th", /* G_UNICODE_SCRIPT_THAI */ - "bo", /* G_UNICODE_SCRIPT_TIBETAN */ - "iu", /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ - "", /* G_UNICODE_SCRIPT_YI */ - "tl", /* G_UNICODE_SCRIPT_TAGALOG */ - /* Phillipino languages/scripts */ - "hnn", /* G_UNICODE_SCRIPT_HANUNOO */ - "bku", /* G_UNICODE_SCRIPT_BUHID */ - "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */ - - "", /* G_UNICODE_SCRIPT_BRAILLE */ - "", /* G_UNICODE_SCRIPT_CYPRIOT */ - "", /* G_UNICODE_SCRIPT_LIMBU */ - /* Used for Somali (so) in the past */ - "", /* G_UNICODE_SCRIPT_OSMANYA */ - /* The Shavian alphabet was designed for English */ - "", /* G_UNICODE_SCRIPT_SHAVIAN */ - "", /* G_UNICODE_SCRIPT_LINEAR_B */ - "", /* G_UNICODE_SCRIPT_TAI_LE */ - "uga", /* G_UNICODE_SCRIPT_UGARITIC */ - - "", /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ - "bug", /* G_UNICODE_SCRIPT_BUGINESE */ - /* The original script for Old Church Slavonic (chu), later - * written with Cyrillic */ - "", /* G_UNICODE_SCRIPT_GLAGOLITIC */ - /* Used for for Berber (ber), but Arabic script is more common */ - "", /* G_UNICODE_SCRIPT_TIFINAGH */ - "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ - "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */ - "", /* G_UNICODE_SCRIPT_KHAROSHTHI */ - - "", /* G_UNICODE_SCRIPT_UNKNOWN */ - "", /* G_UNICODE_SCRIPT_BALINESE */ - "", /* G_UNICODE_SCRIPT_CUNEIFORM */ - "", /* G_UNICODE_SCRIPT_PHOENICIAN */ - "", /* G_UNICODE_SCRIPT_PHAGS_PA */ - "nqo" /* G_UNICODE_SCRIPT_NKO */ - }; - const gchar *sel; if (part != NULL) { - if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN || - part->script == G_UNICODE_SCRIPT_COMMON)) { - /* Try to detect encoding by several symbols */ - const gchar *p, *pp; - gunichar c; - gint32 remain = part->content->len, max = 0, processed = 0; - gint32 scripts[G_UNICODE_SCRIPT_NKO]; - GUnicodeScript scc, sel; - - p = part->content->data; - memset (scripts, 0, sizeof (scripts)); - - while (remain > 0 && processed < 10) { - c = g_utf8_get_char_validated (p, remain); - if (c == (gunichar) -2 || c == (gunichar) -1) { - break; - } - scc = g_unichar_get_script (c); - if (scc < (gint)G_N_ELEMENTS (scripts)) { - scripts[scc]++; - } - pp = g_utf8_next_char (p); - remain -= pp - p; - p = pp; - processed ++; - } - for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) { - if (scripts[remain] > max) { - max = scripts[remain]; - sel = remain; - } - } - part->script = sel; - } - - if (part->script > 0 && part->script < - (gint)G_N_ELEMENTS (languages)) { - sel = languages[part->script]; - if (*sel != '\0') { - lua_pushstring (L, sel); - return 1; - } + if (part->lang_code != NULL && part->lang_code[0] != '\0') { + lua_pushstring (L, part->lang_code); + return 1; } } |