summaryrefslogtreecommitdiffstats
path: root/src/lua
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 15:28:27 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 15:28:27 +0000
commit3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch)
tree5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src/lua
parent39b8dcb94620669ae369ab559175dde1a5c103b7 (diff)
downloadrspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz
rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip
Rework language detection.
Diffstat (limited to 'src/lua')
-rw-r--r--src/lua/lua_task.c128
1 files changed, 3 insertions, 125 deletions
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 9a61dd5c5..df7640e46 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -1989,133 +1989,11 @@ static gint
lua_textpart_get_language (lua_State * L)
{
struct mime_text_part *part = lua_check_textpart (L);
- static const gchar languages[][4] = {
- "", /* G_UNICODE_SCRIPT_COMMON */
- "", /* G_UNICODE_SCRIPT_INHERITED */
- "ar", /* G_UNICODE_SCRIPT_ARABIC */
- "hy", /* G_UNICODE_SCRIPT_ARMENIAN */
- "bn", /* G_UNICODE_SCRIPT_BENGALI */
- /* Used primarily in Taiwan, but not part of the standard
- * zh-tw orthography */
- "", /* G_UNICODE_SCRIPT_BOPOMOFO */
- "chr", /* G_UNICODE_SCRIPT_CHEROKEE */
- "cop", /* G_UNICODE_SCRIPT_COPTIC */
- "ru", /* G_UNICODE_SCRIPT_CYRILLIC */
- /* Deseret was used to write English */
- "", /* G_UNICODE_SCRIPT_DESERET */
- "hi", /* G_UNICODE_SCRIPT_DEVANAGARI */
- "am", /* G_UNICODE_SCRIPT_ETHIOPIC */
- "ka", /* G_UNICODE_SCRIPT_GEORGIAN */
- "", /* G_UNICODE_SCRIPT_GOTHIC */
- "el", /* G_UNICODE_SCRIPT_GREEK */
- "gu", /* G_UNICODE_SCRIPT_GUJARATI */
- "pa", /* G_UNICODE_SCRIPT_GURMUKHI */
- "han", /* G_UNICODE_SCRIPT_HAN */
- "ko", /* G_UNICODE_SCRIPT_HANGUL */
- "he", /* G_UNICODE_SCRIPT_HEBREW */
- "ja", /* G_UNICODE_SCRIPT_HIRAGANA */
- "kn", /* G_UNICODE_SCRIPT_KANNADA */
- "ja", /* G_UNICODE_SCRIPT_KATAKANA */
- "km", /* G_UNICODE_SCRIPT_KHMER */
- "lo", /* G_UNICODE_SCRIPT_LAO */
- "en", /* G_UNICODE_SCRIPT_LATIN */
- "ml", /* G_UNICODE_SCRIPT_MALAYALAM */
- "mn", /* G_UNICODE_SCRIPT_MONGOLIAN */
- "my", /* G_UNICODE_SCRIPT_MYANMAR */
- /* Ogham was used to write old Irish */
- "", /* G_UNICODE_SCRIPT_OGHAM */
- "", /* G_UNICODE_SCRIPT_OLD_ITALIC */
- "or", /* G_UNICODE_SCRIPT_ORIYA */
- "", /* G_UNICODE_SCRIPT_RUNIC */
- "si", /* G_UNICODE_SCRIPT_SINHALA */
- "syr", /* G_UNICODE_SCRIPT_SYRIAC */
- "ta", /* G_UNICODE_SCRIPT_TAMIL */
- "te", /* G_UNICODE_SCRIPT_TELUGU */
- "dv", /* G_UNICODE_SCRIPT_THAANA */
- "th", /* G_UNICODE_SCRIPT_THAI */
- "bo", /* G_UNICODE_SCRIPT_TIBETAN */
- "iu", /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
- "", /* G_UNICODE_SCRIPT_YI */
- "tl", /* G_UNICODE_SCRIPT_TAGALOG */
- /* Phillipino languages/scripts */
- "hnn", /* G_UNICODE_SCRIPT_HANUNOO */
- "bku", /* G_UNICODE_SCRIPT_BUHID */
- "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */
-
- "", /* G_UNICODE_SCRIPT_BRAILLE */
- "", /* G_UNICODE_SCRIPT_CYPRIOT */
- "", /* G_UNICODE_SCRIPT_LIMBU */
- /* Used for Somali (so) in the past */
- "", /* G_UNICODE_SCRIPT_OSMANYA */
- /* The Shavian alphabet was designed for English */
- "", /* G_UNICODE_SCRIPT_SHAVIAN */
- "", /* G_UNICODE_SCRIPT_LINEAR_B */
- "", /* G_UNICODE_SCRIPT_TAI_LE */
- "uga", /* G_UNICODE_SCRIPT_UGARITIC */
-
- "", /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
- "bug", /* G_UNICODE_SCRIPT_BUGINESE */
- /* The original script for Old Church Slavonic (chu), later
- * written with Cyrillic */
- "", /* G_UNICODE_SCRIPT_GLAGOLITIC */
- /* Used for for Berber (ber), but Arabic script is more common */
- "", /* G_UNICODE_SCRIPT_TIFINAGH */
- "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
- "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */
- "", /* G_UNICODE_SCRIPT_KHAROSHTHI */
-
- "", /* G_UNICODE_SCRIPT_UNKNOWN */
- "", /* G_UNICODE_SCRIPT_BALINESE */
- "", /* G_UNICODE_SCRIPT_CUNEIFORM */
- "", /* G_UNICODE_SCRIPT_PHOENICIAN */
- "", /* G_UNICODE_SCRIPT_PHAGS_PA */
- "nqo" /* G_UNICODE_SCRIPT_NKO */
- };
- const gchar *sel;
if (part != NULL) {
- if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN ||
- part->script == G_UNICODE_SCRIPT_COMMON)) {
- /* Try to detect encoding by several symbols */
- const gchar *p, *pp;
- gunichar c;
- gint32 remain = part->content->len, max = 0, processed = 0;
- gint32 scripts[G_UNICODE_SCRIPT_NKO];
- GUnicodeScript scc, sel;
-
- p = part->content->data;
- memset (scripts, 0, sizeof (scripts));
-
- while (remain > 0 && processed < 10) {
- c = g_utf8_get_char_validated (p, remain);
- if (c == (gunichar) -2 || c == (gunichar) -1) {
- break;
- }
- scc = g_unichar_get_script (c);
- if (scc < (gint)G_N_ELEMENTS (scripts)) {
- scripts[scc]++;
- }
- pp = g_utf8_next_char (p);
- remain -= pp - p;
- p = pp;
- processed ++;
- }
- for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
- if (scripts[remain] > max) {
- max = scripts[remain];
- sel = remain;
- }
- }
- part->script = sel;
- }
-
- if (part->script > 0 && part->script <
- (gint)G_N_ELEMENTS (languages)) {
- sel = languages[part->script];
- if (*sel != '\0') {
- lua_pushstring (L, sel);
- return 1;
- }
+ if (part->lang_code != NULL && part->lang_code[0] != '\0') {
+ lua_pushstring (L, part->lang_code);
+ return 1;
}
}