From b1cc82613d9929b2d377693b033c63c12cba91e1 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 4 Nov 2014 12:51:55 +0000 Subject: [PATCH] Detect language heuristic for text parts. --- src/lua/lua_task.c | 49 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 43 insertions(+), 6 deletions(-) diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 35d7b5c63..f6df133a6 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -2071,12 +2071,49 @@ lua_textpart_get_language (lua_State * L) }; const gchar *sel; - if (part != NULL && part->script > 0 && part->script < - (gint)G_N_ELEMENTS (languages)) { - sel = languages[part->script]; - if (*sel != '\0') { - lua_pushstring (L, sel); - return 1; + if (part != NULL) { + if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN || + part->script == G_UNICODE_SCRIPT_COMMON)) { + /* Try to detect encoding by several symbols */ + const gchar *p, *pp; + gunichar c; + gint32 remain = part->content->len, max = 0, processed = 0; + gint32 scripts[G_UNICODE_SCRIPT_NKO]; + GUnicodeScript scc, sel; + + p = part->content->data; + memset (scripts, 0, sizeof (scripts)); + + while (remain > 0 && processed < 10) { + c = g_utf8_get_char_validated (p, remain); + if (c == (gunichar) -2 || c == (gunichar) -1) { + break; + } + scc = g_unichar_get_script (c); + if (scc < (gint)G_N_ELEMENTS (scripts)) { + scripts[scc]++; + } + pp = g_utf8_next_char (p); + remain -= pp - p; + p = pp; + processed ++; + } + for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) { + if (scripts[remain] > max) { + max = scripts[remain]; + sel = remain; + } + } + part->script = sel; + } + + if (part->script > 0 && part->script < + (gint)G_N_ELEMENTS (languages)) { + sel = languages[part->script]; + if (*sel != '\0') { + lua_pushstring (L, sel); + return 1; + } } } -- 2.39.5