diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-11-04 12:51:55 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-11-04 12:51:55 +0000 |
commit | b1cc82613d9929b2d377693b033c63c12cba91e1 (patch) | |
tree | 9a2298fb71cc07f08cc673836cc937b4992f9119 /src/lua/lua_task.c | |
parent | ecca6ad84f2f5da9a201dd37cb493be114b68402 (diff) | |
download | rspamd-b1cc82613d9929b2d377693b033c63c12cba91e1.tar.gz rspamd-b1cc82613d9929b2d377693b033c63c12cba91e1.zip |
Detect language heuristic for text parts.
Diffstat (limited to 'src/lua/lua_task.c')
-rw-r--r-- | src/lua/lua_task.c | 49 |
1 files changed, 43 insertions, 6 deletions
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c index 35d7b5c63..f6df133a6 100644 --- a/src/lua/lua_task.c +++ b/src/lua/lua_task.c @@ -2071,12 +2071,49 @@ lua_textpart_get_language (lua_State * L) }; const gchar *sel; - if (part != NULL && part->script > 0 && part->script < - (gint)G_N_ELEMENTS (languages)) { - sel = languages[part->script]; - if (*sel != '\0') { - lua_pushstring (L, sel); - return 1; + if (part != NULL) { + if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN || + part->script == G_UNICODE_SCRIPT_COMMON)) { + /* Try to detect encoding by several symbols */ + const gchar *p, *pp; + gunichar c; + gint32 remain = part->content->len, max = 0, processed = 0; + gint32 scripts[G_UNICODE_SCRIPT_NKO]; + GUnicodeScript scc, sel; + + p = part->content->data; + memset (scripts, 0, sizeof (scripts)); + + while (remain > 0 && processed < 10) { + c = g_utf8_get_char_validated (p, remain); + if (c == (gunichar) -2 || c == (gunichar) -1) { + break; + } + scc = g_unichar_get_script (c); + if (scc < (gint)G_N_ELEMENTS (scripts)) { + scripts[scc]++; + } + pp = g_utf8_next_char (p); + remain -= pp - p; + p = pp; + processed ++; + } + for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) { + if (scripts[remain] > max) { + max = scripts[remain]; + sel = remain; + } + } + part->script = sel; + } + + if (part->script > 0 && part->script < + (gint)G_N_ELEMENTS (languages)) { + sel = languages[part->script]; + if (*sel != '\0') { + lua_pushstring (L, sel); + return 1; + } } } |