aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 15:28:27 +0000
committerVsevolod Stakhov <vsevolod@highsecure.ru>2014-12-23 15:28:27 +0000
commit3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch)
tree5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src
parent39b8dcb94620669ae369ab559175dde1a5c103b7 (diff)
downloadrspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz
rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip
Rework language detection.
Diffstat (limited to 'src')
-rw-r--r--src/libmime/message.c175
-rw-r--r--src/libmime/message.h4
-rw-r--r--src/lua/lua_task.c128
3 files changed, 152 insertions, 155 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c
index 6140f3c24..3c5f83047 100644
--- a/src/libmime/message.c
+++ b/src/libmime/message.c
@@ -1025,6 +1025,148 @@ convert_text_to_utf (struct rspamd_task *task,
return result_array;
}
+struct language_match {
+ const char *code;
+ const char *name;
+ GUnicodeScript script;
+};
+
+static int
+language_elts_cmp (const void *a, const void *b)
+{
+ GUnicodeScript sc = *(const GUnicodeScript *)a;
+ const struct language_match *bb = (const struct language_match *)b;
+
+ return (sc - bb->script);
+}
+
+static void
+detect_text_language (struct mime_text_part *part)
+{
+ /* Keep sorted */
+ static const struct language_match language_codes[] = {
+ { "", "english", G_UNICODE_SCRIPT_COMMON },
+ { "", "", G_UNICODE_SCRIPT_INHERITED },
+ { "ar", "arabic", G_UNICODE_SCRIPT_ARABIC },
+ { "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN },
+ { "bn", "chineese", G_UNICODE_SCRIPT_BENGALI },
+ { "", "", G_UNICODE_SCRIPT_BOPOMOFO },
+ { "chr", "", G_UNICODE_SCRIPT_CHEROKEE },
+ { "cop", "", G_UNICODE_SCRIPT_COPTIC },
+ { "ru", "russian", G_UNICODE_SCRIPT_CYRILLIC },
+ /* Deseret was used to write English */
+ { "", "", G_UNICODE_SCRIPT_DESERET },
+ { "hi", "", G_UNICODE_SCRIPT_DEVANAGARI },
+ { "am", "", G_UNICODE_SCRIPT_ETHIOPIC },
+ { "ka", "", G_UNICODE_SCRIPT_GEORGIAN },
+ { "", "", G_UNICODE_SCRIPT_GOTHIC },
+ { "el", "greek", G_UNICODE_SCRIPT_GREEK },
+ { "gu", "", G_UNICODE_SCRIPT_GUJARATI },
+ { "pa", "", G_UNICODE_SCRIPT_GURMUKHI },
+ { "han", "chineese", G_UNICODE_SCRIPT_HAN },
+ { "ko", "", G_UNICODE_SCRIPT_HANGUL },
+ { "he", "hebrew", G_UNICODE_SCRIPT_HEBREW },
+ { "ja", "", G_UNICODE_SCRIPT_HIRAGANA },
+ { "kn", "", G_UNICODE_SCRIPT_KANNADA },
+ { "ja", "", G_UNICODE_SCRIPT_KATAKANA },
+ { "km", "", G_UNICODE_SCRIPT_KHMER },
+ { "lo", "", G_UNICODE_SCRIPT_LAO },
+ { "en", "english", G_UNICODE_SCRIPT_LATIN },
+ { "ml", "", G_UNICODE_SCRIPT_MALAYALAM },
+ { "mn", "", G_UNICODE_SCRIPT_MONGOLIAN },
+ { "my", "", G_UNICODE_SCRIPT_MYANMAR },
+ /* Ogham was used to write old Irish */
+ { "", "", G_UNICODE_SCRIPT_OGHAM },
+ { "", "", G_UNICODE_SCRIPT_OLD_ITALIC },
+ { "or", "", G_UNICODE_SCRIPT_ORIYA },
+ { "", "", G_UNICODE_SCRIPT_RUNIC },
+ { "si", "", G_UNICODE_SCRIPT_SINHALA },
+ { "syr", "", G_UNICODE_SCRIPT_SYRIAC },
+ { "ta", "", G_UNICODE_SCRIPT_TAMIL },
+ { "te", "", G_UNICODE_SCRIPT_TELUGU },
+ { "dv", "", G_UNICODE_SCRIPT_THAANA },
+ { "th", "", G_UNICODE_SCRIPT_THAI },
+ { "bo", "", G_UNICODE_SCRIPT_TIBETAN },
+ { "iu", "", G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL },
+ { "", "", G_UNICODE_SCRIPT_YI },
+ { "tl", "", G_UNICODE_SCRIPT_TAGALOG },
+ /* Phillipino languages/scripts */
+ { "hnn", "", G_UNICODE_SCRIPT_HANUNOO },
+ { "bku", "", G_UNICODE_SCRIPT_BUHID },
+ { "tbw", "", G_UNICODE_SCRIPT_TAGBANWA },
+
+ { "", "", G_UNICODE_SCRIPT_BRAILLE },
+ { "", "", G_UNICODE_SCRIPT_CYPRIOT },
+ { "", "", G_UNICODE_SCRIPT_LIMBU },
+ /* Used for Somali (so) in the past */
+ { "", "", G_UNICODE_SCRIPT_OSMANYA },
+ /* The Shavian alphabet was designed for English */
+ { "", "", G_UNICODE_SCRIPT_SHAVIAN },
+ { "", "", G_UNICODE_SCRIPT_LINEAR_B },
+ { "", "", G_UNICODE_SCRIPT_TAI_LE },
+ { "uga", "", G_UNICODE_SCRIPT_UGARITIC },
+ { "", "", G_UNICODE_SCRIPT_NEW_TAI_LUE },
+ { "bug", "", G_UNICODE_SCRIPT_BUGINESE },
+ { "", "", G_UNICODE_SCRIPT_GLAGOLITIC },
+ /* Used for for Berber (ber), but Arabic script is more common */
+ { "", "", G_UNICODE_SCRIPT_TIFINAGH },
+ { "syl", "", G_UNICODE_SCRIPT_SYLOTI_NAGRI },
+ { "peo", "", G_UNICODE_SCRIPT_OLD_PERSIAN },
+ { "", "", G_UNICODE_SCRIPT_KHAROSHTHI },
+ { "", "", G_UNICODE_SCRIPT_UNKNOWN },
+ { "", "", G_UNICODE_SCRIPT_BALINESE },
+ { "", "", G_UNICODE_SCRIPT_CUNEIFORM },
+ { "", "", G_UNICODE_SCRIPT_PHOENICIAN },
+ { "", "", G_UNICODE_SCRIPT_PHAGS_PA },
+ { "nqo", "", G_UNICODE_SCRIPT_NKO }
+ };
+ const struct language_match *lm;
+ const int max_chars = 32;
+
+ if (part != NULL) {
+ if (part->is_utf) {
+ /* Try to detect encoding by several symbols */
+ const gchar *p, *pp;
+ gunichar c;
+ gint32 remain = part->content->len, max = 0, processed = 0;
+ gint32 scripts[G_N_ELEMENTS (language_codes)];
+ GUnicodeScript scc, sel;
+
+ p = part->content->data;
+ memset (scripts, 0, sizeof (scripts));
+
+ while (remain > 0 && processed < max_chars) {
+ c = g_utf8_get_char_validated (p, remain);
+ if (c == (gunichar) -2 || c == (gunichar) -1) {
+ break;
+ }
+ if (g_unichar_isalpha (c)) {
+ scc = g_unichar_get_script (c);
+ if (scc < (gint)G_N_ELEMENTS (scripts)) {
+ scripts[scc]++;
+ }
+ processed ++;
+ }
+ pp = g_utf8_next_char (p);
+ remain -= pp - p;
+ p = pp;
+ }
+ for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
+ if (scripts[remain] > max) {
+ max = scripts[remain];
+ sel = remain;
+ }
+ }
+ part->script = sel;
+ lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes),
+ sizeof (language_codes[0]), &language_elts_cmp);
+
+ part->lang_code = lm->code;
+ part->language = lm->name;
+ }
+ }
+}
+
static void
process_text_part (struct rspamd_task *task,
GByteArray *part_content,
@@ -1035,9 +1177,6 @@ process_text_part (struct rspamd_task *task,
{
struct mime_text_part *text_part;
const gchar *cd;
- gchar *pos;
- gsize l;
- rspamd_fstring_t token, buf;
/* Skip attachements */
#ifndef GMIME24
@@ -1128,32 +1267,10 @@ process_text_part (struct rspamd_task *task,
}
/* Post process part */
- buf.begin = text_part->content->data;
- buf.len = text_part->content->len;
- buf.size = buf.len;
- token.begin = NULL;
- token.len = 0;
-
- text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t));
- while ((pos = rspamd_tokenizer_get_word (&buf,
- &token, &text_part->urls_offset)) != NULL) {
- if (text_part->is_utf) {
- l = g_utf8_strlen (token.begin, token.len);
- }
- else {
- l = token.len;
- }
- /*
- * XXX: make this configurable
- */
- if (l < 4) {
- token.begin = pos;
- continue;
- }
- g_array_append_val (text_part->words, token);
-
- token.begin = pos;
- }
+ detect_text_language (text_part);
+ text_part->words = rspamd_tokenize_text (text_part->content->data,
+ text_part->content->len, text_part->is_utf, 4,
+ &text_part->urls_offset);
}
#ifdef GMIME24
diff --git a/src/libmime/message.h b/src/libmime/message.h
index 8287db9b0..7b3e03883 100644
--- a/src/libmime/message.h
+++ b/src/libmime/message.h
@@ -26,6 +26,9 @@ struct mime_text_part {
gboolean is_balanced;
gboolean is_empty;
gboolean is_utf;
+ GUnicodeScript script;
+ const gchar *lang_code;
+ const gchar *language;
const gchar *real_charset;
GByteArray *orig;
GByteArray *content;
@@ -34,7 +37,6 @@ struct mime_text_part {
rspamd_fuzzy_t *fuzzy;
rspamd_fuzzy_t *double_fuzzy;
GMimeObject *parent;
- GUnicodeScript script;
rspamd_fstring_t *diff_str;
GArray *words;
};
diff --git a/src/lua/lua_task.c b/src/lua/lua_task.c
index 9a61dd5c5..df7640e46 100644
--- a/src/lua/lua_task.c
+++ b/src/lua/lua_task.c
@@ -1989,133 +1989,11 @@ static gint
lua_textpart_get_language (lua_State * L)
{
struct mime_text_part *part = lua_check_textpart (L);
- static const gchar languages[][4] = {
- "", /* G_UNICODE_SCRIPT_COMMON */
- "", /* G_UNICODE_SCRIPT_INHERITED */
- "ar", /* G_UNICODE_SCRIPT_ARABIC */
- "hy", /* G_UNICODE_SCRIPT_ARMENIAN */
- "bn", /* G_UNICODE_SCRIPT_BENGALI */
- /* Used primarily in Taiwan, but not part of the standard
- * zh-tw orthography */
- "", /* G_UNICODE_SCRIPT_BOPOMOFO */
- "chr", /* G_UNICODE_SCRIPT_CHEROKEE */
- "cop", /* G_UNICODE_SCRIPT_COPTIC */
- "ru", /* G_UNICODE_SCRIPT_CYRILLIC */
- /* Deseret was used to write English */
- "", /* G_UNICODE_SCRIPT_DESERET */
- "hi", /* G_UNICODE_SCRIPT_DEVANAGARI */
- "am", /* G_UNICODE_SCRIPT_ETHIOPIC */
- "ka", /* G_UNICODE_SCRIPT_GEORGIAN */
- "", /* G_UNICODE_SCRIPT_GOTHIC */
- "el", /* G_UNICODE_SCRIPT_GREEK */
- "gu", /* G_UNICODE_SCRIPT_GUJARATI */
- "pa", /* G_UNICODE_SCRIPT_GURMUKHI */
- "han", /* G_UNICODE_SCRIPT_HAN */
- "ko", /* G_UNICODE_SCRIPT_HANGUL */
- "he", /* G_UNICODE_SCRIPT_HEBREW */
- "ja", /* G_UNICODE_SCRIPT_HIRAGANA */
- "kn", /* G_UNICODE_SCRIPT_KANNADA */
- "ja", /* G_UNICODE_SCRIPT_KATAKANA */
- "km", /* G_UNICODE_SCRIPT_KHMER */
- "lo", /* G_UNICODE_SCRIPT_LAO */
- "en", /* G_UNICODE_SCRIPT_LATIN */
- "ml", /* G_UNICODE_SCRIPT_MALAYALAM */
- "mn", /* G_UNICODE_SCRIPT_MONGOLIAN */
- "my", /* G_UNICODE_SCRIPT_MYANMAR */
- /* Ogham was used to write old Irish */
- "", /* G_UNICODE_SCRIPT_OGHAM */
- "", /* G_UNICODE_SCRIPT_OLD_ITALIC */
- "or", /* G_UNICODE_SCRIPT_ORIYA */
- "", /* G_UNICODE_SCRIPT_RUNIC */
- "si", /* G_UNICODE_SCRIPT_SINHALA */
- "syr", /* G_UNICODE_SCRIPT_SYRIAC */
- "ta", /* G_UNICODE_SCRIPT_TAMIL */
- "te", /* G_UNICODE_SCRIPT_TELUGU */
- "dv", /* G_UNICODE_SCRIPT_THAANA */
- "th", /* G_UNICODE_SCRIPT_THAI */
- "bo", /* G_UNICODE_SCRIPT_TIBETAN */
- "iu", /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */
- "", /* G_UNICODE_SCRIPT_YI */
- "tl", /* G_UNICODE_SCRIPT_TAGALOG */
- /* Phillipino languages/scripts */
- "hnn", /* G_UNICODE_SCRIPT_HANUNOO */
- "bku", /* G_UNICODE_SCRIPT_BUHID */
- "tbw", /* G_UNICODE_SCRIPT_TAGBANWA */
-
- "", /* G_UNICODE_SCRIPT_BRAILLE */
- "", /* G_UNICODE_SCRIPT_CYPRIOT */
- "", /* G_UNICODE_SCRIPT_LIMBU */
- /* Used for Somali (so) in the past */
- "", /* G_UNICODE_SCRIPT_OSMANYA */
- /* The Shavian alphabet was designed for English */
- "", /* G_UNICODE_SCRIPT_SHAVIAN */
- "", /* G_UNICODE_SCRIPT_LINEAR_B */
- "", /* G_UNICODE_SCRIPT_TAI_LE */
- "uga", /* G_UNICODE_SCRIPT_UGARITIC */
-
- "", /* G_UNICODE_SCRIPT_NEW_TAI_LUE */
- "bug", /* G_UNICODE_SCRIPT_BUGINESE */
- /* The original script for Old Church Slavonic (chu), later
- * written with Cyrillic */
- "", /* G_UNICODE_SCRIPT_GLAGOLITIC */
- /* Used for for Berber (ber), but Arabic script is more common */
- "", /* G_UNICODE_SCRIPT_TIFINAGH */
- "syl", /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */
- "peo", /* G_UNICODE_SCRIPT_OLD_PERSIAN */
- "", /* G_UNICODE_SCRIPT_KHAROSHTHI */
-
- "", /* G_UNICODE_SCRIPT_UNKNOWN */
- "", /* G_UNICODE_SCRIPT_BALINESE */
- "", /* G_UNICODE_SCRIPT_CUNEIFORM */
- "", /* G_UNICODE_SCRIPT_PHOENICIAN */
- "", /* G_UNICODE_SCRIPT_PHAGS_PA */
- "nqo" /* G_UNICODE_SCRIPT_NKO */
- };
- const gchar *sel;
if (part != NULL) {
- if (part->is_utf && (part->script == G_UNICODE_SCRIPT_UNKNOWN ||
- part->script == G_UNICODE_SCRIPT_COMMON)) {
- /* Try to detect encoding by several symbols */
- const gchar *p, *pp;
- gunichar c;
- gint32 remain = part->content->len, max = 0, processed = 0;
- gint32 scripts[G_UNICODE_SCRIPT_NKO];
- GUnicodeScript scc, sel;
-
- p = part->content->data;
- memset (scripts, 0, sizeof (scripts));
-
- while (remain > 0 && processed < 10) {
- c = g_utf8_get_char_validated (p, remain);
- if (c == (gunichar) -2 || c == (gunichar) -1) {
- break;
- }
- scc = g_unichar_get_script (c);
- if (scc < (gint)G_N_ELEMENTS (scripts)) {
- scripts[scc]++;
- }
- pp = g_utf8_next_char (p);
- remain -= pp - p;
- p = pp;
- processed ++;
- }
- for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) {
- if (scripts[remain] > max) {
- max = scripts[remain];
- sel = remain;
- }
- }
- part->script = sel;
- }
-
- if (part->script > 0 && part->script <
- (gint)G_N_ELEMENTS (languages)) {
- sel = languages[part->script];
- if (*sel != '\0') {
- lua_pushstring (L, sel);
- return 1;
- }
+ if (part->lang_code != NULL && part->lang_code[0] != '\0') {
+ lua_pushstring (L, part->lang_code);
+ return 1;
}
}