diff options
author | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:28:27 +0000 |
---|---|---|
committer | Vsevolod Stakhov <vsevolod@highsecure.ru> | 2014-12-23 15:28:27 +0000 |
commit | 3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f (patch) | |
tree | 5868e328c9ed4226a52fd0bd12dd8dfc0e4ec5f0 /src/libmime | |
parent | 39b8dcb94620669ae369ab559175dde1a5c103b7 (diff) | |
download | rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.tar.gz rspamd-3cef9e6aef6fe8c9c7bb2aa05bce0738e0ab826f.zip |
Rework language detection.
Diffstat (limited to 'src/libmime')
-rw-r--r-- | src/libmime/message.c | 175 | ||||
-rw-r--r-- | src/libmime/message.h | 4 |
2 files changed, 149 insertions, 30 deletions
diff --git a/src/libmime/message.c b/src/libmime/message.c index 6140f3c24..3c5f83047 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -1025,6 +1025,148 @@ convert_text_to_utf (struct rspamd_task *task, return result_array; } +struct language_match { + const char *code; + const char *name; + GUnicodeScript script; +}; + +static int +language_elts_cmp (const void *a, const void *b) +{ + GUnicodeScript sc = *(const GUnicodeScript *)a; + const struct language_match *bb = (const struct language_match *)b; + + return (sc - bb->script); +} + +static void +detect_text_language (struct mime_text_part *part) +{ + /* Keep sorted */ + static const struct language_match language_codes[] = { + { "", "english", G_UNICODE_SCRIPT_COMMON }, + { "", "", G_UNICODE_SCRIPT_INHERITED }, + { "ar", "arabic", G_UNICODE_SCRIPT_ARABIC }, + { "hy", "armenian", G_UNICODE_SCRIPT_ARMENIAN }, + { "bn", "chineese", G_UNICODE_SCRIPT_BENGALI }, + { "", "", G_UNICODE_SCRIPT_BOPOMOFO }, + { "chr", "", G_UNICODE_SCRIPT_CHEROKEE }, + { "cop", "", G_UNICODE_SCRIPT_COPTIC }, + { "ru", "russian", G_UNICODE_SCRIPT_CYRILLIC }, + /* Deseret was used to write English */ + { "", "", G_UNICODE_SCRIPT_DESERET }, + { "hi", "", G_UNICODE_SCRIPT_DEVANAGARI }, + { "am", "", G_UNICODE_SCRIPT_ETHIOPIC }, + { "ka", "", G_UNICODE_SCRIPT_GEORGIAN }, + { "", "", G_UNICODE_SCRIPT_GOTHIC }, + { "el", "greek", G_UNICODE_SCRIPT_GREEK }, + { "gu", "", G_UNICODE_SCRIPT_GUJARATI }, + { "pa", "", G_UNICODE_SCRIPT_GURMUKHI }, + { "han", "chineese", G_UNICODE_SCRIPT_HAN }, + { "ko", "", G_UNICODE_SCRIPT_HANGUL }, + { "he", "hebrew", G_UNICODE_SCRIPT_HEBREW }, + { "ja", "", G_UNICODE_SCRIPT_HIRAGANA }, + { "kn", "", G_UNICODE_SCRIPT_KANNADA }, + { "ja", "", G_UNICODE_SCRIPT_KATAKANA }, + { "km", "", G_UNICODE_SCRIPT_KHMER }, + { "lo", "", G_UNICODE_SCRIPT_LAO }, + { "en", "english", G_UNICODE_SCRIPT_LATIN }, + { "ml", "", G_UNICODE_SCRIPT_MALAYALAM }, + { "mn", "", G_UNICODE_SCRIPT_MONGOLIAN }, + { "my", "", G_UNICODE_SCRIPT_MYANMAR }, + /* Ogham was used to write old Irish */ + { "", "", G_UNICODE_SCRIPT_OGHAM }, + { "", "", G_UNICODE_SCRIPT_OLD_ITALIC }, + { "or", "", G_UNICODE_SCRIPT_ORIYA }, + { "", "", G_UNICODE_SCRIPT_RUNIC }, + { "si", "", G_UNICODE_SCRIPT_SINHALA }, + { "syr", "", G_UNICODE_SCRIPT_SYRIAC }, + { "ta", "", G_UNICODE_SCRIPT_TAMIL }, + { "te", "", G_UNICODE_SCRIPT_TELUGU }, + { "dv", "", G_UNICODE_SCRIPT_THAANA }, + { "th", "", G_UNICODE_SCRIPT_THAI }, + { "bo", "", G_UNICODE_SCRIPT_TIBETAN }, + { "iu", "", G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL }, + { "", "", G_UNICODE_SCRIPT_YI }, + { "tl", "", G_UNICODE_SCRIPT_TAGALOG }, + /* Phillipino languages/scripts */ + { "hnn", "", G_UNICODE_SCRIPT_HANUNOO }, + { "bku", "", G_UNICODE_SCRIPT_BUHID }, + { "tbw", "", G_UNICODE_SCRIPT_TAGBANWA }, + + { "", "", G_UNICODE_SCRIPT_BRAILLE }, + { "", "", G_UNICODE_SCRIPT_CYPRIOT }, + { "", "", G_UNICODE_SCRIPT_LIMBU }, + /* Used for Somali (so) in the past */ + { "", "", G_UNICODE_SCRIPT_OSMANYA }, + /* The Shavian alphabet was designed for English */ + { "", "", G_UNICODE_SCRIPT_SHAVIAN }, + { "", "", G_UNICODE_SCRIPT_LINEAR_B }, + { "", "", G_UNICODE_SCRIPT_TAI_LE }, + { "uga", "", G_UNICODE_SCRIPT_UGARITIC }, + { "", "", G_UNICODE_SCRIPT_NEW_TAI_LUE }, + { "bug", "", G_UNICODE_SCRIPT_BUGINESE }, + { "", "", G_UNICODE_SCRIPT_GLAGOLITIC }, + /* Used for for Berber (ber), but Arabic script is more common */ + { "", "", G_UNICODE_SCRIPT_TIFINAGH }, + { "syl", "", G_UNICODE_SCRIPT_SYLOTI_NAGRI }, + { "peo", "", G_UNICODE_SCRIPT_OLD_PERSIAN }, + { "", "", G_UNICODE_SCRIPT_KHAROSHTHI }, + { "", "", G_UNICODE_SCRIPT_UNKNOWN }, + { "", "", G_UNICODE_SCRIPT_BALINESE }, + { "", "", G_UNICODE_SCRIPT_CUNEIFORM }, + { "", "", G_UNICODE_SCRIPT_PHOENICIAN }, + { "", "", G_UNICODE_SCRIPT_PHAGS_PA }, + { "nqo", "", G_UNICODE_SCRIPT_NKO } + }; + const struct language_match *lm; + const int max_chars = 32; + + if (part != NULL) { + if (part->is_utf) { + /* Try to detect encoding by several symbols */ + const gchar *p, *pp; + gunichar c; + gint32 remain = part->content->len, max = 0, processed = 0; + gint32 scripts[G_N_ELEMENTS (language_codes)]; + GUnicodeScript scc, sel; + + p = part->content->data; + memset (scripts, 0, sizeof (scripts)); + + while (remain > 0 && processed < max_chars) { + c = g_utf8_get_char_validated (p, remain); + if (c == (gunichar) -2 || c == (gunichar) -1) { + break; + } + if (g_unichar_isalpha (c)) { + scc = g_unichar_get_script (c); + if (scc < (gint)G_N_ELEMENTS (scripts)) { + scripts[scc]++; + } + processed ++; + } + pp = g_utf8_next_char (p); + remain -= pp - p; + p = pp; + } + for (remain = 0; remain < (gint)G_N_ELEMENTS (scripts); remain++) { + if (scripts[remain] > max) { + max = scripts[remain]; + sel = remain; + } + } + part->script = sel; + lm = bsearch (&sel, language_codes, G_N_ELEMENTS (language_codes), + sizeof (language_codes[0]), &language_elts_cmp); + + part->lang_code = lm->code; + part->language = lm->name; + } + } +} + static void process_text_part (struct rspamd_task *task, GByteArray *part_content, @@ -1035,9 +1177,6 @@ process_text_part (struct rspamd_task *task, { struct mime_text_part *text_part; const gchar *cd; - gchar *pos; - gsize l; - rspamd_fstring_t token, buf; /* Skip attachements */ #ifndef GMIME24 @@ -1128,32 +1267,10 @@ process_text_part (struct rspamd_task *task, } /* Post process part */ - buf.begin = text_part->content->data; - buf.len = text_part->content->len; - buf.size = buf.len; - token.begin = NULL; - token.len = 0; - - text_part->words = g_array_new (FALSE, FALSE, sizeof (rspamd_fstring_t)); - while ((pos = rspamd_tokenizer_get_word (&buf, - &token, &text_part->urls_offset)) != NULL) { - if (text_part->is_utf) { - l = g_utf8_strlen (token.begin, token.len); - } - else { - l = token.len; - } - /* - * XXX: make this configurable - */ - if (l < 4) { - token.begin = pos; - continue; - } - g_array_append_val (text_part->words, token); - - token.begin = pos; - } + detect_text_language (text_part); + text_part->words = rspamd_tokenize_text (text_part->content->data, + text_part->content->len, text_part->is_utf, 4, + &text_part->urls_offset); } #ifdef GMIME24 diff --git a/src/libmime/message.h b/src/libmime/message.h index 8287db9b0..7b3e03883 100644 --- a/src/libmime/message.h +++ b/src/libmime/message.h @@ -26,6 +26,9 @@ struct mime_text_part { gboolean is_balanced; gboolean is_empty; gboolean is_utf; + GUnicodeScript script; + const gchar *lang_code; + const gchar *language; const gchar *real_charset; GByteArray *orig; GByteArray *content; @@ -34,7 +37,6 @@ struct mime_text_part { rspamd_fuzzy_t *fuzzy; rspamd_fuzzy_t *double_fuzzy; GMimeObject *parent; - GUnicodeScript script; rspamd_fstring_t *diff_str; GArray *words; }; |