@@ -20,11 +20,32 @@ | |||
#include "config.h" | |||
#include "libserver/cfg_file.h" | |||
#include "libstat/stat_api.h" | |||
#include "libmime/message.h" | |||
struct rspamd_lang_detector; | |||
struct rspamd_language_elt; | |||
struct rspamd_task; | |||
enum rspamd_unicode_scripts { | |||
RSPAMD_UNICODE_LATIN = (1 << 0), | |||
RSPAMD_UNICODE_GREEK = (1 << 1), | |||
RSPAMD_UNICODE_CYRILLIC = (1 << 2), | |||
RSPAMD_UNICODE_HEBREW = (1 << 3), | |||
RSPAMD_UNICODE_CJK = (1 << 4), | |||
RSPAMD_UNICODE_JP = (1 << 5), | |||
RSPAMD_UNICODE_ARABIC = (1 << 6), | |||
RSPAMD_UNICODE_DEVANAGARI = (1 << 7), | |||
RSPAMD_UNICODE_THAI = (1 << 8), | |||
RSPAMD_UNICODE_ARMENIAN = (1 << 9), | |||
RSPAMD_UNICODE_GEORGIAN = (1 << 10), | |||
RSPAMD_UNICODE_GUJARATI = (1 << 11), | |||
RSPAMD_UNICODE_TAMIL = (1 << 12), | |||
RSPAMD_UNICODE_TELUGU = (1 << 13), | |||
RSPAMD_UNICODE_MALAYALAM = (1 << 14), | |||
RSPAMD_UNICODE_SINHALA = (1 << 15), | |||
RSPAMD_UNICODE_HANGUL = (1 << 16), | |||
}; | |||
struct rspamd_lang_detector_res { | |||
gdouble prob; | |||
const gchar *lang; | |||
@@ -59,8 +80,8 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, | |||
* @param words_len | |||
* @return array of struct rspamd_lang_detector_res sorted by freq descending | |||
*/ | |||
GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, | |||
gboolean rspamd_language_detector_detect (struct rspamd_task *task, | |||
struct rspamd_lang_detector *d, | |||
GArray *ucs_tokens); | |||
struct rspamd_mime_text_part *part); | |||
#endif |
@@ -188,12 +188,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, | |||
} | |||
} | |||
static guint | |||
static void | |||
rspamd_mime_part_create_words (struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part) | |||
{ | |||
rspamd_stat_token_t *w, ucs_w; | |||
guint i, ucs_len = 0; | |||
enum rspamd_tokenize_type tok_type; | |||
if (IS_PART_UTF (part)) { | |||
@@ -215,31 +213,8 @@ rspamd_mime_part_create_words (struct rspamd_task *task, | |||
if (part->utf_words) { | |||
part->normalized_hashes = g_array_sized_new (FALSE, FALSE, | |||
sizeof (guint64), part->utf_words->len); | |||
if (IS_PART_UTF (part) && task->lang_det) { | |||
part->unicode_words = g_array_sized_new (FALSE, FALSE, | |||
sizeof (rspamd_stat_token_t), part->utf_words->len); | |||
} | |||
if (part->unicode_words) { | |||
for (i = 0; i < part->utf_words->len; i++) { | |||
w = &g_array_index (part->utf_words, rspamd_stat_token_t, | |||
i); | |||
if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { | |||
rspamd_language_detector_to_ucs (task->lang_det, | |||
task->task_pool, | |||
w, &ucs_w); | |||
g_array_append_val (part->unicode_words, ucs_w); | |||
ucs_len += ucs_w.len; | |||
} | |||
} | |||
} | |||
} | |||
return ucs_len; | |||
} | |||
static void | |||
@@ -248,12 +223,8 @@ rspamd_mime_part_detect_language (struct rspamd_task *task, | |||
{ | |||
struct rspamd_lang_detector_res *lang; | |||
if (part->unicode_words) { | |||
part->languages = rspamd_language_detector_detect (task, | |||
task->lang_det, | |||
part->unicode_words); | |||
if (part->languages->len > 0) { | |||
if (part->utf_words) { | |||
if (rspamd_language_detector_detect (task, task->lang_det, part)) { | |||
lang = g_ptr_array_index (part->languages, 0); | |||
part->language = lang->lang; | |||
@@ -103,7 +103,6 @@ struct rspamd_mime_text_part { | |||
/* Unicode content, used by libicu */ | |||
GArray *unicode_raw_content; /* unicode raw content (of UChar) */ | |||
GArray *unicode_content; /* unicode processed content (of UChar) */ | |||
GArray *unicode_words; | |||
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ | |||
struct html_content *html; | |||
@@ -120,6 +119,7 @@ struct rspamd_mime_text_part { | |||
guint empty_lines; | |||
guint capital_letters; | |||
guint numeric_characters; | |||
guint unicode_scripts; | |||
}; | |||
enum rspamd_received_type { |
@@ -248,9 +248,6 @@ rspamd_task_free (struct rspamd_task *task) | |||
if (tp->normalized_hashes) { | |||
g_array_free (tp->normalized_hashes, TRUE); | |||
} | |||
if (tp->unicode_words) { | |||
g_array_free (tp->unicode_words, TRUE); | |||
} | |||
if (tp->languages) { | |||
g_ptr_array_unref (tp->languages); | |||
} |