- Store unicode in UTF parts - Store unicode for HTML parts - Rename struct fields and split them into unicode/utf componentstags/1.8.0
@@ -1323,7 +1323,7 @@ rspamd_language_detector_cmp_heuristic (gconstpointer a, gconstpointer b, | |||
GPtrArray * | |||
rspamd_language_detector_detect (struct rspamd_task *task, | |||
struct rspamd_lang_detector *d, | |||
GArray *ucs_tokens, gsize words_len) | |||
GArray *ucs_tokens) | |||
{ | |||
khash_t(rspamd_candidates_hash) *candidates; | |||
GPtrArray *result; |
@@ -61,6 +61,6 @@ void rspamd_language_detector_to_ucs (struct rspamd_lang_detector *d, | |||
*/ | |||
GPtrArray * rspamd_language_detector_detect (struct rspamd_task *task, | |||
struct rspamd_lang_detector *d, | |||
GArray *ucs_tokens, gsize words_len); | |||
GArray *ucs_tokens); | |||
#endif |
@@ -67,7 +67,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, | |||
guint i, nlen, total_len = 0, short_len = 0; | |||
gdouble avg_len = 0; | |||
if (part->normalized_words) { | |||
if (part->utf_words) { | |||
#ifdef WITH_SNOWBALL | |||
static GHashTable *stemmers = NULL; | |||
@@ -97,10 +97,10 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, | |||
#endif | |||
for (i = 0; i < part->normalized_words->len; i++) { | |||
for (i = 0; i < part->utf_words->len; i++) { | |||
guint64 h; | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); | |||
r = NULL; | |||
#ifdef WITH_SNOWBALL | |||
if (stem) { | |||
@@ -156,7 +156,7 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, | |||
} | |||
} | |||
if (part->normalized_words && part->normalized_words->len) { | |||
if (part->utf_words && part->utf_words->len) { | |||
gdouble *avg_len_p, *short_len_p; | |||
avg_len_p = rspamd_mempool_get_variable (task->task_pool, | |||
@@ -205,41 +205,41 @@ rspamd_mime_part_create_words (struct rspamd_task *task, | |||
/* Ugly workaround */ | |||
if (IS_PART_HTML (part)) { | |||
part->normalized_words = rspamd_tokenize_text ( | |||
part->stripped_content->data, | |||
part->stripped_content->len, tok_type, task->cfg, | |||
part->utf_words = rspamd_tokenize_text ( | |||
part->utf_stripped_content->data, | |||
part->utf_stripped_content->len, tok_type, task->cfg, | |||
part->exceptions, | |||
NULL); | |||
} | |||
else { | |||
part->normalized_words = rspamd_tokenize_text ( | |||
part->stripped_content->data, | |||
part->stripped_content->len, tok_type, task->cfg, | |||
part->utf_words = rspamd_tokenize_text ( | |||
part->utf_stripped_content->data, | |||
part->utf_stripped_content->len, tok_type, task->cfg, | |||
part->exceptions, | |||
NULL); | |||
} | |||
if (part->normalized_words) { | |||
if (part->utf_words) { | |||
part->normalized_hashes = g_array_sized_new (FALSE, FALSE, | |||
sizeof (guint64), part->normalized_words->len); | |||
sizeof (guint64), part->utf_words->len); | |||
if (IS_PART_UTF (part) && task->lang_det) { | |||
part->ucs32_words = g_array_sized_new (FALSE, FALSE, | |||
sizeof (rspamd_stat_token_t), part->normalized_words->len); | |||
part->unicode_words = g_array_sized_new (FALSE, FALSE, | |||
sizeof (rspamd_stat_token_t), part->utf_words->len); | |||
} | |||
if (part->ucs32_words) { | |||
if (part->unicode_words) { | |||
for (i = 0; i < part->normalized_words->len; i++) { | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, | |||
for (i = 0; i < part->utf_words->len; i++) { | |||
w = &g_array_index (part->utf_words, rspamd_stat_token_t, | |||
i); | |||
if (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) { | |||
rspamd_language_detector_to_ucs (task->lang_det, | |||
task->task_pool, | |||
w, &ucs_w); | |||
g_array_append_val (part->ucs32_words, ucs_w); | |||
g_array_append_val (part->unicode_words, ucs_w); | |||
ucs_len += ucs_w.len; | |||
} | |||
} | |||
@@ -251,14 +251,14 @@ rspamd_mime_part_create_words (struct rspamd_task *task, | |||
static void | |||
rspamd_mime_part_detect_language (struct rspamd_task *task, | |||
struct rspamd_mime_text_part *part, guint ucs_len) | |||
struct rspamd_mime_text_part *part) | |||
{ | |||
struct rspamd_lang_detector_res *lang; | |||
if (part->ucs32_words) { | |||
if (part->unicode_words) { | |||
part->languages = rspamd_language_detector_detect (task, | |||
task->lang_det, | |||
part->ucs32_words, ucs_len); | |||
part->unicode_words); | |||
if (part->languages->len > 0) { | |||
lang = g_ptr_array_index (part->languages, 0); | |||
@@ -289,7 +289,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
state = seen_cr; | |||
if (p > c) { | |||
last_c = *(p - 1); | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)c, p - c); | |||
} | |||
@@ -299,11 +299,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
case seen_cr: | |||
/* Double \r\r */ | |||
if (!crlf_added) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)" ", 1); | |||
crlf_added = TRUE; | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
} | |||
part->nlines ++; | |||
@@ -326,17 +326,17 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
if (p > c) { | |||
last_c = *(p - 1); | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)c, p - c); | |||
} | |||
c = p + 1; | |||
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)" ", 1); | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
crlf_added = TRUE; | |||
} | |||
else { | |||
@@ -348,13 +348,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
/* \r\n */ | |||
if (!crlf_added) { | |||
if (IS_PART_HTML (part) || g_ascii_ispunct (last_c)) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *) " ", 1); | |||
crlf_added = TRUE; | |||
} | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
} | |||
c = p + 1; | |||
@@ -364,11 +364,11 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
case seen_lf: | |||
/* Double \n\n */ | |||
if (!crlf_added) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)" ", 1); | |||
crlf_added = TRUE; | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
} | |||
part->nlines++; | |||
@@ -414,13 +414,13 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
if (!crlf_added) { | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
} | |||
/* Skip initial spaces */ | |||
if (G_UNLIKELY (*p == ' ')) { | |||
if (!crlf_added) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)" ", 1); | |||
} | |||
@@ -451,7 +451,7 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
switch (state) { | |||
case normal_char: | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)c, p - c); | |||
while (c < p) { | |||
@@ -479,10 +479,10 @@ rspamd_strip_newlines_parse (const gchar *begin, const gchar *pe, | |||
default: | |||
if (!crlf_added) { | |||
g_byte_array_append (part->stripped_content, | |||
g_byte_array_append (part->utf_stripped_content, | |||
(const guint8 *)" ", 1); | |||
g_ptr_array_add (part->newlines, | |||
(((gpointer) (goffset) (part->stripped_content->len)))); | |||
(((gpointer) (goffset) (part->utf_stripped_content->len)))); | |||
} | |||
part->nlines++; | |||
@@ -502,10 +502,10 @@ rspamd_normalize_text_part (struct rspamd_task *task, | |||
struct rspamd_process_exception *ex; | |||
/* Strip newlines */ | |||
part->stripped_content = g_byte_array_sized_new (part->content->len); | |||
part->utf_stripped_content = g_byte_array_sized_new (part->utf_content->len); | |||
part->newlines = g_ptr_array_sized_new (128); | |||
p = (const gchar *)part->content->data; | |||
end = p + part->content->len; | |||
p = (const gchar *)part->utf_content->data; | |||
end = p + part->utf_content->len; | |||
rspamd_strip_newlines_parse (p, end, part); | |||
@@ -513,7 +513,7 @@ rspamd_normalize_text_part (struct rspamd_task *task, | |||
ex = rspamd_mempool_alloc (task->task_pool, sizeof (*ex)); | |||
off = (goffset)g_ptr_array_index (part->newlines, i); | |||
g_ptr_array_index (part->newlines, i) = (gpointer)(goffset) | |||
(part->stripped_content->data + off); | |||
(part->utf_stripped_content->data + off); | |||
ex->pos = off; | |||
ex->len = 0; | |||
ex->type = RSPAMD_EXCEPTION_NEWLINE; | |||
@@ -522,7 +522,7 @@ rspamd_normalize_text_part (struct rspamd_task *task, | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
(rspamd_mempool_destruct_t) free_byte_array_callback, | |||
part->stripped_content); | |||
part->utf_stripped_content); | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
(rspamd_mempool_destruct_t) rspamd_ptr_array_free_hard, | |||
part->newlines); | |||
@@ -615,10 +615,10 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part | |||
g_assert (rspamd_multipattern_compile (gtube_matcher, NULL)); | |||
} | |||
if (part->content && part->content->len >= sizeof (gtube_pattern_reject) && | |||
part->content->len <= max_check_size) { | |||
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->content->data, | |||
part->content->len, | |||
if (part->utf_content && part->utf_content->len >= sizeof (gtube_pattern_reject) && | |||
part->utf_content->len <= max_check_size) { | |||
if ((ret = rspamd_multipattern_lookup (gtube_matcher, part->utf_content->data, | |||
part->utf_content->len, | |||
rspamd_multipattern_gtube_cb, NULL, NULL)) > 0) { | |||
switch (ret) { | |||
@@ -639,7 +639,7 @@ rspamd_check_gtube (struct rspamd_task *task, struct rspamd_mime_text_part *part | |||
msg_info_task ( | |||
"<%s>: gtube %s pattern has been found in part of length %ud", | |||
task->message_id, rspamd_action_to_str (act), | |||
part->content->len); | |||
part->utf_content->len); | |||
} | |||
} | |||
} | |||
@@ -655,9 +655,86 @@ exceptions_compare_func (gconstpointer a, gconstpointer b) | |||
return ea->pos - eb->pos; | |||
} | |||
static gboolean | |||
rspamd_message_process_plain_text_part (struct rspamd_task *task, | |||
struct rspamd_mime_text_part *text_part) | |||
{ | |||
if (text_part->parsed.len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
return TRUE; | |||
} | |||
rspamd_mime_text_part_maybe_convert (task, text_part); | |||
if (text_part->utf_raw_content != NULL) { | |||
/* Different from HTML, where we also parse HTML and strip tags */ | |||
text_part->utf_content = text_part->utf_raw_content; | |||
text_part->unicode_content = text_part->unicode_raw_content; | |||
} | |||
else { | |||
/* | |||
* We ignore unconverted parts from now as it is dangerous | |||
* to treat them as text parts | |||
*/ | |||
return FALSE; | |||
} | |||
return TRUE; | |||
} | |||
static gboolean | |||
rspamd_message_process_html_text_part (struct rspamd_task *task, | |||
struct rspamd_mime_text_part *text_part) | |||
{ | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; | |||
if (text_part->parsed.len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
return TRUE; | |||
} | |||
rspamd_mime_text_part_maybe_convert (task, text_part); | |||
if (text_part->utf_raw_content == NULL) { | |||
return FALSE; | |||
} | |||
text_part->html = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (*text_part->html)); | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; | |||
text_part->utf_content = rspamd_html_process_part_full ( | |||
task->task_pool, | |||
text_part->html, | |||
text_part->utf_raw_content, | |||
&text_part->exceptions, | |||
task->urls, | |||
task->emails); | |||
if (text_part->utf_content->len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
} | |||
/* Also add unicode content */ | |||
text_part->unicode_content = g_array_sized_new (FALSE, FALSE, | |||
sizeof (UChar), text_part->utf_content->len + 1); | |||
rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content); | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
(rspamd_mempool_destruct_t) free_byte_array_callback, | |||
text_part->utf_content); | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
rspamd_array_free_hard, | |||
text_part->unicode_content); | |||
return TRUE; | |||
} | |||
static void | |||
rspamd_message_process_text_part (struct rspamd_task *task, | |||
struct rspamd_mime_part *mime_part) | |||
rspamd_message_process_text_part_maybe (struct rspamd_task *task, | |||
struct rspamd_mime_part *mime_part) | |||
{ | |||
struct rspamd_mime_text_part *text_part; | |||
rspamd_ftok_t html_tok, xhtml_tok; | |||
@@ -738,87 +815,31 @@ rspamd_message_process_text_part (struct rspamd_task *task, | |||
debug_task ("skip attachments for checking as text parts"); | |||
return; | |||
} | |||
if (found_html) { | |||
text_part = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (struct rspamd_mime_text_part)); | |||
text_part->raw.begin = mime_part->raw_data.begin; | |||
text_part->raw.len = mime_part->raw_data.len; | |||
text_part->parsed.begin = mime_part->parsed_data.begin; | |||
text_part->parsed.len = mime_part->parsed_data.len; | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_HTML; | |||
text_part->mime_part = mime_part; | |||
if (mime_part->parsed_data.len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
g_ptr_array_add (task->text_parts, text_part); | |||
return; | |||
} | |||
rspamd_mime_text_part_maybe_convert (task, text_part); | |||
if (text_part->utf_raw_content == NULL) { | |||
return; | |||
} | |||
text_part->html = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (*text_part->html)); | |||
text_part->mime_part = mime_part; | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED; | |||
text_part->content = rspamd_html_process_part_full ( | |||
task->task_pool, | |||
text_part->html, | |||
text_part->utf_raw_content, | |||
&text_part->exceptions, | |||
task->urls, | |||
task->emails); | |||
if (text_part->content->len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
} | |||
rspamd_mempool_add_destructor (task->task_pool, | |||
(rspamd_mempool_destruct_t) free_byte_array_callback, | |||
text_part->content); | |||
g_ptr_array_add (task->text_parts, text_part); | |||
else if (!(found_txt || found_html)) { | |||
/* Not a text part */ | |||
return; | |||
} | |||
else if (found_txt) { | |||
text_part = | |||
rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (struct rspamd_mime_text_part)); | |||
text_part->mime_part = mime_part; | |||
text_part->raw.begin = mime_part->raw_data.begin; | |||
text_part->raw.len = mime_part->raw_data.len; | |||
text_part->parsed.begin = mime_part->parsed_data.begin; | |||
text_part->parsed.len = mime_part->parsed_data.len; | |||
text_part->mime_part = mime_part; | |||
if (mime_part->parsed_data.len == 0) { | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; | |||
g_ptr_array_add (task->text_parts, text_part); | |||
return; | |||
} | |||
rspamd_mime_text_part_maybe_convert (task, text_part); | |||
text_part = rspamd_mempool_alloc0 (task->task_pool, | |||
sizeof (struct rspamd_mime_text_part)); | |||
text_part->mime_part = mime_part; | |||
text_part->raw.begin = mime_part->raw_data.begin; | |||
text_part->raw.len = mime_part->raw_data.len; | |||
text_part->parsed.begin = mime_part->parsed_data.begin; | |||
text_part->parsed.len = mime_part->parsed_data.len; | |||
if (text_part->utf_raw_content != NULL) { | |||
/* | |||
* We ignore unconverted parts from now as it is dangerous | |||
* to treat them as text parts | |||
*/ | |||
text_part->content = text_part->utf_raw_content; | |||
g_ptr_array_add (task->text_parts, text_part); | |||
} | |||
else { | |||
if (found_html) { | |||
if (!rspamd_message_process_html_text_part (task, text_part)) { | |||
return; | |||
} | |||
} | |||
else { | |||
return; | |||
if (!rspamd_message_process_plain_text_part (task, text_part)) { | |||
return; | |||
} | |||
} | |||
g_ptr_array_add (task->text_parts, text_part); | |||
mime_part->flags |= RSPAMD_MIME_PART_TEXT; | |||
mime_part->specific.txt = text_part; | |||
@@ -867,7 +888,7 @@ rspamd_message_process_text_part (struct rspamd_task *task, | |||
text_part->exceptions); | |||
} | |||
text_part->ucs_len = rspamd_mime_part_create_words (task, text_part); | |||
rspamd_mime_part_create_words (task, text_part); | |||
} | |||
/* Creates message from various data using libmagic to detect type */ | |||
@@ -1172,7 +1193,7 @@ rspamd_message_process (struct rspamd_task *task) | |||
struct rspamd_mime_part *part; | |||
part = g_ptr_array_index (task->parts, i); | |||
rspamd_message_process_text_part (task, part); | |||
rspamd_message_process_text_part_maybe (task, part); | |||
} | |||
rspamd_images_process (task); | |||
@@ -1207,7 +1228,7 @@ rspamd_message_process (struct rspamd_task *task) | |||
sel = p2; | |||
} | |||
else { | |||
if (p1->ucs_len > p2->ucs_len) { | |||
if (p1->unicode_content->len > p2->unicode_content->len) { | |||
sel = p1; | |||
} | |||
else { | |||
@@ -1215,7 +1236,7 @@ rspamd_message_process (struct rspamd_task *task) | |||
} | |||
} | |||
rspamd_mime_part_detect_language (task, sel, sel->ucs_len); | |||
rspamd_mime_part_detect_language (task, sel); | |||
if (sel->language && sel->language[0]) { | |||
/* Propagate language */ | |||
@@ -1274,13 +1295,13 @@ rspamd_message_process (struct rspamd_task *task) | |||
PTR_ARRAY_FOREACH (task->text_parts, i, text_part) { | |||
if (!text_part->language) { | |||
rspamd_mime_part_detect_language (task, text_part, text_part->ucs_len); | |||
rspamd_mime_part_detect_language (task, text_part); | |||
} | |||
rspamd_mime_part_extract_words (task, text_part); | |||
if (text_part->normalized_words) { | |||
total_words += text_part->normalized_words->len; | |||
if (text_part->utf_words) { | |||
total_words += text_part->utf_words->len; | |||
} | |||
} | |||
@@ -86,20 +86,28 @@ struct rspamd_mime_text_part { | |||
const gchar *language; | |||
GPtrArray *languages; | |||
const gchar *real_charset; | |||
/* Raw data in native encoding */ | |||
rspamd_ftok_t raw; | |||
rspamd_ftok_t parsed; /* decoded from mime encodings */ | |||
GByteArray *content; /* utf8 encoded processed content */ | |||
GArray *ucs_raw_content; /* unicode raw content (of UChar) */ | |||
/* UTF8 content */ | |||
GByteArray *utf_content; /* utf8 encoded processed content */ | |||
GByteArray *utf_raw_content; /* utf raw content */ | |||
GByteArray *stripped_content; /* utf content with no newlines */ | |||
GByteArray *utf_stripped_content; /* utf content with no newlines */ | |||
GArray *normalized_hashes; | |||
GArray *utf_words; | |||
/* Unicode content, used by libicu */ | |||
GArray *unicode_raw_content; /* unicode raw content (of UChar) */ | |||
GArray *unicode_content; /* unicode processed content (of UChar) */ | |||
GArray *unicode_words; | |||
GPtrArray *newlines; /**< positions of newlines in text, relative to content*/ | |||
struct html_content *html; | |||
GList *exceptions; /**< list of offsets of urls */ | |||
struct rspamd_mime_part *mime_part; | |||
GArray *normalized_words; | |||
GArray *ucs32_words; | |||
GArray *normalized_hashes; | |||
guint flags; | |||
guint nlines; | |||
guint spaces; | |||
@@ -110,7 +118,6 @@ struct rspamd_mime_text_part { | |||
guint empty_lines; | |||
guint capital_letters; | |||
guint numeric_characters; | |||
guint ucs_len; | |||
}; | |||
enum rspamd_received_type { |
@@ -283,18 +283,18 @@ rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, | |||
rspamd_mime_utf8_conv_init (); | |||
utf = text_part->utf_raw_content; | |||
text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, | |||
text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, | |||
sizeof (UChar), utf->len + 1); | |||
text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter, | |||
(UChar *)text_part->ucs_raw_content->data, | |||
text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter, | |||
(UChar *)text_part->unicode_raw_content->data, | |||
utf->len + 1, | |||
utf->data, | |||
utf->len, | |||
&uc_err); | |||
if (!U_SUCCESS (uc_err)) { | |||
g_array_free (text_part->ucs_raw_content, TRUE); | |||
text_part->ucs_raw_content = NULL; | |||
g_array_free (text_part->unicode_raw_content, TRUE); | |||
text_part->unicode_raw_content = NULL; | |||
} | |||
} | |||
@@ -311,12 +311,12 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, | |||
norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err); | |||
} | |||
if (!text_part->ucs_raw_content) { | |||
if (!text_part->unicode_raw_content) { | |||
return; | |||
} | |||
src = (UChar *)text_part->ucs_raw_content->data; | |||
nsym = text_part->ucs_raw_content->len; | |||
src = (UChar *)text_part->unicode_raw_content->data; | |||
nsym = text_part->unicode_raw_content->len; | |||
/* We can now check if we need to decompose */ | |||
end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); | |||
@@ -346,8 +346,8 @@ rspamd_mime_text_part_normalise (struct rspamd_task *task, | |||
} | |||
else { | |||
/* Copy normalised back */ | |||
memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar)); | |||
text_part->ucs_raw_content->len = nsym; | |||
memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar)); | |||
text_part->unicode_raw_content->len = nsym; | |||
text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED; | |||
} | |||
@@ -369,16 +369,16 @@ rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, | |||
rspamd_mime_utf8_conv_init (); | |||
if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && | |||
text_part->ucs_raw_content) { | |||
text_part->unicode_raw_content) { | |||
clen = ucnv_getMaxCharSize (utf8_converter); | |||
dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len, | |||
dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len, | |||
clen); | |||
g_byte_array_set_size (text_part->utf_raw_content, dlen); | |||
r = ucnv_fromUChars (utf8_converter, | |||
text_part->utf_raw_content->data, | |||
dlen, | |||
(UChar *)text_part->ucs_raw_content->data, | |||
text_part->ucs_raw_content->len, | |||
(UChar *)text_part->unicode_raw_content->data, | |||
text_part->unicode_raw_content->len, | |||
&uc_err); | |||
text_part->utf_raw_content->len = r; | |||
} | |||
@@ -410,10 +410,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, | |||
} | |||
text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE, | |||
text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, | |||
sizeof (UChar), input->len + 1); | |||
r = ucnv_toUChars (conv, | |||
(UChar *)text_part->ucs_raw_content->data, | |||
(UChar *)text_part->unicode_raw_content->data, | |||
input->len + 1, | |||
input->data, | |||
input->len, | |||
@@ -426,7 +426,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, | |||
return FALSE; | |||
} | |||
text_part->ucs_raw_content->len = r; | |||
text_part->unicode_raw_content->len = r; | |||
rspamd_mime_text_part_normalise (task, text_part); | |||
/* Now, convert to utf8 */ | |||
@@ -434,7 +434,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, | |||
dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); | |||
d = rspamd_mempool_alloc (task->task_pool, dlen); | |||
r = ucnv_fromUChars (utf8_converter, d, dlen, | |||
(UChar *)text_part->ucs_raw_content->data, r, &uc_err); | |||
(UChar *)text_part->unicode_raw_content->data, r, &uc_err); | |||
if (!U_SUCCESS (uc_err)) { | |||
g_set_error (err, rspamd_iconv_error_quark (), EINVAL, | |||
@@ -750,3 +750,17 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, | |||
SET_PART_UTF (text_part); | |||
} | |||
void | |||
rspamd_utf_to_unicode (GByteArray *in, GArray *dest) | |||
{ | |||
UErrorCode uc_err = U_ZERO_ERROR; | |||
g_array_set_size (dest, in->len + 1); | |||
dest->len = ucnv_toUChars (utf8_converter, | |||
(UChar *)dest->data, | |||
in->len + 1, | |||
in->data, | |||
in->len, | |||
&uc_err); | |||
} |
@@ -86,4 +86,11 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, | |||
*/ | |||
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); | |||
/** | |||
* Converts utf8 to libicu unichars | |||
* @param in | |||
* @param dest | |||
*/ | |||
void rspamd_utf_to_unicode (GByteArray *in, GArray *dest); | |||
#endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ |
@@ -905,8 +905,8 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, | |||
raw = TRUE; | |||
} | |||
in = part->content->data; | |||
len = part->content->len; | |||
in = part->utf_content->data; | |||
len = part->utf_content->len; | |||
} | |||
} | |||
@@ -1006,9 +1006,9 @@ rspamd_re_cache_exec_re (struct rspamd_task *task, | |||
for (i = 0; i < task->text_parts->len; i++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
if (part->stripped_content) { | |||
scvec[i + 1] = (guchar *)part->stripped_content->data; | |||
lenvec[i + 1] = part->stripped_content->len; | |||
if (part->utf_stripped_content) { | |||
scvec[i + 1] = (guchar *)part->utf_stripped_content->data; | |||
lenvec[i + 1] = part->utf_stripped_content->len; | |||
} | |||
else { | |||
scvec[i + 1] = (guchar *)""; |
@@ -242,20 +242,20 @@ rspamd_task_free (struct rspamd_task *task) | |||
for (i = 0; i < task->text_parts->len; i ++) { | |||
tp = g_ptr_array_index (task->text_parts, i); | |||
if (tp->normalized_words) { | |||
g_array_free (tp->normalized_words, TRUE); | |||
if (tp->utf_words) { | |||
g_array_free (tp->utf_words, TRUE); | |||
} | |||
if (tp->normalized_hashes) { | |||
g_array_free (tp->normalized_hashes, TRUE); | |||
} | |||
if (tp->ucs32_words) { | |||
g_array_free (tp->ucs32_words, TRUE); | |||
if (tp->unicode_words) { | |||
g_array_free (tp->unicode_words, TRUE); | |||
} | |||
if (tp->languages) { | |||
g_ptr_array_unref (tp->languages); | |||
} | |||
if (tp->ucs_raw_content) { | |||
g_array_free (tp->ucs_raw_content, TRUE); | |||
if (tp->unicode_raw_content) { | |||
g_array_free (tp->unicode_raw_content, TRUE); | |||
} | |||
} | |||
@@ -2624,7 +2624,7 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, | |||
{ | |||
struct rspamd_url_mimepart_cbdata mcbd; | |||
if (part->stripped_content == NULL || part->stripped_content->len == 0) { | |||
if (part->utf_stripped_content == NULL || part->utf_stripped_content->len == 0) { | |||
msg_warn_task ("got empty text part"); | |||
return; | |||
} | |||
@@ -2632,8 +2632,8 @@ rspamd_url_text_extract (rspamd_mempool_t *pool, | |||
mcbd.task = task; | |||
mcbd.part = part; | |||
rspamd_url_find_multiple (task->task_pool, part->stripped_content->data, | |||
part->stripped_content->len, is_html, part->newlines, | |||
rspamd_url_find_multiple (task->task_pool, part->utf_stripped_content->data, | |||
part->utf_stripped_content->len, is_html, part->newlines, | |||
rspamd_url_text_part_callback, &mcbd); | |||
} | |||
@@ -331,8 +331,8 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
for (i = 0; i < task->text_parts->len; i++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { | |||
reserved_len += part->normalized_words->len; | |||
if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { | |||
reserved_len += part->utf_words->len; | |||
} | |||
/* XXX: normal window size */ | |||
reserved_len += 5; | |||
@@ -346,9 +346,9 @@ rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx, | |||
for (i = 0; i < task->text_parts->len; i ++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
if (!IS_PART_EMPTY (part) && part->normalized_words != NULL) { | |||
if (!IS_PART_EMPTY (part) && part->utf_words != NULL) { | |||
st_ctx->tokenizer->tokenize_func (st_ctx, task->task_pool, | |||
part->normalized_words, IS_PART_UTF (part), | |||
part->utf_words, IS_PART_UTF (part), | |||
NULL, task->tokens); | |||
} | |||
@@ -59,7 +59,7 @@ const gchar t_delimiters[255] = { | |||
/* Get next word from specified f_str_t buf */ | |||
static gboolean | |||
rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, | |||
rspamd_tokenizer_get_word_raw (rspamd_stat_token_t * buf, | |||
gchar const **cur, rspamd_stat_token_t * token, | |||
GList **exceptions, gsize *rl, gboolean unused) | |||
{ | |||
@@ -149,7 +149,7 @@ rspamd_tokenizer_get_word_compat (rspamd_stat_token_t * buf, | |||
} | |||
static gboolean | |||
rspamd_tokenizer_get_word (rspamd_stat_token_t * buf, | |||
rspamd_tokenizer_get_word_utf8 (rspamd_stat_token_t * buf, | |||
gchar const **cur, rspamd_stat_token_t * token, | |||
GList **exceptions, gsize *rl, | |||
gboolean check_signature) | |||
@@ -355,10 +355,10 @@ rspamd_tokenize_text (const gchar *text, gsize len, | |||
switch (how) { | |||
case RSPAMD_TOKENIZE_RAW: | |||
func = rspamd_tokenizer_get_word_compat; | |||
func = rspamd_tokenizer_get_word_raw; | |||
break; | |||
case RSPAMD_TOKENIZE_UTF: | |||
func = rspamd_tokenizer_get_word; | |||
func = rspamd_tokenizer_get_word_utf8; | |||
break; | |||
default: | |||
g_assert_not_reached (); |
@@ -28,7 +28,7 @@ struct rspamd_stat_tokenizer { | |||
enum rspamd_tokenize_type { | |||
RSPAMD_TOKENIZE_UTF = 0, | |||
RSPAMD_TOKENIZE_RAW, | |||
RSPAMD_TOKENIZE_UCS | |||
RSPAMD_TOKENIZE_UNICODE | |||
}; | |||
/* Compare two token nodes */ |
@@ -549,16 +549,16 @@ lua_textpart_get_content (lua_State * L) | |||
rspamd_lua_setclass (L, "rspamd{text}", -1); | |||
if (!type) { | |||
start = part->content->data; | |||
len = part->content->len; | |||
start = part->utf_content->data; | |||
len = part->utf_content->len; | |||
} | |||
else if (strcmp (type, "content") == 0) { | |||
start = part->content->data; | |||
len = part->content->len; | |||
start = part->utf_content->data; | |||
len = part->utf_content->len; | |||
} | |||
else if (strcmp (type, "content_oneline") == 0) { | |||
start = part->stripped_content->data; | |||
len = part->stripped_content->len; | |||
start = part->utf_stripped_content->data; | |||
len = part->utf_stripped_content->len; | |||
} | |||
else if (strcmp (type, "raw_parsed") == 0) { | |||
start = part->parsed.begin; | |||
@@ -618,8 +618,8 @@ lua_textpart_get_content_oneline (lua_State * L) | |||
t = lua_newuserdata (L, sizeof (*t)); | |||
rspamd_lua_setclass (L, "rspamd{text}", -1); | |||
t->start = part->stripped_content->data; | |||
t->len = part->stripped_content->len; | |||
t->start = part->utf_stripped_content->data; | |||
t->len = part->utf_stripped_content->len; | |||
t->flags = 0; | |||
return 1; | |||
@@ -636,11 +636,11 @@ lua_textpart_get_length (lua_State * L) | |||
return 1; | |||
} | |||
if (IS_PART_EMPTY (part) || part->content == NULL) { | |||
if (IS_PART_EMPTY (part) || part->utf_content == NULL) { | |||
lua_pushinteger (L, 0); | |||
} | |||
else { | |||
lua_pushinteger (L, part->content->len); | |||
lua_pushinteger (L, part->utf_content->len); | |||
} | |||
return 1; | |||
@@ -721,11 +721,11 @@ lua_textpart_get_words_count (lua_State *L) | |||
return 1; | |||
} | |||
if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { | |||
if (IS_PART_EMPTY (part) || part->utf_words == NULL) { | |||
lua_pushinteger (L, 0); | |||
} | |||
else { | |||
lua_pushinteger (L, part->normalized_words->len); | |||
lua_pushinteger (L, part->utf_words->len); | |||
} | |||
return 1; | |||
@@ -743,14 +743,14 @@ lua_textpart_get_words (lua_State *L) | |||
return luaL_error (L, "invalid arguments"); | |||
} | |||
if (IS_PART_EMPTY (part) || part->normalized_words == NULL) { | |||
if (IS_PART_EMPTY (part) || part->utf_words == NULL) { | |||
lua_createtable (L, 0, 0); | |||
} | |||
else { | |||
lua_createtable (L, part->normalized_words->len, 0); | |||
lua_createtable (L, part->utf_words->len, 0); | |||
for (i = 0; i < part->normalized_words->len; i ++) { | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
for (i = 0; i < part->utf_words->len; i ++) { | |||
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); | |||
lua_pushlstring (L, w->begin, w->len); | |||
lua_rawseti (L, -2, i + 1); | |||
@@ -876,8 +876,8 @@ struct lua_shingle_data { | |||
}; | |||
#define STORE_TOKEN(i, t) do { \ | |||
if ((i) < part->normalized_words->len) { \ | |||
word = &g_array_index (part->normalized_words, rspamd_stat_token_t, (i)); \ | |||
if ((i) < part->utf_words->len) { \ | |||
word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \ | |||
sd->t.begin = word->begin; \ | |||
sd->t.len = word->len; \ | |||
} \ | |||
@@ -936,8 +936,8 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) | |||
/* Calculate direct hash */ | |||
rspamd_cryptobox_hash_init (&st, key, rspamd_cryptobox_HASHKEYBYTES); | |||
for (i = 0; i < part->normalized_words->len; i ++) { | |||
word = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
for (i = 0; i < part->utf_words->len; i ++) { | |||
word = &g_array_index (part->utf_words, rspamd_stat_token_t, i); | |||
rspamd_cryptobox_hash_update (&st, word->begin, word->len); | |||
} | |||
@@ -947,7 +947,7 @@ lua_textpart_get_fuzzy_hashes (lua_State * L) | |||
sizeof (hexdigest)); | |||
lua_pushlstring (L, hexdigest, sizeof (hexdigest) - 1); | |||
sgl = rspamd_shingles_from_text (part->normalized_words, key, | |||
sgl = rspamd_shingles_from_text (part->utf_words, key, | |||
pool, lua_shingles_filter, part, RSPAMD_SHINGLES_MUMHASH); | |||
if (sgl == NULL) { |
@@ -262,9 +262,9 @@ lua_trie_search_mime (lua_State *L) | |||
for (i = 0; i < task->text_parts->len; i ++) { | |||
part = g_ptr_array_index (task->text_parts, i); | |||
if (!IS_PART_EMPTY (part) && part->content != NULL) { | |||
text = part->content->data; | |||
len = part->content->len; | |||
if (!IS_PART_EMPTY (part) && part->utf_content != NULL) { | |||
text = part->utf_content->data; | |||
len = part->utf_content->len; | |||
if (lua_trie_search_str (L, trie, text, len) != 0) { | |||
found = TRUE; |
@@ -560,13 +560,13 @@ rspamd_chartable_process_part (struct rspamd_task *task, | |||
guint i, ncap = 0; | |||
gdouble cur_score = 0.0; | |||
if (part == NULL || part->normalized_words == NULL || | |||
part->normalized_words->len == 0) { | |||
if (part == NULL || part->utf_words == NULL || | |||
part->utf_words->len == 0) { | |||
return; | |||
} | |||
for (i = 0; i < part->normalized_words->len; i++) { | |||
w = &g_array_index (part->normalized_words, rspamd_stat_token_t, i); | |||
for (i = 0; i < part->utf_words->len; i++) { | |||
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); | |||
if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { | |||
@@ -588,7 +588,7 @@ rspamd_chartable_process_part (struct rspamd_task *task, | |||
*/ | |||
part->capital_letters += ncap; | |||
cur_score /= (gdouble)part->normalized_words->len; | |||
cur_score /= (gdouble)part->utf_words->len; | |||
if (cur_score > 2.0) { | |||
cur_score = 2.0; |
@@ -1196,7 +1196,7 @@ fuzzy_io_fin (void *ud) | |||
static GArray * | |||
fuzzy_preprocess_words (struct rspamd_mime_text_part *part, rspamd_mempool_t *pool) | |||
{ | |||
return part->normalized_words; | |||
return part->utf_words; | |||
} | |||
static void | |||
@@ -1418,8 +1418,8 @@ fuzzy_cmd_from_text_part (struct rspamd_task *task, | |||
rspamd_cryptobox_hash_init (&st, rule->hash_key->str, | |||
rule->hash_key->len); | |||
rspamd_cryptobox_hash_update (&st, part->stripped_content->data, | |||
part->stripped_content->len); | |||
rspamd_cryptobox_hash_update (&st, part->utf_stripped_content->data, | |||
part->utf_stripped_content->len); | |||
if (task->subject) { | |||
/* We also include subject */ | |||
@@ -2615,7 +2615,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, | |||
} | |||
/* Check length of part */ | |||
fac = rule->ctx->text_multiplier * part->content->len; | |||
fac = rule->ctx->text_multiplier * part->utf_content->len; | |||
if ((double)min_bytes > fac) { | |||
if (!rule->short_text_direct_hash) { | |||
msg_info_task ( | |||
@@ -2624,7 +2624,7 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, | |||
"skip fuzzy check", | |||
task->message_id, min_bytes, | |||
fac, | |||
part->content->len, | |||
part->utf_content->len, | |||
rule->ctx->text_multiplier); | |||
continue; | |||
} | |||
@@ -2635,21 +2635,21 @@ fuzzy_generate_commands (struct rspamd_task *task, struct fuzzy_rule *rule, | |||
"use direct hash", | |||
task->message_id, min_bytes, | |||
fac, | |||
part->content->len, | |||
part->utf_content->len, | |||
rule->ctx->text_multiplier); | |||
short_text = TRUE; | |||
} | |||
} | |||
if (part->normalized_words == NULL || | |||
part->normalized_words->len == 0) { | |||
if (part->utf_words == NULL || | |||
part->utf_words->len == 0) { | |||
msg_info_task ("<%s>, part hash empty, skip fuzzy check", | |||
task->message_id); | |||
continue; | |||
} | |||
if (rule->ctx->min_hash_len != 0 && | |||
part->normalized_words->len < | |||
part->utf_words->len < | |||
rule->ctx->min_hash_len) { | |||
if (!rule->short_text_direct_hash) { | |||
msg_info_task ( |