From 63ef123b048d5f1f2f6a5d172be6dc1a2629e2d7 Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Sun, 25 Nov 2018 17:03:12 +0000 Subject: [Project] Rework parts conversion and serialization --- src/libmime/message.c | 92 ++------------------- src/libmime/mime_encoding.c | 158 ++++-------------------------------- src/libmime/mime_encoding.h | 7 +- src/libstat/tokenizers/tokenizers.c | 13 ++- src/lua/lua_mimepart.c | 4 +- 5 files changed, 30 insertions(+), 244 deletions(-) (limited to 'src') diff --git a/src/libmime/message.c b/src/libmime/message.c index 4a765643a..b76fa1b23 100644 --- a/src/libmime/message.c +++ b/src/libmime/message.c @@ -61,9 +61,6 @@ static void rspamd_mime_part_extract_words (struct rspamd_task *task, struct rspamd_mime_text_part *part) { -#ifdef WITH_SNOWBALL - struct sb_stemmer *stem = NULL; -#endif rspamd_stat_token_t *w; gchar *temp_word; const guchar *r; @@ -71,92 +68,26 @@ rspamd_mime_part_extract_words (struct rspamd_task *task, gdouble avg_len = 0; if (part->utf_words) { -#ifdef WITH_SNOWBALL - - - if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) { - - if (!stemmers) { - stemmers = g_hash_table_new (rspamd_strcase_hash, - rspamd_strcase_equal); - } - - stem = g_hash_table_lookup (stemmers, part->language); - - if (stem == NULL) { - - stem = sb_stemmer_new (part->language, "UTF_8"); - - if (stem == NULL) { - msg_debug_task ( - "<%s> cannot create lemmatizer for %s language", - task->message_id, part->language); - } else { - g_hash_table_insert (stemmers, g_strdup (part->language), - stem); - } - } - } -#endif - + rspamd_stem_words (part->utf_words, task->task_pool, part->language, + task->lang_det); for (i = 0; i < part->utf_words->len; i++) { guint64 h; w = &g_array_index (part->utf_words, rspamd_stat_token_t, i); - r = NULL; -#ifdef WITH_SNOWBALL - if (stem) { - r = sb_stemmer_stem (stem, w->begin, w->len); - } -#endif - if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) { - avg_len = avg_len + (w->len - avg_len) / (double) (i + 1); - - if (r != NULL) { - nlen = strlen (r); - nlen = MIN (nlen, w->len); - temp_word = rspamd_mempool_alloc (task->task_pool, nlen); - memcpy (temp_word, r, nlen); - - if (IS_PART_UTF (part)) { - rspamd_str_lc_utf8 (temp_word, nlen); - } - else { - rspamd_str_lc (temp_word, nlen); - } - - w->begin = temp_word; - w->len = nlen; - } - else { - temp_word = rspamd_mempool_alloc (task->task_pool, w->len); - memcpy (temp_word, w->begin, w->len); - - if (IS_PART_UTF (part)) { - rspamd_str_lc_utf8 (temp_word, w->len); - } - else { - rspamd_str_lc (temp_word, w->len); - } - - w->begin = temp_word; - } - } - - if (w->len > 0) { + if (w->stemmed.len > 0) { /* * We use static hash seed if we would want to use that in shingles * computation in future */ h = rspamd_cryptobox_fast_hash_specific ( RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT, - w->begin, w->len, words_hash_seed); + w->stemmed.begin, w->stemmed.len, words_hash_seed); g_array_append_val (part->normalized_hashes, h); - total_len += w->len; + total_len += w->stemmed.len; - if (w->len <= 3) { + if (w->stemmed.len <= 3) { short_len++; } } @@ -251,6 +182,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task, if (part->utf_words) { part->normalized_hashes = g_array_sized_new (FALSE, FALSE, sizeof (guint64), part->utf_words->len); + rspamd_normalize_words (part->utf_words, task->task_pool); } } @@ -757,17 +689,9 @@ rspamd_message_process_html_text_part (struct rspamd_task *task, text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY; } - /* Also add unicode content */ - text_part->unicode_content = g_array_sized_new (FALSE, FALSE, - sizeof (UChar), text_part->utf_content->len + 1); - rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content); - rspamd_mempool_add_destructor (task->task_pool, (rspamd_mempool_destruct_t) free_byte_array_callback, text_part->utf_content); - rspamd_mempool_add_destructor (task->task_pool, - rspamd_array_free_hard, - text_part->unicode_content); return TRUE; } @@ -1265,7 +1189,7 @@ rspamd_message_process (struct rspamd_task *task) sel = p2; } else { - if (p1->unicode_content->len > p2->unicode_content->len) { + if (p1->utf_content->len > p2->utf_content->len) { sel = p1; } else { diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c index 269166344..e3479c3e7 100644 --- a/src/libmime/mime_encoding.c +++ b/src/libmime/mime_encoding.c @@ -241,115 +241,6 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool, return d; } -static void -rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part) -{ - GByteArray *utf; - UErrorCode uc_err = U_ZERO_ERROR; - UConverter *utf8_converter = rspamd_get_utf8_converter (); - - utf = text_part->utf_raw_content; - text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, - sizeof (UChar), utf->len + 1); - text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter, - (UChar *)text_part->unicode_raw_content->data, - utf->len + 1, - utf->data, - utf->len, - &uc_err); - - if (!U_SUCCESS (uc_err)) { - g_array_free (text_part->unicode_raw_content, TRUE); - text_part->unicode_raw_content = NULL; - } -} - -static void -rspamd_mime_text_part_normalise (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part) -{ -#if U_ICU_VERSION_MAJOR_NUM >= 44 - UErrorCode uc_err = U_ZERO_ERROR; - gint32 nsym, end; - UChar *src = NULL, *dest = NULL; - const UNormalizer2 *norm = rspamd_get_unicode_normalizer (); - - if (!text_part->unicode_raw_content) { - return; - } - - src = (UChar *)text_part->unicode_raw_content->data; - nsym = text_part->unicode_raw_content->len; - - /* We can now check if we need to decompose */ - end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err); - - if (!U_SUCCESS (uc_err)) { - msg_warn_task ("cannot normalise URL, cannot check normalisation: %s", - u_errorName (uc_err)); - return; - } - - if (end == nsym) { - /* Already normalised */ - return; - } - - text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL; - dest = g_malloc (nsym * sizeof (*dest)); - memcpy (dest, src, end * sizeof (*dest)); - nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym, - src + end, nsym - end, &uc_err); - - if (!U_SUCCESS (uc_err)) { - if (uc_err != U_BUFFER_OVERFLOW_ERROR) { - msg_warn_task ("cannot normalise URL: %s", - u_errorName (uc_err)); - } - } - else { - /* Copy normalised back */ - memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar)); - text_part->unicode_raw_content->len = nsym; - text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED; - } - - g_free (dest); -#endif -} - -/* - * Recode utf from normalised unichars if needed - */ -static void -rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task, - struct rspamd_mime_text_part *text_part) -{ - UErrorCode uc_err = U_ZERO_ERROR; - guint clen, dlen; - gint r; - UConverter *utf8_converter; - - utf8_converter = rspamd_get_utf8_converter (); - - if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) && - text_part->unicode_raw_content) { - clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len, - clen); - g_byte_array_set_size (text_part->utf_raw_content, dlen); - r = ucnv_fromUChars (utf8_converter, - text_part->utf_raw_content->data, - dlen, - (UChar *)text_part->unicode_raw_content->data, - text_part->unicode_raw_content->len, - &uc_err); - text_part->utf_raw_content->len = r; - } -} - - static gboolean rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, struct rspamd_mime_text_part *text_part, @@ -358,8 +249,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, GError **err) { gchar *d; - gint32 r, clen, dlen; - + gint32 r, clen, dlen, uc_len; + UChar *tmp_buf; UErrorCode uc_err = U_ZERO_ERROR; UConverter *conv, *utf8_converter; @@ -374,11 +265,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, return FALSE; } - - text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE, - sizeof (UChar), input->len + 1); - r = ucnv_toUChars (conv, - (UChar *)text_part->unicode_raw_content->data, + tmp_buf = g_new (UChar, input->len + 1); + uc_err = U_ZERO_ERROR; + uc_len = ucnv_toUChars (conv, + tmp_buf, input->len + 1, input->data, input->len, @@ -388,33 +278,34 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task, g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data to unicode from %s: %s", charset, u_errorName (uc_err)); + g_free (tmp_buf); + return FALSE; } - text_part->unicode_raw_content->len = r; - rspamd_mime_text_part_normalise (task, text_part); - /* Now, convert to utf8 */ clen = ucnv_getMaxCharSize (utf8_converter); - dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen); + dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen); d = rspamd_mempool_alloc (task->task_pool, dlen); r = ucnv_fromUChars (utf8_converter, d, dlen, - (UChar *)text_part->unicode_raw_content->data, r, &uc_err); + tmp_buf, uc_len, &uc_err); if (!U_SUCCESS (uc_err)) { g_set_error (err, rspamd_iconv_error_quark (), EINVAL, "cannot convert data from unicode from %s: %s", charset, u_errorName (uc_err)); + g_free (tmp_buf); return FALSE; } - msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d", - charset, input->len, r); + msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)", + charset, input->len, r, uc_len); text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool, sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4); text_part->utf_raw_content->data = d; text_part->utf_raw_content->len = r; + g_free (tmp_buf); return TRUE; } @@ -658,9 +549,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, else { SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; - rspamd_mime_text_part_ucs_from_utf (task, text_part); - rspamd_mime_text_part_normalise (task, text_part); - rspamd_mime_text_part_maybe_renormalise (task, text_part); text_part->real_charset = UTF8_CHARSET; return; @@ -693,9 +581,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, part_content->len, !checked)) { SET_PART_UTF (text_part); text_part->utf_raw_content = part_content; - rspamd_mime_text_part_ucs_from_utf (task, text_part); - rspamd_mime_text_part_normalise (task, text_part); - rspamd_mime_text_part_maybe_renormalise (task, text_part); text_part->real_charset = UTF8_CHARSET; return; @@ -721,18 +606,3 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task, SET_PART_UTF (text_part); } - -void -rspamd_utf_to_unicode (GByteArray *in, GArray *dest) -{ - UErrorCode uc_err = U_ZERO_ERROR; - UConverter *utf8_converter = rspamd_get_utf8_converter (); - - g_array_set_size (dest, in->len + 1); - dest->len = ucnv_toUChars (utf8_converter, - (UChar *)dest->data, - in->len + 1, - in->data, - in->len, - &uc_err); -} diff --git a/src/libmime/mime_encoding.h b/src/libmime/mime_encoding.h index 0754bb348..5f436d99d 100644 --- a/src/libmime/mime_encoding.h +++ b/src/libmime/mime_encoding.h @@ -18,6 +18,7 @@ #include "config.h" #include "mem_pool.h" +#include "fstring.h" struct rspamd_task; struct rspamd_mime_part; @@ -86,11 +87,5 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset, */ void rspamd_mime_charset_utf_enforce (gchar *in, gsize len); -/** - * Converts utf8 to libicu unichars - * @param in - * @param dest - */ -void rspamd_utf_to_unicode (GByteArray *in, GArray *dest); #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */ diff --git a/src/libstat/tokenizers/tokenizers.c b/src/libstat/tokenizers/tokenizers.c index 32d9ba0df..9ec0c4315 100644 --- a/src/libstat/tokenizers/tokenizers.c +++ b/src/libstat/tokenizers/tokenizers.c @@ -745,28 +745,25 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool, if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) { if (stem) { - const gchar *stemmed; + const gchar *stemmed = NULL; stemmed = sb_stemmer_stem (stem, tok->normalized.begin, tok->normalized.len); - dlen = strlen (stemmed); + dlen = stemmed ? strlen (stemmed) : 0; if (dlen > 0) { - dest = rspamd_mempool_alloc (pool, dlen); + dest = rspamd_mempool_alloc (pool, dlen + 1); memcpy (dest, stemmed, dlen); - rspamd_str_lc_utf8 (dest, dlen); + dest[dlen] = '\0'; tok->stemmed.len = dlen; tok->stemmed.begin = dest; tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED; } else { /* Fallback */ - dest = rspamd_mempool_alloc (pool, tok->normalized.len); - memcpy (dest, tok->normalized.begin, tok->normalized.len); - rspamd_str_lc_utf8 (dest, tok->normalized.len); tok->stemmed.len = tok->normalized.len; - tok->stemmed.begin = dest; + tok->stemmed.begin = tok->normalized.begin; } } else { diff --git a/src/lua/lua_mimepart.c b/src/lua/lua_mimepart.c index a6fc2bfa5..9e74c87c0 100644 --- a/src/lua/lua_mimepart.c +++ b/src/lua/lua_mimepart.c @@ -923,8 +923,8 @@ struct lua_shingle_data { #define STORE_TOKEN(i, t) do { \ if ((i) < part->utf_words->len) { \ word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \ - sd->t.begin = word->begin; \ - sd->t.len = word->len; \ + sd->t.begin = word->stemmed.begin; \ + sd->t.len = word->stemmed.len; \ } \ }while (0) -- cgit v1.2.3