]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Rework parts conversion and serialization
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 17:03:12 +0000 (17:03 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Sun, 25 Nov 2018 17:03:12 +0000 (17:03 +0000)
src/libmime/message.c
src/libmime/mime_encoding.c
src/libmime/mime_encoding.h
src/libstat/tokenizers/tokenizers.c
src/lua/lua_mimepart.c

index 4a765643a1e00d6b62e752986a3740f9a291cfb7..b76fa1b23da7b90e3c946e80228e6db5014c2062 100644 (file)
@@ -61,9 +61,6 @@ static void
 rspamd_mime_part_extract_words (struct rspamd_task *task,
                struct rspamd_mime_text_part *part)
 {
-#ifdef WITH_SNOWBALL
-       struct sb_stemmer *stem = NULL;
-#endif
        rspamd_stat_token_t *w;
        gchar *temp_word;
        const guchar *r;
@@ -71,92 +68,26 @@ rspamd_mime_part_extract_words (struct rspamd_task *task,
        gdouble avg_len = 0;
 
        if (part->utf_words) {
-#ifdef WITH_SNOWBALL
-
-
-               if (part->language && part->language[0] != '\0' && IS_PART_UTF (part)) {
-
-                       if (!stemmers) {
-                               stemmers = g_hash_table_new (rspamd_strcase_hash,
-                                               rspamd_strcase_equal);
-                       }
-
-                       stem = g_hash_table_lookup (stemmers, part->language);
-
-                       if (stem == NULL) {
-
-                               stem = sb_stemmer_new (part->language, "UTF_8");
-
-                               if (stem == NULL) {
-                                       msg_debug_task (
-                                                       "<%s> cannot create lemmatizer for %s language",
-                                                       task->message_id, part->language);
-                               } else {
-                                       g_hash_table_insert (stemmers, g_strdup (part->language),
-                                                       stem);
-                               }
-                       }
-               }
-#endif
-
+               rspamd_stem_words (part->utf_words, task->task_pool, part->language,
+                                       task->lang_det);
 
                for (i = 0; i < part->utf_words->len; i++) {
                        guint64 h;
 
                        w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
-                       r = NULL;
-#ifdef WITH_SNOWBALL
-                       if (stem) {
-                               r = sb_stemmer_stem (stem, w->begin, w->len);
-                       }
-#endif
 
-                       if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
-                               avg_len = avg_len + (w->len - avg_len) / (double) (i + 1);
-
-                               if (r != NULL) {
-                                       nlen = strlen (r);
-                                       nlen = MIN (nlen, w->len);
-                                       temp_word = rspamd_mempool_alloc (task->task_pool, nlen);
-                                       memcpy (temp_word, r, nlen);
-
-                                       if (IS_PART_UTF (part)) {
-                                               rspamd_str_lc_utf8 (temp_word, nlen);
-                                       }
-                                       else {
-                                               rspamd_str_lc (temp_word, nlen);
-                                       }
-
-                                       w->begin = temp_word;
-                                       w->len = nlen;
-                               }
-                               else {
-                                       temp_word = rspamd_mempool_alloc (task->task_pool, w->len);
-                                       memcpy (temp_word, w->begin, w->len);
-
-                                       if (IS_PART_UTF (part)) {
-                                               rspamd_str_lc_utf8 (temp_word, w->len);
-                                       }
-                                       else {
-                                               rspamd_str_lc (temp_word, w->len);
-                                       }
-
-                                       w->begin = temp_word;
-                               }
-                       }
-
-                       if (w->len > 0) {
+                       if (w->stemmed.len > 0) {
                                /*
                                 * We use static hash seed if we would want to use that in shingles
                                 * computation in future
                                 */
                                h = rspamd_cryptobox_fast_hash_specific (
                                                RSPAMD_CRYPTOBOX_HASHFAST_INDEPENDENT,
-                                               w->begin, w->len, words_hash_seed);
+                                               w->stemmed.begin, w->stemmed.len, words_hash_seed);
                                g_array_append_val (part->normalized_hashes, h);
-                               total_len += w->len;
+                               total_len += w->stemmed.len;
 
-                               if (w->len <= 3) {
+                               if (w->stemmed.len <= 3) {
                                        short_len++;
                                }
                        }
@@ -251,6 +182,7 @@ rspamd_mime_part_create_words (struct rspamd_task *task,
        if (part->utf_words) {
                part->normalized_hashes = g_array_sized_new (FALSE, FALSE,
                                sizeof (guint64), part->utf_words->len);
+               rspamd_normalize_words (part->utf_words, task->task_pool);
        }
 
 }
@@ -757,17 +689,9 @@ rspamd_message_process_html_text_part (struct rspamd_task *task,
                text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_EMPTY;
        }
 
-       /* Also add unicode content */
-       text_part->unicode_content =  g_array_sized_new (FALSE, FALSE,
-                       sizeof (UChar), text_part->utf_content->len + 1);
-       rspamd_utf_to_unicode (text_part->utf_content, text_part->unicode_content);
-
        rspamd_mempool_add_destructor (task->task_pool,
                        (rspamd_mempool_destruct_t) free_byte_array_callback,
                        text_part->utf_content);
-       rspamd_mempool_add_destructor (task->task_pool,
-                       rspamd_array_free_hard,
-                       text_part->unicode_content);
 
        return TRUE;
 }
@@ -1265,7 +1189,7 @@ rspamd_message_process (struct rspamd_task *task)
                                                sel = p2;
                                        }
                                        else {
-                                               if (p1->unicode_content->len > p2->unicode_content->len) {
+                                               if (p1->utf_content->len > p2->utf_content->len) {
                                                        sel = p1;
                                                }
                                                else {
index 269166344215b55dce96be4aa21c155ad69dec83..e3479c3e788394fda86068300562c1379a3ebafd 100644 (file)
@@ -241,115 +241,6 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        return d;
 }
 
-static void
-rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
-                                                                       struct rspamd_mime_text_part *text_part)
-{
-       GByteArray *utf;
-       UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
-       utf = text_part->utf_raw_content;
-       text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
-                       sizeof (UChar), utf->len + 1);
-       text_part->unicode_raw_content->len = ucnv_toUChars (utf8_converter,
-                       (UChar *)text_part->unicode_raw_content->data,
-                       utf->len + 1,
-                       utf->data,
-                       utf->len,
-                       &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               g_array_free (text_part->unicode_raw_content, TRUE);
-               text_part->unicode_raw_content = NULL;
-       }
-}
-
-static void
-rspamd_mime_text_part_normalise (struct rspamd_task *task,
-                                                                struct rspamd_mime_text_part *text_part)
-{
-#if U_ICU_VERSION_MAJOR_NUM >= 44
-       UErrorCode uc_err = U_ZERO_ERROR;
-       gint32 nsym, end;
-       UChar *src = NULL, *dest = NULL;
-       const UNormalizer2 *norm = rspamd_get_unicode_normalizer ();
-
-       if (!text_part->unicode_raw_content) {
-               return;
-       }
-
-       src = (UChar *)text_part->unicode_raw_content->data;
-       nsym = text_part->unicode_raw_content->len;
-
-       /* We can now check if we need to decompose */
-       end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               msg_warn_task ("cannot normalise URL, cannot check normalisation: %s",
-                               u_errorName (uc_err));
-               return;
-       }
-
-       if (end == nsym) {
-               /* Already normalised */
-               return;
-       }
-
-       text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL;
-       dest = g_malloc (nsym * sizeof (*dest));
-       memcpy (dest, src, end * sizeof (*dest));
-       nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
-                       src + end, nsym - end, &uc_err);
-
-       if (!U_SUCCESS (uc_err)) {
-               if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
-                       msg_warn_task ("cannot normalise URL: %s",
-                                       u_errorName (uc_err));
-               }
-       }
-       else {
-               /* Copy normalised back */
-               memcpy (text_part->unicode_raw_content->data, dest, nsym * sizeof (UChar));
-               text_part->unicode_raw_content->len = nsym;
-               text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
-       }
-
-       g_free (dest);
-#endif
-}
-
-/*
- * Recode utf from normalised unichars if needed
- */
-static void
-rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
-                                                                                struct rspamd_mime_text_part *text_part)
-{
-       UErrorCode uc_err = U_ZERO_ERROR;
-       guint clen, dlen;
-       gint r;
-       UConverter *utf8_converter;
-
-       utf8_converter = rspamd_get_utf8_converter ();
-
-       if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
-               text_part->unicode_raw_content) {
-               clen = ucnv_getMaxCharSize (utf8_converter);
-               dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->unicode_raw_content->len,
-                               clen);
-               g_byte_array_set_size (text_part->utf_raw_content, dlen);
-               r = ucnv_fromUChars (utf8_converter,
-                               text_part->utf_raw_content->data,
-                               dlen,
-                               (UChar *)text_part->unicode_raw_content->data,
-                               text_part->unicode_raw_content->len,
-                               &uc_err);
-               text_part->utf_raw_content->len = r;
-       }
-}
-
-
 static gboolean
 rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
                                                                        struct rspamd_mime_text_part *text_part,
@@ -358,8 +249,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
                                                                        GError **err)
 {
        gchar *d;
-       gint32 r, clen, dlen;
-
+       gint32 r, clen, dlen, uc_len;
+       UChar *tmp_buf;
        UErrorCode uc_err = U_ZERO_ERROR;
        UConverter *conv, *utf8_converter;
 
@@ -374,11 +265,10 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
                return FALSE;
        }
 
-
-       text_part->unicode_raw_content = g_array_sized_new (FALSE, FALSE,
-                       sizeof (UChar), input->len + 1);
-       r = ucnv_toUChars (conv,
-                       (UChar *)text_part->unicode_raw_content->data,
+       tmp_buf = g_new (UChar, input->len + 1);
+       uc_err = U_ZERO_ERROR;
+       uc_len = ucnv_toUChars (conv,
+                       tmp_buf,
                        input->len + 1,
                        input->data,
                        input->len,
@@ -388,33 +278,34 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
                                "cannot convert data to unicode from %s: %s",
                                charset, u_errorName (uc_err));
+               g_free (tmp_buf);
+
                return FALSE;
        }
 
-       text_part->unicode_raw_content->len = r;
-       rspamd_mime_text_part_normalise (task, text_part);
-
        /* Now, convert to utf8 */
        clen = ucnv_getMaxCharSize (utf8_converter);
-       dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+       dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
        d = rspamd_mempool_alloc (task->task_pool, dlen);
        r = ucnv_fromUChars (utf8_converter, d, dlen,
-                       (UChar *)text_part->unicode_raw_content->data, r, &uc_err);
+                       tmp_buf, uc_len, &uc_err);
 
        if (!U_SUCCESS (uc_err)) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
                                "cannot convert data from unicode from %s: %s",
                                charset, u_errorName (uc_err));
+               g_free (tmp_buf);
 
                return FALSE;
        }
 
-       msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d",
-                       charset, input->len, r);
+       msg_info_task ("converted from %s to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
+                       charset, input->len, r, uc_len);
        text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
                        sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
        text_part->utf_raw_content->data = d;
        text_part->utf_raw_content->len = r;
+       g_free (tmp_buf);
 
        return TRUE;
 }
@@ -658,9 +549,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                else {
                        SET_PART_UTF (text_part);
                        text_part->utf_raw_content = part_content;
-                       rspamd_mime_text_part_ucs_from_utf (task, text_part);
-                       rspamd_mime_text_part_normalise (task, text_part);
-                       rspamd_mime_text_part_maybe_renormalise (task, text_part);
                        text_part->real_charset = UTF8_CHARSET;
 
                        return;
@@ -693,9 +581,6 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                        part_content->len, !checked)) {
                SET_PART_UTF (text_part);
                text_part->utf_raw_content = part_content;
-               rspamd_mime_text_part_ucs_from_utf (task, text_part);
-               rspamd_mime_text_part_normalise (task, text_part);
-               rspamd_mime_text_part_maybe_renormalise (task, text_part);
                text_part->real_charset = UTF8_CHARSET;
 
                return;
@@ -721,18 +606,3 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 
        SET_PART_UTF (text_part);
 }
-
-void
-rspamd_utf_to_unicode (GByteArray *in, GArray *dest)
-{
-       UErrorCode uc_err = U_ZERO_ERROR;
-       UConverter *utf8_converter = rspamd_get_utf8_converter ();
-
-       g_array_set_size (dest, in->len + 1);
-       dest->len = ucnv_toUChars (utf8_converter,
-                       (UChar *)dest->data,
-                       in->len + 1,
-                       in->data,
-                       in->len,
-                       &uc_err);
-}
index 0754bb3484e1027a3f82a9172601e15ba3aac552..5f436d99dd7a9513a74fd1736821aeba6c47b8e2 100644 (file)
@@ -18,6 +18,7 @@
 
 #include "config.h"
 #include "mem_pool.h"
+#include "fstring.h"
 
 struct rspamd_task;
 struct rspamd_mime_part;
@@ -86,11 +87,5 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
 
-/**
- * Converts utf8 to libicu unichars
- * @param in
- * @param dest
- */
-void rspamd_utf_to_unicode (GByteArray *in, GArray *dest);
 
 #endif /* SRC_LIBMIME_MIME_ENCODING_H_ */
index 32d9ba0df3b7e2ebf019a6ea8f636b99d47678e7..9ec0c4315e909e7e08a6aa1772f05a123af2cf31 100644 (file)
@@ -745,28 +745,25 @@ rspamd_stem_words (GArray *words, rspamd_mempool_t *pool,
 
                if (tok->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
                        if (stem) {
-                               const gchar *stemmed;
+                               const gchar *stemmed = NULL;
 
                                stemmed = sb_stemmer_stem (stem,
                                                tok->normalized.begin, tok->normalized.len);
 
-                               dlen = strlen (stemmed);
+                               dlen = stemmed ? strlen (stemmed) : 0;
 
                                if (dlen > 0) {
-                                       dest = rspamd_mempool_alloc (pool, dlen);
+                                       dest = rspamd_mempool_alloc (pool, dlen + 1);
                                        memcpy (dest, stemmed, dlen);
-                                       rspamd_str_lc_utf8 (dest, dlen);
+                                       dest[dlen] = '\0';
                                        tok->stemmed.len = dlen;
                                        tok->stemmed.begin = dest;
                                        tok->flags |= RSPAMD_STAT_TOKEN_FLAG_STEMMED;
                                }
                                else {
                                        /* Fallback */
-                                       dest = rspamd_mempool_alloc (pool, tok->normalized.len);
-                                       memcpy (dest, tok->normalized.begin, tok->normalized.len);
-                                       rspamd_str_lc_utf8 (dest, tok->normalized.len);
                                        tok->stemmed.len = tok->normalized.len;
-                                       tok->stemmed.begin = dest;
+                                       tok->stemmed.begin = tok->normalized.begin;
                                }
                        }
                        else {
index a6fc2bfa5a744e01318b964f46c3db0afb3d06c0..9e74c87c09286814180f2f4c6f4d8cce0fb0cabf 100644 (file)
@@ -923,8 +923,8 @@ struct lua_shingle_data {
 #define STORE_TOKEN(i, t) do { \
     if ((i) < part->utf_words->len) { \
         word = &g_array_index (part->utf_words, rspamd_stat_token_t, (i)); \
-        sd->t.begin = word->begin; \
-        sd->t.len = word->len; \
+        sd->t.begin = word->stemmed.begin; \
+        sd->t.len = word->stemmed.len; \
     } \
     }while (0)