]> source.dussan.org Git - rspamd.git/commitdiff
[Project] Further changes in unicode operations
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Aug 2018 19:06:34 +0000 (20:06 +0100)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Thu, 23 Aug 2018 19:06:34 +0000 (20:06 +0100)
* Normalise unicode
* Add normality flag for text parts
* Store UCS in text parts
* Rework unicode conversions and operations

src/libmime/message.c
src/libmime/message.h
src/libmime/mime_encoding.c
src/libmime/mime_encoding.h
src/libserver/task.c

index 5d9cf19d1441ec9993579a2c2043e2494847a8b8..e6cb63504cad4a2d4ead8cb1826050c99d4231bd 100644 (file)
@@ -661,7 +661,6 @@ rspamd_message_process_text_part (struct rspamd_task *task,
 {
        struct rspamd_mime_text_part *text_part;
        rspamd_ftok_t html_tok, xhtml_tok;
-       GByteArray *part_content;
        gboolean found_html = FALSE, found_txt = FALSE;
        enum rspamd_action_type act;
 
@@ -756,22 +755,21 @@ rspamd_message_process_text_part (struct rspamd_task *task,
                        return;
                }
 
-               part_content = rspamd_mime_text_part_maybe_convert (task, text_part);
+               rspamd_mime_text_part_maybe_convert (task, text_part);
 
-               if (part_content == NULL) {
+               if (text_part->utf_raw_content == NULL) {
                        return;
                }
 
                text_part->html = rspamd_mempool_alloc0 (task->task_pool,
                                sizeof (*text_part->html));
                text_part->mime_part = mime_part;
-               text_part->utf_raw_content = part_content;
 
                text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_BALANCED;
                text_part->content = rspamd_html_process_part_full (
                                task->task_pool,
                                text_part->html,
-                               part_content,
+                               text_part->utf_raw_content,
                                &text_part->exceptions,
                                task->urls,
                                task->emails);
@@ -802,15 +800,14 @@ rspamd_message_process_text_part (struct rspamd_task *task,
                        return;
                }
 
-               text_part->content = rspamd_mime_text_part_maybe_convert (task,
-                               text_part);
-               text_part->utf_raw_content = text_part->content;
+               rspamd_mime_text_part_maybe_convert (task, text_part);
 
-               if (text_part->content != NULL) {
+               if (text_part->utf_raw_content != NULL) {
                        /*
                         * We ignore unconverted parts from now as it is dangerous
                         * to treat them as text parts
                         */
+                       text_part->content = text_part->utf_raw_content;
                        g_ptr_array_add (task->text_parts, text_part);
                }
                else {
index b0a7983b47bb876f7f3bf3785b46631a8f31fb59..baabb762aa9e3b6636172f80a4dfe26bd5afd53e 100644 (file)
@@ -73,6 +73,8 @@ struct rspamd_mime_part {
 #define RSPAMD_MIME_TEXT_PART_FLAG_HTML (1 << 3)
 #define RSPAMD_MIME_TEXT_PART_FLAG_8BIT (1 << 4)
 #define RSPAMD_MIME_TEXT_PART_FLAG_8BIT_ENCODED (1 << 5)
+#define RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL (1 << 6)
+#define RSPAMD_MIME_TEXT_PART_NORMALISED (1 << 7)
 
 #define IS_PART_EMPTY(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_EMPTY)
 #define IS_PART_UTF(part) ((part)->flags & RSPAMD_MIME_TEXT_PART_FLAG_UTF)
@@ -88,7 +90,7 @@ struct rspamd_mime_text_part {
        rspamd_ftok_t parsed; /* decoded from mime encodings */
        GByteArray *content; /* utf8 encoded processed content */
 
-       UChar *ucs_raw_content; /* unicode raw content */
+       GArray *ucs_raw_content; /* unicode raw content (of UChar) */
        GByteArray *utf_raw_content; /* utf raw content */
        GByteArray *stripped_content; /* utf content with no newlines */
        GPtrArray *newlines;    /**< positions of newlines in text, relative to content*/
index 605ab76499ed8ad16a43a959f649560b2fcb2040..1e284c6c272d0d11ce82b2ab7480061c21814668 100644 (file)
@@ -23,6 +23,7 @@
 #include "message.h"
 #include <unicode/ucnv.h>
 #include <unicode/ucsdet.h>
+#include <unicode/unorm2.h>
 #include <math.h>
 
 #define UTF8_CHARSET "UTF-8"
 static rspamd_regexp_t *utf_compatible_re = NULL;
 UConverter *utf8_converter = NULL;
 
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+static const UNormalizer2 *norm = NULL;
+#endif
+
 struct rspamd_charset_substitution {
        const gchar *input;
        const gchar *canon;
@@ -94,6 +99,36 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
        return conv;
 }
 
+static inline void
+rspamd_mime_utf8_conv_init (void)
+{
+       if (utf8_converter == NULL) {
+               UErrorCode uc_err = U_ZERO_ERROR;
+
+               utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
+
+               if (!U_SUCCESS (uc_err)) {
+                       msg_err ("FATAL error: cannot open converter for utf8: %s",
+                                       u_errorName (uc_err));
+
+                       g_assert_not_reached ();
+               }
+
+               ucnv_setFromUCallBack (utf8_converter,
+                               UCNV_FROM_U_CALLBACK_SUBSTITUTE,
+                               NULL,
+                               NULL,
+                               NULL,
+                               &uc_err);
+               ucnv_setToUCallBack (utf8_converter,
+                               UCNV_TO_U_CALLBACK_SUBSTITUTE,
+                               NULL,
+                               NULL,
+                               NULL,
+                               &uc_err);
+       }
+}
+
 static void
 rspamd_mime_encoding_substitute_init (void)
 {
@@ -189,25 +224,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        UErrorCode uc_err = U_ZERO_ERROR;
        UConverter *conv;
 
-       if (utf8_converter == NULL) {
-               utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
-               if (uc_err != U_ZERO_ERROR) {
-                       g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
-                                       "cannot open converter for utf8: %s",
-                                       u_errorName (uc_err));
-
-                       return NULL;
-               }
-
-               ucnv_setFromUCallBack (utf8_converter,
-                               UCNV_FROM_U_CALLBACK_SUBSTITUTE,
-                               NULL,
-                               NULL,
-                               NULL,
-                               &uc_err);
-       }
-
+       rspamd_mime_utf8_conv_init ();
        conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
 
        if (conv == NULL) {
@@ -222,7 +239,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        uc_err = U_ZERO_ERROR;
        r = ucnv_toUChars (conv, tmp_buf, len + 1, input, len, &uc_err);
 
-       if (uc_err != U_ZERO_ERROR) {
+       if (!U_SUCCESS (uc_err)) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
                                        "cannot convert data to unicode from %s: %s",
                                        in_enc, u_errorName (uc_err));
@@ -237,7 +254,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        d = rspamd_mempool_alloc (pool, dlen);
        r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
 
-       if (uc_err != U_ZERO_ERROR) {
+       if (!U_SUCCESS (uc_err)) {
                g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
                                "cannot convert data from unicode from %s: %s",
                                in_enc, u_errorName (uc_err));
@@ -257,6 +274,186 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        return d;
 }
 
+static void
+rspamd_mime_text_part_ucs_from_utf (struct rspamd_task *task,
+                                                                       struct rspamd_mime_text_part *text_part)
+{
+       GByteArray *utf;
+       UErrorCode uc_err = U_ZERO_ERROR;
+
+       rspamd_mime_utf8_conv_init ();
+       utf = text_part->utf_raw_content;
+       text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+                       sizeof (UChar), utf->len + 1);
+       text_part->ucs_raw_content->len = ucnv_toUChars (utf8_converter,
+                       (UChar *)text_part->ucs_raw_content->data,
+                       utf->len + 1,
+                       utf->data,
+                       utf->len,
+                       &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               g_array_free (text_part->ucs_raw_content, TRUE);
+               text_part->ucs_raw_content = NULL;
+       }
+}
+
+static void
+rspamd_mime_text_part_normalise (struct rspamd_task *task,
+                                                                struct rspamd_mime_text_part *text_part)
+{
+#if U_ICU_VERSION_MAJOR_NUM >= 44
+       UErrorCode uc_err = U_ZERO_ERROR;
+       gint32 nsym, end;
+       UChar *src = NULL, *dest = NULL;
+
+       if (norm == NULL) {
+               norm = unorm2_getInstance (NULL, "nfkc", UNORM2_COMPOSE, &uc_err);
+       }
+
+       if (!text_part->ucs_raw_content) {
+               return;
+       }
+
+       src = (UChar *)text_part->ucs_raw_content->data;
+       nsym = text_part->ucs_raw_content->len;
+
+       /* We can now check if we need to decompose */
+       end = unorm2_spanQuickCheckYes (norm, src, nsym, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               msg_warn_task ("cannot normalise URL, cannot check normalisation: %s",
+                               u_errorName (uc_err));
+               return;
+       }
+
+       if (end == nsym) {
+               /* Already normalised */
+               return;
+       }
+
+       text_part->flags |= RSPAMD_MIME_TEXT_PART_HAS_SUBNORMAL;
+       dest = g_malloc (nsym * sizeof (*dest));
+       memcpy (dest, src, end * sizeof (*dest));
+       nsym = unorm2_normalizeSecondAndAppend (norm, dest, end, nsym,
+                       src + end, nsym - end, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               if (uc_err != U_BUFFER_OVERFLOW_ERROR) {
+                       msg_warn_task ("cannot normalise URL: %s",
+                                       u_errorName (uc_err));
+               }
+       }
+       else {
+               /* Copy normalised back */
+               memcpy (text_part->ucs_raw_content->data, dest, nsym * sizeof (UChar));
+               text_part->ucs_raw_content->len = nsym;
+               text_part->flags |= RSPAMD_MIME_TEXT_PART_NORMALISED;
+       }
+
+       g_free (dest);
+#endif
+}
+
+/*
+ * Recode utf from normalised unichars if needed
+ */
+static void
+rspamd_mime_text_part_maybe_renormalise (struct rspamd_task *task,
+                                                                                struct rspamd_mime_text_part *text_part)
+{
+       UErrorCode uc_err = U_ZERO_ERROR;
+       guint clen, dlen;
+       gint r;
+
+       rspamd_mime_utf8_conv_init ();
+
+       if ((text_part->flags & RSPAMD_MIME_TEXT_PART_NORMALISED) &&
+               text_part->ucs_raw_content) {
+               clen = ucnv_getMaxCharSize (utf8_converter);
+               dlen = UCNV_GET_MAX_BYTES_FOR_STRING (text_part->ucs_raw_content->len,
+                               clen);
+               g_byte_array_set_size (text_part->utf_raw_content, dlen);
+               r = ucnv_fromUChars (utf8_converter,
+                               text_part->utf_raw_content->data,
+                               dlen,
+                               (UChar *)text_part->ucs_raw_content->data,
+                               text_part->ucs_raw_content->len,
+                               &uc_err);
+               text_part->utf_raw_content->len = r;
+       }
+}
+
+
+static gboolean
+rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
+                                                                       struct rspamd_mime_text_part *text_part,
+                                                                       GByteArray *input,
+                                                                       const gchar *charset,
+                                                                       GError **err)
+{
+       gchar *d;
+       gint32 r, clen, dlen;
+
+       UErrorCode uc_err = U_ZERO_ERROR;
+       UConverter *conv;
+
+       rspamd_mime_utf8_conv_init ();
+       conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+
+       if (conv == NULL) {
+               g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+                               "cannot open converter for %s: %s",
+                               charset, u_errorName (uc_err));
+
+               return FALSE;
+       }
+
+
+       text_part->ucs_raw_content = g_array_sized_new (FALSE, FALSE,
+                       sizeof (UChar), input->len + 1);
+       r = ucnv_toUChars (conv,
+                       (UChar *)text_part->ucs_raw_content->data,
+                       input->len + 1,
+                       input->data,
+                       input->len,
+                       &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+                               "cannot convert data to unicode from %s: %s",
+                               charset, u_errorName (uc_err));
+               return FALSE;
+       }
+
+       text_part->ucs_raw_content->len = r;
+       rspamd_mime_text_part_normalise (task, text_part);
+
+       /* Now, convert to utf8 */
+       clen = ucnv_getMaxCharSize (utf8_converter);
+       dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
+       d = rspamd_mempool_alloc (task->task_pool, dlen);
+       r = ucnv_fromUChars (utf8_converter, d, dlen,
+                       (UChar *)text_part->ucs_raw_content->data, r, &uc_err);
+
+       if (!U_SUCCESS (uc_err)) {
+               g_set_error (err, rspamd_iconv_error_quark (), EINVAL,
+                               "cannot convert data from unicode from %s: %s",
+                               charset, u_errorName (uc_err));
+
+               return FALSE;
+       }
+
+       msg_info_task ("converted from %s to UTF-8 inlen: %z, outlen: %d",
+                       charset, input->len, r);
+       text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
+                       sizeof (text_part->utf_raw_content));
+       text_part->utf_raw_content->data = d;
+       text_part->utf_raw_content->len = r;
+
+       return TRUE;
+}
+
 gboolean
 rspamd_mime_to_utf8_byte_array (GByteArray *in,
                GByteArray *out,
@@ -278,24 +475,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
                return TRUE;
        }
 
-       if (utf8_converter == NULL) {
-               utf8_converter = ucnv_open (UTF8_CHARSET, &uc_err);
-
-               if (uc_err != U_ZERO_ERROR) {
-                       msg_warn ("cannot open converter for utf8: %s",
-                                       u_errorName (uc_err));
-
-                       return FALSE;
-               }
-
-               ucnv_setFromUCallBack (utf8_converter,
-                               UCNV_FROM_U_CALLBACK_SUBSTITUTE,
-                               NULL,
-                               NULL,
-                               NULL,
-                               &uc_err);
-       }
-
+       rspamd_mime_utf8_conv_init ();
        conv = rspamd_mime_get_converter_cached (enc, &uc_err);
 
        if (conv == NULL) {
@@ -306,7 +486,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
        uc_err = U_ZERO_ERROR;
        r = ucnv_toUChars (conv, tmp_buf, in->len + 1, in->data, in->len, &uc_err);
 
-       if (uc_err != U_ZERO_ERROR) {
+       if (!U_SUCCESS (uc_err)) {
                g_free (tmp_buf);
 
                return FALSE;
@@ -318,7 +498,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
        g_byte_array_set_size (out, dlen);
        r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
 
-       if (uc_err != U_ZERO_ERROR) {
+       if (!U_SUCCESS (uc_err)) {
                g_free (tmp_buf);
 
                return FALSE;
@@ -461,16 +641,14 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
        return FALSE;
 }
 
-GByteArray *
+void
 rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                struct rspamd_mime_text_part *text_part)
 {
        GError *err = NULL;
-       gsize write_bytes;
        const gchar *charset = NULL;
        gboolean checked = FALSE, need_charset_heuristic = TRUE;
-       gchar *res_str;
-       GByteArray *result_array, *part_content;
+       GByteArray *part_content;
        rspamd_ftok_t charset_tok;
        struct rspamd_mime_part *part = text_part->mime_part;
 
@@ -494,8 +672,9 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
 
        if (task->cfg && task->cfg->raw_mode) {
                SET_PART_RAW (text_part);
+               text_part->utf_raw_content = part_content;
 
-               return part_content;
+               return;
        }
 
        if (part->ct->charset.len == 0) {
@@ -511,8 +690,11 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                }
                else {
                        SET_PART_UTF (text_part);
+                       rspamd_mime_text_part_ucs_from_utf (task, text_part);
+                       rspamd_mime_text_part_normalise (task, text_part);
+                       rspamd_mime_text_part_maybe_renormalise (task, text_part);
 
-                       return part_content;
+                       return;
                }
        }
        else {
@@ -530,27 +712,26 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
        if (charset == NULL) {
                msg_info_task ("<%s>: has invalid charset", task->message_id);
                SET_PART_RAW (text_part);
+               text_part->utf_raw_content = part_content;
 
-               return NULL;
+               return;
        }
 
        RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
 
        if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
                        part_content->len, !checked)) {
-               SET_PART_UTF (text_part);
+               rspamd_mime_text_part_ucs_from_utf (task, text_part);
+               rspamd_mime_text_part_normalise (task, text_part);
+               rspamd_mime_text_part_maybe_renormalise (task, text_part);
 
-               return part_content;
+               return;
        }
        else {
                charset = charset_tok.begin;
-               res_str = rspamd_mime_text_to_utf8 (task->task_pool, part_content->data,
-                               part_content->len,
-                               charset,
-                               &write_bytes,
-                               &err);
 
-               if (res_str == NULL) {
+               if (!rspamd_mime_text_part_utf8_convert (task, text_part,
+                               part_content, charset, &err)) {
                        msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
                                        task->message_id,
                                        charset,
@@ -558,14 +739,10 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                        SET_PART_RAW (text_part);
                        g_error_free (err);
 
-                       return NULL;
+                       text_part->utf_raw_content = part_content;
+                       return;
                }
        }
 
-       result_array = rspamd_mempool_alloc (task->task_pool, sizeof (GByteArray));
-       result_array->data = res_str;
-       result_array->len = write_bytes;
        SET_PART_UTF (text_part);
-
-       return result_array;
 }
index 58a799e456adb79941f47a06d9ba3d447f18d8e7..5e30efdaea3404513e3dbe523452a0a3635edafd 100644 (file)
@@ -65,7 +65,7 @@ gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in,
  * @param text_part
  * @return
  */
-GByteArray * rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
+void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
                struct rspamd_mime_text_part *text_part);
 
 /**
index 82a80d0593f811aaf8661689537cc2ad9632d21e..bfeec990bf9153d7d3ca44f0dc1056bb603c9104 100644 (file)
@@ -254,6 +254,9 @@ rspamd_task_free (struct rspamd_task *task)
                        if (tp->languages) {
                                g_ptr_array_unref (tp->languages);
                        }
+                       if (tp->ucs_raw_content) {
+                               g_array_free (tp->ucs_raw_content, TRUE);
+                       }
                }
 
                if (task->rcpt_envelope) {