]> source.dussan.org Git - rspamd.git/commitdiff
[Minor] Unify converters usage
authorVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 19 Nov 2019 10:05:14 +0000 (10:05 +0000)
committerVsevolod Stakhov <vsevolod@highsecure.ru>
Tue, 19 Nov 2019 10:05:14 +0000 (10:05 +0000)
src/libmime/archives.c
src/libmime/mime_encoding.c
src/libmime/mime_encoding.h
src/libmime/mime_headers.c

index 8c7e4ea90e31d50c4ce630d378e71891f6a233d6..5701ce95c61e601ff59f537c3d5da634bb1d38b0 100644 (file)
@@ -67,7 +67,8 @@ rspamd_archive_file_try_utf (struct rspamd_task *task,
                struct rspamd_charset_converter *conv;
                UConverter *utf8_converter;
 
-               conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+               conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
+                               FALSE, &uc_err);
                utf8_converter = rspamd_get_utf8_converter ();
 
                if (conv == NULL) {
index 7872fa330a3ef4341bd39fda68f77da8552af214..7ef492f9d414764318af9ebf6a24ef7ab27bdc4c 100644 (file)
@@ -135,7 +135,10 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
 
 
 struct rspamd_charset_converter *
-rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
+rspamd_mime_get_converter_cached (const gchar *enc,
+                                                                 rspamd_mempool_t *pool,
+                                                                 gboolean is_canon,
+                                                                 UErrorCode *err)
 {
        const gchar *canon_name;
        static rspamd_lru_hash_t *cache;
@@ -147,7 +150,19 @@ rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
                                rspamd_str_equal);
        }
 
-       canon_name = ucnv_getStandardName (enc, "IANA", err);
+       if (enc == NULL) {
+               return NULL;
+       }
+
+       if (!is_canon) {
+               rspamd_ftok_t cset_tok;
+
+               RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
+               canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
+       }
+       else {
+               canon_name = enc;
+       }
 
        if (canon_name == NULL) {
                return NULL;
@@ -306,7 +321,7 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
        UConverter *utf8_converter;
        struct rspamd_charset_converter *conv;
 
-       conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
+       conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
        utf8_converter = rspamd_get_utf8_converter ();
 
        if (conv == NULL) {
@@ -370,7 +385,8 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
        UConverter *utf8_converter;
        struct rspamd_charset_converter *conv;
 
-       conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+       conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
+                       TRUE, &uc_err);
        utf8_converter = rspamd_get_utf8_converter ();
 
        if (conv == NULL) {
@@ -429,6 +445,7 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
 gboolean
 rspamd_mime_to_utf8_byte_array (GByteArray *in,
                GByteArray *out,
+               rspamd_mempool_t *pool,
                const gchar *enc)
 {
        gint32 r, clen, dlen;
@@ -438,6 +455,24 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
        struct rspamd_charset_converter *conv;
        rspamd_ftok_t charset_tok;
 
+       if (in == NULL || in->len == 0) {
+               return FALSE;
+       }
+
+       if (enc == NULL) {
+               /* Assume utf ? */
+               if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
+                       g_byte_array_set_size (out, in->len);
+                       memcpy (out->data, in->data, out->len);
+
+                       return TRUE;
+               }
+               else {
+                       /* Bad stuff, keep out */
+                       return FALSE;
+               }
+       }
+
        RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
 
        if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
@@ -449,7 +484,7 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
        }
 
        utf8_converter = rspamd_get_utf8_converter ();
-       conv = rspamd_mime_get_converter_cached (enc, &uc_err);
+       conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
 
        if (conv == NULL) {
                return FALSE;
index 5224d33fb39d5333ce5428872d5f98b57dedcc3c..22f0ee818f3f98626dc9efa53e4495bc0be6ba7f 100644 (file)
@@ -47,7 +47,7 @@ const gchar *rspamd_mime_detect_charset (const rspamd_ftok_t *in,
  * @param pool
  * @param input
  * @param len
- * @param in_enc
+ * @param in_enc canon charset
  * @param olen
  * @param err
  * @return
@@ -57,14 +57,20 @@ gchar *rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
                                                                 gsize *olen, GError **err);
 
 /**
- * Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset
+ * Converts data from `in` to `out`,
+ * returns `FALSE` if `enc` is not a valid iconv charset
+ *
+ * This function, in fact, copies `in` from `out` replacing out content in
+ * total.
  * @param in
  * @param out
- * @param enc
+ * @param enc validated canonical charset name. If NULL, then utf8 check is done only
  * @return
  */
 gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in,
-                                                                                GByteArray *out, const gchar *enc);
+                                                                                GByteArray *out,
+                                                                                rspamd_mempool_t *pool,
+                                                                                const gchar *enc);
 
 /**
  * Maybe convert part to utf-8
@@ -83,7 +89,8 @@ void rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
  * @return
  */
 gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
-                                                                               gchar *in, gsize len, gboolean content_check);
+                                                                               gchar *in, gsize len,
+                                                                               gboolean content_check);
 
 /**
  * Ensure that all characters in string are valid utf8 chars or replace them
@@ -93,14 +100,18 @@ gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
  */
 void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
 
-/**
- * Gets cached converter
- * @param enc
- * @param err
- * @return
- */
+ /**
+  * Gets cached converter
+  * @param enc input encoding
+  * @param pool pool to use for temporary normalisation
+  * @param is_canon TRUE if normalisation is needed
+  * @param err output error
+  * @return converter
+  */
 struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
                const gchar *enc,
+               rspamd_mempool_t *pool,
+               gboolean is_canon,
                UErrorCode *err);
 
 /**
index b024bd7b148398f0872a72999361465e2d6677a0..e0c91c47858349eef2dec26f9c1dbb00066331fd 100644 (file)
@@ -512,9 +512,12 @@ rspamd_mime_headers_process (struct rspamd_task *task,
 }
 
 static void
-rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out,
-               GByteArray *token, GByteArray *decoded_token,
-               rspamd_ftok_t *old_charset, rspamd_ftok_t *new_charset)
+rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool,
+                                                                        GString *out,
+                                                                        GByteArray *token,
+                                                                        GByteArray *decoded_token,
+                                                                        rspamd_ftok_t *old_charset,
+                                                                        rspamd_ftok_t *new_charset)
 {
        if (new_charset->len == 0) {
                g_assert_not_reached ();
@@ -538,14 +541,22 @@ rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out,
        }
 
        /* We need to flush and decode old token to out string */
-       if (rspamd_mime_to_utf8_byte_array (token, decoded_token,
+       if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool,
                        rspamd_mime_detect_charset (new_charset, pool))) {
                g_string_append_len (out, decoded_token->data, decoded_token->len);
        }
 
        /* We also reset buffer */
        g_byte_array_set_size (token, 0);
-       /* Propagate charset */
+       /*
+        * Propagate charset
+        *
+        * Here are dragons: we save the original charset to allow buffers concat
+        * in the condition at the beginning of the function.
+        * However, it will likely cause unnecessary calls for
+        * `rspamd_mime_detect_charset` which could be relatively expensive.
+        * But we ignore that for now...
+        */
        memcpy (old_charset, new_charset, sizeof (*old_charset));
 }