struct rspamd_charset_converter *conv;
UConverter *utf8_converter;
- conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+ conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
+ FALSE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();
if (conv == NULL) {
struct rspamd_charset_converter *
-rspamd_mime_get_converter_cached (const gchar *enc, UErrorCode *err)
+rspamd_mime_get_converter_cached (const gchar *enc,
+ rspamd_mempool_t *pool,
+ gboolean is_canon,
+ UErrorCode *err)
{
const gchar *canon_name;
static rspamd_lru_hash_t *cache;
rspamd_str_equal);
}
- canon_name = ucnv_getStandardName (enc, "IANA", err);
+ if (enc == NULL) {
+ return NULL;
+ }
+
+ if (!is_canon) {
+ rspamd_ftok_t cset_tok;
+
+ RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
+ canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
+ }
+ else {
+ canon_name = enc;
+ }
if (canon_name == NULL) {
return NULL;
UConverter *utf8_converter;
struct rspamd_charset_converter *conv;
- conv = rspamd_mime_get_converter_cached (in_enc, &uc_err);
+ conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();
if (conv == NULL) {
UConverter *utf8_converter;
struct rspamd_charset_converter *conv;
- conv = rspamd_mime_get_converter_cached (charset, &uc_err);
+ conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
+ TRUE, &uc_err);
utf8_converter = rspamd_get_utf8_converter ();
if (conv == NULL) {
gboolean
rspamd_mime_to_utf8_byte_array (GByteArray *in,
GByteArray *out,
+ rspamd_mempool_t *pool,
const gchar *enc)
{
gint32 r, clen, dlen;
struct rspamd_charset_converter *conv;
rspamd_ftok_t charset_tok;
+ if (in == NULL || in->len == 0) {
+ return FALSE;
+ }
+
+ if (enc == NULL) {
+ /* Assume utf ? */
+ if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
+ g_byte_array_set_size (out, in->len);
+ memcpy (out->data, in->data, out->len);
+
+ return TRUE;
+ }
+ else {
+ /* Bad stuff, keep out */
+ return FALSE;
+ }
+ }
+
RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
}
utf8_converter = rspamd_get_utf8_converter ();
- conv = rspamd_mime_get_converter_cached (enc, &uc_err);
+ conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
if (conv == NULL) {
return FALSE;
* @param pool
* @param input
* @param len
- * @param in_enc
+ * @param in_enc canon charset
* @param olen
* @param err
* @return
gsize *olen, GError **err);
/**
- * Converts data from `in` to `out`, returns `FALSE` if `enc` is not a valid iconv charset
+ * Converts data from `in` to `out`,
+ * returns `FALSE` if `enc` is not a valid iconv charset
+ *
+ * This function, in fact, copies `in` from `out` replacing out content in
+ * total.
* @param in
* @param out
- * @param enc
+ * @param enc validated canonical charset name. If NULL, then utf8 check is done only
* @return
*/
gboolean rspamd_mime_to_utf8_byte_array (GByteArray *in,
- GByteArray *out, const gchar *enc);
+ GByteArray *out,
+ rspamd_mempool_t *pool,
+ const gchar *enc);
/**
* Maybe convert part to utf-8
* @return
*/
gboolean rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
- gchar *in, gsize len, gboolean content_check);
+ gchar *in, gsize len,
+ gboolean content_check);
/**
* Ensure that all characters in string are valid utf8 chars or replace them
*/
void rspamd_mime_charset_utf_enforce (gchar *in, gsize len);
-/**
- * Gets cached converter
- * @param enc
- * @param err
- * @return
- */
+ /**
+ * Gets cached converter
+ * @param enc input encoding
+ * @param pool pool to use for temporary normalisation
+ * @param is_canon TRUE if normalisation is needed
+ * @param err output error
+ * @return converter
+ */
struct rspamd_charset_converter *rspamd_mime_get_converter_cached (
const gchar *enc,
+ rspamd_mempool_t *pool,
+ gboolean is_canon,
UErrorCode *err);
/**
}
static void
-rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool, GString *out,
- GByteArray *token, GByteArray *decoded_token,
- rspamd_ftok_t *old_charset, rspamd_ftok_t *new_charset)
+rspamd_mime_header_maybe_save_token (rspamd_mempool_t *pool,
+ GString *out,
+ GByteArray *token,
+ GByteArray *decoded_token,
+ rspamd_ftok_t *old_charset,
+ rspamd_ftok_t *new_charset)
{
if (new_charset->len == 0) {
g_assert_not_reached ();
}
/* We need to flush and decode old token to out string */
- if (rspamd_mime_to_utf8_byte_array (token, decoded_token,
+ if (rspamd_mime_to_utf8_byte_array (token, decoded_token, pool,
rspamd_mime_detect_charset (new_charset, pool))) {
g_string_append_len (out, decoded_token->data, decoded_token->len);
}
/* We also reset buffer */
g_byte_array_set_size (token, 0);
- /* Propagate charset */
+ /*
+ * Propagate charset
+ *
+ * Here are dragons: we save the original charset to allow buffers concat
+ * in the condition at the beginning of the function.
+ * However, it will likely cause unnecessary calls for
+ * `rspamd_mime_detect_charset` which could be relatively expensive.
+ * But we ignore that for now...
+ */
memcpy (old_charset, new_charset, sizeof (*old_charset));
}