aboutsummaryrefslogtreecommitdiffstats
path: root/src/libmime/mime_encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/libmime/mime_encoding.c')
-rw-r--r--src/libmime/mime_encoding.c533
1 files changed, 265 insertions, 268 deletions
diff --git a/src/libmime/mime_encoding.c b/src/libmime/mime_encoding.c
index a3467fc91..48a97a4af 100644
--- a/src/libmime/mime_encoding.c
+++ b/src/libmime/mime_encoding.c
@@ -54,23 +54,22 @@ struct rspamd_charset_substitution {
static GHashTable *sub_hash = NULL;
static const UChar iso_8859_16_map[] = {
- 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
- 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
- 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
- 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
- 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
- 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
- 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
- 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
- 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
- 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
- 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
- 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
- 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
- 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
- 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
- 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF
-};
+ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087,
+ 0x0088, 0x0089, 0x008A, 0x008B, 0x008C, 0x008D, 0x008E, 0x008F,
+ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097,
+ 0x0098, 0x0099, 0x009A, 0x009B, 0x009C, 0x009D, 0x009E, 0x009F,
+ 0x00A0, 0x0104, 0x0105, 0x0141, 0x20AC, 0x201E, 0x0160, 0x00A7,
+ 0x0161, 0x00A9, 0x0218, 0x00AB, 0x0179, 0x00AD, 0x017A, 0x017B,
+ 0x00B0, 0x00B1, 0x010C, 0x0142, 0x017D, 0x201D, 0x00B6, 0x00B7,
+ 0x017E, 0x010D, 0x0219, 0x00BB, 0x0152, 0x0153, 0x0178, 0x017C,
+ 0x00C0, 0x00C1, 0x00C2, 0x0102, 0x00C4, 0x0106, 0x00C6, 0x00C7,
+ 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF,
+ 0x0110, 0x0143, 0x00D2, 0x00D3, 0x00D4, 0x0150, 0x00D6, 0x015A,
+ 0x0170, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x0118, 0x021A, 0x00DF,
+ 0x00E0, 0x00E1, 0x00E2, 0x0103, 0x00E4, 0x0107, 0x00E6, 0x00E7,
+ 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF,
+ 0x0111, 0x0144, 0x00F2, 0x00F3, 0x00F4, 0x0151, 0x00F6, 0x015B,
+ 0x0171, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x0119, 0x021B, 0x00FF};
struct rspamd_charset_converter {
gchar *canon_name;
@@ -82,37 +81,37 @@ struct rspamd_charset_converter {
};
static GQuark
-rspamd_charset_conv_error_quark (void)
+rspamd_charset_conv_error_quark(void)
{
- return g_quark_from_static_string ("charset conversion error");
+ return g_quark_from_static_string("charset conversion error");
}
static void
-rspamd_converter_dtor (gpointer p)
+rspamd_converter_dtor(gpointer p)
{
- struct rspamd_charset_converter *c = (struct rspamd_charset_converter *)p;
+ struct rspamd_charset_converter *c = (struct rspamd_charset_converter *) p;
if (!c->is_internal) {
- ucnv_close (c->d.conv);
+ ucnv_close(c->d.conv);
}
- g_free (c->canon_name);
- g_free (c);
+ g_free(c->canon_name);
+ g_free(c);
}
int32_t
-rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
- UChar *dest,
- int32_t destCapacity,
- const char *src,
- int32_t srcLength,
- UErrorCode *pErrorCode)
+rspamd_converter_to_uchars(struct rspamd_charset_converter *cnv,
+ UChar *dest,
+ int32_t destCapacity,
+ const char *src,
+ int32_t srcLength,
+ UErrorCode *pErrorCode)
{
if (!cnv->is_internal) {
- return ucnv_toUChars (cnv->d.conv,
- dest, destCapacity,
- src, srcLength,
- pErrorCode);
+ return ucnv_toUChars(cnv->d.conv,
+ dest, destCapacity,
+ src, srcLength,
+ pErrorCode);
}
else {
UChar *d = dest, *dend = dest + destCapacity;
@@ -120,13 +119,13 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
while (p < end && d < dend) {
if (*p <= 127) {
- *d++ = (UChar)*p;
+ *d++ = (UChar) *p;
}
else {
*d++ = cnv->d.cnv_table[*p - 128];
}
- p ++;
+ p++;
}
return d - dest;
@@ -135,19 +134,19 @@ rspamd_converter_to_uchars (struct rspamd_charset_converter *cnv,
struct rspamd_charset_converter *
-rspamd_mime_get_converter_cached (const gchar *enc,
- rspamd_mempool_t *pool,
- gboolean is_canon,
- UErrorCode *err)
+rspamd_mime_get_converter_cached(const gchar *enc,
+ rspamd_mempool_t *pool,
+ gboolean is_canon,
+ UErrorCode *err)
{
const gchar *canon_name;
static rspamd_lru_hash_t *cache;
struct rspamd_charset_converter *conv;
if (cache == NULL) {
- cache = rspamd_lru_hash_new_full (RSPAMD_CHARSET_CACHE_SIZE, NULL,
- rspamd_converter_dtor, rspamd_str_hash,
- rspamd_str_equal);
+ cache = rspamd_lru_hash_new_full(RSPAMD_CHARSET_CACHE_SIZE, NULL,
+ rspamd_converter_dtor, rspamd_str_hash,
+ rspamd_str_equal);
}
if (enc == NULL) {
@@ -157,8 +156,8 @@ rspamd_mime_get_converter_cached (const gchar *enc,
if (!is_canon) {
rspamd_ftok_t cset_tok;
- RSPAMD_FTOK_FROM_STR (&cset_tok, enc);
- canon_name = rspamd_mime_detect_charset (&cset_tok, pool);
+ RSPAMD_FTOK_FROM_STR(&cset_tok, enc);
+ canon_name = rspamd_mime_detect_charset(&cset_tok, pool);
}
else {
canon_name = enc;
@@ -168,38 +167,38 @@ rspamd_mime_get_converter_cached (const gchar *enc,
return NULL;
}
- conv = rspamd_lru_hash_lookup (cache, (gpointer)canon_name, 0);
+ conv = rspamd_lru_hash_lookup(cache, (gpointer) canon_name, 0);
if (conv == NULL) {
- if (!(strcmp (canon_name, "ISO-8859-16") == 0 ||
- strcmp (canon_name, "latin10") == 0 ||
- strcmp (canon_name, "iso-ir-226") == 0)) {
- conv = g_malloc0 (sizeof (*conv));
- conv->d.conv = ucnv_open (canon_name, err);
- conv->canon_name = g_strdup (canon_name);
+ if (!(strcmp(canon_name, "ISO-8859-16") == 0 ||
+ strcmp(canon_name, "latin10") == 0 ||
+ strcmp(canon_name, "iso-ir-226") == 0)) {
+ conv = g_malloc0(sizeof(*conv));
+ conv->d.conv = ucnv_open(canon_name, err);
+ conv->canon_name = g_strdup(canon_name);
if (conv->d.conv != NULL) {
- ucnv_setToUCallBack (conv->d.conv,
- UCNV_TO_U_CALLBACK_SUBSTITUTE,
- NULL,
- NULL,
- NULL,
- err);
- rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
+ ucnv_setToUCallBack(conv->d.conv,
+ UCNV_TO_U_CALLBACK_SUBSTITUTE,
+ NULL,
+ NULL,
+ NULL,
+ err);
+ rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0);
}
else {
- g_free (conv);
+ g_free(conv);
conv = NULL;
}
}
else {
/* ISO-8859-16 */
- conv = g_malloc0 (sizeof (*conv));
+ conv = g_malloc0(sizeof(*conv));
conv->is_internal = TRUE;
conv->d.cnv_table = iso_8859_16_map;
- conv->canon_name = g_strdup (canon_name);
+ conv->canon_name = g_strdup(canon_name);
- rspamd_lru_hash_insert (cache, conv->canon_name, conv, 0, 0);
+ rspamd_lru_hash_insert(cache, conv->canon_name, conv, 0, 0);
}
}
@@ -207,19 +206,19 @@ rspamd_mime_get_converter_cached (const gchar *enc,
}
static void
-rspamd_mime_encoding_substitute_init (void)
+rspamd_mime_encoding_substitute_init(void)
{
guint i;
- sub_hash = g_hash_table_new (rspamd_strcase_hash, rspamd_strcase_equal);
+ sub_hash = g_hash_table_new(rspamd_strcase_hash, rspamd_strcase_equal);
- for (i = 0; i < G_N_ELEMENTS (sub); i ++) {
- g_hash_table_insert (sub_hash, (void *)sub[i].input, (void *)&sub[i]);
+ for (i = 0; i < G_N_ELEMENTS(sub); i++) {
+ g_hash_table_insert(sub_hash, (void *) sub[i].input, (void *) &sub[i]);
}
}
static void
-rspamd_charset_normalize (gchar *in)
+rspamd_charset_normalize(gchar *in)
{
/*
* This is a simple routine to validate input charset
@@ -231,26 +230,26 @@ rspamd_charset_normalize (gchar *in)
begin = in;
- while (*begin && !g_ascii_isalnum (*begin)) {
- begin ++;
+ while (*begin && !g_ascii_isalnum(*begin)) {
+ begin++;
changed = TRUE;
}
- end = begin + strlen (begin) - 1;
+ end = begin + strlen(begin) - 1;
- while (end > begin && !g_ascii_isalnum (*end)) {
- end --;
+ while (end > begin && !g_ascii_isalnum(*end)) {
+ end--;
changed = TRUE;
}
if (changed) {
- memmove (in, begin, end - begin + 2);
+ memmove(in, begin, end - begin + 2);
*(end + 1) = '\0';
}
}
const gchar *
-rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
+rspamd_mime_detect_charset(const rspamd_ftok_t *in, rspamd_mempool_t *pool)
{
gchar *ret = NULL, *h, *t;
struct rspamd_charset_substitution *s;
@@ -259,27 +258,27 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
UErrorCode uc_err = U_ZERO_ERROR;
if (sub_hash == NULL) {
- rspamd_mime_encoding_substitute_init ();
+ rspamd_mime_encoding_substitute_init();
}
/* Fast path */
- RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf-8");
+ RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf-8");
- if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
+ if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) {
return UTF8_CHARSET;
}
- RSPAMD_FTOK_ASSIGN (&utf8_tok, "utf8");
+ RSPAMD_FTOK_ASSIGN(&utf8_tok, "utf8");
- if (rspamd_ftok_casecmp (in, &utf8_tok) == 0) {
+ if (rspamd_ftok_casecmp(in, &utf8_tok) == 0) {
return UTF8_CHARSET;
}
- ret = rspamd_mempool_ftokdup (pool, in);
- rspamd_charset_normalize (ret);
+ ret = rspamd_mempool_ftokdup(pool, in);
+ rspamd_charset_normalize(ret);
- if ((in->len > 3 && rspamd_lc_cmp (in->begin, "cp-", 3) == 0) ||
- (in->len > 4 && (rspamd_lc_cmp (in->begin, "ibm-", 4) == 0))) {
+ if ((in->len > 3 && rspamd_lc_cmp(in->begin, "cp-", 3) == 0) ||
+ (in->len > 4 && (rspamd_lc_cmp(in->begin, "ibm-", 4) == 0))) {
/* Try to remove '-' chars from encoding: e.g. CP-100 to CP100 */
h = ret;
t = ret;
@@ -289,43 +288,43 @@ rspamd_mime_detect_charset (const rspamd_ftok_t *in, rspamd_mempool_t *pool)
*t++ = *h;
}
- h ++;
+ h++;
}
*t = '\0';
}
- s = g_hash_table_lookup (sub_hash, ret);
+ s = g_hash_table_lookup(sub_hash, ret);
if (s) {
- ret = (char *)s->canon;
+ ret = (char *) s->canon;
}
/* Try different aliases */
- cset = ucnv_getCanonicalName (ret, "MIME", &uc_err);
+ cset = ucnv_getCanonicalName(ret, "MIME", &uc_err);
if (cset == NULL) {
uc_err = U_ZERO_ERROR;
- cset = ucnv_getCanonicalName (ret, "IANA", &uc_err);
+ cset = ucnv_getCanonicalName(ret, "IANA", &uc_err);
}
if (cset == NULL) {
uc_err = U_ZERO_ERROR;
- cset = ucnv_getCanonicalName (ret, "", &uc_err);
+ cset = ucnv_getCanonicalName(ret, "", &uc_err);
}
if (cset == NULL) {
uc_err = U_ZERO_ERROR;
- cset = ucnv_getAlias (ret, 0, &uc_err);
+ cset = ucnv_getAlias(ret, 0, &uc_err);
}
return cset;
}
gchar *
-rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
- gchar *input, gsize len, const gchar *in_enc,
- gsize *olen, GError **err)
+rspamd_mime_text_to_utf8(rspamd_mempool_t *pool,
+ gchar *input, gsize len, const gchar *in_enc,
+ gsize *olen, GError **err)
{
gchar *d;
gint32 r, clen, dlen;
@@ -337,12 +336,12 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
rspamd_ftok_t cset_tok;
/* Check if already utf8 */
- RSPAMD_FTOK_FROM_STR (&cset_tok, in_enc);
+ RSPAMD_FTOK_FROM_STR(&cset_tok, in_enc);
- if (rspamd_mime_charset_utf_check (&cset_tok, input, len,
- FALSE)) {
- d = rspamd_mempool_alloc (pool, len);
- memcpy (d, input, len);
+ if (rspamd_mime_charset_utf_check(&cset_tok, input, len,
+ FALSE)) {
+ d = rspamd_mempool_alloc(pool, len);
+ memcpy(d, input, len);
if (olen) {
*olen = len;
}
@@ -350,48 +349,48 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
return d;
}
- conv = rspamd_mime_get_converter_cached (in_enc, pool, TRUE, &uc_err);
- utf8_converter = rspamd_get_utf8_converter ();
+ conv = rspamd_mime_get_converter_cached(in_enc, pool, TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
if (conv == NULL) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot open converter for %s: %s",
- in_enc, u_errorName (uc_err));
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot open converter for %s: %s",
+ in_enc, u_errorName(uc_err));
return NULL;
}
- tmp_buf = g_new (UChar, len + 1);
+ tmp_buf = g_new(UChar, len + 1);
uc_err = U_ZERO_ERROR;
- r = rspamd_converter_to_uchars (conv, tmp_buf, len + 1, input, len, &uc_err);
+ r = rspamd_converter_to_uchars(conv, tmp_buf, len + 1, input, len, &uc_err);
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
"cannot convert data to unicode from %s: %s",
- in_enc, u_errorName (uc_err));
- g_free (tmp_buf);
+ in_enc, u_errorName(uc_err));
+ g_free(tmp_buf);
return NULL;
}
/* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
- d = rspamd_mempool_alloc (pool, dlen);
- r = ucnv_fromUChars (utf8_converter, d, dlen, tmp_buf, r, &uc_err);
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
+ d = rspamd_mempool_alloc(pool, dlen);
+ r = ucnv_fromUChars(utf8_converter, d, dlen, tmp_buf, r, &uc_err);
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data from unicode from %s: %s",
- in_enc, u_errorName (uc_err));
- g_free (tmp_buf);
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data from unicode from %s: %s",
+ in_enc, u_errorName(uc_err));
+ g_free(tmp_buf);
return NULL;
}
- msg_debug_pool ("converted from %s to UTF-8 inlen: %z, outlen: %d",
- in_enc, len, r);
- g_free (tmp_buf);
+ msg_debug_pool("converted from %s to UTF-8 inlen: %z, outlen: %d",
+ in_enc, len, r);
+ g_free(tmp_buf);
if (olen) {
*olen = r;
@@ -401,11 +400,11 @@ rspamd_mime_text_to_utf8 (rspamd_mempool_t *pool,
}
static gboolean
-rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part,
- GByteArray *input,
- const gchar *charset,
- GError **err)
+rspamd_mime_text_part_utf8_convert(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part,
+ GByteArray *input,
+ const gchar *charset,
+ GError **err)
{
gchar *d;
gint32 r, clen, dlen, uc_len;
@@ -414,76 +413,76 @@ rspamd_mime_text_part_utf8_convert (struct rspamd_task *task,
UConverter *utf8_converter;
struct rspamd_charset_converter *conv;
- conv = rspamd_mime_get_converter_cached (charset, task->task_pool,
- TRUE, &uc_err);
- utf8_converter = rspamd_get_utf8_converter ();
+ conv = rspamd_mime_get_converter_cached(charset, task->task_pool,
+ TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
if (conv == NULL) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot open converter for %s: %s",
- charset, u_errorName (uc_err));
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot open converter for %s: %s",
+ charset, u_errorName(uc_err));
return FALSE;
}
- tmp_buf = g_new (UChar, input->len + 1);
+ tmp_buf = g_new(UChar, input->len + 1);
uc_err = U_ZERO_ERROR;
- uc_len = rspamd_converter_to_uchars (conv,
- tmp_buf,
- input->len + 1,
- input->data,
- input->len,
- &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data to unicode from %s: %s",
- charset, u_errorName (uc_err));
- g_free (tmp_buf);
+ uc_len = rspamd_converter_to_uchars(conv,
+ tmp_buf,
+ input->len + 1,
+ input->data,
+ input->len,
+ &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data to unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp_buf);
return FALSE;
}
/* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (uc_len, clen);
- d = rspamd_mempool_alloc (task->task_pool, dlen);
- r = ucnv_fromUChars (utf8_converter, d, dlen,
- tmp_buf, uc_len, &uc_err);
-
- if (!U_SUCCESS (uc_err)) {
- g_set_error (err, rspamd_charset_conv_error_quark(), EINVAL,
- "cannot convert data from unicode from %s: %s",
- charset, u_errorName (uc_err));
- g_free (tmp_buf);
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(uc_len, clen);
+ d = rspamd_mempool_alloc(task->task_pool, dlen);
+ r = ucnv_fromUChars(utf8_converter, d, dlen,
+ tmp_buf, uc_len, &uc_err);
+
+ if (!U_SUCCESS(uc_err)) {
+ g_set_error(err, rspamd_charset_conv_error_quark(), EINVAL,
+ "cannot convert data from unicode from %s: %s",
+ charset, u_errorName(uc_err));
+ g_free(tmp_buf);
return FALSE;
}
if (text_part->mime_part && text_part->mime_part->ct) {
- msg_info_task ("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
- charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
+ msg_info_task("converted text part from %s ('%T' announced) to UTF-8 inlen: %d, outlen: %d (%d UTF16 chars)",
+ charset, &text_part->mime_part->ct->charset, input->len, r, uc_len);
}
else {
- msg_info_task ("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
- "outlen: %d (%d UTF16 chars)",
- charset, input->len, r, uc_len);
+ msg_info_task("converted text part from %s (no charset announced) to UTF-8 inlen: %d, "
+ "outlen: %d (%d UTF16 chars)",
+ charset, input->len, r, uc_len);
}
- text_part->utf_raw_content = rspamd_mempool_alloc (task->task_pool,
- sizeof (*text_part->utf_raw_content) + sizeof (gpointer) * 4);
+ text_part->utf_raw_content = rspamd_mempool_alloc(task->task_pool,
+ sizeof(*text_part->utf_raw_content) + sizeof(gpointer) * 4);
text_part->utf_raw_content->data = d;
text_part->utf_raw_content->len = r;
- g_free (tmp_buf);
+ g_free(tmp_buf);
return TRUE;
}
gboolean
-rspamd_mime_to_utf8_byte_array (GByteArray *in,
- GByteArray *out,
- rspamd_mempool_t *pool,
- const gchar *enc)
+rspamd_mime_to_utf8_byte_array(GByteArray *in,
+ GByteArray *out,
+ rspamd_mempool_t *pool,
+ const gchar *enc)
{
gint32 r, clen, dlen;
UChar *tmp_buf;
@@ -498,9 +497,9 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
if (enc == NULL) {
/* Assume utf ? */
- if (rspamd_fast_utf8_validate (in->data, in->len) == 0) {
- g_byte_array_set_size (out, in->len);
- memcpy (out->data, in->data, out->len);
+ if (rspamd_fast_utf8_validate(in->data, in->len) == 0) {
+ g_byte_array_set_size(out, in->len);
+ memcpy(out->data, in->data, out->len);
return TRUE;
}
@@ -510,55 +509,54 @@ rspamd_mime_to_utf8_byte_array (GByteArray *in,
}
}
- RSPAMD_FTOK_FROM_STR (&charset_tok, enc);
+ RSPAMD_FTOK_FROM_STR(&charset_tok, enc);
- if (rspamd_mime_charset_utf_check (&charset_tok, (gchar *)in->data, in->len,
- FALSE)) {
- g_byte_array_set_size (out, in->len);
- memcpy (out->data, in->data, out->len);
+ if (rspamd_mime_charset_utf_check(&charset_tok, (gchar *) in->data, in->len,
+ FALSE)) {
+ g_byte_array_set_size(out, in->len);
+ memcpy(out->data, in->data, out->len);
return TRUE;
}
- utf8_converter = rspamd_get_utf8_converter ();
- conv = rspamd_mime_get_converter_cached (enc, pool, TRUE, &uc_err);
+ utf8_converter = rspamd_get_utf8_converter();
+ conv = rspamd_mime_get_converter_cached(enc, pool, TRUE, &uc_err);
if (conv == NULL) {
return FALSE;
}
- tmp_buf = g_new (UChar, in->len + 1);
+ tmp_buf = g_new(UChar, in->len + 1);
uc_err = U_ZERO_ERROR;
- r = rspamd_converter_to_uchars (conv,
- tmp_buf, in->len + 1,
- in->data, in->len, &uc_err);
+ r = rspamd_converter_to_uchars(conv,
+ tmp_buf, in->len + 1,
+ in->data, in->len, &uc_err);
- if (!U_SUCCESS (uc_err)) {
- g_free (tmp_buf);
+ if (!U_SUCCESS(uc_err)) {
+ g_free(tmp_buf);
return FALSE;
}
/* Now, convert to utf8 */
- clen = ucnv_getMaxCharSize (utf8_converter);
- dlen = UCNV_GET_MAX_BYTES_FOR_STRING (r, clen);
- g_byte_array_set_size (out, dlen);
- r = ucnv_fromUChars (utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
+ clen = ucnv_getMaxCharSize(utf8_converter);
+ dlen = UCNV_GET_MAX_BYTES_FOR_STRING(r, clen);
+ g_byte_array_set_size(out, dlen);
+ r = ucnv_fromUChars(utf8_converter, out->data, dlen, tmp_buf, r, &uc_err);
- if (!U_SUCCESS (uc_err)) {
- g_free (tmp_buf);
+ if (!U_SUCCESS(uc_err)) {
+ g_free(tmp_buf);
return FALSE;
}
- g_free (tmp_buf);
+ g_free(tmp_buf);
out->len = r;
return TRUE;
}
-void
-rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
+void rspamd_mime_charset_utf_enforce(gchar *in, gsize len)
{
gchar *p, *end;
goffset err_offset;
@@ -568,25 +566,25 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
p = in;
end = in + len;
- while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate (p, len)) > 0) {
- err_offset --; /* As it returns it 1 indexed */
+ while (p < end && len > 0 && (err_offset = rspamd_fast_utf8_validate(p, len)) > 0) {
+ err_offset--; /* As it returns it 1 indexed */
gint32 cur_offset = err_offset;
while (cur_offset < len) {
gint32 tmp = cur_offset;
- U8_NEXT (p, cur_offset, len, uc);
+ U8_NEXT(p, cur_offset, len, uc);
if (uc > 0) {
/* Fill string between err_offset and tmp with `?` character */
- memset (p + err_offset, '?', tmp - err_offset);
+ memset(p + err_offset, '?', tmp - err_offset);
break;
}
}
if (uc < 0) {
/* Fill till the end */
- memset (p + err_offset, '?', len - err_offset);
+ memset(p + err_offset, '?', len - err_offset);
break;
}
@@ -596,23 +594,23 @@ rspamd_mime_charset_utf_enforce (gchar *in, gsize len)
}
const char *
-rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
- bool check_utf8)
+rspamd_mime_charset_find_by_content(const gchar *in, gsize inlen,
+ bool check_utf8)
{
int nconsumed;
bool is_reliable;
const gchar *ced_name;
if (check_utf8) {
- if (rspamd_fast_utf8_validate (in, inlen) == 0) {
+ if (rspamd_fast_utf8_validate(in, inlen) == 0) {
return UTF8_CHARSET;
}
}
- ced_name = ced_encoding_detect (in, inlen, NULL, NULL,
- NULL, 0, CED_EMAIL_CORPUS,
- false, &nconsumed, &is_reliable);
+ ced_name = ced_encoding_detect(in, inlen, NULL, NULL,
+ NULL, 0, CED_EMAIL_CORPUS,
+ false, &nconsumed, &is_reliable);
if (ced_name) {
@@ -623,28 +621,28 @@ rspamd_mime_charset_find_by_content (const gchar *in, gsize inlen,
}
static const char *
-rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
+rspamd_mime_charset_find_by_content_maybe_split(const gchar *in, gsize inlen)
{
if (inlen < RSPAMD_CHARSET_MAX_CONTENT * 3) {
- return rspamd_mime_charset_find_by_content (in, inlen, false);
+ return rspamd_mime_charset_find_by_content(in, inlen, false);
}
else {
const gchar *c1, *c2, *c3;
- c1 = rspamd_mime_charset_find_by_content (in, RSPAMD_CHARSET_MAX_CONTENT, false);
- c2 = rspamd_mime_charset_find_by_content (in + inlen / 2,
- RSPAMD_CHARSET_MAX_CONTENT, false);
- c3 = rspamd_mime_charset_find_by_content (in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
- RSPAMD_CHARSET_MAX_CONTENT, false);
+ c1 = rspamd_mime_charset_find_by_content(in, RSPAMD_CHARSET_MAX_CONTENT, false);
+ c2 = rspamd_mime_charset_find_by_content(in + inlen / 2,
+ RSPAMD_CHARSET_MAX_CONTENT, false);
+ c3 = rspamd_mime_charset_find_by_content(in + inlen - RSPAMD_CHARSET_MAX_CONTENT,
+ RSPAMD_CHARSET_MAX_CONTENT, false);
/* 7bit stuff */
- if (c1 && strcmp (c1, "US-ASCII") == 0) {
+ if (c1 && strcmp(c1, "US-ASCII") == 0) {
c1 = NULL; /* Invalid - we have 8 bit there */
}
- if (c2 && strcmp (c2, "US-ASCII") == 0) {
+ if (c2 && strcmp(c2, "US-ASCII") == 0) {
c2 = NULL; /* Invalid - we have 8 bit there */
}
- if (c3 && strcmp (c3, "US-ASCII") == 0) {
+ if (c3 && strcmp(c3, "US-ASCII") == 0) {
c3 = NULL; /* Invalid - we have 8 bit there */
}
@@ -679,45 +677,45 @@ rspamd_mime_charset_find_by_content_maybe_split (const gchar *in, gsize inlen)
}
gboolean
-rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
- gchar *in, gsize len, gboolean content_check)
+rspamd_mime_charset_utf_check(rspamd_ftok_t *charset,
+ gchar *in, gsize len, gboolean content_check)
{
const gchar *real_charset;
if (utf_compatible_re == NULL) {
- utf_compatible_re = rspamd_regexp_new (
- "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
- "i", NULL);
+ utf_compatible_re = rspamd_regexp_new(
+ "^(?:utf-?8.*)|(?:us-ascii)|(?:ascii)|(?:ansi.*)|(?:CSASCII)$",
+ "i", NULL);
}
if (charset->len == 0 ||
- rspamd_regexp_match (utf_compatible_re,
- charset->begin, charset->len, TRUE)) {
+ rspamd_regexp_match(utf_compatible_re,
+ charset->begin, charset->len, TRUE)) {
/*
* In case of UTF8 charset we still can check the content to find
* corner cases
*/
if (content_check) {
- if (rspamd_fast_utf8_validate (in, len) != 0) {
+ if (rspamd_fast_utf8_validate(in, len) != 0) {
real_charset = rspamd_mime_charset_find_by_content_maybe_split(in, len);
if (real_charset) {
- if (rspamd_regexp_match (utf_compatible_re,
- real_charset, strlen (real_charset), TRUE)) {
- RSPAMD_FTOK_ASSIGN (charset, UTF8_CHARSET);
+ if (rspamd_regexp_match(utf_compatible_re,
+ real_charset, strlen(real_charset), TRUE)) {
+ RSPAMD_FTOK_ASSIGN(charset, UTF8_CHARSET);
return TRUE;
}
else {
charset->begin = real_charset;
- charset->len = strlen (real_charset);
+ charset->len = strlen(real_charset);
return FALSE;
}
}
- rspamd_mime_charset_utf_enforce (in, len);
+ rspamd_mime_charset_utf_enforce(in, len);
}
}
@@ -727,9 +725,8 @@ rspamd_mime_charset_utf_check (rspamd_ftok_t *charset,
return FALSE;
}
-void
-rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
- struct rspamd_mime_text_part *text_part)
+void rspamd_mime_text_part_maybe_convert(struct rspamd_task *task,
+ struct rspamd_mime_text_part *text_part)
{
GError *err = NULL;
const gchar *charset = NULL;
@@ -738,21 +735,21 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
rspamd_ftok_t charset_tok;
struct rspamd_mime_part *part = text_part->mime_part;
- if (rspamd_str_has_8bit (text_part->raw.begin, text_part->raw.len)) {
+ if (rspamd_str_has_8bit(text_part->raw.begin, text_part->raw.len)) {
text_part->flags |= RSPAMD_MIME_TEXT_PART_FLAG_8BIT_RAW;
}
/* Allocate copy storage */
- part_content = g_byte_array_sized_new (text_part->parsed.len);
- memcpy (part_content->data, text_part->parsed.begin, text_part->parsed.len);
+ part_content = g_byte_array_sized_new(text_part->parsed.len);
+ memcpy(part_content->data, text_part->parsed.begin, text_part->parsed.len);
part_content->len = text_part->parsed.len;
- rspamd_mempool_notify_alloc (task->task_pool,
- part_content->len);
- rspamd_mempool_add_destructor (task->task_pool,
- (rspamd_mempool_destruct_t)g_byte_array_unref, part_content);
+ rspamd_mempool_notify_alloc(task->task_pool,
+ part_content->len);
+ rspamd_mempool_add_destructor(task->task_pool,
+ (rspamd_mempool_destruct_t) g_byte_array_unref, part_content);
- if (rspamd_str_has_8bit (text_part->parsed.begin, text_part->parsed.len)) {
- if (rspamd_fast_utf8_validate (text_part->parsed.begin, text_part->parsed.len) == 0) {
+ if (rspamd_str_has_8bit(text_part->parsed.begin, text_part->parsed.len)) {
+ if (rspamd_fast_utf8_validate(text_part->parsed.begin, text_part->parsed.len) == 0) {
/* Valid UTF, likely all good */
need_charset_heuristic = FALSE;
valid_utf8 = TRUE;
@@ -770,18 +767,18 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
if (part->ct->charset.len == 0) {
if (need_charset_heuristic) {
- charset = rspamd_mime_charset_find_by_content_maybe_split (text_part->parsed.begin,
- text_part->parsed.len);
+ charset = rspamd_mime_charset_find_by_content_maybe_split(text_part->parsed.begin,
+ text_part->parsed.len);
if (charset != NULL) {
- msg_info_task ("detected charset %s", charset);
+ msg_info_task("detected charset %s", charset);
}
checked = TRUE;
text_part->real_charset = charset;
}
else if (valid_utf8) {
- SET_PART_UTF (text_part);
+ SET_PART_UTF(text_part);
text_part->utf_raw_content = part_content;
text_part->real_charset = UTF8_CHARSET;
@@ -789,15 +786,15 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
}
}
else {
- charset = rspamd_mime_detect_charset (&part->ct->charset,
- task->task_pool);
+ charset = rspamd_mime_detect_charset(&part->ct->charset,
+ task->task_pool);
if (charset == NULL) {
/* We don't know the real charset but can try heuristic */
if (need_charset_heuristic) {
- charset = rspamd_mime_charset_find_by_content_maybe_split (part_content->data,
- part_content->len);
- msg_info_task ("detected charset: %s", charset);
+ charset = rspamd_mime_charset_find_by_content_maybe_split(part_content->data,
+ part_content->len);
+ msg_info_task("detected charset: %s", charset);
checked = TRUE;
text_part->real_charset = charset;
}
@@ -809,7 +806,7 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
else {
text_part->real_charset = charset;
- if (strcmp (charset, UTF8_CHARSET) != 0) {
+ if (strcmp(charset, UTF8_CHARSET) != 0) {
/*
* We have detected some charset, but we don't know which one,
* so we need to reset valid utf8 flag and enforce it later
@@ -820,21 +817,21 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
}
if (text_part->real_charset == NULL) {
- msg_info_task ("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
- MESSAGE_FIELD_CHECK (task, message_id), &part->ct->charset,
- part->ct->cpy);
- SET_PART_RAW (text_part);
+ msg_info_task("<%s>: has invalid charset; original charset: %T; Content-Type: \"%s\"",
+ MESSAGE_FIELD_CHECK(task, message_id), &part->ct->charset,
+ part->ct->cpy);
+ SET_PART_RAW(text_part);
text_part->utf_raw_content = part_content;
return;
}
- RSPAMD_FTOK_FROM_STR (&charset_tok, charset);
+ RSPAMD_FTOK_FROM_STR(&charset_tok, charset);
if (!valid_utf8) {
- if (rspamd_mime_charset_utf_check (&charset_tok, part_content->data,
- part_content->len, !checked)) {
- SET_PART_UTF (text_part);
+ if (rspamd_mime_charset_utf_check(&charset_tok, part_content->data,
+ part_content->len, !checked)) {
+ SET_PART_UTF(text_part);
text_part->utf_raw_content = part_content;
text_part->real_charset = UTF8_CHARSET;
@@ -843,25 +840,25 @@ rspamd_mime_text_part_maybe_convert (struct rspamd_task *task,
else {
charset = charset_tok.begin;
- if (!rspamd_mime_text_part_utf8_convert (task, text_part,
- part_content, charset, &err)) {
- msg_warn_task ("<%s>: cannot convert from %s to utf8: %s",
- MESSAGE_FIELD (task, message_id),
- charset,
- err ? err->message : "unknown problem");
- SET_PART_RAW (text_part);
- g_error_free (err);
+ if (!rspamd_mime_text_part_utf8_convert(task, text_part,
+ part_content, charset, &err)) {
+ msg_warn_task("<%s>: cannot convert from %s to utf8: %s",
+ MESSAGE_FIELD(task, message_id),
+ charset,
+ err ? err->message : "unknown problem");
+ SET_PART_RAW(text_part);
+ g_error_free(err);
text_part->utf_raw_content = part_content;
return;
}
- SET_PART_UTF (text_part);
+ SET_PART_UTF(text_part);
text_part->real_charset = charset;
}
}
else {
- SET_PART_UTF (text_part);
+ SET_PART_UTF(text_part);
text_part->utf_raw_content = part_content;
}
}